| Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
| SharedData |
|
| 2.0;2 |
| 1 | /* | |
| 2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
| 3 | * contributor license agreements. See the NOTICE file distributed with | |
| 4 | * this work for additional information regarding copyright ownership. | |
| 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
| 6 | * (the "License"); you may not use this file except in compliance with | |
| 7 | * the License. You may obtain a copy of the License at | |
| 8 | * | |
| 9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
| 10 | * | |
| 11 | * Unless required by applicable law or agreed to in writing, software | |
| 12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 | * See the License for the specific language governing permissions and | |
| 15 | * limitations under the License. | |
| 16 | */ | |
| 17 | ||
| 18 | package org.apache.any23.plugin.crawler; | |
| 19 | ||
| 20 | import edu.uci.ics.crawler4j.crawler.Page; | |
| 21 | ||
| 22 | import java.util.List; | |
| 23 | import java.util.regex.Pattern; | |
| 24 | ||
| 25 | /** | |
| 26 | * This class hosts shared data structures accessible | |
| 27 | * to all the {@link DefaultWebCrawler} instances | |
| 28 | * run by the {@link SiteCrawler}. | |
| 29 | * | |
| 30 | * @author Michele Mostarda (mostarda@fbk.eu) | |
| 31 | */ | |
| 32 | public class SharedData { | |
| 33 | ||
| 34 | /** | |
| 35 | * Singleton instance. | |
| 36 | */ | |
| 37 | private static SharedData instance; | |
| 38 | ||
| 39 | /** | |
| 40 | * Crawl seed. | |
| 41 | */ | |
| 42 | private final String seed; | |
| 43 | ||
| 44 | /** | |
| 45 | * Crawl page filter pattern. | |
| 46 | */ | |
| 47 | private final Pattern pattern; | |
| 48 | ||
| 49 | /** | |
| 50 | * List of crawler listeners. | |
| 51 | */ | |
| 52 | private final List<CrawlerListener> listeners; | |
| 53 | ||
| 54 | // /** | |
| 55 | // * Output triple handler. | |
| 56 | // */ | |
| 57 | // private final TripleHandler tripleHandler; | |
| 58 | ||
| 59 | /** | |
| 60 | * @return the singleton instance. | |
| 61 | */ | |
| 62 | protected static SharedData getInstance() { | |
| 63 | 0 | if(instance == null) throw new IllegalStateException("The configuration has not yet initialized."); |
| 64 | 0 | return instance; |
| 65 | } | |
| 66 | ||
| 67 | /** | |
| 68 | * Initializes the crawler data. | |
| 69 | * | |
| 70 | * @param seed crawler seed. | |
| 71 | * @param regex page filter regex. | |
| 72 | * @param listeners the listeners to be notified of the crawler activity. | |
| 73 | */ | |
| 74 | protected static void setCrawlData(String seed, Pattern regex, List<CrawlerListener> listeners) { | |
| 75 | 0 | instance = new SharedData(seed, regex, listeners); |
| 76 | 0 | } |
| 77 | ||
| 78 | /** | |
| 79 | * Internal constructor. | |
| 80 | * | |
| 81 | * @param seed | |
| 82 | * @param pattern | |
| 83 | * @param listeners | |
| 84 | */ | |
| 85 | 0 | private SharedData(String seed, Pattern pattern, List<CrawlerListener> listeners) { |
| 86 | 0 | if(seed == null || seed.trim().length() == 0) |
| 87 | 0 | throw new IllegalArgumentException( |
| 88 | String.format("Invalid seed '%s'", seed) | |
| 89 | ); | |
| 90 | ||
| 91 | 0 | this.seed = seed; |
| 92 | 0 | this.pattern = pattern; |
| 93 | 0 | this.listeners = listeners; |
| 94 | 0 | } |
| 95 | ||
| 96 | /** | |
| 97 | * @return crawl seed. | |
| 98 | */ | |
| 99 | protected String getSeed() { | |
| 100 | 0 | return seed; |
| 101 | } | |
| 102 | ||
| 103 | /** | |
| 104 | * @return page filter pattern. | |
| 105 | */ | |
| 106 | protected Pattern getPattern() { | |
| 107 | 0 | return pattern; |
| 108 | } | |
| 109 | ||
| 110 | /** | |
| 111 | * Notifies all listeners that a page has been discovered. | |
| 112 | * | |
| 113 | * @param page the discovered page. | |
| 114 | */ | |
| 115 | protected void notifyPage(Page page) { | |
| 116 | 0 | for(CrawlerListener listener : listeners) { |
| 117 | 0 | listener.visitedPage(page); |
| 118 | } | |
| 119 | 0 | } |
| 120 | ||
| 121 | } |