1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.any23.plugin.crawler;
19
20 import edu.uci.ics.crawler4j.crawler.Page;
21
22 import java.util.List;
23 import java.util.regex.Pattern;
24
25 /**
26 * This class hosts shared data structures accessible
27 * to all the {@link DefaultWebCrawler} instances
28 * run by the {@link SiteCrawler}.
29 *
30 * @author Michele Mostarda (mostarda@fbk.eu)
31 */
32 public class SharedData {
33
34 /**
35 * Singleton instance.
36 */
37 private static SharedData instance;
38
39 /**
40 * Crawl seed.
41 */
42 private final String seed;
43
44 /**
45 * Crawl page filter pattern.
46 */
47 private final Pattern pattern;
48
49 /**
50 * List of crawler listeners.
51 */
52 private final List<CrawlerListener> listeners;
53
54 // /**
55 // * Output triple handler.
56 // */
57 // private final TripleHandler tripleHandler;
58
59 /**
60 * @return the singleton instance.
61 */
62 protected static SharedData getInstance() {
63 if(instance == null) throw new IllegalStateException("The configuration has not yet initialized.");
64 return instance;
65 }
66
67 /**
68 * Initializes the crawler data.
69 *
70 * @param seed crawler seed.
71 * @param regex page filter regex.
72 * @param listeners the listeners to be notified of the crawler activity.
73 */
74 protected static void setCrawlData(String seed, Pattern regex, List<CrawlerListener> listeners) {
75 instance = new SharedData(seed, regex, listeners);
76 }
77
78 /**
79 * Internal constructor.
80 *
81 * @param seed
82 * @param pattern
83 * @param listeners
84 */
85 private SharedData(String seed, Pattern pattern, List<CrawlerListener> listeners) {
86 if(seed == null || seed.trim().length() == 0)
87 throw new IllegalArgumentException(
88 String.format("Invalid seed '%s'", seed)
89 );
90
91 this.seed = seed;
92 this.pattern = pattern;
93 this.listeners = listeners;
94 }
95
96 /**
97 * @return crawl seed.
98 */
99 protected String getSeed() {
100 return seed;
101 }
102
103 /**
104 * @return page filter pattern.
105 */
106 protected Pattern getPattern() {
107 return pattern;
108 }
109
110 /**
111 * Notifies all listeners that a page has been discovered.
112 *
113 * @param page the discovered page.
114 */
115 protected void notifyPage(Page page) {
116 for(CrawlerListener listener : listeners) {
117 listener.visitedPage(page);
118 }
119 }
120
121 }