| Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
| DefaultWebCrawler |
|
| 3.5;3.5 |
| 1 | /* | |
| 2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
| 3 | * contributor license agreements. See the NOTICE file distributed with | |
| 4 | * this work for additional information regarding copyright ownership. | |
| 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
| 6 | * (the "License"); you may not use this file except in compliance with | |
| 7 | * the License. You may obtain a copy of the License at | |
| 8 | * | |
| 9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
| 10 | * | |
| 11 | * Unless required by applicable law or agreed to in writing, software | |
| 12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 | * See the License for the specific language governing permissions and | |
| 15 | * limitations under the License. | |
| 16 | */ | |
| 17 | ||
| 18 | package org.apache.any23.plugin.crawler; | |
| 19 | ||
| 20 | import edu.uci.ics.crawler4j.crawler.Page; | |
| 21 | import edu.uci.ics.crawler4j.crawler.WebCrawler; | |
| 22 | import edu.uci.ics.crawler4j.url.WebURL; | |
| 23 | import org.slf4j.Logger; | |
| 24 | import org.slf4j.LoggerFactory; | |
| 25 | ||
| 26 | import java.util.regex.Pattern; | |
| 27 | ||
| 28 | /** | |
| 29 | * Default {@link WebCrawler} implementation. | |
| 30 | * | |
| 31 | * @author Michele Mostarda (mostarda@fbk.eu) | |
| 32 | */ | |
| 33 | 0 | public class DefaultWebCrawler extends WebCrawler { |
| 34 | ||
| 35 | 0 | private static final Logger logger = LoggerFactory.getLogger(DefaultWebCrawler.class); |
| 36 | ||
| 37 | /** | |
| 38 | * Shared data reference. | |
| 39 | */ | |
| 40 | 0 | private final SharedData sharedData = SharedData.getInstance(); |
| 41 | ||
| 42 | /** | |
| 43 | * Page filter pattern. | |
| 44 | */ | |
| 45 | 0 | private final Pattern pattern = sharedData.getPattern(); |
| 46 | ||
| 47 | /** | |
| 48 | * Override this method to specify whether the given URL should be visited or not. | |
| 49 | */ | |
| 50 | @Override | |
| 51 | public boolean shouldVisit(WebURL url) { | |
| 52 | 0 | if (url.getURL() == null) return false; |
| 53 | 0 | final String href = url.getURL().toLowerCase(); |
| 54 | 0 | if( ! href.startsWith( sharedData.getSeed() ) ) return false; |
| 55 | 0 | return pattern == null || ! pattern.matcher(href).matches(); |
| 56 | } | |
| 57 | ||
| 58 | /** | |
| 59 | * Override this method to implement the single page processing logic. | |
| 60 | */ | |
| 61 | @Override | |
| 62 | public void visit(Page page) { | |
| 63 | 0 | logger.trace("Visiting page: " + page.getWebURL().getURL()); |
| 64 | 0 | sharedData.notifyPage(page); |
| 65 | 0 | } |
| 66 | ||
| 67 | } | |
| 68 |