| Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
| Extractor |
|
| 1.0;1 | ||||
| Extractor$BlindExtractor |
|
| 1.0;1 | ||||
| Extractor$ContentExtractor |
|
| 1.0;1 | ||||
| Extractor$TagSoupDOMExtractor |
|
| 1.0;1 |
| 1 | /* | |
| 2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
| 3 | * contributor license agreements. See the NOTICE file distributed with | |
| 4 | * this work for additional information regarding copyright ownership. | |
| 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
| 6 | * (the "License"); you may not use this file except in compliance with | |
| 7 | * the License. You may obtain a copy of the License at | |
| 8 | * | |
| 9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
| 10 | * | |
| 11 | * Unless required by applicable law or agreed to in writing, software | |
| 12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 | * See the License for the specific language governing permissions and | |
| 15 | * limitations under the License. | |
| 16 | */ | |
| 17 | ||
| 18 | package org.apache.any23.extractor; | |
| 19 | ||
| 20 | import org.openrdf.model.URI; | |
| 21 | import org.w3c.dom.Document; | |
| 22 | ||
| 23 | import java.io.IOException; | |
| 24 | import java.io.InputStream; | |
| 25 | ||
| 26 | /** | |
| 27 | * It defines the signature of a generic Extractor. | |
| 28 | * | |
| 29 | * @param <Input> the type of the input data to be processed. | |
| 30 | */ | |
| 31 | public interface Extractor<Input> { | |
| 32 | ||
| 33 | /** | |
| 34 | * This interface specializes an {@link Extractor} able to handle | |
| 35 | * {@link java.net.URI} as input format. Use it if you need to fetch a document before the extraction | |
| 36 | */ | |
| 37 | public interface BlindExtractor extends Extractor<URI> { | |
| 38 | } | |
| 39 | ||
| 40 | /** | |
| 41 | * This interface specializes an {@link Extractor} able to handle | |
| 42 | * {@link java.io.InputStream} as input format. | |
| 43 | */ | |
| 44 | public interface ContentExtractor extends Extractor<InputStream> { | |
| 45 | ||
| 46 | /** | |
| 47 | * If <code>true</code>, the extractor will stop at first parsing error, | |
| 48 | * if<code>false</code> the extractor will attempt to ignore all parsing errors. | |
| 49 | * | |
| 50 | * @param f tolerance flag. | |
| 51 | */ | |
| 52 | void setStopAtFirstError(boolean f); | |
| 53 | ||
| 54 | } | |
| 55 | ||
| 56 | /** | |
| 57 | * This interface specializes an {@link Extractor} able to handle | |
| 58 | * {@link org.w3c.dom.Document} as input format. | |
| 59 | */ | |
| 60 | public interface TagSoupDOMExtractor extends Extractor<Document> { | |
| 61 | } | |
| 62 | ||
| 63 | /** | |
| 64 | * Executes the extractor. Will be invoked only once, extractors are | |
| 65 | * not reusable. | |
| 66 | * | |
| 67 | * @param extractionParameters the parameters to be applied during the extraction. | |
| 68 | * @param context The document context. | |
| 69 | * @param in The extractor input data. | |
| 70 | * @param out the collector for the extracted data. | |
| 71 | * @throws IOException On error while reading from the input stream. | |
| 72 | * @throws ExtractionException On other error, such as parse errors. | |
| 73 | */ | |
| 74 | void run(ExtractionParameters extractionParameters, ExtractionContext context, Input in, ExtractionResult out) | |
| 75 | throws IOException, ExtractionException; | |
| 76 | ||
| 77 | /** | |
| 78 | * Returns a {@link ExtractorDescription} of this extractor. | |
| 79 | * | |
| 80 | * @return the object representing the extractor description. | |
| 81 | */ | |
| 82 | ExtractorDescription getDescription(); | |
| 83 | ||
| 84 | } |