1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.any23.extractor;
19
20 import org.eclipse.rdf4j.model.IRI;
21 import org.w3c.dom.Document;
22
23 import java.io.IOException;
24 import java.io.InputStream;
25
26 /**
27 * It defines the signature of a generic Extractor.
28 *
29 * @param <Input>
30 * the type of the input data to be processed.
31 */
32 public interface Extractor<Input> {
33
34 /**
35 * This interface specializes an {@link Extractor} able to handle {@link java.net.URI} as input format. Use it if
36 * you need to fetch a document before the extraction
37 */
38 public interface BlindExtractor extends Extractor<IRI> {
39 }
40
41 /**
42 * This interface specializes an {@link Extractor} able to handle {@link java.io.InputStream} as input format.
43 */
44 public interface ContentExtractor extends Extractor<InputStream> {
45
46 /**
47 * If <code>true</code>, the extractor will stop at first parsing error, if<code>false</code> the extractor will
48 * attempt to ignore all parsing errors.
49 *
50 * @param f
51 * tolerance flag.
52 */
53 void setStopAtFirstError(boolean f);
54
55 }
56
57 /**
58 * This interface specializes an {@link Extractor} able to handle {@link org.w3c.dom.Document} as input format.
59 */
60 public interface TagSoupDOMExtractor extends Extractor<Document> {
61 }
62
63 /**
64 * Executes the extractor. Will be invoked only once, extractors are not reusable.
65 *
66 * @param extractionParameters
67 * the parameters to be applied during the extraction.
68 * @param context
69 * The document context.
70 * @param in
71 * The extractor input data.
72 * @param out
73 * the collector for the extracted data.
74 *
75 * @throws IOException
76 * On error while reading from the input stream.
77 * @throws ExtractionException
78 * On other error, such as parse errors.
79 */
80 void run(ExtractionParameters extractionParameters, ExtractionContext context, Input in, ExtractionResult out)
81 throws IOException, ExtractionException;
82
83 /**
84 * Returns a {@link ExtractorDescription} of this extractor.
85 *
86 * @return the object representing the extractor description.
87 */
88 ExtractorDescription getDescription();
89
90 }