1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.any23;
19
20 import org.apache.any23.configuration.Configuration;
21 import org.apache.any23.configuration.DefaultConfiguration;
22 import org.apache.any23.extractor.ExtractionException;
23 import org.apache.any23.extractor.ExtractionParameters;
24 import org.apache.any23.extractor.ExtractorFactory;
25 import org.apache.any23.extractor.ExtractorGroup;
26 import org.apache.any23.extractor.ExtractorRegistryImpl;
27 import org.apache.any23.extractor.SingleDocumentExtraction;
28 import org.apache.any23.extractor.SingleDocumentExtractionReport;
29 import org.apache.any23.http.AcceptHeaderBuilder;
30 import org.apache.any23.http.DefaultHTTPClient;
31 import org.apache.any23.http.DefaultHTTPClientConfiguration;
32 import org.apache.any23.http.HTTPClient;
33 import org.apache.any23.mime.MIMEType;
34 import org.apache.any23.mime.MIMETypeDetector;
35 import org.apache.any23.mime.TikaMIMETypeDetector;
36 import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
37 import org.apache.any23.source.DocumentSource;
38 import org.apache.any23.source.FileDocumentSource;
39 import org.apache.any23.source.HTTPDocumentSource;
40 import org.apache.any23.source.LocalCopyFactory;
41 import org.apache.any23.source.MemCopyFactory;
42 import org.apache.any23.source.StringDocumentSource;
43 import org.apache.any23.writer.TripleHandler;
44 import org.slf4j.Logger;
45 import org.slf4j.LoggerFactory;
46
47 import java.io.File;
48 import java.io.IOException;
49 import java.net.URI;
50 import java.net.URISyntaxException;
51 import java.util.ArrayList;
52 import java.util.Arrays;
53 import java.util.Collection;
54 import java.util.Locale;
55
56 /**
57 * A facade with convenience methods for typical <i>Any23</i> extraction operations.
58 *
59 * @author Richard Cyganiak (richard@cyganiak.de)
60 * @author Michele Mostarda (michele.mostarda@gmail.com)
61 */
62 public class Any23 {
63
64 /**
65 * Any23 core library version. NOTE: there's also a version string in pom.xml, they should match.
66 */
67 public static final String VERSION = DefaultConfiguration.singleton().getPropertyOrFail("any23.core.version");
68
69 /**
70 * Default HTTP User Agent defined in default configuration.
71 */
72 public static final String DEFAULT_HTTP_CLIENT_USER_AGENT = DefaultConfiguration.singleton()
73 .getPropertyOrFail("any23.http.user.agent.default");
74
75 protected static final Logger logger = LoggerFactory.getLogger(Any23.class);
76
77 private final Configuration configuration;
78 private final String defaultUserAgent;
79
80 private MIMETypeDetector mimeTypeDetector = new TikaMIMETypeDetector(new WhiteSpacesPurifier());
81
82 private HTTPClient httpClient = new DefaultHTTPClient();
83
84 private boolean httpClientInitialized = false;
85
86 private final ExtractorGroup factories;
87 private LocalCopyFactory streamCache;
88 private String userAgent;
89
90 /**
91 * Constructor that allows the specification of a custom configuration and of a list of extractors.
92 *
93 * @param configuration
94 * configuration used to build the <i>Any23</i> instance.
95 * @param extractorGroup
96 * the group of extractors to be applied.
97 */
98 public Any23(Configuration configuration, ExtractorGroup extractorGroup) {
99 if (configuration == null)
100 throw new NullPointerException("configuration must be not null.");
101 this.configuration = configuration;
102 if (logger.isDebugEnabled()) {
103 logger.debug(configuration.getConfigurationDump());
104 }
105
106 this.defaultUserAgent = configuration.getPropertyOrFail("any23.http.user.agent.default");
107
108 this.factories = (extractorGroup == null) ? ExtractorRegistryImpl.getInstance().getExtractorGroup()
109 : extractorGroup;
110 setCacheFactory(new MemCopyFactory());
111 }
112
113 /**
114 * Constructor that allows the specification of a list of extractors.
115 *
116 * @param extractorGroup
117 * the group of extractors to be applied.
118 */
119 public Any23(ExtractorGroup extractorGroup) {
120 this(DefaultConfiguration.singleton(), extractorGroup);
121 }
122
123 /**
124 * Constructor that allows the specification of a custom configuration and of list of extractor names.
125 *
126 * @param configuration
127 * a {@link Configuration} object
128 * @param extractorNames
129 * list of extractor's names.
130 */
131 public Any23(Configuration configuration, String... extractorNames) {
132 this(configuration, extractorNames == null ? null
133 : ExtractorRegistryImpl.getInstance().getExtractorGroup(Arrays.asList(extractorNames)));
134 }
135
136 /**
137 * Constructor that allows the specification of a list of extractor names.
138 *
139 * @param extractorNames
140 * list of extractor's names.
141 */
142 public Any23(String... extractorNames) {
143 this(DefaultConfiguration.singleton(), extractorNames);
144 }
145
146 /**
147 * Constructor accepting {@link Configuration}.
148 *
149 * @param configuration
150 * a {@link Configuration} object
151 */
152 public Any23(Configuration configuration) {
153 this(configuration, (String[]) null);
154 }
155
156 /**
157 * Constructor with default configuration.
158 */
159 public Any23() {
160 this(DefaultConfiguration.singleton());
161 }
162
163 /**
164 * Sets the <i>HTTP Header User Agent</i>, see <i>RFC 2616-14.43</i>.
165 *
166 * @param userAgent
167 * text describing the user agent.
168 */
169 public void setHTTPUserAgent(String userAgent) {
170 if (httpClientInitialized) {
171 throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized");
172 }
173 if (userAgent == null) {
174 userAgent = defaultUserAgent;
175 }
176 if (userAgent.trim().length() == 0) {
177 throw new IllegalArgumentException(String.format(Locale.ROOT, "Invalid user agent: '%s'", userAgent));
178 }
179 this.userAgent = userAgent;
180 }
181
182 /**
183 * Returns the <i>HTTP Header User Agent</i>, see <i>RFC 2616-14.43</i>.
184 *
185 * @return text describing the user agent.
186 */
187 public String getHTTPUserAgent() {
188 return this.userAgent;
189 }
190
191 /**
192 * Allows to set the {@link org.apache.any23.http.HTTPClient} implementation used to retrieve contents. The default
193 * instance is {@link org.apache.any23.http.DefaultHTTPClient}.
194 *
195 * @param httpClient
196 * a valid client instance.
197 *
198 * @throws IllegalStateException
199 * if invoked after client has been initialized.
200 */
201 public void setHTTPClient(HTTPClient httpClient) {
202 if (httpClient == null) {
203 throw new NullPointerException("httpClient cannot be null.");
204 }
205 if (httpClientInitialized) {
206 throw new IllegalStateException("Cannot change HTTP configuration after client has been initialized");
207 }
208 this.httpClient = httpClient;
209 }
210
211 /**
212 * Returns the current {@link org.apache.any23.http.HTTPClient} implementation.
213 *
214 * @return instance of HTTPClient.
215 *
216 * @throws IOException
217 * if the HTTP client has not initialized.
218 */
219 public HTTPClient getHTTPClient() throws IOException {
220 if (!httpClientInitialized) {
221 if (userAgent == null) {
222 throw new IOException("Must call " + Any23.class.getSimpleName()
223 + ".setHTTPUserAgent(String) before extracting from HTTP IRI");
224 }
225 httpClient.init(new DefaultHTTPClientConfiguration(this.getAcceptHeader()));
226 httpClientInitialized = true;
227 }
228 return httpClient;
229 }
230
231 /**
232 * Allows to set a {@link org.apache.any23.source.LocalCopyFactory} instance.
233 *
234 * @param cache
235 * valid cache instance.
236 */
237 public void setCacheFactory(LocalCopyFactory cache) {
238 if (cache == null) {
239 throw new NullPointerException("cache cannot be null.");
240 }
241 this.streamCache = cache;
242 }
243
244 /**
245 * Allows to set an instance of {@link org.apache.any23.mime.MIMETypeDetector}.
246 *
247 * @param detector
248 * a valid detector instance, if <code>null</code> all the detectors will be used.
249 */
250 public void setMIMETypeDetector(MIMETypeDetector detector) {
251 this.mimeTypeDetector = detector;
252 }
253
254 /**
255 * <p>
256 * Returns the most appropriate {@link DocumentSource} for the given<code>documentIRI</code>.
257 * </p>
258 * <p>
259 * <b>N.B.</b> <code>documentIRI's</code> <i>should</i> contain a protocol. E.g. <b>http:</b>, <b>https:</b>,
260 * <b>file:</b>
261 * </p>
262 *
263 * @param documentIRI
264 * the document <i>IRI</i>.
265 *
266 * @return a new instance of DocumentSource.
267 *
268 * @throws URISyntaxException
269 * if an error occurs while parsing the <code>documentIRI</code> as a <i>IRI</i>.
270 * @throws IOException
271 * if an error occurs while initializing the internal {@link org.apache.any23.http.HTTPClient}.
272 */
273 public DocumentSource createDocumentSource(String documentIRI) throws URISyntaxException, IOException {
274 if (documentIRI == null)
275 throw new NullPointerException("documentIRI cannot be null.");
276 if (documentIRI.toLowerCase(Locale.ROOT).startsWith("file:")) {
277 return new FileDocumentSource(new File(new URI(documentIRI)));
278 }
279 if (documentIRI.toLowerCase(Locale.ROOT).startsWith("http:")
280 || documentIRI.toLowerCase(Locale.ROOT).startsWith("https:")) {
281 return new HTTPDocumentSource(getHTTPClient(), documentIRI);
282 }
283 throw new IllegalArgumentException(String.format(Locale.ROOT,
284 "Unsupported protocol for document IRI: '%s' . " + "Check that document IRI contains a protocol.",
285 documentIRI));
286 }
287
288 /**
289 * Performs metadata extraction from the content of the given <code>in</code> document source, sending the generated
290 * events to the specified <code>outputHandler</code>.
291 *
292 * @param eps
293 * the extraction parameters to be applied.
294 * @param in
295 * the input document source.
296 * @param outputHandler
297 * handler responsible for collecting of the extracted metadata.
298 * @param encoding
299 * explicit encoding see <a href="http://www.iana.org/assignments/character-sets">available
300 * encodings</a>.
301 *
302 * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
303 *
304 * @throws IOException
305 * if there is an error reading the {@link org.apache.any23.source.DocumentSource}
306 * @throws org.apache.any23.extractor.ExtractionException
307 * if there is an error during extraction
308 */
309 public ExtractionReport extract(ExtractionParameters eps, DocumentSource in, TripleHandler outputHandler,
310 String encoding) throws IOException, ExtractionException {
311 final SingleDocumentExtraction ex = new SingleDocumentExtraction(configuration, in, factories, outputHandler);
312 ex.setMIMETypeDetector(mimeTypeDetector);
313 ex.setLocalCopyFactory(streamCache);
314 ex.setParserEncoding(encoding);
315 final SingleDocumentExtractionReport sder = ex.run(eps);
316 return new ExtractionReport(ex.getMatchingExtractors(), ex.getParserEncoding(), ex.getDetectedMIMEType(),
317 sder.getValidationReport(), sder.getExtractorToIssues());
318 }
319
320 /**
321 * Performs metadata extraction on the <code>in</code> string associated to the <code>documentIRI</code> IRI,
322 * declaring <code>contentType</code> and <code>encoding</code>. The generated events are sent to the specified
323 * <code>outputHandler</code>.
324 *
325 * @param in
326 * raw data to be analyzed.
327 * @param documentIRI
328 * IRI from which the raw data has been extracted.
329 * @param contentType
330 * declared data content type.
331 * @param encoding
332 * declared data encoding.
333 * @param outputHandler
334 * handler responsible for collecting of the extracted metadata.
335 *
336 * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
337 *
338 * @throws IOException
339 * if there is an error reading the {@link org.apache.any23.source.DocumentSource}
340 * @throws org.apache.any23.extractor.ExtractionException
341 * if there is an error during extraction
342 */
343 public ExtractionReport extract(String in, String documentIRI, String contentType, String encoding,
344 TripleHandler outputHandler) throws IOException, ExtractionException {
345 return extract(new StringDocumentSource(in, documentIRI, contentType, encoding), outputHandler);
346 }
347
348 /**
349 * Performs metadata extraction on the <code>in</code> string associated to the <code>documentIRI</code> IRI,
350 * sending the generated events to the specified <code>outputHandler</code>.
351 *
352 * @param in
353 * raw data to be analyzed.
354 * @param documentIRI
355 * IRI from which the raw data has been extracted.
356 * @param outputHandler
357 * handler responsible for collecting of the extracted metadata.
358 *
359 * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
360 *
361 * @throws IOException
362 * if there is an error reading the {@link org.apache.any23.source.DocumentSource}
363 * @throws org.apache.any23.extractor.ExtractionException
364 * if there is an error during extraction
365 */
366 public ExtractionReport extract(String in, String documentIRI, TripleHandler outputHandler)
367 throws IOException, ExtractionException {
368 return extract(new StringDocumentSource(in, documentIRI), outputHandler);
369 }
370
371 /**
372 * Performs metadata extraction from the content of the given <code>file</code> sending the generated events to the
373 * specified <code>outputHandler</code>.
374 *
375 * @param file
376 * file containing raw data.
377 * @param outputHandler
378 * handler responsible for collecting of the extracted metadata.
379 *
380 * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
381 *
382 * @throws IOException
383 * if there is an error reading the {@link org.apache.any23.source.DocumentSource}
384 * @throws org.apache.any23.extractor.ExtractionException
385 * if there is an error during extraction
386 */
387 public ExtractionReport extract(File file, TripleHandler outputHandler) throws IOException, ExtractionException {
388 return extract(new FileDocumentSource(file), outputHandler);
389 }
390
391 /**
392 * Performs metadata extraction from the content of the given <code>documentIRI</code> sending the generated events
393 * to the specified <code>outputHandler</code>. If the <i>IRI</i> is replied with a redirect, the last will be
394 * followed.
395 *
396 * @param eps
397 * the parameters to be applied to the extraction.
398 * @param documentIRI
399 * the IRI from which retrieve document.
400 * @param outputHandler
401 * handler responsible for collecting of the extracted metadata.
402 *
403 * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
404 *
405 * @throws IOException
406 * if there is an error reading the {@link org.apache.any23.source.DocumentSource}
407 * @throws org.apache.any23.extractor.ExtractionException
408 * if there is an error during extraction
409 */
410 public ExtractionReport extract(ExtractionParameters eps, String documentIRI, TripleHandler outputHandler)
411 throws IOException, ExtractionException {
412 try {
413 return extract(eps, createDocumentSource(documentIRI), outputHandler);
414 } catch (URISyntaxException ex) {
415 throw new ExtractionException("Error while extracting data from document IRI.", ex);
416 }
417 }
418
419 /**
420 * Performs metadata extraction from the content of the given <code>documentIRI</code> sending the generated events
421 * to the specified <code>outputHandler</code>. If the <i>IRI</i> is replied with a redirect, the last will be
422 * followed.
423 *
424 * @param documentIRI
425 * the IRI from which retrieve document.
426 * @param outputHandler
427 * handler responsible for collecting of the extracted metadata.
428 *
429 * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
430 *
431 * @throws IOException
432 * if there is an error reading the {@link org.apache.any23.source.DocumentSource}
433 * @throws org.apache.any23.extractor.ExtractionException
434 * if there is an error during extraction
435 */
436 public ExtractionReport extract(String documentIRI, TripleHandler outputHandler)
437 throws IOException, ExtractionException {
438 return extract((ExtractionParameters) null, documentIRI, outputHandler);
439 }
440
441 /**
442 * Performs metadata extraction from the content of the given <code>in</code> document source, sending the generated
443 * events to the specified <code>outputHandler</code>.
444 *
445 * @param in
446 * the input document source.
447 * @param outputHandler
448 * handler responsible for collecting of the extracted metadata.
449 * @param encoding
450 * explicit encoding see <a href="http://www.iana.org/assignments/character-sets">available
451 * encodings</a>.
452 *
453 * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
454 *
455 * @throws IOException
456 * if there is an error reading the {@link org.apache.any23.source.DocumentSource}
457 * @throws org.apache.any23.extractor.ExtractionException
458 * if there is an error during extraction
459 */
460 public ExtractionReport extract(DocumentSource in, TripleHandler outputHandler, String encoding)
461 throws IOException, ExtractionException {
462 return extract(null, in, outputHandler, encoding);
463 }
464
465 /**
466 * Performs metadata extraction from the content of the given <code>in</code> document source, sending the generated
467 * events to the specified <code>outputHandler</code>.
468 *
469 * @param in
470 * the input document source.
471 * @param outputHandler
472 * handler responsible for collecting of the extracted metadata.
473 *
474 * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
475 *
476 * @throws IOException
477 * if there is an error reading the {@link org.apache.any23.source.DocumentSource}
478 * @throws org.apache.any23.extractor.ExtractionException
479 * if there is an error during extraction
480 */
481 public ExtractionReport extract(DocumentSource in, TripleHandler outputHandler)
482 throws IOException, ExtractionException {
483 return extract(null, in, outputHandler, null);
484 }
485
486 /**
487 * Performs metadata extraction from the content of the given <code>in</code> document source, sending the generated
488 * events to the specified <code>outputHandler</code>.
489 *
490 * @param eps
491 * the parameters to be applied for the extraction phase.
492 * @param in
493 * the input document source.
494 * @param outputHandler
495 * handler responsible for collecting of the extracted metadata.
496 *
497 * @return <code>true</code> if some extraction occurred, <code>false</code> otherwise.
498 *
499 * @throws IOException
500 * if there is an error reading the {@link org.apache.any23.source.DocumentSource}
501 * @throws org.apache.any23.extractor.ExtractionException
502 * if there is an error during extraction
503 */
504 public ExtractionReport extract(ExtractionParameters eps, DocumentSource in, TripleHandler outputHandler)
505 throws IOException, ExtractionException {
506 return extract(eps, in, outputHandler, null);
507 }
508
509 private String getAcceptHeader() {
510 Collection<MIMEType> mimeTypes = new ArrayList<>();
511 for (ExtractorFactory<?> factory : factories) {
512 mimeTypes.addAll(factory.getSupportedMIMETypes());
513 }
514 return new AcceptHeaderBuilder(mimeTypes).getAcceptHeader();
515 }
516
517 }