1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.any23.extractor.html;
19
20 import org.jsoup.Jsoup;
21 import org.jsoup.nodes.Document;
22 import org.jsoup.parser.Parser;
23
24 import java.io.ByteArrayInputStream;
25 import java.io.IOException;
26 import java.io.InputStream;
27 import java.io.SequenceInputStream;
28 import java.nio.charset.StandardCharsets;
29 import java.util.Arrays;
30
31 /**
32 * @author Hans Brende
33 */
34 public class JsoupUtils {
35
36 public static Document parse(InputStream input, String documentIRI, String encoding) throws IOException {
37 // Jsoup doesn't allow null document URIs
38 if (documentIRI == null) {
39 documentIRI = "";
40 }
41
42 // workaround for Jsoup issue #1009
43 if (encoding == null) {
44
45 int c;
46 do {
47 c = input.read();
48 } while (c != -1 && Character.isWhitespace(c));
49
50 if (c != -1) {
51 int capacity = 256;
52 byte[] bytes = new byte[capacity];
53 int length = 0;
54 bytes[length++] = (byte) c;
55
56 if (c == '<') {
57 c = input.read();
58 if (c != -1) {
59 bytes[length++] = (byte) c;
60 if (c == '?') {
61 c = input.read();
62
63 while (c != -1) {
64 if (length == capacity) {
65 capacity *= 2;
66 bytes = Arrays.copyOf(bytes, capacity);
67 }
68 bytes[length++] = (byte) c;
69
70 if (c == '>') {
71 if (length >= 20 && bytes[length - 2] == '?') {
72 String decl = "<" + new String(bytes, 2, length - 4, StandardCharsets.UTF_8)
73 + ">";
74 org.jsoup.nodes.Document doc = org.jsoup.Jsoup.parse(decl, documentIRI,
75 Parser.xmlParser());
76 for (org.jsoup.nodes.Element el : doc.children()) {
77 if ("xml".equalsIgnoreCase(el.tagName())) {
78 String enc = el.attr("encoding");
79 if (enc != null && !enc.isEmpty()) {
80 encoding = enc;
81 break;
82 }
83 }
84 }
85 }
86 break;
87 }
88
89 c = input.read();
90 }
91 }
92 }
93
94 }
95
96 input = new SequenceInputStream(new ByteArrayInputStream(bytes, 0, length), input);
97 }
98
99 }
100
101 // Use Parser.htmlParser() to parse javascript correctly
102 return Jsoup.parse(input, encoding, documentIRI, Parser.htmlParser());
103 }
104
105 }