| 1 | |
|
| 2 | |
|
| 3 | |
|
| 4 | |
|
| 5 | |
|
| 6 | |
|
| 7 | |
|
| 8 | |
|
| 9 | |
|
| 10 | |
|
| 11 | |
|
| 12 | |
|
| 13 | |
|
| 14 | |
|
| 15 | |
|
| 16 | |
|
| 17 | |
|
| 18 | |
package org.apache.any23.extractor.html; |
| 19 | |
|
| 20 | |
import org.apache.any23.extractor.ExtractionException; |
| 21 | |
import org.apache.any23.rdf.Any23ValueFactoryWrapper; |
| 22 | |
import org.apache.any23.rdf.RDFUtils; |
| 23 | |
import org.openrdf.model.URI; |
| 24 | |
import org.openrdf.model.impl.ValueFactoryImpl; |
| 25 | |
import org.slf4j.Logger; |
| 26 | |
import org.slf4j.LoggerFactory; |
| 27 | |
import org.w3c.dom.NamedNodeMap; |
| 28 | |
import org.w3c.dom.Node; |
| 29 | |
import org.w3c.dom.NodeList; |
| 30 | |
import org.w3c.dom.Text; |
| 31 | |
|
| 32 | |
import javax.xml.xpath.XPath; |
| 33 | |
import javax.xml.xpath.XPathConstants; |
| 34 | |
import javax.xml.xpath.XPathExpressionException; |
| 35 | |
import javax.xml.xpath.XPathFactory; |
| 36 | |
import java.net.URISyntaxException; |
| 37 | |
import java.util.ArrayList; |
| 38 | |
import java.util.List; |
| 39 | |
|
| 40 | |
|
| 41 | |
|
| 42 | |
|
| 43 | |
|
| 44 | |
|
| 45 | |
|
| 46 | |
|
| 47 | |
public class HTMLDocument { |
| 48 | |
|
| 49 | 0 | private final static XPath xPathEngine = XPathFactory.newInstance().newXPath(); |
| 50 | 0 | private final static Logger log = LoggerFactory.getLogger(HTMLDocument.class); |
| 51 | |
|
| 52 | |
private Node document; |
| 53 | |
private java.net.URI baseURI; |
| 54 | |
|
| 55 | 0 | private final Any23ValueFactoryWrapper valueFactory = |
| 56 | |
new Any23ValueFactoryWrapper(ValueFactoryImpl.getInstance()); |
| 57 | |
|
| 58 | |
|
| 59 | |
|
| 60 | |
|
| 61 | |
|
| 62 | |
|
| 63 | |
|
| 64 | |
public static TextField readTextField(Node node) { |
| 65 | |
TextField result; |
| 66 | 0 | final String name = node.getNodeName(); |
| 67 | 0 | final NamedNodeMap attributes = node.getAttributes(); |
| 68 | |
|
| 69 | 0 | if (attributes == null ) { |
| 70 | 0 | return new TextField( node.getTextContent(), node); |
| 71 | |
} |
| 72 | |
|
| 73 | 0 | List<Node> values = DomUtils.findAllByClassName(node, "value"); |
| 74 | 0 | if (!values.isEmpty()) { |
| 75 | 0 | String val = ""; |
| 76 | 0 | for (Node n : values) |
| 77 | 0 | val += n.getTextContent(); |
| 78 | 0 | return new TextField( val.trim(), node); |
| 79 | |
} |
| 80 | 0 | if ("ABBR".equals(name) && (null != attributes.getNamedItem("title"))) { |
| 81 | 0 | result = new TextField(attributes.getNamedItem("title").getNodeValue(), node); |
| 82 | 0 | } else if ("A".equals(name)) { |
| 83 | 0 | if (DomUtils.hasAttribute(node, "rel", "tag")) { |
| 84 | 0 | String href = extractRelTag(attributes); |
| 85 | 0 | result = new TextField(href, node); |
| 86 | 0 | } else |
| 87 | 0 | result = new TextField(node.getTextContent(), node); |
| 88 | 0 | } else if ("IMG".equals(name) || "AREA".equals(name)) { |
| 89 | 0 | result = new TextField(attributes.getNamedItem("alt").getNodeValue(), node); |
| 90 | |
} else { |
| 91 | 0 | result = new TextField(node.getTextContent(), node); |
| 92 | |
} |
| 93 | 0 | return result; |
| 94 | |
} |
| 95 | |
|
| 96 | |
|
| 97 | |
|
| 98 | |
|
| 99 | |
|
| 100 | |
|
| 101 | |
|
| 102 | |
public static void readUrlField(List<TextField> res, Node node) { |
| 103 | 0 | String name = node.getNodeName(); |
| 104 | 0 | NamedNodeMap attributes = node.getAttributes(); |
| 105 | 0 | if (null == attributes) { |
| 106 | 0 | res.add( new TextField(node.getTextContent(), node) ); |
| 107 | 0 | return; |
| 108 | |
} |
| 109 | 0 | if ("A".equals(name) || "AREA".equals(name)) { |
| 110 | 0 | Node n = attributes.getNamedItem("href"); |
| 111 | 0 | res.add( new TextField(n.getNodeValue(), n) ); |
| 112 | 0 | } else if ("ABBR".equals(name)) { |
| 113 | 0 | Node n = attributes.getNamedItem("title"); |
| 114 | 0 | res.add( new TextField(n.getNodeValue(), n) ); |
| 115 | 0 | } else if ("IMG".equals(name)) { |
| 116 | 0 | Node n = attributes.getNamedItem("src"); |
| 117 | 0 | res.add( new TextField(n.getNodeValue(), n) ); |
| 118 | 0 | } else if ("OBJECT".equals(name)) { |
| 119 | 0 | Node n = attributes.getNamedItem("data"); |
| 120 | 0 | res.add( new TextField(n.getNodeValue(), n) ); |
| 121 | 0 | } else { |
| 122 | 0 | res.add( new TextField(node.getTextContent().trim(), node) ); |
| 123 | |
} |
| 124 | 0 | } |
| 125 | |
|
| 126 | |
|
| 127 | |
|
| 128 | |
|
| 129 | |
|
| 130 | |
|
| 131 | |
|
| 132 | |
|
| 133 | |
public static String extractRelTag(String hrefAttributeContent) { |
| 134 | 0 | String[] all = hrefAttributeContent.split("[#?]"); |
| 135 | |
|
| 136 | 0 | String path = all[0]; |
| 137 | 0 | int pathLenghtMin1 = path.length() - 1; |
| 138 | 0 | if( '/' == path.charAt(pathLenghtMin1) ) { |
| 139 | 0 | path = path.substring(0, pathLenghtMin1); |
| 140 | |
} |
| 141 | 0 | return path; |
| 142 | |
} |
| 143 | |
|
| 144 | |
|
| 145 | |
|
| 146 | |
|
| 147 | |
|
| 148 | |
|
| 149 | |
|
| 150 | |
|
| 151 | |
public static String extractRelTag(NamedNodeMap attributes) { |
| 152 | 0 | return extractRelTag(attributes.getNamedItem("href").getNodeValue()); |
| 153 | |
} |
| 154 | |
|
| 155 | |
|
| 156 | |
|
| 157 | |
|
| 158 | |
|
| 159 | |
|
| 160 | |
|
| 161 | |
|
| 162 | |
|
| 163 | |
|
| 164 | |
public static String readNodeContent(Node node, boolean prettify) { |
| 165 | 0 | final String content = node.getTextContent(); |
| 166 | 0 | return prettify ? content.trim().replaceAll("\\n", " ").replaceAll(" +", " ") : content; |
| 167 | |
} |
| 168 | |
|
| 169 | |
|
| 170 | |
|
| 171 | |
|
| 172 | |
|
| 173 | |
|
| 174 | 0 | public HTMLDocument(Node document) { |
| 175 | 0 | if (null == document) |
| 176 | 0 | throw new IllegalArgumentException("node cannot be null when constructing an HTMLDocument"); |
| 177 | 0 | this.document = document; |
| 178 | 0 | } |
| 179 | |
|
| 180 | |
|
| 181 | |
|
| 182 | |
|
| 183 | |
|
| 184 | |
public URI resolveURI(String uri) throws ExtractionException { |
| 185 | 0 | return valueFactory.resolveURI(uri, getBaseURI()); |
| 186 | |
} |
| 187 | |
|
| 188 | |
public String find(String xpath) { |
| 189 | 0 | return DomUtils.find(getDocument(), xpath); |
| 190 | |
} |
| 191 | |
|
| 192 | |
public Node findNodeById(String id) { |
| 193 | 0 | return DomUtils.findNodeById(getDocument(), id); |
| 194 | |
} |
| 195 | |
|
| 196 | |
public List<Node> findAll(String xpath) { |
| 197 | 0 | return DomUtils.findAll(getDocument(), xpath); |
| 198 | |
} |
| 199 | |
|
| 200 | |
public String findMicroformattedValue( |
| 201 | |
String objectTag, |
| 202 | |
String object, |
| 203 | |
String fieldTag, |
| 204 | |
String field, |
| 205 | |
String key |
| 206 | |
) { |
| 207 | 0 | Node node = findMicroformattedObjectNode(objectTag, object); |
| 208 | 0 | if (null == node) |
| 209 | 0 | return ""; |
| 210 | |
|
| 211 | 0 | if (DomUtils.hasClassName(node, field)) |
| 212 | 0 | return node.getTextContent(); |
| 213 | |
|
| 214 | |
|
| 215 | |
try { |
| 216 | 0 | String xpath = ".//" + fieldTag + "[contains(@class, '" + field + "')]/" + key; |
| 217 | 0 | String value = (String) xPathEngine.evaluate(xpath, node, XPathConstants.STRING); |
| 218 | 0 | if (null == value) { |
| 219 | 0 | return ""; |
| 220 | |
} |
| 221 | 0 | return value; |
| 222 | 0 | } catch (XPathExpressionException ex) { |
| 223 | 0 | throw new RuntimeException("Should not happen, XPath expression is built locally", ex); |
| 224 | |
} |
| 225 | |
|
| 226 | |
} |
| 227 | |
|
| 228 | |
public Node getDocument() { |
| 229 | 0 | return document; |
| 230 | |
} |
| 231 | |
|
| 232 | |
|
| 233 | |
|
| 234 | |
|
| 235 | |
|
| 236 | |
|
| 237 | |
|
| 238 | |
|
| 239 | |
public TextField getSingularTextField(String className) { |
| 240 | 0 | TextField[] res = getPluralTextField(className); |
| 241 | 0 | if (res.length == 0) |
| 242 | 0 | return new TextField("", null); |
| 243 | 0 | return res[0]; |
| 244 | |
} |
| 245 | |
|
| 246 | |
|
| 247 | |
|
| 248 | |
|
| 249 | |
|
| 250 | |
|
| 251 | |
|
| 252 | |
public TextField[] getPluralTextField(String className) { |
| 253 | 0 | List<TextField> res = new ArrayList<TextField>(); |
| 254 | 0 | List<Node> nodes = DomUtils.findAllByClassName(getDocument(), className); |
| 255 | 0 | for (Node node : nodes) { |
| 256 | 0 | res.add( readTextField(node) ); |
| 257 | |
} |
| 258 | 0 | return res.toArray( new TextField[res.size()] ); |
| 259 | |
} |
| 260 | |
|
| 261 | |
|
| 262 | |
|
| 263 | |
|
| 264 | |
|
| 265 | |
|
| 266 | |
|
| 267 | |
|
| 268 | |
public TextField getSingularUrlField(String className) { |
| 269 | 0 | TextField[] res = getPluralUrlField(className); |
| 270 | 0 | if (res.length < 1) |
| 271 | 0 | return new TextField("", null); |
| 272 | 0 | return res[0]; |
| 273 | |
} |
| 274 | |
|
| 275 | |
|
| 276 | |
|
| 277 | |
|
| 278 | |
|
| 279 | |
|
| 280 | |
|
| 281 | |
public TextField[] getPluralUrlField(String className) { |
| 282 | 0 | List<TextField> res = new ArrayList<TextField>(); |
| 283 | 0 | List<Node> nodes = DomUtils.findAllByClassName(getDocument(), className); |
| 284 | 0 | for (Node node : nodes) |
| 285 | 0 | readUrlField(res, node); |
| 286 | 0 | return res.toArray( new TextField[res.size()] ); |
| 287 | |
} |
| 288 | |
|
| 289 | |
public Node findMicroformattedObjectNode(String objectTag, String name) { |
| 290 | 0 | List<Node> nodes = DomUtils.findAllByTagAndClassName(getDocument(), objectTag, name); |
| 291 | 0 | if (nodes.isEmpty()) |
| 292 | 0 | return null; |
| 293 | 0 | return nodes.get(0); |
| 294 | |
} |
| 295 | |
|
| 296 | |
|
| 297 | |
|
| 298 | |
|
| 299 | |
|
| 300 | |
|
| 301 | |
|
| 302 | |
|
| 303 | |
public String readAttribute(String attribute) { |
| 304 | 0 | return DomUtils.readAttribute(getDocument(), attribute); |
| 305 | |
} |
| 306 | |
|
| 307 | |
|
| 308 | |
|
| 309 | |
|
| 310 | |
|
| 311 | |
|
| 312 | |
|
| 313 | |
public List<Node> findAllByClassName(String clazz) { |
| 314 | 0 | return DomUtils.findAllByClassName(getDocument(), clazz); |
| 315 | |
} |
| 316 | |
|
| 317 | |
|
| 318 | |
|
| 319 | |
|
| 320 | |
|
| 321 | |
|
| 322 | |
|
| 323 | |
public String getText() { |
| 324 | 0 | NodeList children = getDocument().getChildNodes(); |
| 325 | 0 | if(children.getLength() == 1 && children.item(0) instanceof Text) { |
| 326 | 0 | return children.item(0).getTextContent(); |
| 327 | |
} |
| 328 | 0 | return null; |
| 329 | |
} |
| 330 | |
|
| 331 | |
|
| 332 | |
|
| 333 | |
|
| 334 | |
|
| 335 | |
|
| 336 | |
public String getDefaultLanguage() { |
| 337 | 0 | final String xpathLanguageSelector = "/HTML"; |
| 338 | |
Node html; |
| 339 | |
try { |
| 340 | 0 | html = (Node) xPathEngine.evaluate(xpathLanguageSelector, document, XPathConstants.NODE); |
| 341 | 0 | } catch (XPathExpressionException xpeee) { |
| 342 | 0 | throw new IllegalStateException(); |
| 343 | 0 | } |
| 344 | 0 | if (html == null) { |
| 345 | 0 | return null; |
| 346 | |
} |
| 347 | 0 | Node langAttribute = html.getAttributes().getNamedItem("xml:lang"); |
| 348 | 0 | return langAttribute == null ? null : langAttribute.getTextContent(); |
| 349 | |
} |
| 350 | |
|
| 351 | |
|
| 352 | |
|
| 353 | |
|
| 354 | |
|
| 355 | |
|
| 356 | |
public String[] getPathToLocalRoot() { |
| 357 | 0 | return DomUtils.getXPathListForNode(document); |
| 358 | |
} |
| 359 | |
|
| 360 | |
|
| 361 | |
|
| 362 | |
|
| 363 | |
|
| 364 | |
|
| 365 | |
public TextField[] extractRelTagNodes() { |
| 366 | 0 | final List<Node> relTagNodes = DomUtils.findAllByAttributeName(getDocument(), "rel"); |
| 367 | 0 | final List<TextField> result = new ArrayList<TextField>(); |
| 368 | 0 | for(Node relTagNode : relTagNodes) { |
| 369 | 0 | readUrlField(result, relTagNode); |
| 370 | |
} |
| 371 | 0 | return result.toArray( new TextField[result.size()] ); |
| 372 | |
} |
| 373 | |
|
| 374 | |
private java.net.URI getBaseURI() throws ExtractionException { |
| 375 | 0 | if (baseURI == null) { |
| 376 | |
try { |
| 377 | 0 | if (document.getBaseURI() == null) { |
| 378 | 0 | log.warn("document.getBaseURI() is null, this should not happen"); |
| 379 | |
} |
| 380 | 0 | baseURI = new java.net.URI(RDFUtils.fixAbsoluteURI(document.getBaseURI())); |
| 381 | 0 | } catch (IllegalArgumentException ex) { |
| 382 | 0 | throw new ExtractionException("Error in base URI: " + document.getBaseURI(), ex); |
| 383 | 0 | } catch (URISyntaxException ex) { |
| 384 | 0 | throw new ExtractionException("Error in base URI: " + document.getBaseURI(), ex); |
| 385 | 0 | } |
| 386 | |
} |
| 387 | 0 | return baseURI; |
| 388 | |
} |
| 389 | |
|
| 390 | |
|
| 391 | |
|
| 392 | |
|
| 393 | |
|
| 394 | |
public static class TextField { |
| 395 | |
private String value; |
| 396 | |
private Node source; |
| 397 | |
|
| 398 | 0 | public TextField(String value, Node source) { |
| 399 | 0 | this.value = value; |
| 400 | 0 | this.source = source; |
| 401 | 0 | } |
| 402 | |
|
| 403 | |
public String value() { |
| 404 | 0 | return value; |
| 405 | |
} |
| 406 | |
|
| 407 | |
public Node source() { |
| 408 | 0 | return source; |
| 409 | |
} |
| 410 | |
} |
| 411 | |
|
| 412 | |
} |