| Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
| EntityBasedMicroformatExtractor |
|
| 1.2;1.2 |
| 1 | /* | |
| 2 | * Licensed to the Apache Software Foundation (ASF) under one or more | |
| 3 | * contributor license agreements. See the NOTICE file distributed with | |
| 4 | * this work for additional information regarding copyright ownership. | |
| 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 | |
| 6 | * (the "License"); you may not use this file except in compliance with | |
| 7 | * the License. You may obtain a copy of the License at | |
| 8 | * | |
| 9 | * http://www.apache.org/licenses/LICENSE-2.0 | |
| 10 | * | |
| 11 | * Unless required by applicable law or agreed to in writing, software | |
| 12 | * distributed under the License is distributed on an "AS IS" BASIS, | |
| 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| 14 | * See the License for the specific language governing permissions and | |
| 15 | * limitations under the License. | |
| 16 | */ | |
| 17 | ||
| 18 | package org.apache.any23.extractor.html; | |
| 19 | ||
| 20 | import org.apache.any23.extractor.ExtractionException; | |
| 21 | import org.apache.any23.extractor.ExtractionResult; | |
| 22 | import org.apache.any23.rdf.RDFUtils; | |
| 23 | import org.openrdf.model.BNode; | |
| 24 | import org.w3c.dom.Node; | |
| 25 | ||
| 26 | import java.util.List; | |
| 27 | ||
| 28 | /** | |
| 29 | * Base class for microformat extractors based on entities. | |
| 30 | * | |
| 31 | * @author Gabriele Renzi | |
| 32 | */ | |
| 33 | 0 | public abstract class EntityBasedMicroformatExtractor extends MicroformatExtractor { |
| 34 | ||
| 35 | /** | |
| 36 | * Returns the base class name for the extractor. | |
| 37 | * | |
| 38 | * @return a string containing the base of the extractor. | |
| 39 | */ | |
| 40 | protected abstract String getBaseClassName(); | |
| 41 | ||
| 42 | /** | |
| 43 | * Resets the internal status of the extractor to prepare it to a new extraction section. | |
| 44 | */ | |
| 45 | protected abstract void resetExtractor(); | |
| 46 | ||
| 47 | /** | |
| 48 | * Extracts an entity from a <i>DOM</i> node. | |
| 49 | * | |
| 50 | * @param node the DOM node. | |
| 51 | * @param out the extraction result collector. | |
| 52 | * @return <code>true</code> if the extraction has produces something, <code>false</code> otherwise. | |
| 53 | * @throws ExtractionException | |
| 54 | */ | |
| 55 | protected abstract boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException; | |
| 56 | ||
| 57 | @Override | |
| 58 | public boolean extract() throws ExtractionException { | |
| 59 | 0 | List<Node> nodes = DomUtils.findAllByClassName( getHTMLDocument().getDocument(), getBaseClassName()); |
| 60 | 0 | boolean foundAny = false; |
| 61 | 0 | int count = 1; |
| 62 | 0 | for (Node node : nodes) { |
| 63 | 0 | resetExtractor(); |
| 64 | 0 | String contextID = Integer.toString(count); |
| 65 | 0 | ExtractionResult subResult = openSubResult( getExtractionContext().copy(contextID) ); |
| 66 | 0 | foundAny |= extractEntity(node, subResult); |
| 67 | 0 | subResult.close(); |
| 68 | 0 | count++; |
| 69 | 0 | } |
| 70 | 0 | return foundAny; |
| 71 | } | |
| 72 | ||
| 73 | /** | |
| 74 | * @param node a DOM node representing a blank node | |
| 75 | * @return an RDF blank node corresponding to that DOM node, by using a | |
| 76 | * blank node ID like "MD5 of http://doc-uri/#xpath/to/node" | |
| 77 | */ | |
| 78 | protected BNode getBlankNodeFor(Node node) { | |
| 79 | 0 | return RDFUtils.getBNode(getDocumentURI() + "#" + DomUtils.getXPathForNode(node)); |
| 80 | } | |
| 81 | ||
| 82 | } |