1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.any23.extractor;
19
20 import org.apache.any23.extractor.html.MicroformatExtractor;
21 import org.eclipse.rdf4j.model.BNode;
22 import org.eclipse.rdf4j.model.Resource;
23
24 import java.util.Arrays;
25 import java.util.List;
26 import java.util.Locale;
27
28 /**
29 * This interface models a specific {@link ExtractionResult} able to collect property roots generated by <i>HTML
30 * Microformat</i> extractions.
31 *
32 * @author Michele Mostarda (mostarda@fbk.eu)
33 */
34 public interface TagSoupExtractionResult extends ExtractionResult {
35
36 /**
37 * Adds a root property to the extraction result, specifying also the <i>path</i> corresponding to the root of data
38 * which generated the property and the extractor responsible for such addition.
39 *
40 * @param path
41 * the <i>path</i> from the document root to the local root of the data generating the property.
42 * @param root
43 * the property root node.
44 * @param extractor
45 * the extractor responsible of such extraction.
46 */
47 void addResourceRoot(String[] path, Resource root, Class<? extends MicroformatExtractor> extractor);
48
49 /**
50 * Returns all the collected property roots.
51 *
52 * @return an <b>unmodifiable</b> list of {@link TagSoupExtractionResult.ResourceRoot}s.
53 */
54 List<ResourceRoot> getResourceRoots();
55
56 /**
57 * Adds a property path to the list of the extracted data.
58 *
59 * @param extractor
60 * the identifier of the extractor responsible for retrieving such property.
61 * @param propertySubject
62 * the subject of the property.
63 * @param property
64 * the property IRI.
65 * @param object
66 * the property object if any, <code>null</code> otherwise.
67 * @param path
68 * the path of the <i>HTML</i> node from which the property literal has been extracted.
69 */
70 void addPropertyPath(Class<? extends MicroformatExtractor> extractor, Resource propertySubject, Resource property,
71 BNode object, String[] path);
72
73 /**
74 * Returns all the collected property paths.
75 *
76 * @return a valid list of property paths.
77 */
78 List<PropertyPath> getPropertyPaths();
79
80 /**
81 * Defines a property root object.
82 */
83 class ResourceRoot {
84 private String[] path;
85 private Resource root;
86 private Class<? extends MicroformatExtractor> extractor;
87
88 public ResourceRoot(String[] path, Resource root, Class<? extends MicroformatExtractor> extractor) {
89 if (path == null || path.length == 0) {
90 throw new IllegalArgumentException(
91 String.format(Locale.ROOT, "Invalid xpath: '%s'.", Arrays.toString(path)));
92 }
93 if (root == null) {
94 throw new IllegalArgumentException("Invalid root, cannot be null.");
95 }
96 if (extractor == null) {
97 throw new IllegalArgumentException("Invalid extractor, cannot ne null");
98 }
99 this.path = path;
100 this.root = root;
101 this.extractor = extractor;
102 }
103
104 public String[] getPath() {
105 return path;
106 }
107
108 public Resource getRoot() {
109 return root;
110 }
111
112 public Class<? extends MicroformatExtractor> getExtractor() {
113 return extractor;
114 }
115
116 @Override
117 public String toString() {
118 return String.format(Locale.ROOT, "%s-%s-%s %s", this.getClass().getCanonicalName(), Arrays.toString(path),
119 root, extractor);
120 }
121 }
122
123 /**
124 * Defines a property path object.
125 */
126 class PropertyPath {
127
128 private Class<? extends MicroformatExtractor> extractor;
129 private String[] path;
130 private Resource subject;
131 private Resource property;
132 private BNode object;
133
134 public PropertyPath(String[] path, Resource subject, Resource property, BNode object,
135 Class<? extends MicroformatExtractor> extractor) {
136 if (path == null) {
137 throw new NullPointerException("path cannot be null.");
138 }
139 if (subject == null) {
140 throw new NullPointerException("subject cannot be null.");
141 }
142 if (property == null) {
143 throw new NullPointerException("property cannot be null.");
144 }
145 if (extractor == null) {
146 throw new NullPointerException("extractor cannot be null.");
147 }
148 this.path = path;
149 this.subject = subject;
150 this.property = property;
151 this.object = object;
152 this.extractor = extractor;
153 }
154
155 public String[] getPath() {
156 return path;
157 }
158
159 public Resource getSubject() {
160 return subject;
161 }
162
163 public Resource getProperty() {
164 return property;
165 }
166
167 public BNode getObject() {
168 return object;
169 }
170
171 public Class<? extends MicroformatExtractor> getExtractor() {
172 return extractor;
173 }
174
175 @Override
176 public String toString() {
177 return String.format(Locale.ROOT, "%s %s - %s - %s -- %s -->", this.getClass().getCanonicalName(),
178 Arrays.toString(path), extractor, subject, property);
179 }
180 }
181
182 }