| 1 | |
|
| 2 | |
|
| 3 | |
|
| 4 | |
|
| 5 | |
|
| 6 | |
|
| 7 | |
|
| 8 | |
|
| 9 | |
|
| 10 | |
|
| 11 | |
|
| 12 | |
|
| 13 | |
|
| 14 | |
|
| 15 | |
|
| 16 | |
|
| 17 | |
|
| 18 | |
package org.apache.any23.servlet; |
| 19 | |
|
| 20 | |
import org.apache.any23.Any23; |
| 21 | |
import org.apache.any23.ExtractionReport; |
| 22 | |
import org.apache.any23.extractor.ExtractionException; |
| 23 | |
import org.apache.any23.extractor.ExtractionParameters; |
| 24 | |
import org.apache.any23.filter.IgnoreAccidentalRDFa; |
| 25 | |
import org.apache.any23.source.DocumentSource; |
| 26 | |
import org.apache.any23.validator.SerializationException; |
| 27 | |
import org.apache.any23.validator.ValidationReport; |
| 28 | |
import org.apache.any23.validator.XMLValidationReportSerializer; |
| 29 | |
import org.apache.any23.writer.CompositeTripleHandler; |
| 30 | |
import org.apache.any23.writer.CountingTripleHandler; |
| 31 | |
import org.apache.any23.writer.FormatWriter; |
| 32 | |
import org.apache.any23.writer.ReportingTripleHandler; |
| 33 | |
import org.apache.any23.writer.TripleHandler; |
| 34 | |
import org.apache.any23.writer.WriterRegistry; |
| 35 | |
import sun.security.validator.ValidatorException; |
| 36 | |
|
| 37 | |
import javax.servlet.ServletOutputStream; |
| 38 | |
import javax.servlet.http.HttpServletResponse; |
| 39 | |
import java.io.ByteArrayOutputStream; |
| 40 | |
import java.io.IOException; |
| 41 | |
import java.io.PrintStream; |
| 42 | |
import java.nio.charset.Charset; |
| 43 | |
import java.util.ArrayList; |
| 44 | |
import java.util.List; |
| 45 | |
|
| 46 | |
|
| 47 | |
|
| 48 | |
|
| 49 | |
|
| 50 | |
class WebResponder { |
| 51 | |
|
| 52 | 0 | private static final WriterRegistry writerRegistry = WriterRegistry.getInstance(); |
| 53 | |
|
| 54 | |
|
| 55 | |
|
| 56 | |
|
| 57 | |
private final Any23 runner; |
| 58 | |
|
| 59 | |
|
| 60 | |
|
| 61 | |
|
| 62 | |
private Servlet any23servlet; |
| 63 | |
|
| 64 | |
|
| 65 | |
|
| 66 | |
|
| 67 | |
private HttpServletResponse response; |
| 68 | |
|
| 69 | |
|
| 70 | |
|
| 71 | |
|
| 72 | 0 | private TripleHandler rdfWriter = null; |
| 73 | |
|
| 74 | |
|
| 75 | |
|
| 76 | |
|
| 77 | 0 | private ReportingTripleHandler reporter = null; |
| 78 | |
|
| 79 | |
|
| 80 | |
|
| 81 | |
|
| 82 | 0 | private String outputMediaType = null; |
| 83 | |
|
| 84 | |
|
| 85 | |
|
| 86 | |
|
| 87 | 0 | private ByteArrayOutputStream byteOutStream = new ByteArrayOutputStream(); |
| 88 | |
|
| 89 | 0 | public WebResponder(Servlet any23servlet, HttpServletResponse response) { |
| 90 | 0 | this.any23servlet = any23servlet; |
| 91 | 0 | this.response = response; |
| 92 | 0 | this.runner = new Any23(); |
| 93 | 0 | runner.setHTTPUserAgent("Any23-Servlet"); |
| 94 | 0 | } |
| 95 | |
|
| 96 | |
protected Any23 getRunner() { |
| 97 | 0 | return runner; |
| 98 | |
} |
| 99 | |
|
| 100 | |
public void runExtraction( |
| 101 | |
DocumentSource in, |
| 102 | |
ExtractionParameters eps, |
| 103 | |
String format, |
| 104 | |
boolean report, boolean annotate |
| 105 | |
) throws IOException { |
| 106 | 0 | if (in == null) return; |
| 107 | 0 | if (!initRdfWriter(format, report, annotate)) return; |
| 108 | |
final ExtractionReport er; |
| 109 | |
try { |
| 110 | 0 | er = runner.extract(eps, in, rdfWriter); |
| 111 | 0 | rdfWriter.close(); |
| 112 | 0 | if (! er.hasMatchingExtractors() ) { |
| 113 | 0 | sendError( |
| 114 | |
415, |
| 115 | |
"No suitable extractor found for this media type", |
| 116 | |
null, |
| 117 | |
er.getValidationReport(), |
| 118 | |
report |
| 119 | |
); |
| 120 | 0 | return; |
| 121 | |
} |
| 122 | 0 | } catch (IOException ioe) { |
| 123 | |
|
| 124 | 0 | if (ioe.getCause() != null && ValidatorException.class.equals(ioe.getCause().getClass())) { |
| 125 | 0 | final String errMsg = "Could not fetch input, IO Error."; |
| 126 | 0 | any23servlet.log(errMsg, ioe.getCause()); |
| 127 | 0 | sendError(502, errMsg, ioe, null, report); |
| 128 | 0 | return; |
| 129 | |
} |
| 130 | 0 | any23servlet.log("Could not fetch input", ioe); |
| 131 | 0 | sendError(502, "Could not fetch input.", ioe, null, report); |
| 132 | 0 | return; |
| 133 | 0 | } catch (ExtractionException e) { |
| 134 | |
|
| 135 | 0 | any23servlet.log("Could not parse input", e); |
| 136 | 0 | sendError(502, "Could not parse input.", e, null, report); |
| 137 | 0 | return; |
| 138 | 0 | } catch (Exception e) { |
| 139 | 0 | any23servlet.log("Internal error", e); |
| 140 | 0 | sendError(500, "Internal error.", e, null, report); |
| 141 | 0 | return; |
| 142 | 0 | } |
| 143 | |
|
| 144 | |
|
| 145 | 0 | any23servlet.log("Extraction complete, " + reporter.getTotalTriples() + " triples"); |
| 146 | 0 | if (reporter.getTotalTriples() == 0) { |
| 147 | 0 | sendError( |
| 148 | |
501, |
| 149 | |
"Extraction completed. No triples have been found.", |
| 150 | |
null, |
| 151 | |
er.getValidationReport(), report |
| 152 | |
); |
| 153 | 0 | return; |
| 154 | |
} |
| 155 | |
|
| 156 | |
|
| 157 | 0 | response.setContentType(outputMediaType); |
| 158 | 0 | response.setStatus(200); |
| 159 | |
|
| 160 | 0 | final String charsetEncoding = er.getEncoding(); |
| 161 | 0 | if (Charset.isSupported(charsetEncoding)) { |
| 162 | 0 | response.setCharacterEncoding(er.getEncoding()); |
| 163 | |
} else { |
| 164 | 0 | response.setCharacterEncoding("UTF-8"); |
| 165 | |
} |
| 166 | |
|
| 167 | 0 | final ServletOutputStream sos = response.getOutputStream(); |
| 168 | 0 | final byte[] data = byteOutStream.toByteArray(); |
| 169 | 0 | if(report) { |
| 170 | 0 | final PrintStream ps = new PrintStream(sos); |
| 171 | |
try { |
| 172 | 0 | printHeader(ps); |
| 173 | 0 | printResponse(reporter, er.getValidationReport(), data, ps); |
| 174 | 0 | } catch (Exception e) { |
| 175 | 0 | throw new RuntimeException("An error occurred while serializing the output response.", e); |
| 176 | |
} finally { |
| 177 | 0 | ps.close(); |
| 178 | 0 | } |
| 179 | 0 | } else { |
| 180 | 0 | sos.write(data); |
| 181 | |
} |
| 182 | 0 | } |
| 183 | |
|
| 184 | |
public void sendError(int code, String msg, boolean report) throws IOException { |
| 185 | 0 | sendError(code, msg, null, null, report); |
| 186 | 0 | } |
| 187 | |
|
| 188 | |
private void printHeader(PrintStream ps) { |
| 189 | 0 | ps.println("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>"); |
| 190 | 0 | } |
| 191 | |
|
| 192 | |
private void printResponse(ReportingTripleHandler rth, ValidationReport vr, byte[] data, PrintStream ps) { |
| 193 | 0 | ps.println("<response>"); |
| 194 | 0 | printExtractors(rth, ps); |
| 195 | 0 | printReport(null, null, vr, ps); |
| 196 | 0 | printData(data, ps); |
| 197 | 0 | ps.println("</response>"); |
| 198 | 0 | } |
| 199 | |
|
| 200 | |
private void printExtractors(ReportingTripleHandler rth, PrintStream ps) { |
| 201 | 0 | ps.println("<extractors>"); |
| 202 | 0 | for (String extractor : rth.getExtractorNames()) { |
| 203 | 0 | ps.print("<extractor>"); |
| 204 | 0 | ps.print(extractor); |
| 205 | 0 | ps.println("</extractor>"); |
| 206 | |
} |
| 207 | 0 | ps.println("</extractors>"); |
| 208 | 0 | } |
| 209 | |
|
| 210 | |
private void printReport(String msg, Throwable e, ValidationReport vr, PrintStream ps) { |
| 211 | 0 | XMLValidationReportSerializer reportSerializer = new XMLValidationReportSerializer(); |
| 212 | 0 | ps.println("<report>"); |
| 213 | 0 | ps.printf("<message>%s</message>\n", msg == null ? "" : msg); |
| 214 | 0 | ps.println("<error>"); |
| 215 | 0 | if(e != null) { |
| 216 | 0 | ps.println("<![CDATA["); |
| 217 | 0 | e.printStackTrace(ps); |
| 218 | 0 | ps.println("]]>"); |
| 219 | |
} |
| 220 | 0 | ps.println("</error>"); |
| 221 | |
|
| 222 | |
try { |
| 223 | 0 | reportSerializer.serialize(vr, ps); |
| 224 | 0 | } catch (SerializationException se) { |
| 225 | 0 | ps.println("An error occurred while serializing error."); |
| 226 | 0 | se.printStackTrace(ps); |
| 227 | 0 | } |
| 228 | |
|
| 229 | 0 | ps.println("</report>"); |
| 230 | 0 | } |
| 231 | |
|
| 232 | |
private void printData(byte[] data, PrintStream ps) { |
| 233 | 0 | ps.println("<data>"); |
| 234 | 0 | ps.println("<![CDATA["); |
| 235 | |
try { |
| 236 | 0 | ps.write(data); |
| 237 | 0 | } catch (IOException ioe) { |
| 238 | 0 | ps.println("An error occurred while serializing data."); |
| 239 | 0 | ioe.printStackTrace(ps); |
| 240 | 0 | } |
| 241 | 0 | ps.println("]]>"); |
| 242 | 0 | ps.println("</data>"); |
| 243 | 0 | } |
| 244 | |
|
| 245 | |
private void sendError(int code, String msg, Exception e, ValidationReport vr, boolean report) |
| 246 | |
throws IOException { |
| 247 | 0 | response.setStatus(code); |
| 248 | 0 | response.setContentType("text/plain"); |
| 249 | 0 | final PrintStream ps = new PrintStream(response.getOutputStream()); |
| 250 | 0 | if (report) { |
| 251 | |
try { |
| 252 | 0 | printHeader(ps); |
| 253 | 0 | printReport(msg, e, vr, ps); |
| 254 | |
} finally { |
| 255 | 0 | ps.close(); |
| 256 | 0 | } |
| 257 | |
} else { |
| 258 | 0 | ps.println(msg); |
| 259 | 0 | if (e != null) { |
| 260 | 0 | ps.println("================================================================"); |
| 261 | 0 | e.printStackTrace(ps); |
| 262 | 0 | ps.println("================================================================"); |
| 263 | |
} |
| 264 | |
} |
| 265 | 0 | } |
| 266 | |
|
| 267 | |
private boolean initRdfWriter(String format, boolean report, boolean annotate) throws IOException { |
| 268 | 0 | final FormatWriter fw = getFormatWriter(format, annotate); |
| 269 | 0 | if (fw == null) { |
| 270 | 0 | sendError( |
| 271 | |
400, |
| 272 | |
"Invalid format '" + format + "', try one of: [rdfxml, turtle, ntriples, nquads, trix, json]", |
| 273 | |
null, |
| 274 | |
null, |
| 275 | |
report |
| 276 | |
); |
| 277 | 0 | return false; |
| 278 | |
} |
| 279 | 0 | outputMediaType = WriterRegistry.getMimeType( fw.getClass() ); |
| 280 | 0 | List<TripleHandler> tripleHandlers = new ArrayList<TripleHandler>(); |
| 281 | 0 | tripleHandlers.add(new IgnoreAccidentalRDFa(fw)); |
| 282 | 0 | tripleHandlers.add(new CountingTripleHandler()); |
| 283 | 0 | rdfWriter = new CompositeTripleHandler(tripleHandlers); |
| 284 | 0 | reporter = new ReportingTripleHandler(rdfWriter); |
| 285 | 0 | rdfWriter = reporter; |
| 286 | 0 | return true; |
| 287 | |
} |
| 288 | |
|
| 289 | |
private FormatWriter getFormatWriter(String format, boolean annotate) throws IOException { |
| 290 | |
final String finalFormat; |
| 291 | 0 | if ("rdf".equals(format) || "xml".equals(format) || "rdfxml".equals(format)) { |
| 292 | 0 | finalFormat = "rdfxml"; |
| 293 | 0 | } else if ("turtle".equals(format) || "ttl".equals(format)) { |
| 294 | 0 | finalFormat = "turtle"; |
| 295 | 0 | } else if ("n3".equals(format)) { |
| 296 | 0 | finalFormat = "turtle"; |
| 297 | 0 | } else if ("n-triples".equals(format) || "ntriples".equals(format) || "nt".equals(format)) { |
| 298 | 0 | finalFormat = "ntriples"; |
| 299 | 0 | } else if("nquads".equals(format) || "n-quads".equals(format) || "nq".equals(format)) { |
| 300 | 0 | finalFormat = "nquads"; |
| 301 | 0 | } else if("trix".equals(format)) { |
| 302 | 0 | finalFormat = "trix"; |
| 303 | 0 | } else if("json".equals(format)) { |
| 304 | 0 | finalFormat = "json"; |
| 305 | |
} else { |
| 306 | 0 | return null; |
| 307 | |
} |
| 308 | 0 | final FormatWriter writer = writerRegistry.getWriterInstanceByIdentifier(finalFormat, byteOutStream); |
| 309 | 0 | writer.setAnnotated(annotate); |
| 310 | 0 | return writer; |
| 311 | |
} |
| 312 | |
|
| 313 | |
} |