| 1 | |
|
| 2 | |
|
| 3 | |
|
| 4 | |
|
| 5 | |
|
| 6 | |
|
| 7 | |
|
| 8 | |
|
| 9 | |
|
| 10 | |
|
| 11 | |
|
| 12 | |
|
| 13 | |
|
| 14 | |
|
| 15 | |
|
| 16 | |
|
| 17 | |
|
| 18 | |
package org.apache.any23.mime; |
| 19 | |
|
| 20 | |
import org.apache.any23.extractor.csv.CSVReaderBuilder; |
| 21 | |
import org.apache.any23.mime.purifier.Purifier; |
| 22 | |
import org.apache.any23.mime.purifier.WhiteSpacesPurifier; |
| 23 | |
import org.apache.tika.Tika; |
| 24 | |
import org.apache.tika.config.TikaConfig; |
| 25 | |
import org.apache.tika.metadata.Metadata; |
| 26 | |
import org.apache.tika.mime.MimeType; |
| 27 | |
import org.apache.tika.mime.MimeTypeException; |
| 28 | |
import org.apache.tika.mime.MimeTypes; |
| 29 | |
import org.openrdf.rio.RDFParser; |
| 30 | |
import org.openrdf.rio.turtle.TurtleParser; |
| 31 | |
|
| 32 | |
import java.io.BufferedReader; |
| 33 | |
import java.io.ByteArrayInputStream; |
| 34 | |
import java.io.IOException; |
| 35 | |
import java.io.InputStream; |
| 36 | |
import java.io.InputStreamReader; |
| 37 | |
import java.util.regex.Pattern; |
| 38 | |
|
| 39 | |
|
| 40 | |
|
| 41 | |
|
| 42 | |
|
| 43 | |
|
| 44 | |
|
| 45 | |
|
| 46 | |
public class TikaMIMETypeDetector implements MIMETypeDetector { |
| 47 | |
|
| 48 | |
private Purifier purifier; |
| 49 | |
|
| 50 | |
|
| 51 | |
|
| 52 | |
public static final String N3_MIMETYPE = "text/n3"; |
| 53 | |
|
| 54 | |
public static final String NQUADS_MIMETYPE = "text/nq"; |
| 55 | |
|
| 56 | |
public static final String TURTLE_MIMETYPE = "application/turtle"; |
| 57 | |
|
| 58 | |
public static final String CSV_MIMETYPE = "text/csv"; |
| 59 | |
|
| 60 | |
public static final String RESOURCE_NAME = "/org/apache/any23/mime/tika-config.xml"; |
| 61 | |
|
| 62 | |
|
| 63 | |
|
| 64 | |
|
| 65 | 0 | private static final Pattern[] N3_PATTERNS = { |
| 66 | |
Pattern.compile("^\\S+\\s*<\\S+>\\s*<\\S+>\\s*\\." ), |
| 67 | |
Pattern.compile("^\\S+\\s*<\\S+>\\s*_:\\S+\\s*\\." ), |
| 68 | |
Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(@\\S+)?\\s*\\." ), |
| 69 | |
Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(\\^\\^\\S+)?\\s*\\.") |
| 70 | |
}; |
| 71 | |
|
| 72 | |
|
| 73 | |
|
| 74 | |
|
| 75 | 0 | private static final Pattern[] NQUADS_PATTERNS = { |
| 76 | |
Pattern.compile("^\\S+\\s*<\\S+>\\s*<\\S+>\\s*\\<\\S+>\\s*\\." ), |
| 77 | |
Pattern.compile("^\\S+\\s*<\\S+>\\s*_:\\S+\\s*\\<\\S+>\\s*\\." ), |
| 78 | |
Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(@\\S+)?\\s*\\<\\S+>\\s*\\." ), |
| 79 | |
Pattern.compile("^\\S+\\s*<\\S+>\\s*\".*\"(\\^\\^\\S+)?\\s*\\<\\S+>\\s*\\.") |
| 80 | |
}; |
| 81 | |
|
| 82 | 0 | private static TikaConfig config = null; |
| 83 | |
|
| 84 | |
private static Tika tika; |
| 85 | |
|
| 86 | |
private static MimeTypes types; |
| 87 | |
|
| 88 | |
|
| 89 | |
|
| 90 | |
|
| 91 | |
|
| 92 | |
|
| 93 | |
|
| 94 | |
|
| 95 | |
public static boolean checkN3Format(InputStream is) throws IOException { |
| 96 | 0 | return findPattern(N3_PATTERNS, '.', is); |
| 97 | |
} |
| 98 | |
|
| 99 | |
|
| 100 | |
|
| 101 | |
|
| 102 | |
|
| 103 | |
|
| 104 | |
|
| 105 | |
|
| 106 | |
public static boolean checkNQuadsFormat(InputStream is) throws IOException { |
| 107 | 0 | return findPattern(NQUADS_PATTERNS, '.', is); |
| 108 | |
} |
| 109 | |
|
| 110 | |
|
| 111 | |
|
| 112 | |
|
| 113 | |
|
| 114 | |
|
| 115 | |
|
| 116 | |
|
| 117 | |
public static boolean checkTurtleFormat(InputStream is) throws IOException { |
| 118 | 0 | String sample = extractDataSample(is, '.'); |
| 119 | 0 | TurtleParser turtleParser = new TurtleParser(); |
| 120 | 0 | turtleParser.setDatatypeHandling(RDFParser.DatatypeHandling.VERIFY); |
| 121 | 0 | turtleParser.setStopAtFirstError(true); |
| 122 | 0 | turtleParser.setVerifyData(true); |
| 123 | 0 | ByteArrayInputStream bais = new ByteArrayInputStream( sample.getBytes() ); |
| 124 | |
try { |
| 125 | 0 | turtleParser.parse(bais, ""); |
| 126 | 0 | return true; |
| 127 | 0 | } catch (Exception e) { |
| 128 | 0 | return false; |
| 129 | |
} |
| 130 | |
} |
| 131 | |
|
| 132 | |
|
| 133 | |
|
| 134 | |
|
| 135 | |
|
| 136 | |
|
| 137 | |
|
| 138 | |
|
| 139 | |
public static boolean checkCSVFormat(InputStream is) throws IOException { |
| 140 | 0 | return CSVReaderBuilder.isCSV(is); |
| 141 | |
} |
| 142 | |
|
| 143 | |
|
| 144 | |
|
| 145 | |
|
| 146 | |
|
| 147 | |
|
| 148 | |
|
| 149 | |
|
| 150 | |
|
| 151 | |
|
| 152 | |
private static boolean findPattern(Pattern[] patterns, char delimiterChar, InputStream is) |
| 153 | |
throws IOException { |
| 154 | 0 | String sample = extractDataSample(is, delimiterChar); |
| 155 | 0 | for(Pattern pattern : patterns) { |
| 156 | 0 | if(pattern.matcher(sample).find()) { |
| 157 | 0 | return true; |
| 158 | |
} |
| 159 | |
} |
| 160 | 0 | return false; |
| 161 | |
} |
| 162 | |
|
| 163 | |
|
| 164 | |
|
| 165 | |
|
| 166 | |
|
| 167 | |
|
| 168 | |
|
| 169 | |
|
| 170 | |
|
| 171 | |
|
| 172 | |
private static String extractDataSample(InputStream is, char breakChar) throws IOException { |
| 173 | 0 | BufferedReader br = new BufferedReader(new InputStreamReader(is)); |
| 174 | 0 | StringBuilder sb = new StringBuilder(); |
| 175 | 0 | final int MAX_SIZE = 1024 * 2; |
| 176 | |
int c; |
| 177 | 0 | boolean insideBlock = false; |
| 178 | 0 | int read = 0; |
| 179 | 0 | br.mark(MAX_SIZE); |
| 180 | |
try { |
| 181 | 0 | while ((c = br.read()) != -1) { |
| 182 | 0 | read++; |
| 183 | 0 | if (read > MAX_SIZE) { |
| 184 | 0 | break; |
| 185 | |
} |
| 186 | 0 | if ('<' == c) { |
| 187 | 0 | insideBlock = true; |
| 188 | 0 | } else if ('>' == c) { |
| 189 | 0 | insideBlock = false; |
| 190 | 0 | } else if ('"' == c) { |
| 191 | 0 | insideBlock = !insideBlock; |
| 192 | |
} |
| 193 | 0 | sb.append((char) c); |
| 194 | 0 | if (!insideBlock && breakChar == c) { |
| 195 | 0 | break; |
| 196 | |
} |
| 197 | |
} |
| 198 | |
} finally { |
| 199 | 0 | is.reset(); |
| 200 | 0 | br.reset(); |
| 201 | 0 | } |
| 202 | 0 | return sb.toString(); |
| 203 | |
} |
| 204 | |
|
| 205 | 0 | public TikaMIMETypeDetector(Purifier purifier) { |
| 206 | 0 | this.purifier = purifier; |
| 207 | 0 | InputStream is = getResourceAsStream(); |
| 208 | 0 | if (config == null) { |
| 209 | |
try { |
| 210 | 0 | config = new TikaConfig(is); |
| 211 | 0 | } catch (Exception e) { |
| 212 | 0 | throw new RuntimeException("Error while loading Tika configuration.", e); |
| 213 | 0 | } |
| 214 | |
} |
| 215 | |
|
| 216 | 0 | if (types == null) { |
| 217 | 0 | types = config.getMimeRepository(); |
| 218 | |
} |
| 219 | |
|
| 220 | 0 | if(tika == null) { |
| 221 | 0 | tika = new Tika(config); |
| 222 | |
} |
| 223 | 0 | } |
| 224 | |
|
| 225 | |
public TikaMIMETypeDetector() { |
| 226 | 0 | this( new WhiteSpacesPurifier() ); |
| 227 | 0 | } |
| 228 | |
|
| 229 | |
|
| 230 | |
|
| 231 | |
|
| 232 | |
|
| 233 | |
|
| 234 | |
|
| 235 | |
|
| 236 | |
|
| 237 | |
|
| 238 | |
|
| 239 | |
public MIMEType guessMIMEType( |
| 240 | |
String fileName, |
| 241 | |
InputStream input, |
| 242 | |
MIMEType mimeTypeFromMetadata |
| 243 | |
) { |
| 244 | 0 | if(input != null) { |
| 245 | |
try { |
| 246 | 0 | this.purifier.purify(input); |
| 247 | 0 | } catch (IOException e) { |
| 248 | 0 | throw new RuntimeException("Error while purifying the provided input", e); |
| 249 | 0 | } |
| 250 | |
} |
| 251 | |
|
| 252 | 0 | final Metadata meta = new Metadata(); |
| 253 | 0 | if (mimeTypeFromMetadata != null) |
| 254 | 0 | meta.set(Metadata.CONTENT_TYPE, mimeTypeFromMetadata.getFullType()); |
| 255 | 0 | if (fileName != null) |
| 256 | 0 | meta.set(Metadata.RESOURCE_NAME_KEY, fileName); |
| 257 | |
|
| 258 | |
String type; |
| 259 | |
try { |
| 260 | 0 | final String mt = guessMimeTypeByInputAndMeta(input, meta); |
| 261 | 0 | if( ! MimeTypes.OCTET_STREAM.equals(mt) ) { |
| 262 | 0 | type = mt; |
| 263 | |
} else { |
| 264 | 0 | if( checkN3Format(input) ) { |
| 265 | 0 | type = N3_MIMETYPE; |
| 266 | 0 | } else if( checkNQuadsFormat(input) ) { |
| 267 | 0 | type = NQUADS_MIMETYPE; |
| 268 | 0 | } else if( checkTurtleFormat(input) ) { |
| 269 | 0 | type = TURTLE_MIMETYPE; |
| 270 | 0 | } else if( checkCSVFormat(input) ) { |
| 271 | 0 | type = CSV_MIMETYPE; |
| 272 | |
} |
| 273 | |
else { |
| 274 | 0 | type = MimeTypes.OCTET_STREAM; |
| 275 | |
} |
| 276 | |
} |
| 277 | 0 | } catch (IOException ioe) { |
| 278 | 0 | throw new RuntimeException("Error while retrieving mime type.", ioe); |
| 279 | 0 | } |
| 280 | 0 | return MIMEType.parse(type); |
| 281 | |
} |
| 282 | |
|
| 283 | |
|
| 284 | |
|
| 285 | |
|
| 286 | |
|
| 287 | |
|
| 288 | |
private InputStream getResourceAsStream() { |
| 289 | |
InputStream result; |
| 290 | 0 | result = TikaMIMETypeDetector.class.getResourceAsStream(RESOURCE_NAME); |
| 291 | 0 | if (result == null) { |
| 292 | 0 | result = TikaMIMETypeDetector.class.getClassLoader().getResourceAsStream(RESOURCE_NAME); |
| 293 | 0 | if (result == null) { |
| 294 | 0 | result = ClassLoader.getSystemResourceAsStream(RESOURCE_NAME); |
| 295 | |
} |
| 296 | |
} |
| 297 | 0 | return result; |
| 298 | |
} |
| 299 | |
|
| 300 | |
|
| 301 | |
|
| 302 | |
|
| 303 | |
|
| 304 | |
|
| 305 | |
|
| 306 | |
|
| 307 | |
|
| 308 | |
|
| 309 | |
|
| 310 | |
|
| 311 | |
|
| 312 | |
|
| 313 | |
private String guessMimeTypeByInputAndMeta(InputStream stream, final Metadata metadata) |
| 314 | |
throws IOException { |
| 315 | 0 | if (stream != null) { |
| 316 | 0 | final String type = tika.detect(stream); |
| 317 | 0 | if ( type != null && ! isGenericMIMEType(type) ) { |
| 318 | 0 | return type; |
| 319 | |
} |
| 320 | |
} |
| 321 | |
|
| 322 | |
|
| 323 | 0 | final String contentType = metadata.get(Metadata.CONTENT_TYPE); |
| 324 | 0 | String candidateMIMEType = null; |
| 325 | 0 | if (contentType != null) { |
| 326 | |
try { |
| 327 | 0 | MimeType type = types.forName(contentType); |
| 328 | 0 | if (type != null) { |
| 329 | 0 | if( ! isPlainMIMEType(type.getName()) ) { |
| 330 | 0 | return type.getName(); |
| 331 | |
} else { |
| 332 | 0 | candidateMIMEType = type.getName(); |
| 333 | |
} |
| 334 | |
} |
| 335 | |
} |
| 336 | 0 | catch (MimeTypeException mte) { |
| 337 | |
|
| 338 | 0 | } |
| 339 | |
} |
| 340 | |
|
| 341 | |
|
| 342 | 0 | final String resourceName = metadata.get(Metadata.RESOURCE_NAME_KEY); |
| 343 | 0 | if (resourceName != null) { |
| 344 | 0 | MimeType type = types.getMimeType(resourceName); |
| 345 | 0 | if (type != null) { |
| 346 | 0 | return type.getName(); |
| 347 | |
} |
| 348 | |
} |
| 349 | |
|
| 350 | |
|
| 351 | 0 | if(candidateMIMEType != null) { |
| 352 | 0 | return candidateMIMEType; |
| 353 | |
} else { |
| 354 | 0 | return MimeTypes.OCTET_STREAM; |
| 355 | |
} |
| 356 | |
} |
| 357 | |
|
| 358 | |
private boolean isPlainMIMEType(String type) { |
| 359 | 0 | return |
| 360 | |
type.equals(MimeTypes.OCTET_STREAM) |
| 361 | |
|| |
| 362 | |
type.equals(MimeTypes.PLAIN_TEXT); |
| 363 | |
} |
| 364 | |
|
| 365 | |
private boolean isGenericMIMEType(String type) { |
| 366 | 0 | return |
| 367 | |
isPlainMIMEType(type) |
| 368 | |
|| |
| 369 | |
type.equals(MimeTypes.XML); |
| 370 | |
} |
| 371 | |
|
| 372 | |
} |
| 373 | |
|