/*
 * Decompiled with CFR 0.152.
 */
package org.apache.tika.eval;

import java.io.IOException;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ArrayBlockingQueue;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.io.FilenameUtils;
import org.apache.tika.batch.FileResource;
import org.apache.tika.batch.fs.FSProperties;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.eval.AbstractProfiler;
import org.apache.tika.eval.EvalFilePaths;
import org.apache.tika.eval.ExtractProfiler;
import org.apache.tika.eval.db.ColInfo;
import org.apache.tika.eval.db.Cols;
import org.apache.tika.eval.db.TableInfo;
import org.apache.tika.eval.io.ExtractReader;
import org.apache.tika.eval.io.ExtractReaderException;
import org.apache.tika.eval.io.IDBWriter;
import org.apache.tika.eval.tokens.ContrastStatistics;
import org.apache.tika.eval.tokens.TokenContraster;
import org.apache.tika.eval.tokens.TokenIntPair;
import org.apache.tika.eval.util.ContentTags;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler;

public class ExtractComparer
extends AbstractProfiler {
    static Options OPTIONS;
    private static final String DIGEST_KEY_PREFIX = "X-TIKA:digest:";
    private static final String FIELD_A = "fa";
    private static final String FIELD_B = "fb";
    public static TableInfo REF_PAIR_NAMES;
    public static TableInfo COMPARISON_CONTAINERS;
    public static TableInfo CONTENT_COMPARISONS;
    public static TableInfo PROFILES_A;
    public static TableInfo PROFILES_B;
    public static TableInfo EMBEDDED_FILE_PATH_TABLE_A;
    public static TableInfo EMBEDDED_FILE_PATH_TABLE_B;
    public static TableInfo CONTENTS_TABLE_A;
    public static TableInfo CONTENTS_TABLE_B;
    public static TableInfo TAGS_TABLE_A;
    public static TableInfo TAGS_TABLE_B;
    public static TableInfo EXCEPTION_TABLE_A;
    public static TableInfo EXCEPTION_TABLE_B;
    public static TableInfo EXTRACT_EXCEPTION_TABLE_A;
    public static TableInfo EXTRACT_EXCEPTION_TABLE_B;
    private final TikaConfig config = TikaConfig.getDefaultConfig();
    private final Path inputDir;
    private final Path extractsA;
    private final Path extractsB;
    private final TokenContraster tokenContraster = new TokenContraster();
    private final ExtractReader extractReader;

    public static void USAGE() {
        HelpFormatter helpFormatter = new HelpFormatter();
        helpFormatter.printHelp(80, "java -jar tika-eval-x.y.jar Compare -extractsA extractsA -extractsB extractsB -db mydb", "Tool: Compare", OPTIONS, "Note: for the default h2 db, do not include the .mv.db at the end of the db name.");
    }

    public ExtractComparer(ArrayBlockingQueue<FileResource> queue, Path inputDir, Path extractsA, Path extractsB, ExtractReader extractReader, IDBWriter writer) {
        super(queue, writer);
        this.inputDir = inputDir;
        this.extractsA = extractsA;
        this.extractsB = extractsB;
        this.extractReader = extractReader;
    }

    @Override
    public boolean processFileResource(FileResource fileResource) {
        Metadata metadata = fileResource.getMetadata();
        EvalFilePaths fpsA = null;
        EvalFilePaths fpsB = null;
        if (this.inputDir != null && (this.inputDir.equals(this.extractsA) || this.inputDir.equals(this.extractsB))) {
            fpsA = this.getPathsFromExtractCrawl(metadata, this.extractsA);
            fpsB = this.getPathsFromExtractCrawl(metadata, this.extractsB);
        } else {
            fpsA = this.getPathsFromSrcCrawl(metadata, this.inputDir, this.extractsA);
            fpsB = this.getPathsFromSrcCrawl(metadata, this.inputDir, this.extractsB);
        }
        try {
            this.compareFiles(fpsA, fpsB);
        }
        catch (Throwable e) {
            throw new RuntimeException("Exception while working on: " + metadata.get(FSProperties.FS_REL_PATH), e);
        }
        return true;
    }

    protected void compareFiles(EvalFilePaths fpsA, EvalFilePaths fpsB) throws IOException {
        int i;
        ExtractReaderException.TYPE extractExceptionA = null;
        ExtractReaderException.TYPE extractExceptionB = null;
        List<Metadata> metadataListA = null;
        if (extractExceptionA == null) {
            try {
                metadataListA = this.extractReader.loadExtract(fpsA.getExtractFile());
            }
            catch (ExtractReaderException e) {
                extractExceptionA = e.getType();
            }
        }
        List<Metadata> metadataListB = null;
        try {
            metadataListB = this.extractReader.loadExtract(fpsB.getExtractFile());
        }
        catch (ExtractReaderException e) {
            extractExceptionB = e.getType();
        }
        HashSet<Integer> handledB = new HashSet<Integer>();
        String containerID = Integer.toString(ID.getAndIncrement());
        HashMap<Cols, String> contData = new HashMap<Cols, String>();
        contData.put(Cols.CONTAINER_ID, containerID);
        contData.put(Cols.FILE_PATH, fpsA.getRelativeSourceFilePath().toString());
        long srcFileLength = this.getSourceFileLength(metadataListA, metadataListB);
        contData.put(Cols.LENGTH, srcFileLength > -1L ? Long.toString(srcFileLength) : "");
        contData.put(Cols.FILE_EXTENSION, FilenameUtils.getExtension(fpsA.getRelativeSourceFilePath().getFileName().toString()));
        long extractFileLengthA = this.getFileLength(fpsA.getExtractFile());
        contData.put(Cols.EXTRACT_FILE_LENGTH_A, extractFileLengthA > -1L ? Long.toString(extractFileLengthA) : "");
        long extractFileLengthB = this.getFileLength(fpsB.getExtractFile());
        contData.put(Cols.EXTRACT_FILE_LENGTH_B, extractFileLengthB > -1L ? Long.toString(extractFileLengthB) : "");
        this.writer.writeRow(COMPARISON_CONTAINERS, contData);
        if (extractExceptionA != null) {
            this.writeExtractException(EXTRACT_EXCEPTION_TABLE_A, containerID, fpsA.getRelativeSourceFilePath().toString(), extractExceptionA);
        }
        if (extractExceptionB != null) {
            this.writeExtractException(EXTRACT_EXCEPTION_TABLE_B, containerID, fpsB.getRelativeSourceFilePath().toString(), extractExceptionB);
        }
        if (metadataListA == null && metadataListB == null) {
            return;
        }
        List<Integer> numAttachmentsA = ExtractComparer.countAttachments(metadataListA);
        List<Integer> numAttachmentsB = ExtractComparer.countAttachments(metadataListB);
        if (metadataListA != null) {
            for (i = 0; i < metadataListA.size(); ++i) {
                String fileId = i == 0 ? containerID : Integer.toString(ID.getAndIncrement());
                Metadata metadataA = metadataListA.get(i);
                ContentTags contentTagsA = ExtractComparer.getContent(fpsA, metadataA);
                ContentTags contentTagsB = ContentTags.EMPTY_CONTENT_TAGS;
                Metadata metadataB = null;
                this.writeTagData(fileId, contentTagsA, TAGS_TABLE_A);
                this.writeProfileData(fpsA, i, contentTagsA, metadataA, fileId, containerID, numAttachmentsA, PROFILES_A);
                this.writeExceptionData(fileId, metadataA, EXCEPTION_TABLE_A);
                int matchIndex = this.getMatch(i, metadataListA, metadataListB);
                if (matchIndex > -1 && !handledB.contains(matchIndex)) {
                    metadataB = metadataListB.get(matchIndex);
                    handledB.add(matchIndex);
                }
                if (metadataB != null) {
                    contentTagsB = ExtractComparer.getContent(fpsB, metadataB);
                    this.writeTagData(fileId, contentTagsB, TAGS_TABLE_B);
                    this.writeProfileData(fpsB, i, contentTagsB, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
                    this.writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
                }
                this.writeEmbeddedFilePathData(i, fileId, metadataA, metadataB);
                this.tokenCounter.clear(FIELD_A);
                this.tokenCounter.clear(FIELD_B);
                try {
                    this.writeContentData(fileId, contentTagsA, FIELD_A, CONTENTS_TABLE_A);
                    this.writeContentData(fileId, contentTagsB, FIELD_B, CONTENTS_TABLE_B);
                }
                catch (IOException e) {
                    throw new RuntimeException(e);
                }
                if (this.tokenCounter.getTokenStatistics(FIELD_A).getTotalTokens() <= 0 || this.tokenCounter.getTokenStatistics(FIELD_B).getTotalTokens() <= 0) continue;
                HashMap<Cols, String> data = new HashMap<Cols, String>();
                data.put(Cols.ID, fileId);
                ContrastStatistics contrastStatistics = this.tokenContraster.calculateContrastStatistics(this.tokenCounter.getTokens(FIELD_A), this.tokenCounter.getTokenStatistics(FIELD_A), this.tokenCounter.getTokens(FIELD_B), this.tokenCounter.getTokenStatistics(FIELD_B));
                this.writeContrasts(data, contrastStatistics);
                this.writer.writeRow(CONTENT_COMPARISONS, data);
            }
        }
        if (metadataListB != null) {
            for (i = 0; i < metadataListB.size(); ++i) {
                if (handledB.contains(i)) continue;
                Metadata metadataB = metadataListB.get(i);
                ContentTags contentTagsB = ExtractComparer.getContent(fpsB, metadataB);
                String fileId = i == 0 ? containerID : Integer.toString(ID.getAndIncrement());
                this.writeTagData(fileId, contentTagsB, TAGS_TABLE_B);
                this.writeProfileData(fpsB, i, contentTagsB, metadataB, fileId, containerID, numAttachmentsB, PROFILES_B);
                this.writeEmbeddedFilePathData(i, fileId, null, metadataB);
                this.writeExceptionData(fileId, metadataB, EXCEPTION_TABLE_B);
                this.tokenCounter.clear(FIELD_B);
                try {
                    this.writeContentData(fileId, contentTagsB, FIELD_B, CONTENTS_TABLE_B);
                    continue;
                }
                catch (IOException e) {
                    throw new RuntimeException(e);
                }
            }
        }
    }

    private void writeEmbeddedFilePathData(int i, String fileId, Metadata mA, Metadata mB) {
        HashMap<Cols, String> d;
        if (i == 0) {
            return;
        }
        String pathA = null;
        String pathB = null;
        if (mA != null) {
            pathA = mA.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
        }
        if (mB != null) {
            pathB = mB.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
        }
        if (pathA != null) {
            d = new HashMap<Cols, String>();
            d.put(Cols.ID, fileId);
            d.put(Cols.EMBEDDED_FILE_PATH, pathA);
            try {
                this.writer.writeRow(EMBEDDED_FILE_PATH_TABLE_A, d);
            }
            catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
        if (!(pathB == null || pathA != null && pathA.equals(pathB))) {
            d = new HashMap();
            d.put(Cols.ID, fileId);
            d.put(Cols.EMBEDDED_FILE_PATH, pathB);
            try {
                this.writer.writeRow(EMBEDDED_FILE_PATH_TABLE_B, d);
            }
            catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
    }

    private long getSourceFileLength(List<Metadata> metadataListA, List<Metadata> metadataListB) {
        long len = this.getSourceFileLength(metadataListA);
        if (len > -1L) {
            return len;
        }
        return this.getSourceFileLength(metadataListB);
    }

    private int getMatch(int i, List<Metadata> metadataListA, List<Metadata> metadataListB) {
        if (metadataListB == null || metadataListB.size() == 0) {
            return -1;
        }
        if (i == 0) {
            return 0;
        }
        int match = this.findMatchingDigests(metadataListA.get(i), metadataListB);
        if (match > -1) {
            return match;
        }
        Metadata thisMetadata = metadataListA.get(i);
        String embeddedPath = thisMetadata.get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
        if (embeddedPath != null) {
            for (int j = 0; j < metadataListB.size(); ++j) {
                String thatEmbeddedPath = metadataListB.get(j).get(AbstractRecursiveParserWrapperHandler.EMBEDDED_RESOURCE_PATH);
                if (!embeddedPath.equals(thatEmbeddedPath)) continue;
                return j;
            }
        }
        if (metadataListA.size() == metadataListB.size()) {
            return i;
        }
        return -1;
    }

    private int findMatchingDigests(Metadata metadata, List<Metadata> metadataListB) {
        HashSet digestKeys = new HashSet();
        for (String n : metadata.names()) {
            if (!n.startsWith(DIGEST_KEY_PREFIX)) continue;
            String digestA = metadata.get(n);
            for (int i = 0; i < metadataListB.size(); ++i) {
                String digestB = metadataListB.get(i).get(n);
                if (digestA == null || !digestA.equals(digestB)) continue;
                return i;
            }
        }
        return -1;
    }

    private void writeContrasts(Map<Cols, String> data, ContrastStatistics contrastStatistics) {
        this.writeContrastString(data, Cols.TOP_10_MORE_IN_A, contrastStatistics.getTopNMoreA());
        this.writeContrastString(data, Cols.TOP_10_MORE_IN_B, contrastStatistics.getTopNMoreB());
        this.writeContrastString(data, Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A, contrastStatistics.getTopNUniqueA());
        this.writeContrastString(data, Cols.TOP_10_UNIQUE_TOKEN_DIFFS_B, contrastStatistics.getTopNUniqueB());
        data.put(Cols.OVERLAP, Double.toString(contrastStatistics.getOverlap()));
        data.put(Cols.DICE_COEFFICIENT, Double.toString(contrastStatistics.getDiceCoefficient()));
    }

    private void writeContrastString(Map<Cols, String> data, Cols col, TokenIntPair[] tokenIntPairs) {
        int i = 0;
        StringBuilder sb = new StringBuilder();
        for (TokenIntPair p : tokenIntPairs) {
            if (i++ > 0) {
                sb.append(" | ");
            }
            sb.append(p.getToken()).append(": ").append(p.getValue());
        }
        data.put(col, sb.toString());
    }

    static {
        Option extractsA = new Option("extractsA", true, "directory for extractsA files");
        extractsA.setRequired(true);
        Option extractsB = new Option("extractsB", true, "directory for extractsB files");
        extractsB.setRequired(true);
        Option inputDir = new Option("inputDir", true, "optional: directory of original binary input files if it exists or can be the same as -extractsA or -extractsB. If not specified, -inputDir=-extractsA");
        OPTIONS = new Options().addOption(extractsA).addOption(extractsB).addOption(inputDir).addOption("bc", "optional: tika-batch config file").addOption("numConsumers", true, "optional: number of consumer threads").addOption(new Option("alterExtract", true, "for json-formatted extract files, process full metadata list ('as_is'=default), take just the first/container document ('first_only'), concatenate all content into the first metadata item ('concatenate_content')")).addOption("minExtractLength", true, "minimum extract length to process (in bytes)").addOption("maxExtractLength", true, "maximum extract length to process (in bytes)").addOption("db", true, "db file to which to write results").addOption("jdbc", true, "EXPERT: full jdbc connection string. Must specify this or -db <h2db>").addOption("jdbcDriver", true, "EXPERT: jdbc driver, or specify via -Djdbc.driver").addOption("tablePrefixA", true, "EXPERT: optional prefix for table names for A").addOption("tablePrefixB", true, "EXPERT: optional prefix for table names for B").addOption("drop", true, "drop tables if they exist").addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler").addOption("maxTokens", true, "maximum tokens to process, default=200000").addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000").addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id, default=50000").addOption("defaultLangCode", true, "which language to use for common words if no 'common words' file exists for the langid result");
        REF_PAIR_NAMES = new TableInfo("pair_names", new ColInfo(Cols.DIR_NAME_A, 12, 128), new ColInfo(Cols.DIR_NAME_B, 12, 128));
        COMPARISON_CONTAINERS = new TableInfo("containers", new ColInfo(Cols.CONTAINER_ID, 4, "PRIMARY KEY"), new ColInfo(Cols.FILE_PATH, 12, 1024), new ColInfo(Cols.FILE_EXTENSION, 12, 12), new ColInfo(Cols.LENGTH, -5), new ColInfo(Cols.EXTRACT_FILE_LENGTH_A, -5), new ColInfo(Cols.EXTRACT_FILE_LENGTH_B, -5));
        CONTENT_COMPARISONS = new TableInfo("content_comparisons", new ColInfo(Cols.ID, 4, "PRIMARY KEY"), new ColInfo(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A, 12, 1024), new ColInfo(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_B, 12, 1024), new ColInfo(Cols.TOP_10_MORE_IN_A, 12, 1024), new ColInfo(Cols.TOP_10_MORE_IN_B, 12, 1024), new ColInfo(Cols.DICE_COEFFICIENT, 6), new ColInfo(Cols.OVERLAP, 6));
        PROFILES_A = new TableInfo("profiles_a", ExtractProfiler.PROFILE_TABLE.getColInfos());
        PROFILES_B = new TableInfo("profiles_b", ExtractProfiler.PROFILE_TABLE.getColInfos());
        EMBEDDED_FILE_PATH_TABLE_A = new TableInfo("emb_path_a", ExtractProfiler.EMBEDDED_FILE_PATH_TABLE.getColInfos());
        EMBEDDED_FILE_PATH_TABLE_B = new TableInfo("emb_path_b", ExtractProfiler.EMBEDDED_FILE_PATH_TABLE.getColInfos());
        CONTENTS_TABLE_A = new TableInfo("contents_a", ExtractProfiler.CONTENTS_TABLE.getColInfos());
        CONTENTS_TABLE_B = new TableInfo("contents_b", ExtractProfiler.CONTENTS_TABLE.getColInfos());
        TAGS_TABLE_A = new TableInfo("tags_a", ExtractProfiler.TAGS_TABLE.getColInfos());
        TAGS_TABLE_B = new TableInfo("tags_b", ExtractProfiler.TAGS_TABLE.getColInfos());
        EXCEPTION_TABLE_A = new TableInfo("exceptions_a", ExtractProfiler.EXCEPTION_TABLE.getColInfos());
        EXCEPTION_TABLE_B = new TableInfo("exceptions_b", ExtractProfiler.EXCEPTION_TABLE.getColInfos());
        EXTRACT_EXCEPTION_TABLE_A = new TableInfo("extract_exceptions_a", ExtractProfiler.EXTRACT_EXCEPTION_TABLE.getColInfos());
        EXTRACT_EXCEPTION_TABLE_B = new TableInfo("extract_exceptions_b", ExtractProfiler.EXTRACT_EXCEPTION_TABLE.getColInfos());
    }
}

