/*
 * Decompiled with CFR 0.152.
 */
package org.apache.tika.eval.langid;

import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import opennlp.tools.langdetect.LanguageDetector;
import opennlp.tools.langdetect.LanguageDetectorModel;
import opennlp.tools.util.normalizer.CharSequenceNormalizer;
import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer;
import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer;
import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer;
import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer;
import org.apache.tika.eval.langid.Language;
import org.apache.tika.eval.langid.ProbingLanguageDetector;
import org.apache.tika.eval.textstats.StringStatsCalculator;

public class LanguageIDWrapper
implements StringStatsCalculator<List<Language>> {
    static LanguageDetectorModel LANG_MODEL;
    static int MAX_TEXT_LENGTH;
    private final LanguageDetector detector;

    public static synchronized void loadBuiltInModels() throws IOException {
        try (InputStream is = LanguageIDWrapper.class.getResourceAsStream("/opennlp/model_20190626.bin");){
            LANG_MODEL = new LanguageDetectorModel(is);
        }
    }

    public static void loadModels(Path path) throws IOException {
        LANG_MODEL = new LanguageDetectorModel(path.toFile());
    }

    private static CharSequenceNormalizer[] getNormalizers() {
        return new CharSequenceNormalizer[]{EmojiCharSequenceNormalizer.getInstance(), TikaUrlCharSequenceNormalizer.getInstance(), TwitterCharSequenceNormalizer.getInstance(), NumberCharSequenceNormalizer.getInstance(), ShrinkCharSequenceNormalizer.getInstance()};
    }

    public LanguageIDWrapper() {
        if (LANG_MODEL == null) {
            try {
                LanguageIDWrapper.loadBuiltInModels();
            }
            catch (IOException e) {
                throw new RuntimeException("couldn't load built in lang models", e);
            }
        }
        this.detector = new ProbingLanguageDetector(LANG_MODEL, new CharSequenceNormalizer[0]);
    }

    public List<Language> getProbabilities(String s) {
        opennlp.tools.langdetect.Language[] detected = this.detector.predictLanguages(s);
        ArrayList<Language> ret = new ArrayList<Language>();
        for (int i = 0; i < detected.length; ++i) {
            ret.add(new Language(detected[i].getLang(), detected[i].getConfidence()));
        }
        return ret;
    }

    public String[] getSupportedLanguages() {
        return this.detector.getSupportedLanguages();
    }

    public static void setMaxTextLength(int maxTextLength) {
        MAX_TEXT_LENGTH = maxTextLength;
    }

    @Override
    public List<Language> calculate(String txt) {
        return this.getProbabilities(txt);
    }

    static {
        MAX_TEXT_LENGTH = 50000;
    }

    private static class TikaUrlCharSequenceNormalizer
    implements CharSequenceNormalizer {
        private static final Pattern URL_REGEX = Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]{10,10000}");
        private static final Pattern MAIL_REGEX = Pattern.compile("[-_.0-9A-Za-z]{1,100}@[-_0-9A-Za-z]{1,100}[-_.0-9A-Za-z]{1,100}");
        private static final TikaUrlCharSequenceNormalizer INSTANCE = new TikaUrlCharSequenceNormalizer();

        public static TikaUrlCharSequenceNormalizer getInstance() {
            return INSTANCE;
        }

        private TikaUrlCharSequenceNormalizer() {
        }

        @Override
        public CharSequence normalize(CharSequence charSequence) {
            String modified = URL_REGEX.matcher(charSequence).replaceAll(" ");
            return MAIL_REGEX.matcher(modified).replaceAll(" ");
        }
    }
}

