/*
 * Decompiled with CFR 0.152.
 */
package org.apache.asterix.fuzzyjoin;

import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.FileOutputStream;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import org.apache.asterix.fuzzyjoin.FuzzyJoinUtil;
import org.apache.asterix.fuzzyjoin.LittleEndianIntOutputStream;
import org.apache.asterix.fuzzyjoin.MutableInteger;
import org.apache.asterix.fuzzyjoin.tokenizer.Tokenizer;
import org.apache.asterix.fuzzyjoin.tokenizer.TokenizerFactory;
import org.apache.asterix.fuzzyjoin.tokenorder.TokenLoad;
import org.apache.asterix.fuzzyjoin.tokenorder.TokenRankFrequency;

public class FuzzyJoinTokenize {
    public static void main(String[] args) throws IOException {
        String line;
        String inputFileName = args[0];
        String tokensFileName = args[1];
        String tokenizedFileName = args[2];
        BufferedReader input = new BufferedReader(new FileReader(inputFileName));
        Tokenizer tokenizer = TokenizerFactory.getTokenizer("Word", "_", '_');
        int[] dataColumns = FuzzyJoinUtil.getDataColumns("2,3");
        HashMap<String, MutableInteger> tokenCount = new HashMap<String, MutableInteger>();
        while ((line = input.readLine()) != null) {
            List<String> tokens = tokenizer.tokenize(FuzzyJoinUtil.getData(line.split(":"), dataColumns, '_'));
            for (String string : tokens) {
                MutableInteger count = (MutableInteger)tokenCount.get(string);
                if (count == null) {
                    tokenCount.put(string, new MutableInteger(1));
                    continue;
                }
                count.inc();
            }
        }
        input.close();
        ArrayList tokenCounts = new ArrayList();
        tokenCount.forEach((key, value) -> tokenCounts.add(new TokenCount((String)key, (MutableInteger)value)));
        Collections.sort(tokenCounts);
        BufferedWriter outputTokens = new BufferedWriter(new FileWriter(tokensFileName));
        for (TokenCount tc : tokenCounts) {
            outputTokens.write(tc.getToken() + "\n");
        }
        outputTokens.close();
        TokenRankFrequency tokenRankFrequency = new TokenRankFrequency();
        TokenLoad tokenLoad = new TokenLoad(tokensFileName, tokenRankFrequency);
        tokenLoad.loadTokenRank();
        input = new BufferedReader(new FileReader(inputFileName));
        LittleEndianIntOutputStream outputTokenized = new LittleEndianIntOutputStream(new BufferedOutputStream(new FileOutputStream(tokenizedFileName)));
        while ((line = input.readLine()) != null) {
            Object[] splits = line.split(":");
            int rid = Integer.parseInt(splits[0]);
            outputTokenized.writeInt(rid);
            List<String> tokens = tokenizer.tokenize(FuzzyJoinUtil.getData(splits, dataColumns, '_'));
            Collection<Integer> tokensRanked = tokenRankFrequency.getTokenRanks(tokens);
            outputTokenized.writeInt(tokensRanked.size());
            for (Integer token : tokensRanked) {
                outputTokenized.writeInt(token);
            }
        }
        input.close();
        outputTokenized.close();
    }

    public static class TokenCount
    implements Comparable<Object> {
        public String token;
        public MutableInteger count;

        public TokenCount(String token, MutableInteger count) {
            this.token = token;
            this.count = count;
        }

        @Override
        public int compareTo(Object o) {
            TokenCount tc = (TokenCount)o;
            return this.count.compareTo(tc.count);
        }

        public String getToken() {
            return this.token;
        }

        public String toString() {
            return this.token + " " + this.count;
        }
    }
}

