/*
 * Decompiled with CFR 0.152.
 */
package edu.uci.ics.crawler4j.parser;

import edu.uci.ics.crawler4j.crawler.CrawlConfig;
import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.crawler.exceptions.ParseException;
import edu.uci.ics.crawler4j.parser.AllTagMapper;
import edu.uci.ics.crawler4j.parser.ExtractedUrlAnchorPair;
import edu.uci.ics.crawler4j.parser.HtmlContentHandler;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.parser.HtmlParser;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import edu.uci.ics.crawler4j.url.WebURL;
import java.io.ByteArrayInputStream;
import java.io.InputStream;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.HashSet;
import java.util.Set;
import org.apache.tika.metadata.DublinCore;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.HtmlMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;

public class TikaHtmlParser
implements HtmlParser {
    protected static final Logger logger = LoggerFactory.getLogger(TikaHtmlParser.class);
    private final CrawlConfig config;
    private final org.apache.tika.parser.html.HtmlParser htmlParser;
    private final ParseContext parseContext;

    public TikaHtmlParser(CrawlConfig config) throws InstantiationException, IllegalAccessException {
        this.config = config;
        this.htmlParser = new org.apache.tika.parser.html.HtmlParser();
        this.parseContext = new ParseContext();
        this.parseContext.set(HtmlMapper.class, AllTagMapper.class.newInstance());
    }

    @Override
    public HtmlParseData parse(Page page, String contextURL) throws ParseException {
        HtmlParseData parsedData = new HtmlParseData();
        HtmlContentHandler contentHandler = new HtmlContentHandler();
        Metadata metadata = new Metadata();
        try (ByteArrayInputStream inputStream = new ByteArrayInputStream(page.getContentData());){
            this.htmlParser.parse((InputStream)inputStream, (ContentHandler)contentHandler, metadata, this.parseContext);
        }
        catch (Exception e) {
            logger.error("{}, while parsing: {}", (Object)e.getMessage(), (Object)page.getWebURL().getURL());
            throw new ParseException();
        }
        String contentCharset = this.chooseEncoding(page, metadata);
        parsedData.setContentCharset(contentCharset);
        parsedData.setText(contentHandler.getBodyText().trim());
        parsedData.setTitle(metadata.get(DublinCore.TITLE));
        parsedData.setMetaTags(contentHandler.getMetaTags());
        Set<WebURL> outgoingUrls = this.getOutgoingUrls(contextURL, contentHandler, contentCharset);
        parsedData.setOutgoingUrls(outgoingUrls);
        try {
            if (page.getContentCharset() == null) {
                parsedData.setHtml(new String(page.getContentData()));
            } else {
                parsedData.setHtml(new String(page.getContentData(), page.getContentCharset()));
            }
            return parsedData;
        }
        catch (UnsupportedEncodingException e) {
            logger.error("error parsing the html: " + page.getWebURL().getURL(), (Throwable)e);
            throw new ParseException();
        }
    }

    private Set<WebURL> getOutgoingUrls(String contextURL, HtmlContentHandler contentHandler, String contentCharset) {
        HashSet<WebURL> outgoingUrls = new HashSet<WebURL>();
        String baseURL = contentHandler.getBaseUrl();
        if (baseURL != null) {
            contextURL = baseURL;
        }
        int urlCount = 0;
        for (ExtractedUrlAnchorPair urlAnchorPair : contentHandler.getOutgoingUrls()) {
            Charset hrefCharset;
            String url;
            String hrefLoweredCase;
            String href = urlAnchorPair.getHref();
            if (href == null || href.trim().isEmpty() || (hrefLoweredCase = href.trim().toLowerCase()).contains("javascript:") || hrefLoweredCase.contains("mailto:") || hrefLoweredCase.contains("@") || (url = URLCanonicalizer.getCanonicalURL(href, contextURL, hrefCharset = contentCharset == null || contentCharset.isEmpty() ? StandardCharsets.UTF_8 : Charset.forName(contentCharset))) == null) continue;
            WebURL webURL = new WebURL();
            webURL.setURL(url);
            webURL.setTag(urlAnchorPair.getTag());
            webURL.setAnchor(urlAnchorPair.getAnchor());
            webURL.setAttributes(urlAnchorPair.getAttributes());
            outgoingUrls.add(webURL);
            if (++urlCount <= this.config.getMaxOutgoingLinksToFollow()) continue;
            break;
        }
        return outgoingUrls;
    }

    private String chooseEncoding(Page page, Metadata metadata) {
        String pageCharset = page.getContentCharset();
        if (pageCharset == null || pageCharset.isEmpty()) {
            return metadata.get("Content-Encoding");
        }
        return pageCharset;
    }
}

