/*
 * Decompiled with CFR 0.152.
 */
package org.apache.cocoon.components.crawler;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import org.apache.avalon.excalibur.pool.Recyclable;
import org.apache.avalon.framework.activity.Disposable;
import org.apache.avalon.framework.configuration.Configurable;
import org.apache.avalon.framework.configuration.Configuration;
import org.apache.avalon.framework.configuration.ConfigurationException;
import org.apache.cocoon.Constants;
import org.apache.cocoon.components.crawler.CocoonCrawler;
import org.apache.cocoon.util.AbstractLogEnabled;
import org.apache.commons.lang.StringUtils;
import org.apache.regexp.RE;
import org.apache.regexp.RESyntaxException;

public class SimpleCocoonCrawlerImpl
extends AbstractLogEnabled
implements CocoonCrawler,
Configurable,
Disposable,
Recyclable {
    public static final String LINK_CONTENT_TYPE_CONFIG = "link-content-type";
    public final String LINK_CONTENT_TYPE_DEFAULT = "application/x-cocoon-links";
    public static final String LINK_VIEW_QUERY_CONFIG = "link-view-query";
    public static final String LINK_VIEW_QUERY_DEFAULT = "cocoon-view=links";
    public static final String EXCLUDE_CONFIG = "exclude";
    public static final String INCLUDE_CONFIG = "include";
    public static final String USER_AGENT_CONFIG = "user-agent";
    public static final String USER_AGENT_DEFAULT = Constants.COMPLETE_NAME;
    public static final String ACCEPT_CONFIG = "accept";
    public static final String ACCEPT_DEFAULT = "*/*";
    private String linkViewQuery = "cocoon-view=links";
    private String linkContentType = "application/x-cocoon-links";
    private HashSet excludeCrawlingURL = null;
    private HashSet includeCrawlingURL = null;
    private String userAgent = USER_AGENT_DEFAULT;
    private String accept = "*/*";
    private HashSet crawled;
    protected int depth;
    protected HashSet urlsToProcess;
    protected HashSet urlsNextDepth;

    public void configure(Configuration configuration) throws ConfigurationException {
        String value;
        Configuration child;
        String tokenized_pattern;
        int index;
        String[] params;
        String pattern;
        int i;
        Configuration[] children = configuration.getChildren(INCLUDE_CONFIG);
        if (children.length > 0) {
            this.includeCrawlingURL = new HashSet();
            for (i = 0; i < children.length; ++i) {
                pattern = children[i].getValue();
                try {
                    params = StringUtils.split((String)pattern, (String)", ");
                    for (index = 0; index < params.length; ++index) {
                        tokenized_pattern = params[index];
                        this.includeCrawlingURL.add(new RE(tokenized_pattern));
                    }
                    continue;
                }
                catch (RESyntaxException rese) {
                    this.getLogger().error((Object)("Cannot create including regular-expression for " + pattern), (Throwable)rese);
                }
            }
        } else if (this.getLogger().isDebugEnabled()) {
            this.getLogger().debug((Object)"Include all URLs");
        }
        if ((children = configuration.getChildren(EXCLUDE_CONFIG)).length > 0) {
            this.excludeCrawlingURL = new HashSet();
            for (i = 0; i < children.length; ++i) {
                pattern = children[i].getValue();
                try {
                    params = StringUtils.split((String)pattern, (String)", ");
                    for (index = 0; index < params.length; ++index) {
                        tokenized_pattern = params[index];
                        this.excludeCrawlingURL.add(new RE(tokenized_pattern));
                    }
                    continue;
                }
                catch (RESyntaxException rese) {
                    this.getLogger().error((Object)("Cannot create excluding regular-expression for " + pattern), (Throwable)rese);
                }
            }
        } else {
            this.excludeCrawlingURL = new HashSet();
            this.setDefaultExcludeFromCrawling();
            if (this.getLogger().isDebugEnabled()) {
                this.getLogger().debug((Object)"Exclude default URLs only");
            }
        }
        if ((child = configuration.getChild(LINK_CONTENT_TYPE_CONFIG, false)) != null && (value = child.getValue()) != null && value.length() > 0) {
            this.linkContentType = value.trim();
        }
        if ((child = configuration.getChild(LINK_VIEW_QUERY_CONFIG, false)) != null && (value = child.getValue()) != null && value.length() > 0) {
            this.linkViewQuery = value.trim();
        }
        if ((child = configuration.getChild(USER_AGENT_CONFIG, false)) != null && (value = child.getValue()) != null && value.length() > 0) {
            this.userAgent = value;
        }
        if ((child = configuration.getChild(ACCEPT_CONFIG, false)) != null && (value = child.getValue()) != null && value.length() > 0) {
            this.accept = value;
        }
    }

    public void dispose() {
        this.crawled = null;
        this.urlsToProcess = null;
        this.urlsNextDepth = null;
        this.excludeCrawlingURL = null;
        this.includeCrawlingURL = null;
    }

    public void recycle() {
        this.crawled = null;
        this.urlsToProcess = null;
        this.urlsNextDepth = null;
        this.depth = -1;
    }

    public void crawl(URL url) {
        this.crawl(url, -1);
    }

    public void crawl(URL url, int maxDepth) {
        this.crawled = new HashSet();
        this.urlsToProcess = new HashSet();
        this.urlsNextDepth = new HashSet();
        this.depth = maxDepth;
        if (this.getLogger().isDebugEnabled()) {
            this.getLogger().debug((Object)("crawl URL " + url + " to depth " + maxDepth));
        }
        this.urlsToProcess.add(url);
    }

    public Iterator iterator() {
        return new CocoonCrawlerIterator(this);
    }

    private void setDefaultExcludeFromCrawling() {
        String[] EXCLUDE_FROM_CRAWLING_DEFAULT = new String[]{".*\\.gif(\\?.*)?$", ".*\\.png(\\?.*)?$", ".*\\.jpe?g(\\?.*)?$", ".*\\.js(\\?.*)?$", ".*\\.css(\\?.*)?$"};
        for (int i = 0; i < EXCLUDE_FROM_CRAWLING_DEFAULT.length; ++i) {
            String pattern = EXCLUDE_FROM_CRAWLING_DEFAULT[i];
            try {
                this.excludeCrawlingURL.add(new RE(pattern));
                continue;
            }
            catch (RESyntaxException rese) {
                this.getLogger().error((Object)("Cannot create excluding regular-expression for " + pattern), (Throwable)rese);
            }
        }
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     * Loose catch block
     * Enabled force condition propagation
     * Lifted jumps to return sites
     */
    private List getLinks(URL url) {
        String contentType;
        BufferedReader br;
        ArrayList<URL> url_links;
        block23: {
            String sURL = url.toString();
            if (!this.isIncludedURL(sURL) || this.isExcludedURL(sURL)) {
                return null;
            }
            if (this.crawled.contains(sURL)) {
                return null;
            }
            this.crawled.add(sURL);
            if (this.getLogger().isDebugEnabled()) {
                this.getLogger().debug((Object)("Getting links of URL " + sURL));
            }
            url_links = null;
            br = null;
            sURL = url.getFile();
            URL links = new URL(url, sURL + (sURL.indexOf("?") == -1 ? "?" : "&") + this.linkViewQuery);
            URLConnection links_url_connection = links.openConnection();
            links_url_connection.setRequestProperty("Accept", this.accept);
            links_url_connection.setRequestProperty("User-Agent", this.userAgent);
            links_url_connection.connect();
            InputStream is = links_url_connection.getInputStream();
            br = new BufferedReader(new InputStreamReader(is));
            contentType = links_url_connection.getContentType();
            if (contentType != null) break block23;
            if (this.getLogger().isDebugEnabled()) {
                this.getLogger().debug((Object)("Ignoring " + sURL + " (no content type)"));
            }
            List list = null;
            Object var15_12 = null;
            if (br == null) return list;
            try {
                br.close();
                return list;
            }
            catch (IOException ignored) {
                // empty catch block
            }
            return list;
        }
        int index = contentType.indexOf(59);
        if (index != -1) {
            contentType = contentType.substring(0, index);
        }
        if (this.getLogger().isDebugEnabled()) {
            this.getLogger().debug((Object)("Content-type: " + contentType));
        }
        if (contentType.equals(this.linkContentType)) {
            String line;
            url_links = new ArrayList<URL>();
            while ((line = br.readLine()) != null) {
                URL newUrl = new URL(url, line);
                String sNewUrl = newUrl.toString();
                boolean add_url = true;
                if (add_url) {
                    add_url &= !url_links.contains(sNewUrl);
                }
                if (add_url) {
                    add_url &= !this.crawled.contains(sNewUrl);
                }
                if (add_url) {
                    add_url &= this.isIncludedURL(sNewUrl);
                }
                if (add_url) {
                    add_url &= !this.isExcludedURL(sNewUrl);
                }
                if (!add_url) continue;
                if (this.getLogger().isDebugEnabled()) {
                    this.getLogger().debug((Object)("Add URL: " + sNewUrl));
                }
                url_links.add(newUrl);
            }
        }
        Object var15_13 = null;
        if (br == null) return url_links;
        try {
            br.close();
            return url_links;
        }
        catch (IOException ignored) {}
        return url_links;
        {
            catch (IOException ioe) {
                this.getLogger().warn((Object)("Problems get links of " + url), (Throwable)ioe);
                Object var15_14 = null;
                if (br == null) return url_links;
                try {
                    br.close();
                    return url_links;
                }
                catch (IOException ignored) {}
                return url_links;
            }
        }
        catch (Throwable throwable) {
            Object var15_15 = null;
            if (br == null) throw throwable;
            try {
                br.close();
                throw throwable;
            }
            catch (IOException ignored) {
                // empty catch block
            }
            throw throwable;
        }
    }

    private boolean isExcludedURL(String url) {
        if (this.excludeCrawlingURL == null) {
            return false;
        }
        Iterator i = this.excludeCrawlingURL.iterator();
        while (i.hasNext()) {
            RE pattern = (RE)i.next();
            if (!pattern.match(url)) continue;
            if (this.getLogger().isDebugEnabled()) {
                this.getLogger().debug((Object)("Excluded URL " + url));
            }
            return true;
        }
        if (this.getLogger().isDebugEnabled()) {
            this.getLogger().debug((Object)("Not excluded URL " + url));
        }
        return false;
    }

    private boolean isIncludedURL(String url) {
        if (this.includeCrawlingURL == null) {
            return true;
        }
        Iterator i = this.includeCrawlingURL.iterator();
        while (i.hasNext()) {
            RE pattern = (RE)i.next();
            if (!pattern.match(url)) continue;
            if (this.getLogger().isDebugEnabled()) {
                this.getLogger().debug((Object)("Included URL " + url));
            }
            return true;
        }
        if (this.getLogger().isDebugEnabled()) {
            this.getLogger().debug((Object)("Not included URL " + url));
        }
        return false;
    }

    public static class CocoonCrawlerIterator
    implements Iterator {
        private SimpleCocoonCrawlerImpl cocoonCrawler;

        CocoonCrawlerIterator(SimpleCocoonCrawlerImpl cocoonCrawler) {
            this.cocoonCrawler = cocoonCrawler;
        }

        public boolean hasNext() {
            return this.cocoonCrawler.urlsToProcess.size() > 0 || this.cocoonCrawler.urlsNextDepth.size() > 0;
        }

        public Object next() {
            if (this.cocoonCrawler.urlsToProcess.size() == 0 && this.cocoonCrawler.urlsNextDepth.size() > 0) {
                this.cocoonCrawler.urlsToProcess = this.cocoonCrawler.urlsNextDepth;
                this.cocoonCrawler.urlsNextDepth = new HashSet();
                if (this.cocoonCrawler.depth > 0) {
                    --this.cocoonCrawler.depth;
                }
            }
            URL theNextUrl = null;
            Iterator i = this.cocoonCrawler.urlsToProcess.iterator();
            while (i.hasNext() && theNextUrl == null) {
                List url_links;
                URL url = (URL)i.next();
                i.remove();
                if (this.cocoonCrawler.depth != -1 && this.cocoonCrawler.depth <= 0 || (url_links = this.cocoonCrawler.getLinks(url)) == null) continue;
                this.cocoonCrawler.urlsNextDepth.addAll(url_links);
                theNextUrl = url;
            }
            return theNextUrl;
        }

        public void remove() {
            throw new UnsupportedOperationException("remove is not implemented");
        }
    }
}

