#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements.  See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License.  You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
A script to pull licenses/notices/source code for Java dependencies.
It generates a CSV file with [dependency_name, url_to_license, license_type, source_included]
"""

import csv
import json
import os
import shutil
import traceback
import yaml

from bs4 import BeautifulSoup
from future.moves.urllib.request import urlopen
from future.moves.urllib.request import URLError, HTTPError
from tenacity import retry
from tenacity import stop_after_attempt
from tenacity import wait_exponential

LICENSE_DIR = 'java_third_party_licenses'
LICENSE_SCRIPT_DIR = 'sdks/java/container/license_scripts/'
SOURCE_CODE_REQUIRED_LICENSES = ['lgpl', 'glp', 'cddl', 'mpl']


@retry(reraise=True,
       wait=wait_exponential(multiplier=2),
       stop=stop_after_attempt(5))
def pull_from_url(file_name, url, dep, no_list):
    if url == 'skip':
        return
    try:
        url_read = urlopen(url)
        with open(file_name, 'wb') as temp_write:
            shutil.copyfileobj(url_read, temp_write)
        print('Successfully pulled {file_name} from {url}'.format(
            url=url, file_name=file_name))
    except URLError as e:
        no_list.add(dep)
        print('Invalid url: {url}'.format(url=url))
    except HTTPError as e:
        no_list.add(dep)
        print('Received {code} from {url}'.format(code=e.code, url=url))
    except Exception as e:
        print('Error occurred when pull {file_name} from {url}.'.format(
            url=url, file_name=file_name))
        traceback.print_exc()
        raise


def pull_source_code(base_url, dir_name, dep, incorrect_source_url):
    # base_url example: https://repo1.maven.org/maven2/org/mortbay/jetty/jsp-2.1/6.1.14/
    soup = BeautifulSoup(urlopen(base_url).read(), "html.parser")
    for href in (a["href"] for a in soup.select("a[href]")):
        if href.endswith('.jar') and not 'javadoc' in href: # download jar file only
            file_name = dir_name + '/' + href
            url = base_url + '/' + href
            pull_from_url(file_name, url, dep, incorrect_source_url)


@retry(reraise=True, stop=stop_after_attempt(3))
def write_to_csv(csv_dict):
    csv_columns = [
        'dependency_name', 'url_to_license', 'license_type', 'source_included'
    ]
    csv_file = "{license_dir}/beam_java_dependency_list.csv".format(
        license_dir=LICENSE_DIR)
    try:
        with open(csv_file, 'w') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=csv_columns)
            writer.writeheader()
            for dep, data in csv_dict.items():
                data['dependency_name'] = dep
                writer.writerow(data)
    except:
        traceback.print_exc()
        raise


if __name__ == "__main__":
    no_licenses = set()
    no_license_type = set()
    incorrect_source_url = set()
    csv_dict = dict()

    # index.json is generated by Gradle plugin.
    with open('{license_dir}/index.json'.format(license_dir=LICENSE_DIR)) as f:
        dependencies = json.load(f)

    yaml_file = LICENSE_SCRIPT_DIR + 'dep_urls_java.yaml'
    with open(yaml_file) as file:
        dep_config = yaml.full_load(file)

    maven_url_temp = 'https://repo1.maven.org/maven2/{module}/{version}'

    for dep in dependencies['dependencies']:
        '''
        An example of a Json blob.
        {
            "moduleName": "antlr:antlr",
            "moduleUrl": "http://www.antlr.org/",
            "moduleVersion": "2.7.7",
            "moduleLicense": "BSD License",
            "moduleLicenseUrl": "http://www.antlr.org/license.html"
        }
        '''
        name = dep['moduleName'].split(':')[1].lower()
        version = dep['moduleVersion']
        name_version = name + '-' + version
        dir_name = '{license_dir}/{name_version}.jar'.format(
            license_dir=LICENSE_DIR, name_version=name_version)
        # if auto pulled, directory is existing at {license_dir}
        if not os.path.isdir(dir_name):
            # skip self dependencies
            if dep['moduleName'].startswith('beam'):
                print('Skippig', name + '-' + version)
                continue
            os.mkdir(dir_name)
            # pull license
            try:
                license_url = dep_config[name][version]['license']
            except:
                license_url = dep['moduleLicenseUrl']
            pull_from_url(dir_name + '/LICENSE', license_url, name_version,
                          no_licenses)
            # pull notice
            try:
                notice_url = dep_config[name][version]['notice']
                pull_from_url(dir_name + '/NOTICE', notice_url, name_version,
                              no_licenses)
            except:
                notice_url = None
        else:
            try:
                license_url = dep['moduleLicenseUrl']
            except:
                license_url = ''
            print(
                'License/notice for {name_version} were pulled automatically.'.
                format(name_version=name_version))

        # get license_type to decide if pull source code.
        try:
            license_type = dep['moduleLicense']
        except:
            try:
                license_type = dep_config[name][version]['type']
            except:
                no_license_type.add(name_version)
                license_type = ''
                continue

        # pull source code if license_type is one of SOURCE_CODE_REQUIRED_LICENSES.
        if any(x in license_type.lower()
               for x in SOURCE_CODE_REQUIRED_LICENSES):
            try:
                base_url = dep_config[name][version]['source']
            except:
                module = dep['moduleName'].split(':')[0].replace('.', '/')
                base_url = maven_url_temp.format(module=module + '/' + name,
                                                 version=version)
            pull_source_code(base_url, dir_name, name_version,
                             incorrect_source_url)
            source_included = True
        else:
            source_included = False

        csv_dict[name_version] = {
            'url_to_license': license_url,
            'license_type': license_type,
            'source_included': source_included
        }

    # write csv file
    write_to_csv(csv_dict)

    error_msg = []
    if no_licenses:
        how_to = '**************************************** ' \
                 'Licenses were not able to be pulled ' \
                 'automatically for some dependencies. Please search source ' \
                 'code of the dependencies on the internet and add "license" ' \
                 'and "notice" (if available) field to {yaml_file} for each ' \
                 'missing license. Dependency List: [{dep_list}]'.format(
            dep_list=','.join(sorted(no_licenses)), yaml_file=yaml_file)
        error_msg.append(how_to)

    if no_license_type:
        how_to = '**************************************** ' \
                 'License type of some dependencies were not ' \
                 'identified. The license type is used to decide whether the ' \
                 'source code of the dependency should be pulled or not. ' \
                 'Please add "type" field to {yaml_file} for each dependency. ' \
                 'Dependency List: [{dep_list}]'.format(
            dep_list=','.join(sorted(no_license_type)), yaml_file=yaml_file)
        error_msg.append(how_to)

    if incorrect_source_url:
        how_to = '**************************************** ' \
                 'Urls to maven repo for some dependencies ' \
                 'were not able to be generated automatically. Please add ' \
                 '"source" field to {yaml_file} for each dependency. ' \
                 'Dependency List: [{dep_list}]'.format(
            dep_list=','.join(sorted(incorrect_source_url)),
            yaml_file=yaml_file)
        error_msg.append(how_to)

    if error_msg:
        raise RuntimeError('{n} error(s) occurred.'.format(n=len(error_msg)),
                           error_msg)
