# Changes required to address EULA for the origen hwpacks

#!/usr/bin/env python

import argparse
import os
import pycurl
import re
import urlparse
import html2text
from BeautifulSoup import BeautifulSoup

class LicenseProtectedFileFetcher:
    """Fetch a file from the web that may be protected by a license redirect

    This is designed to run on snapshots.linaro.org. License HTML file are in
    the form:

    <vendor>.html has a link to <vendor>-accept.html

    If self.get is pointed at a file that has to go through one of these
    licenses, it should be able to automatically accept the license and
    download the file.

    Once a license has been accepted, it will be used for all following
    downloads.

    If self.close() is called before the object is deleted, cURL will store
    the license accept cookie to cookies.txt, so it can be used for later
    downloads.

    """
    def __init__(self, cookie_file="cookies.txt"):
        """Set up cURL"""
        self.curl = pycurl.Curl()
        self.curl.setopt(pycurl.WRITEFUNCTION, self._write_body)
        self.curl.setopt(pycurl.HEADERFUNCTION, self._write_header)
        self.curl.setopt(pycurl.FOLLOWLOCATION, 1)
        self.curl.setopt(pycurl.COOKIEFILE, cookie_file)
        self.curl.setopt(pycurl.COOKIEJAR, cookie_file)
        self.file_out = None

    def _get(self, url):
        """Clear out header and body storage, fetch URL, filling them in."""
        url = url.encode("ascii")
        self.curl.setopt(pycurl.URL, url)

        self.body = ""
        self.header = ""

        if self.file_name:
            self.file_out = open(self.file_name, 'w')
        else:
            self.file_out = None

        self.curl.perform()
        self._parse_headers(url)

        if self.file_out:
            self.file_out.close()

    def _parse_headers(self, url):
        header = {}
        for line in self.header.splitlines():
            # Header lines typically are of the form thing: value...
            test_line = re.search("^(.*?)\s*:\s*(.*)$", line)

            if test_line:
                header[test_line.group(1)] = test_line.group(2)

        # The location attribute is sometimes relative, but we would
        # like to have it as always absolute...

        if 'Location' in header.keys():
            parsed_location = urlparse.urlparse(header["Location"])

            # If not an absolute location...
            if not parsed_location.netloc:
                parsed_source_url = urlparse.urlparse(url)
                new_location = ["", "", "", "", ""]

                new_location[0] = parsed_source_url.scheme
                new_location[1] = parsed_source_url.netloc
                new_location[2] = header["Location"]

                # Update location with absolute URL
                header["Location"] = urlparse.urlunsplit(new_location)

        self.header_text = self.header
        self.header = header

    def get_headers(self, url):
        url = url.encode("ascii")
        self.curl.setopt(pycurl.URL, url)

        self.body = ""
        self.header = ""

        # Setting NOBODY causes CURL to just fetch the header.
        self.curl.setopt(pycurl.NOBODY, True)
        self.curl.perform()
        self.curl.setopt(pycurl.NOBODY, False)

        self._parse_headers(url)

        return self.header

    def get_or_return_license(self, url, file_name=None):
        """Get file at the requested URL or, if behind a license, return that.

        If the URL provided does not redirect us to a license, then return the
        body of that file. If we are redirected to a license click through
        then return (the license as plain text, url to accept the license).

        If the user of this function accepts the license, then they should
        call get_protected_file."""

        self.file_name = file_name

        # Get the license details. If this returns None, the file isn't license
        # protected and we can just return the file we started to get in the
        # function (self.body).
        license_details = self._get_license(url)

        if license_details:
            return license_details

        return self.body

    def get(self, url, file_name=None):
        """Fetch the requested URL, accepting licenses, returns file body

        Fetches the file at url. If a redirect is encountered, it is
        expected to be to a license that has an accept link. Follow that link,
        then download the original file.

        """

        self.file_name = file_name
        license_details = self._get_license(url)

        if license_details:
            # Found a license. Accept the license without looking at it and
            # start fetching the file we originally wanted.
            accept_url = license_details[1]
            self.get_protected_file(accept_url, url)

        else:
            # If we got here, there wasn't a license protecting the file
            # so we just fetch it.
            self._get(url)

        return self.body

    def _get_license(self, url):
        """Return (license, accept URL) if found, else return None"""

        self.get_headers(url)

        if "Location" in self.header and self.header["Location"] != url:
            # We have been redirected to a new location - the license file
            location = self.header["Location"]

            # Fetch the license HTML
            self._get(location)

            # Get the file from the URL (full path)
            file = urlparse.urlparse(location).path

            # Get the file without the rest of the path
            file = os.path.split(file)[-1]

            # Look for a link with accepted.html in the page name. Follow it.
            for line in self.body.splitlines():
                link_search = re.search("""href=.*?["'](.*?-accepted.html)""",
                                        line)
                if link_search:
                    # Have found license accept URL!
                    new_file = link_search.group(1)
                    accept_url = re.sub(file, new_file, location)

                    # Parse the HTML using BeautifulSoup
                    soup = BeautifulSoup(self.body)

                    # The license is in a div with the ID license-text, so we
                    # use this to pull just the license out of the HTML.
                    html_license = u""
                    for chunk in soup.findAll(id="license-text"):
                        # Output of chunk.prettify is UTF8, but comes back
                        # as a str, so convert it here.
                        html_license += chunk.prettify().decode("utf-8")

                    text_license = html2text.html2text(html_license)

                    return text_license, accept_url

        return None

    def get_protected_file(self, accept_url, url):
        """Gets the file redirected to by the accept_url"""

        self._get(accept_url)  # Accept the license

        if not("Location" in self.header and self.header["Location"] == url):
            # If we got here, we don't have the file yet (weren't redirected
            # to it). Fetch our target file. This should work now that we have
            # the right cookie.
            self._get(url)  # Download the target file

        return self.body

    def _write_body(self, buf):
        """Used by curl as a sink for body content"""

        # If we have a target file to write to, write to it
        if self.file_out:
            self.file_out.write(buf)

        # Only buffer first 1MB of body. This should be plenty for anything
        # we wish to parse internally.
        if len(self.body) < 1024*1024*1024:
            self.body += buf

    def _write_header(self, buf):
        """Used by curl as a sink for header content"""
        self.header += buf

    def register_progress_callback(self, callback):
        self.curl.setopt(pycurl.NOPROGRESS, 0)
        self.curl.setopt(pycurl.PROGRESSFUNCTION, callback)

    def close(self):
        """Wrapper to close curl - this will allow curl to write out cookies"""
        self.curl.close()