summaryrefslogtreecommitdiff
path: root/download_content_yes_to_lic.py
blob: d576914f8c59cae0deef048a00eaf99fcd456956 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
# Changes required to address EULA for the origen hwpacks

#!/usr/bin/env python

import argparse
import os
import pycurl
import re
import urlparse
import html2text
from BeautifulSoup import BeautifulSoup

class LicenseProtectedFileFetcher:
    """Fetch a file from the web that may be protected by a license redirect

    This is designed to run on snapshots.linaro.org. License HTML file are in
    the form:

    <vendor>.html has a link to <vendor>-accept.html

    If self.get is pointed at a file that has to go through one of these
    licenses, it should be able to automatically accept the license and
    download the file.

    Once a license has been accepted, it will be used for all following
    downloads.

    If self.close() is called before the object is deleted, cURL will store
    the license accept cookie to cookies.txt, so it can be used for later
    downloads.

    """
    def __init__(self, cookie_file="cookies.txt"):
        """Set up cURL"""
        self.curl = pycurl.Curl()
        self.curl.setopt(pycurl.WRITEFUNCTION, self._write_body)
        self.curl.setopt(pycurl.HEADERFUNCTION, self._write_header)
        self.curl.setopt(pycurl.FOLLOWLOCATION, 1)
        self.curl.setopt(pycurl.COOKIEFILE, cookie_file)
        self.curl.setopt(pycurl.COOKIEJAR, cookie_file)
        self.file_out = None

    def _get(self, url):
        """Clear out header and body storage, fetch URL, filling them in."""
        url = url.encode("ascii")
        self.curl.setopt(pycurl.URL, url)

        self.body = ""
        self.header = ""

        if self.file_name:
            self.file_out = open(self.file_name, 'w')
        else:
            self.file_out = None

        self.curl.perform()
        self._parse_headers(url)

        if self.file_out:
            self.file_out.close()

    def _parse_headers(self, url):
        header = {}
        for line in self.header.splitlines():
            # Header lines typically are of the form thing: value...
            test_line = re.search("^(.*?)\s*:\s*(.*)$", line)

            if test_line:
                header[test_line.group(1)] = test_line.group(2)

        # The location attribute is sometimes relative, but we would
        # like to have it as always absolute...

        if 'Location' in header.keys():
            parsed_location = urlparse.urlparse(header["Location"])

            # If not an absolute location...
            if not parsed_location.netloc:
                parsed_source_url = urlparse.urlparse(url)
                new_location = ["", "", "", "", ""]

                new_location[0] = parsed_source_url.scheme
                new_location[1] = parsed_source_url.netloc
                new_location[2] = header["Location"]

                # Update location with absolute URL
                header["Location"] = urlparse.urlunsplit(new_location)

        self.header_text = self.header
        self.header = header

    def get_headers(self, url):
        url = url.encode("ascii")
        self.curl.setopt(pycurl.URL, url)

        self.body = ""
        self.header = ""

        # Setting NOBODY causes CURL to just fetch the header.
        self.curl.setopt(pycurl.NOBODY, True)
        self.curl.perform()
        self.curl.setopt(pycurl.NOBODY, False)

        self._parse_headers(url)

        return self.header

    def get_or_return_license(self, url, file_name=None):
        """Get file at the requested URL or, if behind a license, return that.

        If the URL provided does not redirect us to a license, then return the
        body of that file. If we are redirected to a license click through
        then return (the license as plain text, url to accept the license).

        If the user of this function accepts the license, then they should
        call get_protected_file."""

        self.file_name = file_name

        # Get the license details. If this returns None, the file isn't license
        # protected and we can just return the file we started to get in the
        # function (self.body).
        license_details = self._get_license(url)

        if license_details:
            return license_details

        return self.body

    def get(self, url, file_name=None):
        """Fetch the requested URL, accepting licenses, returns file body

        Fetches the file at url. If a redirect is encountered, it is
        expected to be to a license that has an accept link. Follow that link,
        then download the original file.

        """

        self.file_name = file_name
        license_details = self._get_license(url)

        if license_details:
            # Found a license. Accept the license without looking at it and
            # start fetching the file we originally wanted.
            accept_url = license_details[1]
            self.get_protected_file(accept_url, url)

        else:
            # If we got here, there wasn't a license protecting the file
            # so we just fetch it.
            self._get(url)

        return self.body

    def _get_license(self, url):
        """Return (license, accept URL) if found, else return None"""

        self.get_headers(url)

        if "Location" in self.header and self.header["Location"] != url:
            # We have been redirected to a new location - the license file
            location = self.header["Location"]

            # Fetch the license HTML
            self._get(location)

            # Get the file from the URL (full path)
            file = urlparse.urlparse(location).path

            # Get the file without the rest of the path
            file = os.path.split(file)[-1]

            # Look for a link with accepted.html in the page name. Follow it.
            for line in self.body.splitlines():
                link_search = re.search("""href=.*?["'](.*?-accepted.html)""",
                                        line)
                if link_search:
                    # Have found license accept URL!
                    new_file = link_search.group(1)
                    accept_url = re.sub(file, new_file, location)

                    # Parse the HTML using BeautifulSoup
                    soup = BeautifulSoup(self.body)

                    # The license is in a div with the ID license-text, so we
                    # use this to pull just the license out of the HTML.
                    html_license = u""
                    for chunk in soup.findAll(id="license-text"):
                        # Output of chunk.prettify is UTF8, but comes back
                        # as a str, so convert it here.
                        html_license += chunk.prettify().decode("utf-8")

                    text_license = html2text.html2text(html_license)

                    return text_license, accept_url

        return None

    def get_protected_file(self, accept_url, url):
        """Gets the file redirected to by the accept_url"""

        self._get(accept_url)  # Accept the license

        if not("Location" in self.header and self.header["Location"] == url):
            # If we got here, we don't have the file yet (weren't redirected
            # to it). Fetch our target file. This should work now that we have
            # the right cookie.
            self._get(url)  # Download the target file

        return self.body

    def _write_body(self, buf):
        """Used by curl as a sink for body content"""

        # If we have a target file to write to, write to it
        if self.file_out:
            self.file_out.write(buf)

        # Only buffer first 1MB of body. This should be plenty for anything
        # we wish to parse internally.
        if len(self.body) < 1024*1024*1024:
            self.body += buf

    def _write_header(self, buf):
        """Used by curl as a sink for header content"""
        self.header += buf

    def register_progress_callback(self, callback):
        self.curl.setopt(pycurl.NOPROGRESS, 0)
        self.curl.setopt(pycurl.PROGRESSFUNCTION, callback)

    def close(self):
        """Wrapper to close curl - this will allow curl to write out cookies"""
        self.curl.close()