Adapt regexps to new HTML code of snapshots.linaro.org - LP: #1153464
Why we parse HTML instead of having API for it?
Signed-off-by: Marcin Juszkiewicz <marcin.juszkiewicz@linaro.org>
diff --git a/crawler.py b/crawler.py
index 5d7fe89..71787c9 100755
--- a/crawler.py
+++ b/crawler.py
@@ -27,7 +27,7 @@
def list_hwpack(url):
''' returns tuple of (buildate, url)
'''
- urls = list_links(url, r'<a\s*href=[\'|"].*[\'"]>(hwpack.*?\.tar\.gz)</a>')
+ urls = list_links(url, r'<a\s*href=[\'|"].*/(.*hwpack.*?\.tar\.gz)[\'"]')
for link in urls:
try:
build_date = re.compile('_(\d+)-').findall(link)
@@ -41,7 +41,7 @@
[ (20120210, http://foo.bar/hwpack.tar.gz), (20120209, blah.tar.gz) ]
'''
# only analyze the last few builds
- links = list_links(url, r'<a\s*href=[\'|"].*[\'"]>(\d+)/?</a>')
+ links = list_links(url, r'<a\s*href=[\'"].*/(\d+)[\'"]')
links = sorted(links, reverse=True, key=int)[:limit]
hwpacks = []
for link in links:
@@ -52,7 +52,7 @@
def list_rfs(url):
links = list_links(url,
- r'<a\s*href=[\'|"].*[\'"]>(.*\-\d+\.(?!config)(?:rootfs\.)?tar\.gz)</a>')
+ r'<a\s*href=[\'|"].*/(.*(?!config)(?:rootfs\.)?tar\.gz)[\'"]')
if len(links) is 1:
return "%s/%s" %(url,links[0])
return None
@@ -62,7 +62,7 @@
Returns a tuple of (builddate, url)
'''
# only analyze the last few builds
- links = list_links(url, r'<a\s*href=[\'"].*[\'"]>(\d+)/?</a>')
+ links = list_links(url, r'<a\s*href=[\'"].*/(\d+)[\'"]')
links = sorted(links, reverse=True, key=int)[:limit]
for link in links:
build = list_rfs('%s/%s' %(url, link))