remove dependency on fetch_image server index
This removes the need to be dependent on the server_index.tar.bz2
file being up-to-date on releases.linaro.org/fetch_image. This makes
the build process easier by not having to hope its been run between
the time the build was completed and this script gets launched
diff --git a/crawler.py b/crawler.py
new file mode 100755
index 0000000..d8cdd6c
--- /dev/null
+++ b/crawler.py
@@ -0,0 +1,69 @@
+#!/usr/bin/python
+
+import re
+import urllib2
+
+def list_links(url, regex=r'<a\s*href=[\'|"](.*?)[\'"].*?>'):
+ response = urllib2.urlopen(url)
+ msg = response.read()
+ links = re.compile(regex).findall(msg)
+ return links
+
+def list_hwpack_builds(url, date):
+ url = "%s/%s" % (url,date)
+ links = list_links(url, r'<a\s*href=[\'|"](\d)\/[\'"].*?>')
+ builds = []
+ for link in sorted(links, reverse=True):
+ hwurl = "%s/%s/images/hwpack/" % (url, link)
+ hwpack = list_links(hwurl, r'<a\s*href=[\'|"](hwpack.*?\.tar\.gz)[\'"].*?>')
+ if len(hwpack) is 1:
+ builds.append("%s/%s" %(hwurl, hwpack[0]))
+ return builds
+
+def latest_hwpacks(url, limit=7):
+ '''returns an array of tuples (build-date, hwpack url) like:
+ [ (20120210, http://foo.bar/hwpack.tar.gz), (20120209, blah.tar.gz) ]
+ '''
+ # only analyze the last few builds
+ links = list_links(url, r'<a\s*href=[\'|"](\d+)\/[\'"].*?>')[:limit]
+ hwpacks = []
+ for link in sorted(links, reverse=True):
+ builds = list_hwpack_builds(url, link)
+ if len(builds) > 0:
+ hwpacks.append((link,builds[0]))
+ return hwpacks
+
+def list_rfs_builds(url, date):
+ url = "%s/%s" % (url,date)
+ links = list_links(url, r'<a\s*href=[\'|"](\d)\/[\'"].*?>')
+ builds = []
+ for link in sorted(links, reverse=True):
+ hwurl = "%s/%s/images/tar/" % (url, link)
+ hwpack = list_links(hwurl, r'<a\s*href=[\'|"](.*?\.tar\.gz)[\'"].*?>')
+ if len(hwpack) is 1:
+ builds.append("%s/%s" %(hwurl, hwpack[0]))
+ return builds
+
+def latest_rfs(url, limit=7):
+ '''
+ Returns a tuple of (builddate, url)
+ '''
+ # only analyze the last few builds
+ links = list_links(url, r'<a\s*href=[\'|"](\d+)\/[\'"].*?>')[:limit]
+ for link in sorted(links, reverse=True):
+ builds = list_rfs_builds(url, link)
+ if len(builds) > 0:
+ return (link, builds[0])
+
+ return None
+
+if __name__ == '__main__':
+ import sys
+ for arg in sys.argv[1:]:
+ print "HWPACKS for: %s" % arg
+ hwpacks = latest_hwpacks(arg, 4)
+ for hwpack in hwpacks:
+ print " %s: %s" % hwpack
+
+ print "latest nano:"
+ print " %s %s" % latest_rfs('http://snapshots.linaro.org/oneiric/linaro-o-nano')