#!/usr/bin/python import re import requests import subprocess import sys OUTPUT_FILE = "./link_check_result.txt" argv = sys.argv # Replace command to linkchecker argv[0] = 'linkchecker' if '-F' in argv: argv[argv.index('-F')+1] = "text" else: argv.insert(-1, '-F') argv.insert(-1, 'text') # Run linkchecker here ret = subprocess.call(argv) if ret == 2: exit(1) if '-h' in argv: exit(0) # Have the final output of linkchecker ifile = open('./linkchecker-out.txt', 'r') rest_str = ifile.read() ifile.close() ofile = open(OUTPUT_FILE, 'w') PATTERN = '^URL.*$\n(^Name.*$\n)?(^Parent URL.*$\n)?^Real URL\\s+(?P.*)$\n^Check time\\s+.*$\n(^Size.*$\n)?^Result\\s+Error:\\s+(?PConnectionError:.*)$\n^$\n' pattern = re.compile(PATTERN, re.MULTILINE) checked_url = [] match_cnt = 0 match = pattern.search(rest_str) while (match): match_cnt += 1 match_dict = match.groupdict() match_str = rest_str[match.start():match.end()] if match.start() != 0: ofile.write(rest_str[:match.start()]) # test the URL if match_dict['real_url'] not in checked_url: checked_url.append(match_dict['real_url']) print("Checking %s" % match_dict['real_url']) try: req = requests.get(match_dict['real_url']) print("Req status code: %d" % req.status_code) if req.status_code != 200: print("Got Error: %d %s" % (req.status_code, req.reason)) wstr = match_str.replace(match_dict['reason'], "%d %s" % (req.status_code, req.reason)) ofile.write(wstr) except Exception as e: ofile.write(match_str) match_cnt -= 1 rest_str = rest_str[match.end():] match = pattern.search(rest_str) ofile.write(rest_str) ofile.close() print("--------------------------") print("Fix %d ConnectionError\nFinal report: %s" % (match_cnt, OUTPUT_FILE)) exit(ret)