""" A website link checker and validator BUGS: - Will do a whole validation for a page for every anchor referenced, eg: will check edit.php, edit.php#local, edit.php#foo - The python sgml parser seems to choke on some pages, and not provide very good error message. It is recommended that on these pages you try and validate using w3c which will often fix the problem. - Currently uses no proxy for checking base urls, and generic proxy setting for all others. This is a very CSE specific thing """ import sys import urllib2, sgmllib import htmllib, formatter import sets import urlparse import popen2 import optparse SUCCESS=0 NOT_FOUND=1 PARSE_ERROR=2 accepted_protocols = ["http", "ftp"] def relative_url(url, cur_url): protocol = urlparse.urlparse(url)[0] if protocol == "": return urlparse.urljoin(cur_url, url) elif protocol in accepted_protocols: return url else: return None class MyParser(htmllib.HTMLParser): def __init__(self, formatter, base): htmllib.HTMLParser.__init__(self, formatter) self.base = base self.links = sets.Set() def start_a(self, attrs): for attr in attrs: if attr[0] == "href": new_url = relative_url(attr[1], self.base) if not new_url is None: self.links.add(new_url) class UrlChecker: def __init__(self, url, validate=True, verbose=False, w3c_url=False, base_proxy=True): self.validate = validate self.verbose = verbose self.w3c_url = w3c_url self.base = url self.checked_pages = {} self.pending_urls = sets.Set() self.pending_urls.add(url) self.urlopener_others = urllib2.build_opener() if base_proxy: self.urlopener_base = self.urlopener_others else: self.urlopener_base = urllib2.build_opener(urllib2.ProxyHandler({})) while len(self.pending_urls): self.check_page(self.pending_urls.pop()) def do_report(self): for url in self.checked_pages.keys(): if url[:len(self.base)] == self.base: if self.checked_pages[url][0] == NOT_FOUND: pass # Might do something here in the future elif self.checked_pages[url][0] == PARSE_ERROR: print "%s: couldn't be parsed: %s" % (url, self.checked_pages[url][1]) else: links = len(self.checked_pages[url][1]) bads = [] for each in self.checked_pages[url][1]: if self.checked_pages[each][0] == NOT_FOUND: bads.append(each) if len(self.checked_pages[url][2]): print "%s didn't validate:\n--------------" % url if self.w3c_url: print "http://validator.w3.org/check?uri=%s" % url print self.checked_pages[url][2] if len(bads) > 0 or self.verbose: print "%s: %s/%s" % (url, len(bads), links) for bad in bads: print "\t%s" % bad def check_page(self, url): if url[:len(self.base)] == self.base: urlopen = self.urlopener_base else: urlopen = self.urlopener_others if self.verbose: print "Checking:", url try: x = urlopen.open(url) except: # FIXME: Correct error checking self.checked_pages[url] = (NOT_FOUND,) return if url[:len(self.base)] == self.base: links = [] output = "" conent = None if self.verbose: print "Downloading...", url data = x.read() if "Content-Type" in x.info(): content = x.info()["Content-Type"].split(";")[0] if content == "text/html": # Here we validate if self.validate: r, w = popen2.popen2("validate -") w.write(data) w.close() output = r.read() r.close() if self.verbose: print "Validator output:", output # Here we parse for links try: parser = MyParser(formatter.NullFormatter(), x.geturl()) parser.feed(data) parser.close() except sgmllib.SGMLParseError, x: self.checked_pages[url] = (PARSE_ERROR, x, output) return links = parser.links self.checked_pages[url] = (SUCCESS, list(links), output) while len(links): link = links.pop() if link not in self.checked_pages.keys(): if self.verbose: print "\tAdding... ", link self.pending_urls.add(link) else: self.checked_pages[url] = (SUCCESS, ) if __name__ == "__main__": parser = optparse.OptionParser() parser.add_option("-x", action="store_false", dest="validate", default=True) parser.add_option("-v", action="store_true", dest="verbose", default=False) parser.add_option("-w", action="store_true", dest="w3c_url", default=False) parser.add_option("-p", action="store_false", dest="base_proxy", default=True) (options, args) = parser.parse_args() if len(args) == 0: print "Need to provide a url" else: x = UrlChecker(args[0], options.validate, options.verbose, options.w3c_url, options.base_proxy) x.do_report()