#!/usr/bin/env python # Parse html tables from a given URL and output CSV. # Note: To install a missing python module foo do "easy_install foo" # (or the new way is "pip install foo" but you might have to do # "easy_install pip" first) from BeautifulSoup import BeautifulSoup import urllib2 import re import sys import unicodedata #from time import sleep # from http://stackoverflow.com/questions/1197981/convert-html-entities def asciify2(s): matches = re.findall("&#\d+;", s) if len(matches) > 0: hits = set(matches) for hit in hits: name = hit[2:-1] try: entnum = int(name) s = s.replace(hit, unichr(entnum)) except ValueError: pass matches = re.findall("&\w+;", s) hits = set(matches) amp = "&" if amp in hits: hits.remove(amp) for hit in hits: name = hit[1:-1] if htmlentitydefs.name2codepoint.has_key(name): #s = s.replace(hit, unichr(htmlentitydefs.name2codepoint[name])) s = s.replace(hit, "") s = s.replace(amp, "&") return s def opensoup(url): request = urllib2.Request(url) request.add_header("User-Agent", "Mozilla/5.0") # To mimic a real browser's user-agent string more exactly, if necessary: # Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.14) # Gecko/20080418 Ubuntu/7.10 (gutsy) Firefox/2.0.0.14 pagefile = urllib2.urlopen(request) soup = BeautifulSoup(pagefile) pagefile.close() return soup def asciify(s): #print "DEBUG[", type(s), "]" return unicodedata.normalize('NFKD', s).encode('ascii', 'ignore') # remove extra whitespace, including stripping leading and trailing whitespace. def condense(s): s = re.sub(r"\s+", " ", s, re.DOTALL); #$s =~ s/\s+/\ /gs; return s.strip() # this gets rid of tags and condenses whitespace def striptags(s): #print "DEBUG[", type(s), "]" s = re.sub(r"\]*\>[^\<]*\<\/span\>", "", s) s = re.sub(r"\&\#160\;", " ", s) return condense(re.sub(r"\<[^\>]*\>", " ", s)) if(len(sys.argv) == 1): # called with no arguments print "Usage: ", sys.argv[0], " url [n]" print " (where n indicates which html table to parse)" exit(1) if(len(sys.argv) == 2): # called with just a URL url = sys.argv[1] soup = opensoup(url) tables = soup.findAll("table") print "Number of html tables: ", len(tables) i = 0; for t in tables: i += 1 print str(i)+": ", j = 0 hdr = t.find('tr') hdrl = [] #print "DEBUGH: ", hdr.__class__, " [", hdr, "]" for h in hdr.findAll(re.compile('td|th')): j += 1 hdrl.append(asciify2(striptags(h.renderContents()))[0:20]) sys.stdout.write("[%3d cols, %3d rows] " % (j, len(t.findAll('tr')))) print " | ".join(hdrl) exit(0) url = sys.argv[1] n = int(sys.argv[2]) soup = opensoup(url) tables = soup.findAll("table") #, {"class":"wikitable sortable"}) table = tables[n-1] for r in table.findAll('tr'): rl = [] for c in r.findAll(re.compile('td|th')): rl.append(striptags(c.renderContents())) print " | ".join(rl)