#!/usr/bin/env python2.3 import urllib, urllib2 import sys import re from xml.dom.minidom import parseString from BeautifulSoup import BeautifulSoup STATE = "n" AREA = "capital" RES_PER_PAGE = 20 WP_URL = "http://www.whitepages.com.au/wp/search/results.jhtml?pname=%s&pstreet=%s&pstate=%s&psuburb=%s&pinitial=%s&parea=%s&psearchType=res&_DARGS=%%2Fwp%%2Fsearch%%2Fresults.jhtml.7_A&_DAV=" def wp(name, initial = None, suburb = None, street=None, start=0, count=5): if suburb is None: suburb = "" if street is None: street = "" if initial is None: initial = "" name = urllib.quote(name) initial = urllib.quote(initial) suburb = urllib.quote(suburb) street = urllib.quote(street) total = None start_group = start / RES_PER_PAGE start_idx = start % RES_PER_PAGE end_group = (start + count) / RES_PER_PAGE end_idx = (start + count) % RES_PER_PAGE url = WP_URL % (name, street, STATE, suburb, initial, AREA) results = [] for page in range(start_group, end_group + 1): req = urllib2.Request(url = (url + str(page + 1))) f = urllib2.urlopen(req) soup = BeautifulSoup(f) if total is None: listings = soup('td', {'class':'bold', 'colspan':'2'})[0].string.strip() srch = re.search(".*of (\d+) listings.*", listings) total = int(srch.groups()[0]) start = 0 end = -1 if page == start_group: start = start_idx if page == end_group: end = end_idx for form in soup('form', {'action': "http://www.whereis.com/mapping/geocodeAddress.do"})[start_idx:end_idx]: res = {} res["name"] = form('input', {"name":"placeName"})[0]["value"] res["phone"] = form('input', {"name":"phoneNumber"})[0]["value"] res["locality"] = form('input', {"name":"locality"})[0]["value"] res["state"] = form('input', {"name":"state"})[0]["value"] res["address"] = form('input', {"name":"address"})[0]["value"] res["stnum"] = form('input', {"name":"streetNumber"})[0]["value"] res["stnam"] = form('input', {"name":"streetName"})[0]["value"] res["sttype"] = form('input', {"name":"streetType"})[0]["value"] results.append(res) return total, results if __name__ == "__main__": total, res = wp("Howard", "J", "North Sydney") print "%d Reuslts" % total for each in res: print each