#!/usr/bin/env python """ A simple script that reads city names from standard input, one per line, and returns their populations according to Wikipedia. """ import requests import pyquery import re import urlparse from itertools import * def debug(string): #print string pass def get_population(name, url = None, country = None, disambiguate = True): """ Get population of city named *name* from Wikipedia. You can specify *url* of the exact page, or leave it *None* for the function to determine. You get a higher probability of correct answer if you specify the *country* too. Finally you can control if the function should try to *disambiguate*. """ if not url: url = "http://en.wikipedia.org/wiki/%s" % name debug('Trying %s' % url) req = requests.get(url) assert req.ok, "Fetching failed, url %s, code %d" % (url, req.status_code) query = pyquery.PyQuery(req.text.lower()) if country: correct = _correct_country(query, country) debug('Correct country: %s' % correct) if not correct: if disambiguate: return _disambiguate(name, country, query) else: return None return _population(query) def _correct_country(query, country): label = _get_label(query, 'country') if not label: debug('Not a geography page') return False countrytr = label.parent() return country in countrytr.text() def _disambiguate(name, country, query): for li in chain( query('li:contains("city"):contains("%s")' % country), query('li:contains("town"):contains("%s")' % country), query('li:contains("%s")' % country), query('li:contains("city")'), query('li:contains("town")'), ): # Limited by imagination... And common sense ;) try: a = li.getchildren()[0] url = urlparse.urljoin('http://en.wikipedia.org/', a.get('href')) res = get_population(name, url, country, disambiguate = False) if res: return res except: if debug: #import traceback #traceback.print_exc() #debug(li.getchildren()) #debug(li.text_content()) debug('Error while disambiguating') return None LABELS = ['adasdasdasd', 'metro', 'urban', 'agglo', 'city'] DEPTH = 9 def _population(query): popl = _get_label(query, 'population') if not popl: debug('No population label!') return None poptd = popl.next() if not poptd: debug('Nothing next to population label, multiple numbers?') best = -1 value = -1 ftr = popl.parent() for label in LABELS: #debug("Looking for label %s" % label) tr = ftr for _ in xrange(DEPTH): tr = tr.next() #debug(tr.text()) match = tr('td:contains("%s"), th:contains("%s")' % ( label, label)) if match: poptd = match.next() return _parse_number(poptd.text()) return None return _parse_number(poptd.text()) def _parse_number(text): res = re.search('[0-9,]+', text) if not res: return None return int(res.group(0).replace(',', '')) def _get_label(query, label): """ Return the cell table containing the *label* """ res = query('table[class *= "geography"] th:contains("%s")' % label) if not res: res = query('table[class *= "geography"] b:contains("%s")' %label).\ parent() if not res: res = query('table[class *= "geography"] td:first-child' + ':contains("%s:")' % label) return res if __name__ == '__main__': while True: try: name = raw_input() debug(name) except: debug('END') break url = "http://en.wikipedia.org/wiki/%s" % name try: population = get_population(name, url) except: population = None print name, population debug("\n\n")