#!/usr/bin/env python
"""
A simple script that reads city names from standard input, one per line, and
returns their populations according to Wikipedia.
"""

import requests
import pyquery
import re
import urlparse
from itertools import *

def debug(string):
    #print string
    pass

def get_population(name, url = None, country = None, disambiguate = True):
    """
    Get population of city named *name* from Wikipedia. You can specify *url*
    of the exact page, or leave it *None* for the function to determine. You
    get a higher probability of correct answer if you specify the *country*
    too. Finally you can control if the function should try to *disambiguate*.
    """
    if not url:
        url = "http://en.wikipedia.org/wiki/%s" % name
    debug('Trying %s' % url)

    req = requests.get(url)
    assert req.ok, "Fetching failed, url %s, code %d" % (url, req.status_code)

    query = pyquery.PyQuery(req.text.lower())

    if country:
        correct = _correct_country(query, country)
        debug('Correct country: %s' % correct)
        if not correct:
            if disambiguate:
                return _disambiguate(name, country, query)
            else:
                return None

    return _population(query)


def _correct_country(query, country):
    label = _get_label(query, 'country')
    if not label:
        debug('Not a geography page')
        return False
    countrytr = label.parent() 
    return country in countrytr.text()

def _disambiguate(name, country, query):
    for li in chain(
            query('li:contains("city"):contains("%s")' % country),
            query('li:contains("town"):contains("%s")' % country),
            query('li:contains("%s")' % country),
            query('li:contains("city")'),
            query('li:contains("town")'),
            ): # Limited by imagination... And common sense ;)
        try:
            a = li.getchildren()[0]
            url = urlparse.urljoin('http://en.wikipedia.org/', a.get('href'))
            res = get_population(name, url, country, disambiguate = False)
            if res:
                return res
        except:
            if debug:
                #import traceback
                #traceback.print_exc()
                #debug(li.getchildren())
                #debug(li.text_content())
                debug('Error while disambiguating')
    return None

LABELS = ['adasdasdasd', 'metro', 'urban', 'agglo', 'city']
DEPTH = 9

def _population(query):
    popl = _get_label(query, 'population')
    if not popl:
        debug('No population label!')
        return None
    poptd = popl.next()
    if not poptd:
        debug('Nothing next to population label, multiple numbers?')
        best = -1
        value = -1
        ftr = popl.parent()
        for label in LABELS:
            #debug("Looking for label %s" % label)
            tr = ftr
            for _ in xrange(DEPTH):
                tr = tr.next()
                #debug(tr.text())
                match = tr('td:contains("%s"), th:contains("%s")' % (
                    label, label))
                if match:
                    poptd = match.next()
                    return _parse_number(poptd.text())
        return None
    return _parse_number(poptd.text())

def _parse_number(text):
    res = re.search('[0-9,]+', text)
    if not res:
        return None
    return int(res.group(0).replace(',', ''))

def _get_label(query, label):
    """
    Return the cell table containing the *label*
    """
    res = query('table[class *= "geography"] th:contains("%s")' % label)
    if not res:
        res = query('table[class *= "geography"] b:contains("%s")' %label).\
                parent()
    if not res:
        res = query('table[class *= "geography"] td:first-child' +
                ':contains("%s:")' % label)
    return res

if __name__ == '__main__':
    while True:
        try:
            name = raw_input()
            debug(name)
        except:
            debug('END')
            break
        url = "http://en.wikipedia.org/wiki/%s" % name
        try:
            population = get_population(name, url)
        except:
            population = None
        print name, population
        debug("\n\n")