Hi, I'm working on a unofficial app to get the movie listings from a webpage http://cinepolis.com.mx. In order to get the correct movie listings, the user must select his city.
Now, using HTMLParser I was able to get the list of cities, but because some of these have non english chars like ñ or á é í ó ú the HTMLParser just avoid them an returns the word Cancún as 'Canc' and 'n', when what it should do is give the 'Cancún' as a single word.
It wouldn't matter if it returns me the word in pieces, I could just join them, but it is avoiding chars and never giving them.
Here is the code:
'''
Created on Jun 14, 2011
@author: augusto
'''
from HTMLParser import HTMLParser
from urllib2 import urlopen
class Spider(HTMLParser):
def __init__(self, url):
self.this_is_the_tag = False
self.end_of_city = False
self.this_city = ""
self.cities = []
HTMLParser.__init__(self)
req = urlopen(url)
self.feed(req.read())
def checkAttr(self, dic, attr, value):
for pair in dic:
if pair[0] == attr and pair[1] == value:
return True
return False
def handle_starttag(self, tag, attrs):
if tag == 'select' and self.checkAttr(attrs, 'id', 'ctl00_ddlCiudad'):
print "Found div => "
print self.get_starttag_text()
self.this_is_the_tag = True
if self.this_is_the_tag and tag == 'option':
print "Found option value = ", attrs[-1][1]
def handle_endtag(self, tag):
if tag == 'select' and self.this_is_the_tag:
print "End of div => "
self.this_is_the_tag = False
print self.cities
if tag == 'option':
self.end_of_city = True
def handle_data(self, data):
if self.this_is_the_tag and not self.end_of_city:
print self.get_starttag_text()
print "-%s-" % data
self.this_city += data
elif self.end_of_city:
self.cities.append(self.this_city)
self.this_city = ""
self.end_of_city = False
Spider('http://cinepolis.com.mx/index.aspx')
This is the output i get (Cancn should be Cancún, Cd. Cuauhtmoc should be Cd. Cuauhtémoc and so on) :
Cancn[/B]', 'Cd. Acua', 'Cd. Cuauhtmoc ', 'Cd. Jurez', 'Cd. Obregn', 'Cd. Victoria', 'Celaya', 'Chetumal', 'Chihuahua', 'Chilpancingo', 'Coatzacoalcos', 'Colima', 'Comitn', 'Cozumel', 'Cuautla', 'Cuernavaca', 'Culiacn', 'D.F. y A.M. (Centro)', 'D.F. y A.M. (Norte)', 'D.F. y A.M. (Oriente)', 'D.F. y A.M. (Poniente)', 'D.F. y A.M. (Sur)', 'Durango', 'Ensenada', 'Guadalajara', 'Hermosillo', 'Hidalgo del Parral', 'Iguala', 'Irapuato', 'La Paz', 'Len', 'Manzanillo', 'Matamoros', 'Mrida', 'Mexicali', 'Minatitln', 'Monterrey', 'Morelia', 'Nogales', 'Nuevo Laredo', 'Oaxaca', 'Orizaba', 'Pachuca', 'Playa del Carmen', 'Puebla', 'Puerto Vallarta', 'Quertaro', 'Reynosa', 'Rosarito', 'Salamanca', 'Saltillo', 'San Cristbal de las C', 'San Jos del Cabo', 'San L Ro Colorado', 'San Luis Potos', 'Tampico', 'Tapachula', 'Taxco', 'Tecate', 'Tehuacn', 'Tepeji del Ro', 'Tijuana', 'Tlaxcala', 'Toluca', 'Torren', 'Tuxpan', 'Tuxtla Gutirrez', 'Uriangato', 'Uruapan', 'Veracruz', 'Villahermosa', 'Xalapa', 'Zamora', 'Ciudad']