In the line 72 of the code i do a findAll to retrieve all 'a' tags that have a 'horariosCarteleraUnderline' class and that have an href url that contains ?ic=[code]&
where code is a common code used to identifie the movie start time.
It should retrieve all movie times, but it forgets about ones.
This is the full code:
from urllib2 import urlopen
from BeautifulSoup import BeautifulSoup
import re
class cuapi():
def __init__(self):
self.url = "http://cinepolis.com.mx/index.aspx"
self.urlCartelera = "http://www.cinepolis.com/_CARTELERA/cartelera.aspx?ic="
self.citiesid = "ctl00_ddlCiudad"
self.soup = 0
self.currentUrl = ''
def cureHTML(self):
#print "occurrence found !"
return 'target="_blank"'
def getSoup(self, url):
'''opens the url given and using the html in it makes a soup, this soup is
returned as a beautifulsoup object'''
if url != self.currentUrl:
print "new url %s" % url
self.currentUrl = url
page = urlopen(url)
html = page.read()
myMassage = [(re.compile(r'target\"_blank\"'), self.cureHTML())]
self.soup = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES, markupMassage=myMassage)
return self.soup
def getCities(self):
'''Returns a dictionary, where the cities names are the keys, and the value
is a city code used to display the correct schedule and movie listing'''
soup = self.getSoup(self.url)
citiesCombo = soup.findAll('select',{'id':self.citiesid})[0]
cities = citiesCombo.findAll("option")
citiesDict = {}
for city in cities:
name = city.contents[0]
value = city.attrs[0][1]
citiesDict[name] = value
#print "Agregada la ciudad de %s con clave %s " % (name,value)
for city in sorted(citiesDict.keys()):
print city
return citiesDict
def getComplejos(self, cityCode='43'):
'''give a city code and it will return a dictionary where the complejos are
the keys and the value is an ID created to distinguish between them'''
#page = urlopen(self.urlCartelera + str(cityCode))
soup = self.getSoup(self.urlCartelera + str(cityCode))
complejos = soup.findAll('span',{'class':'TitulosBlanco'})
compDict = {}
n = 0
for comp in complejos:
name = comp.contents[0]
compDict[name] = n
#print "Complejo %s agregado áé" % name
n += 1
print compDict
return compDict
def getHorarios(self,cityCode='43', compCode='152'):
#TODO: get html
soup = self.getSoup(self.urlCartelera + str(cityCode))
movies = soup.findAll('a',{'class':'peliculaCartelera'})
moviesDict = {}
for movie in movies:
#qp es la pelicula en cuestion, aqui deberias iterar
name = movie.contents[0]
#print name
papi = movie.findParent().findParent().findParent()
regex = re.compile("\?ic=%s&" % compCode)
hor = papi.findAll('a',{'class':'horariosCarteleraUnderline','href':regex})
#print len(hor)
if len(hor) > 0:
moviesDict[name] = [h.string for h in hor]
print name, moviesDict[name]
#for mov in moviesDict.keys():
#print mov,moviesDict[mov]
c = cuapi()
#c.getCities()
#c.getComplejos()
c.getHorarios()
The first movie is ok, but the second one, that only returns 4 times, should be returning 5. It tends to avoid 1 or 2 of the firts.
Output:
new url [url]http://www.cinepolis.com/_CARTELERA/cartelera.aspx?ic=43[/url]
¿Qué Pasó Ayer? 2 Dob [u'5:45pm', u'7:50pm', u'9:55pm']
<---- this down here should have 5 ---->
El Defensor [u'3:00pm', u'5:25pm', u'8:00pm', u'10:30pm']
Kung Fu Panda 2 Dig 3D Dob [u'4:00pm', u'6:10pm', u'8:20pm', u'10:25pm']
Kung Fu Panda 2 Dob [u'3:10pm', u'5:10pm', u'7:20pm', u'9:20pm']
La Chica de la Capa Roja [u'4:50pm', u'6:55pm']
La Noche del Demonio [u'3:50pm', u'6:00pm', u'8:10pm', u'10:20pm']
Nunca Me Abandones [u'4:10pm', u'6:20pm', u'8:30pm', u'10:40pm']
Piratas del Caribe 4 Dob [u'2:45pm']
X-Men: Primera Generación Dob [u'4:30pm', u'7:20pm', u'10:05pm']
X-Men: Primera Generación Sub [u'9:00pm']