I've written some code that will download a web page and extract all the links from that page into a list. Then I want to make a second pass down the list and repeat the process for each on the links found on the first page. But for some reason, I end up with an infinite loop. I've added a bunch of debugging code and I figured out what is happening, but I don't know why.
In the following block of code, on line 30 I take all of the links that were found in the first web page that we looked at. The script runs some tests against the link to see if we want to parse it or not. Then on line 51 we actually parse the web page. At that point, for some reason the list called msulinks gets longer...but I don't know why. The only place that msulinks is updated is on line 27, and that is outside of the for loop. Since the list keeps getting longer, the program ends up running forever until it runs into some other problem that I'm not worried about right now. Here is the code I'm talking about
import sys
import MSULinkExtractor
import urllib2
import formatter
import socket
import htmllib
# This is a global setting that will set the timeout value for our
# web requests. Referece: http://www.voidspace.org.uk/python/articles/urllib2.shtml
socket.setdefaulttimeout(5)
# Here is a list of extensions that we don't want to parse with our
# htmlparser
dontparse = ['pdf','xls']
format = formatter.NullFormatter()
htmlparser = MSULinkExtractor.MSULinkExtractor(format)
# Here is the first pass
print 'Opening http://www.mnsu.edu/staff'
try:
data = urllib2.urlopen('http://www.mnsu.edu/staff')
except urllib2.URLError:
print 'I couldn\'t open the first page.'
sys.exit(1)
htmlparser.feed(data.read())
msulinks = htmlparser.get_links()
print "### The first pass gave us",str(len(msulinks)),"to check."
for eachlink in msulinks:
print 'Entering the for loop.'
print '\tOpening ' + eachlink
print '\tmsulinks is',str(len(msulinks))
try:
data = urllib2.urlopen(eachlink)
print '\tAfter urllib2.urlopen msulinks is',str(len(msulinks))
except urllib2.URLError:
continue
except urllib2.InvalidURL:
htmlparser.del_link(eachlink)
continue
# There are a few file types that we dont want to parse with our htmlparser.
# if we find them, we will keep them in the list and move on.
if eachlink[-3:] in dontparse:
print '\tWill not parse that link. Length of msulinks is',str(len(msulinks))
continue
try:
print '\tSending data to htmlparser'
print '\tLength of msulinks is',str(len(msulinks))
htmlparser.feed(data.read())
print '\tThe data has been read. Lenth of msulinks is',str(len(msulinks))
except htmllib.HTMLParseError:
sys.stderr.write('Couldn\'t parse ' + eachlink + '\n')
htmlparser.del_link(eachlink)
continue
print " Length is",str(len(msulinks))
raw_input("Press any key to continue")
# links = htmlparser.get_links()
# for eachlink in links:
# print eachlink
Here is the code for the MSULinkExtractor class that is referenced above.
import sys
import urllib2
import htmllib
import formatter
mydomain = "mnsu.edu"
class MSULinkExtractor(htmllib.HTMLParser):
def __init__(self, formatter):
htmllib.HTMLParser.__init__(self, formatter)
self.links = []
def start_a(self, attrs):
if len(attrs) > 0:
for attr in attrs:
if attr[0] == "href" and attr[1].find(mydomain) > -1:
self.add_link(attr[1])
def get_links(self):
print '\tMSULinkExtractor.get_links has been called'
return self.links
def del_link(self, link):
self.links.pop(self.links.index(link))
def add_link(self, link):
# Remove any mailto links
if link.find("mailto") == 0:
return 1
# Remove any javascript links
if link.find("javascript") == 0:
return 1
# Remove the smartedit links, just because I thought of it
if link.find("http://www.mnsu.edu/smartedit/edit.php") == 0:
return 1
# If there is a link that goes to the same page, then the leading
# http://www.whatever.com may be missing. We will put that back in place.
if link.find("/") == 0:
link = "http://www.mnsu.edu" + link
# I want to weed out links that leave the mnsu.edu domain. I don't want to
# spider the whole internet.
if link.find(mydomain) == -1:
return 1
# We dont want to have duplicate links, so this will check if the link
# already exists, and if not, add it.
try:
self.links.index(link)
except ValueError:
self.links.append(link)
if __name__ == "__main__":
format = formatter.NullFormatter()
htmlparser = MSULinkExtractor(format)
data = urllib2.urlopen("http://www.mnsu.edu/staff")
htmlparser.feed(data.read())
links = htmlparser.get_links()
for eachlink in links:
print eachlink