Hey guys,
import sgmllib
class MyParser(sgmllib.SGMLParser):
def parse(self, s):
"Parse the given string 's'."
self.feed(s)
self.close()
def __init__(self, verbose=0):
sgmllib.SGMLParser.__init__(self, verbose)
self.hyperlinks = []
def start_a(self, attributes):
for name, value in attributes:
if name == "href":
self.hyperlinks.append(value)
self.newhyperlinks.append(name)
def get_hyperlinks(self):
return self.hyperlinks
import urllib, sgmllib
f = urllib.urlopen("http://www.python.org")
s = f.read()
myparser = MyParser()
myparser.parse(s)
print myparser.get_hyperlinks()
I'm having some trouble with my code here. What i've done so far is made an HTML Parser that can give me all the links on a page. But what i am trying to do now is search the array for a valid link i.e. http://www.google.com and so on. And then repeat the same process over (in a while loop i assume?) Do you guys have any suggestions how i should approach this?