I need to write a Python program which parses webpages and returns a dictionary of unique words and their frequenies. What I came up with was
#!/usr/bin/env python
from HTMLParser import HTMLParser, HTMLParseError
import urllib
import urlparse
import sys
import os
import MySQLdb
import re
class WordHarvester:
def __init__(self):
self.db = MySQLdb.connect(host="my.host.dom", user="user", passwd="passwd",db="db")
self.dbc = self.db.cursor()
self.mhp = MyHTMLParser()
def run(self,URL,limit=10):
print "Running URL:"
print URL
print " "
try:
sock = urllib.urlopen(URL)
h = sock.read()
self.mhp.feed(h)
except:
print "Error reading",URL
i=0
ur = URL
while i<limit:
sock.close()
local_list = self.mhp.urls
self.mhp.urls=[]
for u in local_list:
try:
split_url= urlparse.urlsplit(urlparse.urljoin(ur,u))
if split_url.scheme == "http":
u = urlparse.urlunsplit((split_url.scheme,split_url.netloc,split_url.path,"",""))
else:
continue
print u
sock = urllib.urlopen(u)
h = sock.read()
self.mhp.feed(h)
except:
print "Error reading",u
i=i+1
print "Your word counts:"
for word in self.mhp.word_dictionary.keys():
print word,": ",self.mhp.word_dictionary[word]
class MyHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.urls=[]
self.word_dictionary = {}
def handle_starttag(self,tag,attrs):
if tag=='script':
self.inscript=True
if tag=='body':
self.inbody=True
if tag=='a' and attrs:
#if attrs[0][1][:4]=='http':
self.urls.append(attrs[0][1])
def handle_endtag(self, tag):
if tag=='script':
self.inscript=False
if tag=='body':
self.inbody=False
def handle_data(self,data):
l = []
m = []
n = []
for s in data:
print s
l = s.split()
for j in l:
m.append(j)
for h in m:
h = h.strip(' ,.?!-')
h = h.lower()
n.append(h)
n.sort()
for i in n:
if i.isalpha():
if i in self.word_dictionary:
self.word_dictionary[i]=self.word_dictionary[i]+1
else:
self.word_dictionary[i]=1
if __name__ == "__main__":
print "Testing Word Harvester"
harvester = WordHarvester()
url = raw_input("Starting URL: ")
harvester.run(URL=url)
This appears to read all the web pages it is supposed to, but the handle_data only adds individual letters to the dictionary, not whole words like it should. Also, after about the first ten web pages, handle_data, and presumably the other handle_* methods of HTML Parser are not called, the print statement I added to handle_data is only called for the first few pages, and the amount of text that is printed is not the entire web page, usually only a couple of words from the beginning of the web page. I have very minimal knowledge of python and this was all I could get from scratch