Parse HTML to get text from webpages

Question

pocnib 0 Newbie Poster

16 Years Ago

I need to write a Python program which parses webpages and returns a dictionary of unique words and their frequenies. What I came up with was

#!/usr/bin/env python
from HTMLParser import HTMLParser, HTMLParseError
import urllib
import urlparse
import sys
import os
import MySQLdb
import re

class WordHarvester:
	def __init__(self):
		self.db = MySQLdb.connect(host="my.host.dom", user="user", passwd="passwd",db="db")
		self.dbc = self.db.cursor()
		self.mhp = MyHTMLParser()
	
	def run(self,URL,limit=10):
		print "Running URL:"
		print URL
		print " "
		try:
			sock = urllib.urlopen(URL)
			h = sock.read()
			self.mhp.feed(h)
		except:
			print "Error reading",URL
		i=0
		ur = URL
		while i<limit:
			sock.close()
			local_list = self.mhp.urls
			self.mhp.urls=[]
			for u in local_list:
				try:
					split_url= urlparse.urlsplit(urlparse.urljoin(ur,u))
					if split_url.scheme == "http":
						u = urlparse.urlunsplit((split_url.scheme,split_url.netloc,split_url.path,"",""))
					else:
						continue
					print u
					sock = urllib.urlopen(u)
					h = sock.read()
					self.mhp.feed(h)
				except:
					print "Error reading",u
			i=i+1
		print "Your word counts:"
		for word in self.mhp.word_dictionary.keys():
			print word,": ",self.mhp.word_dictionary[word]
		
		
		
class MyHTMLParser(HTMLParser):

	def __init__(self):
		HTMLParser.__init__(self)
		self.urls=[]
		self.word_dictionary = {}

	def handle_starttag(self,tag,attrs):
		if tag=='script':
			self.inscript=True
		if tag=='body':
			self.inbody=True
		if tag=='a' and attrs:
			#if attrs[0][1][:4]=='http':
			self.urls.append(attrs[0][1])
	
	def handle_endtag(self, tag):
		if tag=='script':
			self.inscript=False
		if tag=='body':
			self.inbody=False

	def handle_data(self,data):
		l = []
		m = []
		n = []

		for s in data:
			print s
			l = s.split()
		for j in l:
			m.append(j)
		for h in m:
    			h = h.strip(' ,.?!-')
    			h = h.lower()
    			n.append(h)
		n.sort()
		for i in n:
			if i.isalpha():
    				if i in self.word_dictionary:
        				self.word_dictionary[i]=self.word_dictionary[i]+1
    				else:
					self.word_dictionary[i]=1


if __name__ == "__main__":
	print "Testing Word Harvester"
	harvester = WordHarvester()
	url = raw_input("Starting URL: ")
	harvester.run(URL=url)

This appears to read all the web pages it is supposed to, but the handle_data only adds individual letters to the dictionary, not whole words like it should. Also, after about the first ten web pages, handle_data, and presumably the other handle_* methods of HTML Parser are not called, the print statement I added to handle_data is only called for the first few pages, and the amount of text that is printed is not the entire web page, usually only a couple of words from the beginning of the web page. I have very minimal knowledge of python and this was all I could get from scratch

html-css python

2 Contributors
1 Reply
447 Views
11 Hours Discussion Span
Latest Post 16 Years Ago Latest Post by woooee

Reply to this topic

Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.

woooee 814 Nearly a Posting Maven · Answer 1 · 2008-12-12T03:20:06+00:00

First, if this is just a one-time thing for you, you can use Links to download and save the page as text only. http://www.jikos.cz/~mikulas/links/download/binaries/ Also, I assume you know about BeautifulSoup and that is more than you want. To answer your questions
1. Stops after 10 iterations (as the saying goes, this is too coincidental to be a coincidence)

def run(self,URL,limit=10):
                 ## and 
		while i<limit:

2. Single letters are in the dictionary. Difficult to tell but some print statements will add some light. I'm thinking that you want to use j, since it is each word from the split(), but the print should clarify that.

def handle_data(self,data):
     l = []
     m = []
     n = []
     for s in data:
          print "s in data", s
          l = s.split()
## fixed indentation problem
          for j in l:
               print "     j in l", j
               m.append(j)
## wouldn't h be the same as j, so m=l
          print "\n'l' =", l
          print "'m' =", m
          for h in m:
               print "          h in m", h
               h = h.strip(' ,.?!-')
               h = h.lower()
               n.append(h)
##   if "s" is one record from data, then indentation problem (fixed)
          n.sort()
          for i in n:
               if i.isalpha():
                    if i in self.word_dictionary:
                         self.word_dictionary[i]=self.word_dictionary[i]+1
                    else:
                           self.word_dictionary[i]=1
##
##--------This might easier to understand ------------------------
##           but I'm not sure what your data looks like
     for rec in data:
          word_list= rec.split()
          for word in word_list:
               word = word.strip(' ,.?!-')
               word = word.lower()
               if word in self.word_dictionary:
                    self.word_dictionary[word] += 1
               else:
                    if word.isalpha():
                         self.word_dictionary[word] = 1