I am a programming and python beginner and thought this would be a fun exercise. I wrote this script to mine web pages. First it finds all of the hrefs on the page. Then it takes those urls and searches those pages for content. This is by no means perfect. For one, it searches hrefs only. And two, when I search the page for content I have to give it an offset to find 'text' content. It's not always ideal. I know the code is long and few will read it but I was wondering if anyone had any better approaches to doing this?
''' Data Mining Script - pageminer.py'''
from BeautifulSoup import BeautifulSoup
import urllib2
from urllib2 import HTTPError
from urlparse import urljoin, urlsplit
import httplib
from optparse import OptionParser
import re
import sys
from tempfile import TemporaryFile
from xlwt import Workbook, easyxf
EXCLUDES = [' ', '\n', '\r', '\t']
OUTPUT = 'output.xls'
STYLE6 = easyxf('pattern: pattern solid, fore_color grey25;'
'font: bold yes, height 160;'
'border: top medium, bottom medium')
class Miner():
''' Data Miner '''
def __init__(self, options, url):
self.url = url
self.options = options
self.keys = options.keys1.split(',')
def get_soup(self):
''' Parse HTML into BeautifulSoup '''
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
try:
soup = BeautifulSoup(opener.open(self.url))
self.get_links(doc=soup, keys=self.keys)
except HTTPError:
pass
def get_links(self, doc, keys):
''' Use a search pattern provided by list(keys) to
find hrefs that match. '''
links = []
for link in doc.findAll('a', href=True):
for key in keys:
if re.search(key, link['href']):
links.append(link['href'])
self.build_links(links)
def build_links(self, links):
''' Build proper URL '''
paths = []
for link in links:
paths.append(urljoin(self.url, link))
self.validate_links(paths)
def validate_links(self, links):
''' Verify to see if the links we built are good '''
self.goodlinks = []
for link in links:
if link.startswith('http://'):
url = urlsplit(link)
conn = httplib.HTTPConnection(url.netloc)
conn.request('GET', url.path)
response = conn.getresponse()
if response.status == 200:
self.goodlinks.append(link)
def run(self):
''' Start here '''
if self.options.test:
self.get_soup()
for link in self.goodlinks:
print link
sys.exit(0)
if self.options.download:
import pyget
self.get_soup()
for link in self.goodlinks:
pyget.start(link)
sys.exit(0)
self.get_soup()
class PageMiner(Miner):
''' Page Data Miner '''
def __init__(self, options, url):
self.url = url
self.options = options
self.keys = options.keys2.split(',')
self.offsets = options.offsets.split(',')
self.results = []
def get_links(self, doc, keys):
''' Use a search pattern provided by list(keys) to
find page content that matches. '''
try:
#self.doc = doc
positions = zip(keys, self.offsets)
self.positions = positions
found = []
found.append(self.url)
for position in positions:
key, offset = position
offset = int(offset)
text = doc.find(text=re.compile(key))
for x in range(offset):
text = text.findNext(text=True)
text = text.findNext(text=True)
text = text.strip()
for pat in EXCLUDES:
if pat in text:
text = text.replace(pat, '')
text = re.sub('\s+', ' ', text) # Remove mul. spaces
found.append(text)
if found:
self.results.append(tuple(found))
except AttributeError:
pass
def report(self):
''' Display results on screen. '''
for line in self.results:
print line
def save(self):
''' Save result to and excel file defined by OUTPUT '''
book = Workbook()
sheet1 = book.add_sheet('Sheet1')
line = 0
col = 0
row = sheet1.row(line)
if self.options.headers:
headers = self.options.headers.split(',')
for col in range(len(headers)):
row.write(col, headers[col], STYLE6)
line += 1
row = sheet1.row(line)
for entry in self.results:
try:
entry = entry[0] #Remove outer list [()]
for col in range(len(entry)):
row.write(col, entry[col])
line += 1
row = sheet1.row(line)
except IndexError:
pass
book.save(OUTPUT)
book.save(TemporaryFile())
print "Results stored in '%s'" % (OUTPUT)
def run(self):
''' Start here '''
if self.options.test:
self.get_soup()
self.report()
sys.exit(0)
self.get_soup()
def cmd_opts():
''' Parse command line options. '''
usage = '''usage: %prog [-u] [--level1-keys] [--level2-keys] [-o]
STEP 1: Test Level 1 scan to find the right pages.
$> %prog -u url --level1-keys pat1,pat2 --display
STEP 2: Test Level 2 scan to find the right content.
$> %prog -u url --level2-keys pat1,pat2 -o 0,4 -t
STEP 3: Run the whole shebang!
$> %prog -u url --level1-keys pat1,pat2 --level2-keys pat1,pat2 -o 4,0
** NOTE: All patterns can be regular expressions.
'''
parser = OptionParser(usage=usage)
parser.add_option('-u', dest='url',
help='base url')
parser.add_option('-d', '--download', dest='download', default=False,
action='store_true',
help='download files')
parser.add_option('--level1-keys', dest='keys1',
help='patterns in url to search for in Level 1 scan')
parser.add_option('--level2-keys', dest='keys2',
help='patterns in url to search for in Level 2 scan')
parser.add_option('-o', dest='offsets',
help='find text relative to this offset')
parser.add_option('-t', '--test', dest='test', default=False,
action='store_true',
help='test scan')
parser.add_option('--from-file', dest='fromfile',
help='run Level 2 scan using urls from file')
parser.add_option('--header', dest='headers',
help='report headers')
parser.add_option('-v', '--verbose', dest='verbose', default=False,
action='store_true',
help='verbose')
(options, args) = parser.parse_args()
return (options, args)
if __name__ == '__main__':
options, args = cmd_opts()
if options.fromfile:
results = []
urls = open(options.fromfile, 'r').readlines()
for url in urls:
url = url.strip()
levelTwo = PageMiner(options, url=url)
levelTwo.get_soup()
results.append(levelTwo.results)
if options.verbose:
levelTwo.report()
levelTwo.results = results
levelTwo.save()
sys.exit(0)
if options.url and options.keys1:
levelOne = Miner(options, url=options.url)
levelOne.run()
elif (options.keys2 and not options.keys1)\
and (options.url and options.offsets):
levelTwo = PageMiner(options, url=options.url)
levelTwo.run()
levelTwo.report()
else:
print "See 'pageminer.py -h' for help"
sys.exit(1)
results = []
for url in levelOne.goodlinks:
levelTwo = PageMiner(options, url=url)
levelTwo.run()
results.append(levelTwo.results)
if options.verbose:
levelTwo.report()
levelTwo.results = results
levelTwo.save()