ccandillo 0 Newbie Poster

I am a programming and python beginner and thought this would be a fun exercise. I wrote this script to mine web pages. First it finds all of the hrefs on the page. Then it takes those urls and searches those pages for content. This is by no means perfect. For one, it searches hrefs only. And two, when I search the page for content I have to give it an offset to find 'text' content. It's not always ideal. I know the code is long and few will read it but I was wondering if anyone had any better approaches to doing this?

''' Data Mining Script - pageminer.py'''
from BeautifulSoup import BeautifulSoup
import urllib2
from urllib2 import HTTPError
from urlparse import urljoin, urlsplit
import httplib
from optparse import OptionParser
import re
import sys
from tempfile import TemporaryFile
from xlwt import Workbook, easyxf

EXCLUDES = ['&nbsp', '\n', '\r', '\t']
OUTPUT = 'output.xls'

STYLE6 = easyxf('pattern: pattern solid, fore_color grey25;'
                'font: bold yes, height 160;'
                'border: top medium, bottom medium')

class Miner():
    ''' Data Miner '''
    def __init__(self, options, url):
        self.url = url
        self.options = options
        self.keys = options.keys1.split(',')

    def get_soup(self):
        ''' Parse HTML into BeautifulSoup '''
        opener = urllib2.build_opener()
        opener.addheaders = [('User-agent', 'Mozilla/5.0')]
        try:
            soup = BeautifulSoup(opener.open(self.url))
            self.get_links(doc=soup, keys=self.keys)

        except HTTPError:
            pass

    def get_links(self, doc, keys):
        ''' Use a search pattern provided by list(keys) to
            find hrefs that match. '''
        links = []
        for link in doc.findAll('a', href=True):
            for key in keys:
                if re.search(key, link['href']):
                    links.append(link['href'])
                    
        self.build_links(links)

    def build_links(self, links):
        ''' Build proper URL '''
        paths = []
        for link in links:
            paths.append(urljoin(self.url, link))

        self.validate_links(paths)

        
    def validate_links(self, links):
        ''' Verify to see if the links we built are good '''
        self.goodlinks = []
        for link in links:
            if link.startswith('http://'):
                url = urlsplit(link)
                conn = httplib.HTTPConnection(url.netloc)
                conn.request('GET', url.path)
                response = conn.getresponse()
                if response.status == 200:
                    self.goodlinks.append(link)            

    def run(self):
        ''' Start here '''
        if self.options.test:
            self.get_soup()
            for link in self.goodlinks:
                print link
            sys.exit(0) 
               
        if self.options.download:
            import pyget
            self.get_soup()
            for link in self.goodlinks:
                pyget.start(link)
            sys.exit(0)

        self.get_soup()
        

class PageMiner(Miner):
    ''' Page Data Miner '''
    def __init__(self, options, url):
        self.url = url
        self.options = options
        self.keys = options.keys2.split(',')
        self.offsets = options.offsets.split(',')
        self.results = []

    def get_links(self, doc, keys):
        ''' Use a search pattern provided by list(keys) to
        find page content that matches. '''
        try: 
            #self.doc = doc
            positions = zip(keys, self.offsets)
            self.positions = positions
            found = []
            found.append(self.url)
            for position in positions:
                key, offset = position
                offset = int(offset)
                text = doc.find(text=re.compile(key))
                for x in range(offset):
                    text = text.findNext(text=True)
                text = text.findNext(text=True) 
                text = text.strip()
                for pat in EXCLUDES:
                    if pat in text:
                        text = text.replace(pat, '')
                text = re.sub('\s+', ' ', text) # Remove mul. spaces
                found.append(text)
            if found:
                self.results.append(tuple(found))
            
        except AttributeError:
            pass

    def report(self):
        ''' Display results on screen. '''
        for line in self.results:
            print line

    def save(self):
        ''' Save result to and excel file defined by OUTPUT '''
        book = Workbook()
        sheet1 = book.add_sheet('Sheet1')
        line = 0
        col = 0
        row = sheet1.row(line)
        
        if self.options.headers:
            headers = self.options.headers.split(',')
            for col in range(len(headers)):
                row.write(col, headers[col], STYLE6)

            line += 1
            row = sheet1.row(line)
                            
        for entry in self.results:
            try:
                entry = entry[0]    #Remove outer list [()]
                for col in range(len(entry)):
                    row.write(col, entry[col])
                line += 1
                row = sheet1.row(line)

            except IndexError:
                pass
                    
        book.save(OUTPUT)
        book.save(TemporaryFile())
        print "Results stored in '%s'" % (OUTPUT)

    def run(self):
        ''' Start here '''
        if self.options.test:
            self.get_soup()
            self.report()
            sys.exit(0)
            
        self.get_soup()

        
def cmd_opts():
    ''' Parse command line options. '''
    usage = '''usage: %prog [-u] [--level1-keys] [--level2-keys] [-o]

    STEP 1:  Test Level 1 scan to find the right pages.
    $> %prog -u url --level1-keys pat1,pat2 --display

    STEP 2:  Test Level 2 scan to find the right content.
    $> %prog -u url --level2-keys pat1,pat2 -o 0,4 -t

    STEP 3: Run the whole shebang!
    $> %prog -u url --level1-keys pat1,pat2 --level2-keys pat1,pat2 -o 4,0


    ** NOTE: All patterns can be regular expressions.
    '''
    parser = OptionParser(usage=usage)
    parser.add_option('-u', dest='url',
                      help='base url')
    parser.add_option('-d', '--download', dest='download', default=False,
                      action='store_true',
                      help='download files')
    parser.add_option('--level1-keys', dest='keys1',
                      help='patterns in url to search for in Level 1 scan')    
    parser.add_option('--level2-keys', dest='keys2',
                      help='patterns in url to search for in Level 2 scan')
    parser.add_option('-o', dest='offsets',
                      help='find text relative to this offset')
    parser.add_option('-t', '--test', dest='test', default=False,
                      action='store_true',
                      help='test scan')
    parser.add_option('--from-file', dest='fromfile',
                      help='run Level 2 scan using urls from file')
    parser.add_option('--header', dest='headers',
                      help='report headers')
    parser.add_option('-v', '--verbose', dest='verbose', default=False,
                      action='store_true',
                      help='verbose')
    
    (options, args) = parser.parse_args()
    return (options, args)
    

if __name__ == '__main__':
    
    options, args = cmd_opts()
     
    if options.fromfile:
        results = []
        urls = open(options.fromfile, 'r').readlines()
        for url in urls:
            url = url.strip()
            levelTwo = PageMiner(options, url=url)
            levelTwo.get_soup()
            results.append(levelTwo.results)
            if options.verbose:
                levelTwo.report()
        levelTwo.results = results
        levelTwo.save()
        sys.exit(0)

        
    if options.url and options.keys1:     
        levelOne = Miner(options, url=options.url)
        levelOne.run()
    elif (options.keys2 and not options.keys1)\
         and (options.url and options.offsets):
        levelTwo = PageMiner(options, url=options.url)
        levelTwo.run()
        levelTwo.report()
    else:
        print "See 'pageminer.py -h' for help"
        sys.exit(1)

    results = []
    for url in levelOne.goodlinks:
        levelTwo = PageMiner(options, url=url)
        levelTwo.run()
        results.append(levelTwo.results)
        if options.verbose:
            levelTwo.report()
    levelTwo.results = results
    levelTwo.save()
Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.