Hi all,
I'm not a Python expert (I've only dabbled a little). A user has asked me to help with a short webcrawling script they have written. They are making use of Dale Hunscher's UrlNet library in the script.
They basically want to have the script ignore overly repetitive URLS (so it doesn't spend lots of time crawling archives etc).
ANy suggestions as to the best way to do this? Code below.
Any pointers greatly appreciated.
Thanks,
Mark
#!/usr/bin/env python
# $Id: urltree3.py 56 2009-10-11 21:03:43Z dalehunscher $
###################################################################
# #
# UrlNet Python Library #
# Copyright (c) Dale A. Hunscher, 2007-2009 #
# All rights reserved #
# #
# #
# UrlNet is free for non-commercial use. #
# For commercial uses, contact dale.hunscher@thenextroguewave.com #
# #
###################################################################
# faofishforest.py
from urlnet.urltree import UrlTree
from urlnet.urlutils import saveTree, loadTree
from urlnet.ignoreandtruncate import textToIgnore
from urlnet.ignoreandtruncate import textToTruncate
import urlnet.log
ignorableText = \
['video.google.com',
'blogsearch.google.com',
'google.com',
'books.google.com',
'news.google.com',
'maps.google.com',
'images.google.com',
'blogsearch.google.com',
'mail.google.com',
'fusion.google.com',
'google.com/intl',
'google.com/search',
'google.com/accounts',
'google.com/preferences',
'www.stumbleupon.com/submit',
'cache',
'google',
'74.125.77.132',
'209.85.229.132',
'#',
'statcounter.',
'/analytics/',
'onestat',
'doubleclick',
'swicki',
'eurekster',
'yahoo.com',
'submit?',
'quantcast',
'ads2.',
'overture.',
'/rss/',
'/rdf/',
'/feed/',
'feeddigest',
'sitemeter',
'clustrmaps',
'adbureau',
'zeus.com',
'products/acrobat',
'hon.ch',
'feedburner.com',
'://help.',
'businesswire',
'/faq.',
'sys-con.com',
'jigsaw.w3c.org',
'/categories',
'sitemap',
'site-map',
'site_map',
'rss.xml',
'misoso.com',
'adjuggler.com',
'skype.com',
'validator.w3c.org',
'digg.com/submit',
'addthis.com',
'feedblitz',
'del.icio.us/post',
'feeddigest',
'feedster',
'/about/',
'careers',
'employment',
'sitemap',
'site-map',
'aolstore.com',
'aolsyndication.com',
'/privacy/',
'/privacy.',
'twitter.com/?status'
'twitter.com/home?status',
'/help/',
'phpbb',
'crawlability',
'w3.org',
'4networking',
'www.adtech.com'
'technorati',
'/submit?'
'/share.php',
'adserver',
'invisionboard',
'reddit.com/submit',
'www.myspace.com/Modules/PostTo/Pages/',
'www.facebook.com/share.php?',
'www.facebook.com/sharer.php?',
'www.linkedin.com/shareArticle?',
'doubleclick',]
faofishurls = (
'http://www.fao.org/fishery/rfb/cacfish/en',
'http://www.ccamlr.org/pu/e/gen-intro.htm',
'http://www.afsc.noaa.gov/REFM/CBS/Default.htm',
'http://www.ccsbt.org/site/',
'http://www.gfcm.org/gfcm/en',
'http://www.iattc.org/HomeENG.htm',
'http://www.iccat.int/en/',
'http://www.iotc.org/English/index.php',
'http://www.iphc.int/',
'http://www.iwcoffice.org/index.htm',
'http://www.lvfo.org/index.php',
'http://www.nafo.int/about/frames/about.html',
'http://www.nasco.int/',
'http://www.neafc.org/',
'http://www.npafc.org/new/index.html',
'http://www.fao.org/fishery/rfb/recofi/en',
'http://www.seafo.org/',
'http://www.southpacificrfmo.org/',
'http://www.wcpfc.int/',
)
net = UrlTree(_maxLevel=4, _ignorableText=ignorableText, _truncatableText = textToTruncate )
net.SetProperty('getTitles',True)
urlnet.log.logging=True
success = net.BuildUrlForest(Urls=faofishurls)
if success:
net.WritePajekFile('faofishfornet4', 'faofishfornet4',useTitles=True)
net.WritePajekNetworkFile('faofishfordom4', 'faofishfordom4', urlNet = False,useTitles=True)
net2.WritePajekNetworkFile('faofish.1','faofish.1', urlNet = False,useTitles=True)