I'm using HTMLParser to find some thing among this page given below. The link i'm looking to find and follow is in red. I'm using the code, also provided below to find this link, but it isn't seeming to find it at all. The portion of code that isn't working correctly is in red. There is no error, so to speak, but I am not getting the output I believe I should. In fact, in the handle_starttag portion of GraphSpider, re.search(etc.etc.etc.) is coming up None. Help please!!
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<meta http-equiv="content-type" content="text/html;charset=UTF-8" />
<title>hamish's glazesorg-new at master - GitHub</title>
<link rel="search" type="application/opensearchdescription+xml" href="/opensearch.xml" title="GitHub" />
<link rel="fluid-icon" href="http://github.com/fluidicon.png" title="GitHub" />
<link href="http://assets1.github.com/stylesheets/bundle.css?c5a62b10ab8ad45bf9f3fa776adae8395d8222a4" media="screen" rel="stylesheet" type="text/css" />
<script type="text/javascript" src="http://ajax.googleapis.com/ajax/libs/jquery/1.3.2/jquery.min.js"></script>
<script src="http://assets3.github.com/javascripts/bundle.js?c5a62b10ab8ad45bf9f3fa776adae8395d8222a4" type="text/javascript"></script>
<link href="http://github.com/feeds/hamish/commits/glazesorg-new/master" rel="alternate" title="Recent Commits to glazesorg-new:master" type="application/atom+xml" />
<meta name="description" content="" />
<script type="text/javascript">
github_user = null
</script>
</head>
<body>
<div id="main">
<div id="header" class="">
<div class="site">
<div class="logo">
<a href="http://github.com"><img src="/images/modules/header/logov3.png" alt="github" /></a>
</div>
<div class="topsearch">
<form action="/search" id="top_search_form" method="get">
<input type="search" class="search" name="q" /> <input type="submit" value="Search" />
<input type="hidden" name="type" value="Everything" />
<input type="hidden" name="repo" value="" />
<input type="hidden" name="langOverride" value="" />
<input type="hidden" name="start_value" value="1" />
</form>
<div class="links">
<a href="/repositories">Browse</a> | <a href="/guides">Guides</a> | <a href="/search">Advanced</a>
</div>
</div>
<div class="actions">
<a href="http://github.com">Home</a>
<a href="/plans"><b><u>Pricing and Signup</u></b></a>
<a href="http://github.com/popular/forked">Repositories</a>
<a href="/blog">Blog</a>
<a href="https://github.com/login">Login</a>
</div>
</div>
</div>
<div id="repo_menu">
<div class="site">
<ul>
<li class="active"><a href="http://github.com/hamish/glazesorg-new/tree/master">Source</a></li>
<li class=""><a href="http://github.com/hamish/glazesorg-new/commits/master">Commits</a></li>
<li class=""><a href="/hamish/glazesorg-new/network">Network (2)</a></li>
<li class=""><a href="/hamish/glazesorg-new/issues">Issues (0)</a></li>
<li class=""><a href="/hamish/glazesorg-new/downloads">Downloads (0)</a></li>
<li class=""><a href="http://wiki.github.com/hamish/glazesorg-new">Wiki (1)</a></li>
<li class=""><a href="/hamish/glazesorg-new/graphs">Graphs</a></li>
</ul>
</div>
</div>
<div id="repo_sub_menu">
<div class="site">
<div class="joiner"></div>
<ul>
<li>
<a class="active" href="/hamish/glazesorg-new/tree/master">master</a>
</li>
<li>
<a href="#">all branches</a>
<ul>
<li><a href="/hamish/glazesorg-new/tree/master">master</a></li>
</ul>
</li>
<li>
<a href="#">all tags</a>
</li>
</ul>
</div>
</div>
<div class="site">
<div id="repos">
<script type="text/javascript">
GitHub.currentCommitRef = "master"
GitHub.currentRepoOwner = "hamish"
GitHub.currentRepo = "glazesorg-new"
</script>
<div class="repo public">
<div class="title">
<div class="path">
<a href="/hamish">hamish</a> / <b><a href="http://github.com/hamish/glazesorg-new/tree">glazesorg-new</a></b>
<a href="/signup" class="toggle_watch"><img alt="watch" class="button" src="http://assets3.github.com/images/modules/repos/watch_button.png?c5a62b10ab8ad45bf9f3fa776adae8395d8222a4" /></a><a href="/signup" class="toggle_watch" style="display:none;"><img alt="watch" class="button" src="http://assets3.github.com/images/modules/repos/unwatch_button.png?c5a62b10ab8ad45bf9f3fa776adae8395d8222a4" /></a>
<a href="#" id="download_button" rel="hamish/glazesorg-new"><img alt="download tarball" class="button" src="http://assets2.github.com/images/modules/repos/download_button.png?c5a62b10ab8ad45bf9f3fa776adae8395d8222a4" /></a>
</div>
<div class="security private_security" style="display:none">
<a href="#private_repo" rel="facebox"><img src="/images/icons/private.png" alt="private" /></a>
</div>
<div id="private_repo" class="hidden">
This repository is private.
All pages are served over SSL and all pushing and pulling is done over SSH.
No one may fork, clone, or view it unless they are added as a <a href="/hamish/glazesorg-new/edit">member</a>.
<br/>
<br/>
Every repository with this icon (<img src="/images/icons/private.png" alt="private" />) is private.
</div>
<div class="security public_security" style="">
<a href="#public_repo" rel="facebox"><img src="/images/icons/public.png" alt="public" /></a>
</div>
<div id="public_repo" class="hidden">
This repository is public.
Anyone may fork, clone, or view it.
<br/>
<br/>
Every repository with this icon (<img src="/images/icons/public.png" alt="public" />) is public.
</div>
<div class="flexipill">
<a href="/hamish/glazesorg-new/network">
<table cellpadding="0" cellspacing="0">
<tr><td><img alt="Forks" src="http://assets0.github.com/images/modules/repos/pills/forks.png?c5a62b10ab8ad45bf9f3fa776adae8395d8222a4" /></td><td class="middle"><span>2</span></td><td><img alt="Right" src="http://assets1.github.com/images/modules/repos/pills/right.png?c5a62b10ab8ad45bf9f3fa776adae8395d8222a4" /></td></tr>
</table>
</a>
</div>
<div class="flexipill">
<a href="/hamish/glazesorg-new/watchers">
<table cellpadding="0" cellspacing="0">
<tr><td><img alt="Watchers" src="http://assets0.github.com/images/modules/repos/pills/watchers.png?c5a62b10ab8ad45bf9f3fa776adae8395d8222a4" /></td><td class="middle"><span>2</span></td><td><img alt="Right" src="http://assets1.github.com/images/modules/repos/pills/right.png?c5a62b10ab8ad45bf9f3fa776adae8395d8222a4" /></td></tr>
</table>
</a>
</div>
</div>
<div class="meta">
<table>
<tr>
<td class="label">Clone URL:</td>
<td>
<a href="git://github.com/hamish/glazesorg-new.git" class="git_url_facebox" rel="#git-clone">git://github.com/hamish/glazesorg-new.git</a>
<object classid="clsid:d27cdb6e-ae6d-11cf-96b8-444553540000"
width="110"
height="14"
class="clippy"
id="clippy" >
<param name="movie" value="/flash/clippy.swf"/>
<param name="allowScriptAccess" value="always" />
<param name="quality" value="high" />
<param name="scale" value="noscale" />
<param NAME="FlashVars" value="text=git://github.com/hamish/glazesorg-new.git">
<param name="bgcolor" value="#F0F0F0">
<param name="wmode" value="opaque">
<embed src="/flash/clippy.swf"
width="110"
height="14"
name="clippy"
quality="high"
allowScriptAccess="always"
type="application/x-shockwave-flash"
pluginspage="http://www.macromedia.com/go/getflashplayer"
FlashVars="text=git://github.com/hamish/glazesorg-new.git"
bgcolor="#F0F0F0"
wmode="opaque"
/>
</object>
<div id="git-clone" style="display:none;">
Give this clone URL to anyone.
<br/>
<code>git clone git://github.com/hamish/glazesorg-new.git </code>
</div>
</td>
</tr>
</table>
</div>
</div>
</div>
<div id="commit">
<div class="group">
<div class="envelope commit">
<div class="human">
<div class="message"><pre><a href="/hamish/glazesorg-new/commit/864c0e9ce4116b728690f9375f718b0c75864eda">html prototype</a> </pre></div>
<div class="actor">
<div class="gravatar">
<img alt="" height="30" src="http://www.gravatar.com/avatar/8677b00eb466502659e185c0c82d30bf?s=30&d=http%3A%2F%2Fgithub.com%2Fimages%2Fgravatars%2Fgravatar-30.png" width="30" />
</div>
<div class="name">Hamish Currie <span>(author)</span></div>
<div class="date">
<abbr class="relatize" title="2009-06-05 17:21:57">Fri Jun 05 17:21:57 -0700 2009</abbr>
</div>
</div>
</div>
<div class="machine">
<span>c</span>ommit <a href="/hamish/glazesorg-new/commit/864c0e9ce4116b728690f9375f718b0c75864eda" hotkey="c">864c0e9ce4116b728690f9375f718b0c75864eda</a><br />
<span>t</span>ree <a href="/hamish/glazesorg-new/tree/864c0e9ce4116b728690f9375f718b0c75864eda" hotkey="t">de1513fdb688647a26c3596cdded10096808d22d</a><br />
</div>
</div>
</div>
</div>
<div id="path">
<b><a href="/hamish/glazesorg-new/tree">glazesorg-new</a></b> /
</div>
<script type="text/javascript">
GitHub.currentTreeSHA = "864c0e9ce4116b728690f9375f718b0c75864eda"
GitHub.commitSHA = "864c0e9ce4116b728690f9375f718b0c75864eda"
GitHub.currentPath = ""
</script>
<div id="browser">
<table cellpadding="0" cellspacing="0">
<tr>
<th></th>
<th>name</th>
<th>age</th>
<th>
<div class="history">
<a href="/hamish/glazesorg-new/commits/master/">history</a>
</div>
message
</th>
</tr>
<tr class="alt">
<td class="icon"> <img alt="directory" src="http://assets3.github.com/images/icons/dir.png?c5a62b10ab8ad45bf9f3fa776adae8395d8222a4" /> </td>
<td class="content"> <a href="/hamish/glazesorg-new/tree/864c0e9ce4116b728690f9375f718b0c75864eda/html" id="5ad1c2c573250354b486fa2345f1cd64f64193eb">html/</a></td>
<td class="age"> <span class="relatize">Fri Jun 05 17:21:57 -0700 2009</span> </td>
<td class="message"> <a href="/hamish/glazesorg-new/commit/864c0e9ce4116b728690f9375f718b0c75864eda" class="message">html prototype</a> [Hamish Currie] </td>
</tr>
</table>
</div>
</div>
<div class="push"></div>
</div>
<div id="footer">
<div class="site">
<div class="info">
<div class="links">
<a href="http://github.com/blog/148-github-shirts-now-available">Shirts</a> |
<a href="http://github.com/blog">Blog</a> |
<a href="http://support.github.com/">Support</a> |
<a href="http://github.com/training">Training</a> |
<a href="http://github.com/contact">Contact</a> |
<a href="http://groups.google.com/group/github/">Google Group</a> |
<a href="http://develop.github.com">API</a> |
<a href="http://twitter.com/github">Status</a>
</div>
<div class="company">
<span id="_rrt" title="0.03509s from xc88-s00008">GitHub</span>™
is <a href="http://logicalawesome.com/">Logical Awesome</a> ©2009 | <a href="/site/terms">Terms of Service</a> | <a href="/site/privacy">Privacy Policy</a>
</div>
</div>
<div class="sponsor">
<a href="http://engineyard.com"><img src="/images/modules/footer/ey-rubyhosting.png" alt="Engine Yard" /></a>
</div>
</div>
</div>
<div id="coming_soon" style="display:none;">
This feature is coming soon. Sit tight!
</div>
<script type="text/javascript">
var gaJsHost = (("https:" == document.location.protocol) ? "https://ssl." : "http://www.");
document.write(unescape("%3Cscript src='" + gaJsHost + "google-analytics.com/ga.js' type='text/javascript'%3E%3C/script%3E"));
</script>
<script type="text/javascript">
var pageTracker = _gat._getTracker("UA-3769691-2");
pageTracker._initData();
pageTracker._trackPageview();
</script>
</body>
</html>
'''
Created on Jun 5, 2009
@author: Steven Norris
This module provides the spider capability to be used to collect pages from github.com.
'''
import FLOSSmoleutils
from HTMLParser import HTMLParser
import httplib
import re
import time
import MySQLdb
BASE_SITE="github.com"
'''
This class is used to check every page of the repository for a projects list
'''
class GitHubSpider(HTMLParser):
#Used to store the links needing to be checked
check_links=[]
#Used to reset check_links after every feed()
def reset_link_list(self):
self.check_links=[]
#Used to handle the start tags of the main page
def handle_starttag(self,tag,attrs):
if tag=='a':
link=attrs[0][1]
if re.search('/tree', link)!=None:
self.check_links.append(link)
'''
This method finds the Next link on the project page
'''
class HasNextSpider(HTMLParser):
check_link=''
def reset_link(self):
self.check_link=''
def handle_starttag(self,tag,attrs):
if tag=='a':
link=attrs[0][1]
try:
hotkey=attrs[1][1]
if re.search("/repositories\?page",link)!=None and hotkey=='l':
self.check_link=link
except:
'''
do nothing
'''
'''
This method finds the Graph page of each project
'''
class GraphSpider(HTMLParser):
check_link=''
def reset_link(self):
self.check_link=''
def handle_starttag(self,tag,attrs):
if tag=='a':
link=attrs[0][1]
print (link)
'''
This method runs the spider sequence needed to collect the information from github.com
'''
def main():
#Declaring variables and creating spiders
page="/repositories"
hasNextPage=True
spider=GitHubSpider()
spider_Next=HasNextSpider()
spider_Graph=GraphSpider()
track_page=1
#Establish the connection and get the base_page
print("Setting up connection.")
conn=httplib.HTTPConnection(BASE_SITE)
#Begin loop through project pages
while(hasNextPage):
print("Beginning on page "+str(track_page))
print("Gathering base page.")
base_page=FLOSSmoleutils.get_page("http://"+BASE_SITE+page,conn)
#Create the spider and begin the feed
print("Gathering project links.")
spider.feed(base_page)
#Storing project links and preparing spider
redirect_links=spider.check_links
spider.reset_link_list()
#Gathering pages for each project link
for link in redirect_links:
print("Following link: "+link)
link_segments=link.split('/')
project_name=link_segments[2]
developer_name=link_segments[1]
home_page=FLOSSmoleutils.get_page("http://"+BASE_SITE+link,conn)
spider.feed(home_page)
#Following the redirect if project exists
if spider.check_links!=[]:
link2=spider.check_links[0]
spider.reset_link_list()
home_page=FLOSSmoleutils.get_page(link2,conn)
#Find and save the graph page
spider_Graph.feed(home_page)
print(spider_Graph.check_link)
graphlink=spider_Graph.check_link
spider_Graph.reset_link()
print (graphlink)
else:
home_page="There is no page for this project."
print("Collected page for: "+link)#add home page to database here.
time.sleep(3)
#Check for next link
spider_Next.feed(base_page)
next_link=spider_Next.check_link
spider_Next.reset_link()
if next_link!='':
print(next_link)
page=next_link
track_page+=1
print("\n")
else:
print("Final link reach.")
hasNextPage=False
print("Closing connection.")
conn.close()
main()