When the .request method is called, an error is happening. The page does exist, but for some reason the request is failing. Any idea as to why?
'''
Created on May 29, 2009
@author: snorris4
This program will spider the repo.or.cz site for information on their open source projects, gathering the html pages of each
project and adding them to a database.
'''
from http import client
import re
import time
INDEX_SITE='repo.or.cz/w?a=project_index'
BASE_SITE="http://repo.or.cs/w"
class RP_spider_projectslist():
check_links=[]
def get_page(self, site, page):
try:
conn=client.HTTPConnection(site)
try:
conn.request("GET","http://"+site+page) #error occurs here
resp=conn.getresponse()
html_page=resp.read()
return html_page
except:
print("The page request failed.")
except:
print ("The connection failed.")
def find_projects(self,page):
return re.findall("*.git")
def add_to_database(self,links):
for link in links:
page=self.get_page(BASE_SITE,link[3:len(link)])
#add page to database here.
def main():
spider=RP_spider_projectslist()
page=spider.get_page(BASE_SITE,'/')
page_string=str(page,"UTF-8")
spider.feed(page_string)
for i in spider.check_links:
print (i)
def test():
spider=RP_spider_projectslist()
page=spider.get_page(INDEX_SITE,'')
page_string=str(page,"UTF-8")
strings=find_projects(page_string)
print (strings)
test()