Page connection issue

poeticinsanity 2 Light Poster

16 Years Ago

When the .request method is called, an error is happening. The page does exist, but for some reason the request is failing. Any idea as to why?

'''
Created on May 29, 2009

@author: snorris4

This program will spider the repo.or.cz site for information on their open source projects, gathering the html pages of each 
project and adding them to a database.

'''
from http import client
import re
import time

INDEX_SITE='repo.or.cz/w?a=project_index'
BASE_SITE="http://repo.or.cs/w"

class RP_spider_projectslist():
    
    
    check_links=[]
    
    def get_page(self, site, page):
        try:
            conn=client.HTTPConnection(site)
            try:
                conn.request("GET","http://"+site+page) #error occurs here
                resp=conn.getresponse()
                html_page=resp.read()
                return html_page
            except:
                print("The page request failed.")
        except:
            print ("The connection failed.")
    
    def find_projects(self,page):
        return re.findall("*.git")
                
    
    def add_to_database(self,links):
        for link in links:
            page=self.get_page(BASE_SITE,link[3:len(link)])
            #add page to database here.

def main():
    spider=RP_spider_projectslist()
    page=spider.get_page(BASE_SITE,'/')
    page_string=str(page,"UTF-8")
    spider.feed(page_string)
    for i in spider.check_links:
        print (i)
        
def test():
   spider=RP_spider_projectslist()
   page=spider.get_page(INDEX_SITE,'')
   page_string=str(page,"UTF-8")
   strings=find_projects(page_string)
   print (strings)
   
test()

1 Contributor
0 Replies
54 Views

Be the first to reply

Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.