Hey guys, I'm trying to get all links on a website using BeautifulSoup, Queue, Threading, and urllib2. I am specifically looking for links that lead to other pages of the same site. It runs for a few seconds, going through about 3 URLs before giving me the error:
Traceback (most recent call last):
File "/usr/lib/python2.7/threading.py", line 808, in __bootstrap_inner
self.run()
File "/home/john/Desktop/Python Projects/QtProjects/ThreadedDataMine.py", line 51, in run
if url[0:4] != "http" and url[0] != "/" and "#" not in url:
TypeError: 'NoneType' object has no attribute '__getitem__'
The program run perfectly when it only goes over the main URL, but starts giving this error whenever I tell it to start adding the URLs it finds into the first Threads Queue if it hasn't before.
Here's my code:
import Queue
import threading
import urllib2
import time
from BeautifulSoup import BeautifulSoup
hosts = ["http://waoanime.tv"]
queue = Queue.Queue()
out_queue = Queue.Queue()
class ThreadUrl(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, queue, out_queue):
threading.Thread.__init__(self)
self.queue = queue
self.out_queue = out_queue
def run(self):
while True:
#grabs host from queue
host = self.queue.get()
#grabs urls of hosts and then grabs chunk of webpage
req = urllib2.Request(host, headers={'User-Agent':"Anime Browser"})
html = urllib2.urlopen(req)
chunk = html.read()
#place chunk into out queue
self.out_queue.put(chunk)
#signals to queue job is done
self.queue.task_done()
class DatamineThread(threading.Thread):
"""Threaded Url Grab"""
def __init__(self, out_queue, queue):
threading.Thread.__init__(self)
self.out_queue = out_queue
self.queue = queue
def run(self):
while True:
#grabs host from queue
chunk = self.out_queue.get()
soup = BeautifulSoup(chunk)
#parse the chunk
for line in soup.findAll('a'):
url = (line.get('href'))
if url[0:4] != "http" and "#" not in url:
new_url = ""
if url[0] == "/":
new_url = ("http://waoanime.tv%s" % url)
else:
new_url = ("http://waoanime.tv/%s" % url)
if new_url not in hosts:
hosts.append(new_url)
#self.queue.put(new_url)
print new_url #debug
elif url[0:13] == "http://forums" and url not in hosts and "#" not in url:
hosts.append(url)
#put url in url queue
self.queue.put(url)
print url #debug
else:
pass
#signals to queue job is done
self.out_queue.task_done()
start = time.time()
def main():
#spawn a pool of threads, and pass them queue instance
for i in range(5):
t = ThreadUrl(queue, out_queue)
t.setDaemon(True)
t.start()
#populate queue with data
for host in hosts:
queue.put(host)
for i in range(5):
dt = DatamineThread(out_queue, queue)
dt.setDaemon(True)
dt.start()
#wait on the queue until everything has been processed
queue.join()
out_queue.join()
main()
print "Elapsed Time: %s" % (time.time() - start)
Because this is my first time using Threading, I just modified the code found here.
I would really appreciate any help fixing this bug, or if you know of a better way I can do this.