import urlparse
import urllib
import urllib2
from bs4 import BeautifulSoup
from collections import Counter
import re
import Tkinter
from Tkinter import *
import ttk
from tkFileDialog import askopenfilename
import logging
import tkMessageBox
import tkFileDialog
import csv
import json
import os
import time
import collections
import shutil
import thread
from PIL import Image, ImageTk
from timeit import default_timer as timer
from functools import partial
# In[27]:
# Identify all the possible links from the starting URL
# dedupe the URL list to get the unique list of URLs to scrape
url="http://www.dawsons.co.uk/finance-information/"
urls=[url] #stack of urls to scrape
visited=[url] #historic record of urls
while len(urls)>0:
try:
htmltext=urllib.urlopen(urls[0]).read()
except:
print urls[0]
soup=BeautifulSoup(htmltext)
urls.pop(0)
print len(urls)
for tag in soup.findAll('a',href=True):
tag['href']=urlparse.urljoin(url,tag['href'])
if url in tag['href'] and tag['href'] not in visited:
urls.append(tag['href'])
visited.append(tag['href'])
print visited
# In[19]:
# Fetch the words as documents from each of the pages and club them as one document
text_ang = ""
for link in visited:
print link
#link_href='"'+link
#print link_href
#print type(link)
try:
page = urllib2.urlopen(link)
soup = BeautifulSoup(page)
text_ang= text_ang + str(soup.get_text().replace('\n',' ').replace('\t',' ').replace('\r',' ').encode('utf-8'))
except:
print link + " -not found"
# soup = BeautifulSoup(page)
# text_ang= str(soup.get_text().replace('\n',' ').replace('\t',' ').replace('\r',' ').encode('utf-8'))
print("********************************************\n", text_ang, "\n********************************************\n")
print("############################################\n",visible_texts, "\n############################################\n")
# In[20]:
# Look out for specific combinations and get the counts
counter = Counter()
# text = r.content.lower()
# text = "njkhdsk sdhfk kjsdfjk u sudf shall imustdfkj shall must must nksf ds may nknsdk v may not jsdfjdsa not may must may not shall"
for phrase in ['APR', 'Interest free finance', 'Interest bearing finance', 'ROUTINES VIDEOS']:
counter[phrase] += len(re.findall(r'\b%s\b' % phrase, text_ang))
print(counter)
This code is used for web scraping.But it consumes a lot of time.Can someone please suggest a code which consumes less time?
Geethu_2 0 Newbie Poster
Be a part of the DaniWeb community
We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.