web scraping with beautiful soup

Geethu_2 0 Newbie Poster

8 Years Ago

import urlparse
import urllib
import urllib2
from bs4 import BeautifulSoup
from collections import Counter
import re

import Tkinter
from Tkinter import *
import ttk
from tkFileDialog import askopenfilename

import logging
import tkMessageBox
import tkFileDialog

import csv
import json
import os
import time
import collections
import shutil
import thread

from PIL import Image, ImageTk
from timeit import default_timer as timer

from functools import partial

# In[27]:

# Identify all the possible links from the starting URL
# dedupe the URL list to get the unique list of URLs to scrape

url="http://www.dawsons.co.uk/finance-information/"
urls=[url] #stack of urls to scrape
visited=[url]   #historic record of urls

while len(urls)>0:
    try:
        htmltext=urllib.urlopen(urls[0]).read()
    except:
        print urls[0]
    soup=BeautifulSoup(htmltext)

    urls.pop(0)
    print len(urls)
    for tag in soup.findAll('a',href=True):
        tag['href']=urlparse.urljoin(url,tag['href'])
        if url in tag['href'] and tag['href'] not in visited:
            urls.append(tag['href'])
            visited.append(tag['href'])
print visited

# In[19]:

# Fetch the words as documents from each of the pages and club them as one document
text_ang = ""
for link in visited:
    print link
    #link_href='"'+link

    #print link_href
    #print type(link)
    try:
        page = urllib2.urlopen(link)
        soup = BeautifulSoup(page)
        text_ang= text_ang + str(soup.get_text().replace('\n',' ').replace('\t',' ').replace('\r',' ').encode('utf-8'))

    except:
        print link + " -not found"
#     soup = BeautifulSoup(page)

#     text_ang= str(soup.get_text().replace('\n',' ').replace('\t',' ').replace('\r',' ').encode('utf-8'))

print("********************************************\n", text_ang, "\n********************************************\n")
print("############################################\n",visible_texts, "\n############################################\n")

# In[20]:

# Look out for specific combinations and get the counts

counter = Counter()
# text = r.content.lower()
# text = "njkhdsk sdhfk kjsdfjk u sudf shall imustdfkj shall must must nksf ds may nknsdk v may not jsdfjdsa not may must may not shall" 

for phrase in ['APR', 'Interest free finance', 'Interest bearing finance', 'ROUTINES VIDEOS']:
    counter[phrase] += len(re.findall(r'\b%s\b' % phrase, text_ang))

print(counter)

This code is used for web scraping.But it consumes a lot of time.Can someone please suggest a code which consumes less time?

1 Contributor
0 Replies
210 Views

Be the first to reply

Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.