hello
I am following this tutorial on how to scrap website information
http://www.newthinktank.com/2010/11/pyt ... -scraping/
this is my code:
EDIT: do not post off site, moved here
#! /usr/bin/python
from urllib import urlopen
from BeautifulSoup import BeautifulSoup
import re
def cleanHtml(i):
i = str(i) # Convert the Beautiful Soup Tag to a string
bS = BeautifulSoup(i) # Pass the string to Beautiful Soup to strip out html
# Find all of the text between paragraph tags and strip out the html
i = bS.find('p').getText()
# Strip ampersand codes and WATCH:
i = re.sub('&\w+;','',i)
i = re.sub('WATCH:','',i)
return i
def cleanHtmlRegex(i):
i = str(i)
regexPatClean = re.compile(r'<[^<]*?/?>')
i = regexPatClean.sub('', i)
# Strip ampersand codes and WATCH:
i = re.sub('&\w+;','',i)
return re.sub('WATCH:','',i)
# Copy all of the content from the provided web page
webpage = urlopen('http://supertalk.superfuture.com/index.php?/topic/95817-2-for-1-pics/page__st__500').read()
# Grab everything that lies between the title tags using a REGEX
titleString = "<span rel='lightbox'><img src='(.*)' alt='Posted Image' class='bbc_img' />"
patFinderTitle = re.compile(titleString)
# Store all of the titles and links found in 2 lists
findPatTitle = re.findall(patFinderTitle,webpage)
# Print out the results to screen
for i in listIterator:
print findPatTitle[i] # The title
print "\n"
the only parts of the code i've changed is this
webpage = urlopen('http://supertalk.superfuture.com/index.php?/topic/95817-2-for-1-pics/page__st__500').read()
titleString = '<span rel='lightbox'><img src='(.*)' alt='Posted Image' class='bbc_img' />'
patFinderTitle = re.compile(titleString)
I would like to create something that will be able to extract all pictures from every page but for now, im just trying to pull any jpgs but i cant seem to figure it out.
can someone help? i am new to coding.