My code is:
import re
import urllib
import urllib2
webURL="http://www.sc.iitb.ac.in/~bijnan/personal-details.htm" #the website is
connect=urllib.urlopen(webURL) #connect to this website
htmlDoc=connect.read()#get the html document from this website
patternIN="Permanent Address" # Where to begin to keep the text
patternOUT="</tr>" # Where to end to keep the text (after the begining)
keepText=False # Do we keep the text ?
address="" # We init the address
# Now, we read the file to keep the text
for line in htmlDoc:
if keepText:
address+=line.strip() # We store the line, stripping the \n
if patternOUT in line: # Next line won't be kept any more
keepText=False
if patternIN in line: # Starting from next line, we keep the text
keepText=True
# Now, it's time to clean all this
rTags=re.compile("<.*?>") # the regexp to recognise any tag
address=rTags.sub(":", address) # we replace the tags with ":" (I could have chosen anything else,
# especially if there is some ":" in the address
rSep=re.compile(":+") # Now, we replace any number of ":" with a \n
address=rSep.sub("\n", address)
print address
For line 15...whats wrong there?why i cannot do the for loop in the html file?