I am running into some issue with scraping data. If I hardcode value for key "lbo race" in the code below it is able to scrape the data but if I try to set key "lbo race" to a variable which is being read in it doesn't seem to scrape the data correctly. I tried to put a time to slow it down but that doesn't seem to be the issue. Would I use threading to solve this problem?
Thanks!
import urllib.parse
import urllib.request
import csv
import time
def parseTable(html):
#Each "row" of the HTML table will be a list, and the items
#in that list will be the TD data items.
ourTable = []
#We keep these set to NONE when not actively building a
#row of data or a data item.
ourTD = None #Stores one table data item
ourTR = None #List to store each of the TD items in.
#State we keep track of
inTable = False
inTR = False
inTD = False
#Start looking for a start tag at the beginning!
tagStart = html.find("<", 0)
while( tagStart != -1):
tagEnd = html.find(">", tagStart)
if tagEnd == -1: #We are done, return the data!
return ourTable
tagText = html[tagStart+1:tagEnd]
#only look at the text immediately following the <
tagList = tagText.split()
tag = tagList[0]
tag = tag.lower()
#Watch out for TABLE (start/stop) tags!
if tag == "table": #We entered the table!
inTable = True
if tag == "/table": #We exited a table.
inTable = False
#Detect/Handle Table Rows (TR's)
if tag == "tr":
inTR = True
ourTR = [] #Started a new Table Row!
#If we are at the end of a row, add the data we collected
#so far to the main list of table data.
if tag == "/tr":
inTR = False
ourTable.append(ourTR)
ourTR = None
#We are starting a Data item!
if tag== "td" or tag== "th":
inTD = True
ourTD = "" #Start with an empty item!
#We are ending a data item!
if tag == "/td" or tag=="/th":
inTD = False
if ourTD != None and ourTR != None:
cleanedTD = ourTD.strip() #Remove extra spaces
ourTR.append( ourTD.strip() )
ourTD = None
#Look for the NEXT start tag. Anything between the current
#end tag and the next Start Tag is potential data!
tagStart = html.find("<", tagEnd+1)
#If we are in a Table, and in a Row and also in a TD,
# Save anything that's not a tag! (between tags)
#
#Note that this may happen multiple times if the table
#data has tags inside of it!
#e.g. <td>some <b>bold</b> text</td>
#
#Because of this, we need to be sure to put a space between each
#item that may have tags separating them. We remove any extra
#spaces (above) before we append the ourTD data to the ourTR list.
if inTable and inTR and inTD:
ourTD = ourTD + html[tagEnd+1:tagStart] + " "
#print("td:", ourTD) #for debugging
#If we end the while loop looking for the next start tag, we
#are done, return ourTable of data.
return(ourTable)
url = "http://elections.sos.state.tx.us/elchist.exe"
files = open('1992DemocraticPrimaryElection.txt', 'r')
values = {'election' : "1992 Democratic Primary Election",
'lboRace' : "",
'btnSubmit' : "Submit"}
for line in files:
linenew = line
linenew = linenew.replace(' ','')
linenew = linenew.replace('\n','')
linenew = linenew.replace('"', '')
file = open('1992DemocraticPrimaryElection.'+linenew+'.csv', 'w')
for k, v in values.items():
values['lboRace'] = line
print(k, v)
data = urllib.parse.urlencode(values)
data = data.encode('ascii')
req = urllib.request.Request(url, data)
response = urllib.request.urlopen(req)
html_bytes = response.read()
html = str(html_bytes)
dataTable = parseTable(html)
writer = csv.writer(file)
for item in dataTable:
writer.writerow(item)
file.close()
files.close()