I'd like to post some code of an app I wrote to spit out large files, and sort them, and finally reassemble them. I'm new to Python, and the 'object' way in general.
If you feel like it, would you please tell me how it should have been written 'properly'. For instance, I don't get the whole 'self' thing, and I don't 'quite' get the modularity. I'm not looking for a lesson so much as just 'you could have done this here', or, 'this is sloppy, it should be done like this'..
Please look at it, and if you have the time, let me know how I 'should' have done it, or at least a tip or two. I'm glad I found this group, you guys are awesome.
Code:
import csv
import time
import sys
import os
def main():
pass
def sortAndWrite(fname, ziploc):
try:
T = csv.reader(open(fname, 'rb'), quoting = csv.QUOTE_ALL)
To = open('Sorted_' + fn,'ab')
Tw = csv.writer(To, quoting = csv.QUOTE_ALL)
#add all of the rows to a temporary array
for row in T:
tmpTable.append(row)
#sort the data
tmpTable.sort(lambda a,b: cmp(a[ziploc][0:5],b[ziploc][0:5]) )
#write the table
print "Building Sorted file, adding: " + fname
if header == 1:
# Have we written it?
Tw.writerow(h) # No, write it first
header == 0 # Now, it's written
# Write the contents of the temp list after sorting to the master output file
for row in tmpTable:
Tw.writerow(row)
#empty the temp table
tmpTable[:] = []
if fname == 'temp6.tmp':
To.flush()
To.close()
except:
print ("Unexpected error:", sys.exc_info()[0])
# Start App ====================================================================
os.system("cls") # Clear the screen
# Define the help menu should they type Zipsort.py --help
helptext="""\nUsage: Zipsort.py [filename -h | -c ]
Zipsort.py is is a small program that will sort your file by Zip Code.
-h No header
[default, assumes file has a header]
-c New Zip Code column (zero based)
[default is column 76]
Example: Zipsort.py MyFile.csv -h -c
In the above example, the file to be sorted is 'MyFile.csv', the
file does not have a header record and the column that contains
the zip code needs to be overridden.
"""
print "" # Force a print line so the text isn't wedged against the top of the
# DOS window.
# Do they need help?
if sys.argv.count("--help")>0:
print helptext
sys.exit()
try:
fn = sys.argv[1] # The filename is the first argument on the command line
print "Filename is: " + fn
if os.path.exists(fn) == False:
print "File does not exist, try again."
exit()
except:
# rather than mess with indexes, I just catch the exception
print "You must define a valid file to sort. Ex: Zipsort.py MyFile.cvs"
exit()
# is there no header?
if sys.argv.count("-h")>0:
header = 0
print "File has no header"
else:
header = 1
print "File has a header"
# Do they want to change the zip code location?
if sys.argv.count("-c")>0:
z = raw_input("What is the new Zip Code column?: ")
print "Zip code is now located at: " + str(z)
else:
z = 76
print "Zip code is located at: " + str(z)
# Define the working table we will use to hold the temp file(s) records
# for sorting, and other working variables
tmpTable = []
h = '' # This will hold the header for later
# Delete the Sorted out file before we start
if os.path.exists('Sorted_' + fn) == True:
YN = raw_input("Sorted file already exists, delete it?: ")
if YN == 'y' or YN == 'Y':
os.remove('Sorted_' + fn)
print "Sorted file removed"
else:
Q = raw_input( "Quit?, or Continue (Q/C)?")
if Q == 'q' or Q == 'Q':
print "Exiting.."
exit()
#Open input file and split it into (6) temp files for processing
#if there's a header, we will capture it at run time
start = time.clock() #start the timer
I = open(fn, 'rb')
r = csv.reader(I, quoting = csv.QUOTE_ALL)
O1 = open('temp1.tmp', 'w+b')
w1 = csv.writer(O1, quoting = csv.QUOTE_ALL)
O2 = open('temp2.tmp', 'w+b')
w2 = csv.writer(O2, quoting = csv.QUOTE_ALL)
O3 = open('temp3.tmp', 'w+b')
w3 = csv.writer(O3, quoting = csv.QUOTE_ALL)
O4 = open('temp4.tmp', 'w+b')
w4 = csv.writer(O4, quoting = csv.QUOTE_ALL)
O5 = open('temp5.tmp', 'w+b')
w5 = csv.writer(O5, quoting = csv.QUOTE_ALL)
O6 = open('temp6.tmp', 'w+b')
w6 = csv.writer(O6, quoting = csv.QUOTE_ALL)
if header == 1:
h = r.next() # store the header
print ""
print "Splitting out the input file"
try:
for row in r:
Zip = int(row[z][0:5])
if Zip <= 20000:
w1.writerow(row)
if Zip > 20000 and Zip <= 35000:
w2.writerow(row)
if Zip > 35000 and Zip <= 45000:
w3.writerow(row)
if Zip > 45000 and Zip <= 65000:
w4.writerow(row)
if Zip > 65000 and Zip <= 85000:
w5.writerow(row)
if Zip > 85000:
w6.writerow(row)
except:
w6.writerow(row)
print "Error in this record, bad zip: " + row[z][0:5]
#close the temp files so we don't have contention issues later
O1.close()
O2.close()
O3.close()
O4.close()
O5.close()
O6.close()
#once the file are separated, we need to sort them
for f in range(1,7):
tmpFile = 'temp' + str(f) + '.tmp'
print "Sorting: " + tmpFile
sortAndWrite(tmpFile, z)
os.remove(tmpFile)
# End of app....
end = time.clock()
print "Finished"
print "Time elapsed = ", end - start, "seconds"
if __name__ == '__main__':
main()