I'm just going to put this out there. I was asked to create a spreadsheet with data extracted from MARC records. I have looked at pymarc and cannot get this to work. To be honest I'd rather do this myself and lean something in the process. I have another similar project that I need to do and I'd prefer not to do it like this again.
The '.mrk' file contains records that looks like this:
=LDR 01123wam 1234349Ia 4500
=001 ocm30134404\
=003 Blah
=005 19950425141028.0
=008 940621s1789\\\\enka\\\jb\\\\\000\1\eng\d
=007 he\amt345baca
=007 hd\bft455008baaa
=040 \\$aSME$cSEP
=037 \\$aCA0012340002$bSome company somewhere
=041 1\$aeng$hfqwerr
=090 \\$aPE44.C35$bR62 1789
=049 \\$aURGA
=100 1\$aSomeauthor, John$d1746-1818.
=245 13$aSome title$hmicrosomething :$bla bla /$ctranslated from some language.
=260 \\$aSomewhere :$bPrinted for Someone, opposite Some House, Somewhere,$cM,DCC,LXXXIX [1789]
=265 \\$aUniversity Something International, 300 Somewhere Road, Somewhere 3428106
=300 \\$aiv, [5]-183, [7] p. :$bill.
=500 \\$aOriginal attributed to Someone else
=500 \\$aIllustrated by Someone.--Cf. NUC pre-1956 imprints.
=500 \\$a"Entered at Someone's Hall."
=500 \\$aIncludes publisher's advertisements: [7] p. at end.
=500 \\$aReproduction of original in the Some Library.
=510 4\$aSomething$cA 221
=533 \\$aMicrofiche.$bSomewhere $cUniversity Something International,$d1991.$e3 microfiche.$f(Some collection).$7s1991 miun b
=650 \0$aSomething$vSomething fiction.
=650 \0$aSurvival (after some accident.)$vSomething fiction.
=700 10$aSomeone, John,$d1660-1725,$eill.
=700 10$aSomeone-else, John,$d1640-1745,$eill.
=830 \0$aSomething collection ;$v002:005.
Records are separated by a blank line.
My script(oh the shame) looks like this:
#!/usr/local/bin/python2.6
import re, os, sys, os.path
from datetime import datetime
from pymarc import MARCReader
import csv
sys.path.insert(0, "/packages/dsol/lib/python")
from lutbuilder import buildLut
def walk(dir):
""" walks a directory, and meth on each file! """
dir = os.path.abspath(dir)
for file in [file for file in os.listdir(dir) if not file in [".",".."]]:
nfile = os.path.join(dir,file)
if os.path.isfile(nfile):
meth(nfile)
else:
print "%s" % (nfile)
walk(nfile)
def meth(nfile):
lutplace_ = buildLut("/dc/pao/pcift0/data/Publishing/lutcountry.txt", delimiter="|")
lutfreq_ = buildLut("/dc/pao/pcift0/data/Publishing/lutfrequency.txt", delimiter="|")
lutreg_ = buildLut("/dc/pao/pcift0/data/Publishing/lutregularity.txt", delimiter="|")
lutform_ = buildLut("/dc/pao/pcift0/data/Publishing/lutformofitem.txt", delimiter=" = ")
csv_out = csv.writer(open('/dc/pao/pcift0/data/Publishing/marc_records.csv', 'w'), delimiter = ',', quotechar = '"', quoting = csv.QUOTE_MINIMAL)
counts = 0
list_ = {}
list500_ = []
list510_ = []
list510a_ = []
list650_ = []
list650a_ = []
list700_ = []
list700a_ = []
list700b_ = []
list700c_ = []
list700d_ = []
list700e_ = []
thelist_ = []
fileExt = nfile[-3:]
if not fileExt in ["MRK", "mrk"]: return
theFile = '%s\n' % nfile
with open(nfile, "r") as thefile_:
for line_ in thefile_:
match_100 = re.compile(r'^=100....(.*?)\n').search(line_)
match_245 = re.compile(r'^=245....(.*?)\n').search(line_)
match_260 = re.compile(r'^=260....(.*?)\n').search(line_)
match_300 = re.compile(r'^=300....(.*?)\n').search(line_)
match_500 = re.compile(r'^=500....(.*?)\n').search(line_)
match_510 = re.compile(r'^=510....(.*?)\n').search(line_)
match_533 = re.compile(r'^=533....(.*?)\n').search(line_)
match_650 = re.compile(r'^=650....(.*?)\n').search(line_)
match_700 = re.compile(r'^=700....(.*?)\n').search(line_)
match_830 = re.compile(r'^=830....(.*?)\n').search(line_)
if match_100 != None:
authormatch = re.compile(r'\$a(.*?)[\$\n]').search(line_)
authtitlematch = re.compile(r'\$c(.*?)[\$\n]').search(line_)
relatormatch = re.compile(r'\$q(.*?)[\$\n]').search(line_)
affiliationmatch = re.compile(r'\$d(.*?)[\$\n]').search(line_)
if authormatch != None:
list_['100a'] = authormatch.group(1)
if authtitlematch != None:
list_['100c'] = authtitlematch.group(1)
if relatormatch != None:
list_['100q'] = relatormatch.group(1)
if affiliationmatch != None:
list_['100d'] = affiliationmatch.group(1)
if match_245 != None:
titlematch = re.compile(r'\$a(.*?)[\$\n]').search(line_)
typematch = re.compile(r'\$h(.*?)[\$\n]').search(line_)
bmatch = re.compile(r'\$b(.*?)[\$\n]').search(line_)
c_match = re.compile(r'\$c(.*?)[\$\n]').search(line_)
if titlematch != None:
list_['245a'] = titlematch.group(1)
if typematch != None:
list_['245h'] = typematch.group(1)
if bmatch != None:
list_['245b'] = bmatch.group(1)
if c_match != None:
list_['245c'] = c_match.group(1)
if match_260 != None:
titlematcha = re.compile(r'\$a(.*?)[\$\n]').search(line_)
titlematchb = re.compile(r'\$b(.*?)[\$\n]').search(line_)
titlematchc = re.compile(r'\$c(.*?)[\$\n]').search(line_)
titlematchd = re.compile(r'\$e(.*?)[\$\n]').search(line_)
titlematche = re.compile(r'\$f(.*?)[\$\n]').search(line_)
if titlematcha != None:
list_['260a'] = titlematcha.group(1)
if titlematchb != None:
list_['260b'] = titlematchb.group(1)
if titlematchc != None:
list_['260c'] = titlematchc.group(1)
if titlematchd != None:
list_['260d'] = titlematchd.group(1)
if titlematche != None:
list_['260e'] = titlematche.group(1)
if match_300 != None:
titlematcha = re.compile(r'\$a(.*?)[\$\n]').search(line_)
titlematchb = re.compile(r'\$b(.*?)[\$\n]').search(line_)
if titlematcha != None:
list_['300a'] = titlematcha.group(1)
if titlematchb != None:
list_['300b'] = titlematchb.group(1)
if match_500 != None:
match500_ = re.compile(r'\$a(.*?)[\$\n]').search(line_)
if match500_ != None:
list500_.append(match500_.group(1))
if match_510 != None:
match510_ = re.compile(r'\$a(.*?)[\$\n]').search(line_)
match510a_ = re.compile(r'\$c(.*?)[\$\n]').search(line_)
if match510_ != None:
list510_.append(match510_.group(1))
if match510a_ != None:
list510a_.append(match510a_.group(1))
if match_533 != None:
match533a_ = re.compile(r'\$a(.*?)[\$\n]').search(line_)
match533b_ = re.compile(r'\$b(.*?)[\$\n]').search(line_)
match533c_ = re.compile(r'\$c(.*?)[\$\n]').search(line_)
match533d_ = re.compile(r'\$d(.*?)[\$\n]').search(line_)
match533e_ = re.compile(r'\$e(.*?)[\$\n]').search(line_)
match533f_ = re.compile(r'\$f(.*?)[\$\n]').search(line_)
match533g_ = re.compile(r'\$7s(.*?)[\$\n]').search(line_)
if match533a_ != None:
list_['533a'] = match533a_.group(1)
if match533b_ != None:
list_['533b'] = match533b_.group(1)
if match533c_ != None:
list_['533c'] = match533c_.group(1)
if match533d_ != None:
list_['533d'] = match533d_.group(1)
if match533e_ != None:
list_['533e'] = match533e_.group(1)
if match533f_ != None:
list_['533f'] = match533f_.group(1)
if match533g_ != None:
g533 = match533g_.group(1)
g533 = g533[8:-3]
if g533 in lutplace_.keys():
g533 = lutplace_[g533]
list_['533g'] = g533
h533 = match533g_.group(1)
h533 = h533[11:-2]
if h533 in lutfreq_.keys():
h533 = lutfreq_[h533]
list_['533h'] = h533
i533 = match533g_.group(1)
i533 = i533[12:]
if i533 in lutreg_.keys():
i533 = lutreg_[i533]
list_['533i'] = i533
j533 = match533g_.group(1)
j533 = j533[13:]
if j533 in lutform_.keys():
j533 = lutform_[j533]
list_['533j'] = j533
if match_650 != None:
match650_ = re.compile(r'\$a(.*?)[\$\n]').search(line_)
match650a_ = re.compile(r'\$v(.*?)[\$\n]').search(line_)
if match650_ != None:
list650_.append(match650_.group(1))
if match650a_ != None:
list650a_.append(match650a_.group(1))
if match_700 != None:
match700_ = re.compile(r'\$a(.*?)[\$\n]').search(line_)
match700a_ = re.compile(r'\$c(.*?)[\$\n]').search(line_)
match700b_ = re.compile(r'\$q(.*?)[\$\n]').search(line_)
match700c_ = re.compile(r'\$d(.*?)[\$\n]').search(line_)
match700d_ = re.compile(r'\$t(.*?)[\$\n]').search(line_)
match700e_ = re.compile(r'\$l(.*?)[\$\n]').search(line_)
if match700_ != None:
list700_.append(match700_.group(1))
if match700a_ != None:
list700a_.append(match700a_.group(1))
if match700b_ != None:
list700b_.append(match700b_.group(1))
if match700c_ != None:
list700c_.append(match700c_.group(1))
if match700d_ != None:
list700d_.append(match700d_.group(1))
if match700e_ != None:
list700e_.append(match700e_.group(1))
no500 = 0
no510 = 0
no510a = 0
no650 = 0
no650a = 0
no700 = 0
no700a = 0
no700b = 0
no700c = 0
no700d = 0
no700e = 0
if match_830 != None:
for line500_ in list500_:
no500 += 1
the500name_ = "500%s" % no500
list_[the500name_] = line500_
for line510_ in list510_:
no510 += 1
the510name_ = "510%s" % no510
list_[the510name_] = line510_
for line510a_ in list510a_:
no510a += 1
the510aname_ = "510a%s" % no510a
list_[the510aname_] = line510a_
for line650_ in list650_:
no650 += 1
the650name_ = "650%s" % no650
list_[the650name_] = line650_
for line650a_ in list650a_:
no650a += 1
the650aname_ = "650a%s" % no650a
list_[the650aname_] = line650a_
for line700_ in list700_:
no700 += 1
the700name_ = "700%s" % no700
list_[the700name_] = line700_
for line700a_ in list700a_:
no700a += 1
the700aname_ = "700a%s" % no700a
list_[the700aname_] = line700a_
for line700b_ in list700b_:
no700b += 1
the700bname_ = "700b%s" % no700b
list_[the700bname_] = line700b_
for line700c_ in list700c_:
no700c += 1
the700cname_ = "700c%s" % no700c
list_[the700cname_] = line700c_
for line700d_ in list700d_:
no700d += 1
the700dname_ = "700d%s" % no700d
list_[the700dname_] = line700d_
for line700e_ in list700e_:
no700e += 1
the700ename_ = "700e%s" % no700e
list_[the700ename_] = line700e_
thelist_.append(list_)
list_ = {}
list500_ = []
list510_ = []
list510a_ = []
list650_ = []
list650a_ = []
list700_ = []
list700a_ = []
list700b_ = []
list700c_ = []
list700d_ = []
list700e_ = []
csv_out.writerow(["Author", "Authortitle", "Authordate", "Title", "Place of publication", "Name of publisher", "Date of publication", "Place of manufacture", "Manufacturer", "Physical Description", "Physical Description other", "Type of reproduction", "Place of reproduction", "Agency responsible for reproduction", "Date of reproduction", "Physical description of reproduction", "Series statement of reproduction", "Co author 1", "Co author 2", "Co author 3", "Co author 4", "General notes 1", "General notes 2", "General notes 3", "General notes 4", "Citation/References Notes 1", "Citation/References Notes 1 - Location within source", "Citation/References Notes 2", "Citation/References Notes 2 - Location within source", "Citation/References Notes 3", "Citation/References Notes 3 - Location within source", "Citation/References Notes 4", "Citation/References Notes 4 - Location within source", "650 field notes 1", "650 field notes 1a", "650 field notes 2", "650 field notes 2a", "650 field notes 3", "650 field notes 3a", "650 field notes 4", "650 field notes 4a"])
for line in thelist_:
if line.has_key('100a'):
Author = line['100a']
else:
Author = ""
if line.has_key('100c'):
Authortitle = line['100c']
else:
Authortitle = ""
if line.has_key('100d'):
Authordate = line['100d']
else:
Authordate = ""
if line.has_key('245a'):
Title = line['245a']
else:
Title = ""
if line.has_key('260a'):
a260 = line['260a']
else:
a260 = ""
if line.has_key('260b'):
b260 = line['260b']
else:
b260 = ""
if line.has_key('260c'):
c260 = line['260c']
else:
c260 = ""
if line.has_key('260d'):
d260 = line['260d']
else:
d260 = ""
if line.has_key('260e'):
e260 = line['260e']
else:
e260 = ""
if line.has_key('300a'):
a300 = line['300a']
else:
a300 = ""
if line.has_key('300b'):
b300 = line['300b']
else:
b300 = ""
if line.has_key('5001'):
the5001 = line['5001']
else:
the5001 = ""
if line.has_key('5002'):
the5002 = line['5002']
else:
the5002 = ""
if line.has_key('5003'):
the5003 = line['5003']
else:
the5003 = ""
if line.has_key('5004'):
the5004 = line['5004']
else:
the5004 = ""
if line.has_key('5101'):
the5101 = line['5101']
else:
the5101 = ""
if line.has_key('5102'):
the5102 = line['5102']
else:
the5102 = ""
if line.has_key('5103'):
the5103 = line['5103']
else:
the5103 = ""
if line.has_key('5104'):
the5104 = line['5104']
else:
the5104 = ""
if line.has_key('510a1'):
the510a1 = line['510a1']
else:
the510a1 = ""
if line.has_key('510a2'):
the510a2 = line['510a2']
else:
the510a2 = ""
if line.has_key('510a3'):
the510a3 = line['510a3']
else:
the510a3 = ""
if line.has_key('510a4'):
the510a4 = line['510a4']
else:
the510a4 = ""
if line.has_key('533a'):
a533 = line['533a']
else:
a533 = ""
if line.has_key('533b'):
b533 = line['533b']
else:
b533 = ""
if line.has_key('533c'):
c533 = line['533c']
else:
c533 = ""
if line.has_key('533d'):
d533 = line['533d']
else:
d533 = ""
if line.has_key('533e'):
e533 = line['533e']
else:
e533 = ""
if line.has_key('533f'):
f533 = line['533f']
else:
f533 = ""
if line.has_key('533g'):
g533 = line['533g']
else:
g533 = ""
if line.has_key('533h'):
h533 = line['533h']
else:
h533 = ""
if line.has_key('533i'):
i533 = line['533i']
else:
i533 = ""
if line.has_key('533j'):
j533 = line['533j']
else:
j533 = ""
if line.has_key('6501'):
the6501 = line['6501']
else:
the6501 = ""
if line.has_key('6502'):
the6502 = line['6502']
else:
the6502 = ""
if line.has_key('6503'):
the6503 = line['6503']
else:
the6503 = ""
if line.has_key('6504'):
the6504 = line['6504']
else:
the6504 = ""
if line.has_key('650a1'):
the650a1 = line['650a1']
else:
the650a1 = ""
if line.has_key('650a2'):
the650a2 = line['650a2']
else:
the650a2 = ""
if line.has_key('650a3'):
the650a3 = line['650a3']
else:
the650a3 = ""
if line.has_key('650a4'):
the650a4 = line['650a4']
else:
the650a4 = ""
if line.has_key('7001'):
the7001 = line['7001']
else:
the7001 = ""
if line.has_key('7002'):
the7002 = line['7002']
else:
the7002 = ""
if line.has_key('7003'):
the7003 = line['7003']
else:
the7003 = ""
if line.has_key('7004'):
the7004 = line['7004']
else:
the7004 = ""
if line.has_key('700a1'):
the700a1 = line['700a1']
else:
the700a1 = ""
if line.has_key('700a2'):
the700a2 = line['700a2']
else:
the700a2 = ""
if line.has_key('700a3'):
the700a3 = line['700a3']
else:
the700a3 = ""
if line.has_key('700a4'):
the700a4 = line['700a4']
else:
the700a4 = ""
if line.has_key('700b1'):
the700b1 = line['700b1']
else:
the700b1 = ""
if line.has_key('700b2'):
the700b2 = line['700b2']
else:
the700b2 = ""
if line.has_key('700b3'):
the700b3 = line['700b3']
else:
the700b3 = ""
if line.has_key('700b4'):
the700b4 = line['700b4']
else:
the700b4 = ""
if line.has_key('700c1'):
the700c1 = line['700c1']
else:
the700c1 = ""
if line.has_key('700c2'):
the700c2 = line['700c2']
else:
the700c2 = ""
if line.has_key('700c3'):
the700c3 = line['700c3']
else:
the700c3 = ""
if line.has_key('700c4'):
the700c4 = line['700c4']
else:
the700c4 = ""
if line.has_key('700d1'):
the700d1 = line['700d1']
else:
the700d1 = ""
if line.has_key('700d2'):
the700d2 = line['700d2']
else:
the700d2 = ""
if line.has_key('700d3'):
the700d3 = line['700d3']
else:
the700d3 = ""
if line.has_key('700d4'):
the700d4 = line['700d4']
else:
the700d4 = ""
if line.has_key('700e1'):
the700e1 = line['700e1']
else:
the700e1 = ""
if line.has_key('700e2'):
the700e2 = line['700e2']
else:
the700e2 = ""
if line.has_key('700e3'):
the700e3 = line['700e3']
else:
the700e3 = ""
if line.has_key('700e4'):
the700e4 = line['700e4']
else:
the700e4 = ""
csv_out.writerow([Author, Authortitle, Authordate, Title, a260, b260, c260, d260, e260, a300, b300, a533, b533, c533, d533, e533, f533, the7001, the7002, the7003, the7004, the5001, the5002, the5003, the5004, the5101, the510a1, the5102, the510a2, the5103, the510a3, the5104, the510a4, the6501, the650a1, the6502, the650a2, the6503, the650a3, the6504, the650a4])
print counts
if __name__ == "__main__":
path_ = "/dc/pao/pcift0/data/Publishing/in"
startTime = datetime.now()
walk(path_)
print(datetime.now()-startTime)
This script worked fine. It reads everything into one big list. Some of the fields(650, 700) that can appear more than once gets read into more lists and then at the end the list gets sorted(manually) and then gets written into a csv file.
I will be re-thinking this in the next few days(starting now) but I feel a bit out of my depth. If anyone has any suggestions on how to make this a bit more presentable please, please let me know.