I have a large gff file 15,6 GB..
I wanna retrieve features from each record..
and for each subjectOrganism in the qualifier feature I will compare it with sorted list of species and get the index of this organism in sorted list.
It work with a small data from the gff file but not with all file... Memory error
in_file = "All_Pred_Profils.gff" # huge database 15,6 Gb
in_handle = open(in_file)
fo = open("foo.txt", "a") # output file
for rec in GFF.parse(in_handle):
for record in rec.features:
elem_metadatas = list()
keyss=('clst_id', 'SubjectScore','SubjectOrganism')
for key in sorted(record.qualifiers.iterkeys()):
if key in keyss:
if key=="SubjectOrganism":
load_profile = open('speciess.gff')
read_it = load_profile.read()
myLine =list()
for val in record.qualifiers[key]:
i=1
for line in read_it.splitlines():
if line == val:
myLine.append(i)
break
i=i+1
elem_metadatas.append(str(key) + '=' + str(myLine))
else:
elem_metadatas.append(str(key) + '=' + str(record.qualifiers[key]))
print rec.id
print elem_metadatas
fo.write( rec.id+"\n")
fo.writelines(elem_metadatas)
fo.writelines('\n \n \n \n \n')
print ("\n \n")
in_handle.close()
fo.close()