def index_dir(self, base_path):
num_files_indexed = 0
allfiles = os.listdir(base_path)
#print allfiles
num_files_indexed = len(allfiles)
#print num_files_indexed
docnumber = 0
self._inverted_index = {} #dictionary
for file in allfiles:
self.documents = [base_path+file] #list of all text files
f = open(base_path+file, 'r')
lines = f.read()
# Tokenize the file into words
tokens = self.tokenize(lines)
docnumber = docnumber + 1
print 'docnumber', docnumber
for term in tokens:
# check if the key already exists in the dictionary, if yes,
# just add a new value for the key
#if self._inverted_index.has_key(term)
if term in sorted(self._inverted_index.keys()):
docnumlist = self._inverted_index.get(term)
docnumlist = docnumlist.append(docnumber)
else:
# if the key doesn't exist in dictionary, add the key (term)
# and associate the docnumber value with it.
self._inverted_index = self._inverted_index.update({term: docnumber})
#self._inverted_index[term] = docnumber
f.close()
print 'dictionary', self._inverted_index
print 'keys', self._inverted_index.keys()
return num_files_indexed
I'm working on an information retrieval project where we are supposed to crawl through multiple text file, tokenize the files and store the words in an inverted list (dictionary) data structure.
ex:
doc1.txt: "the dog ran"
doc2.txt: "the cat slept"
_inverted_index = {
'the': [0,1],
'dog': [0],
'ran': [0],
'cat': [1],
'slept': [1]
}
where 0,1 are docIDs
I'm getting the following error:
'Nontype' object has no attribute keys. line#95
All help is highly appreciated.