Hi!
I have used the os.walk function to traverse the many files in the directory. Now the problem is that it is listing same words from different files as different keys. How should I go about it?

Some code???


Cheers and Happy coding

It is messy.Real messy! Thats why I didn't put it in the first place.It is my first program in python so please bear with me :)

import os,sys,operator
#import autoviv
#counter = 0
def getlocaldata(sms,dr,flst):
 #sms[0] += 1
 for f in flst:
  fullf = os.path.join(dr,f)
  #print fullf
  if os.path.islink(fullf): continue
  if os.path.isfile(fullf):
   os.path.split(fullf) 
   mata = os.path.dirname(fullf)
   #print mata
   os.path.split(fullf)
   filename = os.path.basename(fullf)
   #print filename
   cata = (filename[4] + filename[5])
   docindex = int(float(cata)) #to index which doc does the current word 
   
   data = open(fullf,'r')
   data_dict = {}
   #dict_dict = {}
   perm_list = {}
   #doc_freq = {}
   for line in data: #Count the number of words in the given file
    for word in line.split():
     perm_list[word] = perm_list.get(word,0) + 1
   perm_list.items()
   
   def remove_dup(perm_list): #remove duplicate entries from the given file
    x = {}
    for y in perm_list:
     x[y]=1
    u = x.keys()
    perm_list =  u
   #per_list = remove_dup(perm_list)
   #print per_list
   number_in_file = 0
   dict_list = []
   data.seek(0)
   doc_freq = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
   for line in data:
    for word in line.split():
    
     if (word != ('the'and'on'and'and'and'are'and'years'and'the')):
      doc_list = {}
      doc_freq[docindex] = 1
      data_dict[word] = doc_list,doc_freq # mapping wordlist : doclist
      doc_list[filename] = perm_list[word] # mapping doclist :key(filename) :: value(term frequency)
      dict_list.append(data_dict)
      data_dict = {}
   
    
   #dict_list[0] = 3
   #print(dict_list)    
   if dict_list:
    dict_list.sort()
    last = dict_list[-1]
    for i in range(len(dict_list)-2 , -1, -1):
     if last == dict_list[i]: del dict_list[i]
     else: last =  dict_list[i]

   sms.append(dict_list)
 
def trim(tresult):# sort the wrds accross all files
 final = {}
 for i in tresult:
  for j in i:
   final.update(j)
 #print final
 for key in sorted(final):
  print "%s : %s" % (key,final[key])
 #print fina
 #print fina['was']
 
#for i in tresult[0][0].keys():
   #print tresult[0][0][i]
 #print tresult[0]

def dtstat (dtroot):
 temp = []
 #sums = [0,0,0]
 os.path.walk(dtroot,getlocaldata,temp)
 return temp

def main():
 
 try:
  root = sys.argv[1]
 except:
  root ='.'
 result = dtstat(root)
 result1 =  trim(result)
  
if __name__ == '__main__':
 main()

For starters, break your code into smaller pieces and test each piece before moving on. As you have found out, debugging 96 lines of code is not easy. This line should be tested first as it does not do what you think.

##--- test this with a known value (word='the')
if (word != ('the'and'on'and'and'and'are'and'years'and'the')):
##
##--- I think you want something like (but can't tell from the code)
if (word not in ['the', 'on', 'and', 'are', 'years']):

Thanks! woooee ! Really appreciate that. I have another query . I have used the word as the key. So when the program walks through the other files and finds the same word it replaces the previous word entry(since the whole thing is imlemented as a dictionary) . Now I want to keep all the entries for a specific word (since I need to know which docs did the word occur in). Is there any way around this ? without changing the underlying implementation .

Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.