Hello..
Im doing some simple text processing using Python which include indexing, splitting and tokenizing text from folder. i want to improve the stemming and tagging process using MontyLingua by importing the Monty library for tokenizing and tagging. I dont know how to edit and call them in my coding. Anybody know how to use and call theMontyTagger and theMontyTokenizer? Pls provide me some coding example.
This is how my existing code looks like in IDLE (python) GUI:
import re
import os
import sqlobject
import re, math
import sgmllib, string
from numpy import*
import adodb
from nltk_lite.corpora import stopwords
from nltk_lite import stem
import MySQLdb
#stopword and stem
stopwords_list=list(stopwords.raw('english'))
stemmer=stem.Porter()
class SearchEngine:
def __init__(self,filePath,files,filename,words,word,freq,wordst,wordss,no_id):
self.filePath=filePath
self.files=files
self.filename=filename
self.words=words
self.word=word
self.freq=freq
self.wordst=wordst
self.wordss=wordss
self.no_id=no_id
def textProcessing(self):
#open file
outfile=open('output_kira.txt','w')
outfile3=open('ayat.txt','w')
self.no_id=0
#read folder
for self.filename in os.listdir("C:/Users/abc1234/Documents/Google Talk Received Files/installer/installer/ayat"):
fullpath=os.path.join("C:/Users/abc1234/Documents/Google Talk Received Files/installer/installer/ayat",self.filename)
self.filenames=self.filename
self.no_id=self.no_id+1
print self.filename
outfile.write("\n%s\n"%(self.filename))
outfile3.write("\n%s\n"%(self.filename))
infile=open(fullpath,'r')
#tokenizer
content=infile.read()
self.words=content.split()
print 'Words in text:',len(self.words)
outfile.write("Words in text:%d\n"%(len(self.words)))
freq_dic={}
punctuation=re.compile('r[""]')
for self.word in self.words:
#remove punctuation marks
self.word=punctuation.sub("",self.word)
#from dictionary
try:
freq_dic[self.word]+=1
except:
freq_dic[self.word]=1
#print unique word
print("Unique words:%d\n"%(len(freq_dic)))
outfile.write("Unique words:%d\n"%(len(freq_dic)))
#create list of (key, val) tuple pairs
freq_list=freq_dic.items()
#sort by key or word
freq_list.sort()
for self.word in self.words:
outfile3.write("%s"%(self.word))
if self.word.endswith('.') and self.word!='Dr.':
outfile3.write("\n\n")
#indexing
for self.word, self.freq in freq_list:
if self.word not in stopwords_list:
if self.word!=self.words[1]:
self.wordst=self.word
if self.freq>0:
if self.wordst!='-' and self.wordst!='<' and self.wordst!='Dr.' and self.wordst!=':' and self.wordst!='&':
#connect database
connection=MySQLdb.connect(host="localhost",
user="root",
passwd="",
db="test1")
cursor=connection.cursor()
if self.wordst!=('') and self.wordst!=('—'):
print self.wordst
sqlstmt="INSERT INTO tabletest1(term,id,idx,frq,filename) VALUES('%s','%d','%d','%d','%s')"%(self.wordst,self.freq,self.words.index(self.word),self.no_id,self.filename)
outfile.write("WORD=%s INDEX=%d FREKUENSI=%d URL=%s ID=%d FILE NAME=%s\n"%(self.wordst,self.words.index(self.word),self.freq,self.words[1],self.no_id,self.filename))
cursor.execute(sqlstmt)
cursor.close()
connection.commit()
connection.close()
infile.close()
outfile.close()
outfile3.close()
i=SearchEngine('','','','','','','','','')
i.textProcessing()