steaming and stop word removal

Question

sandhya1202 0 Newbie Poster

14 Years Ago

Actually i would like to do steaming,stopword removal,word count and punctuation marks removal from my text data i found code for this in DaniWeb

import string

def RemovePunc():
    line = []
    i = 0
    text_input = ""
    total_text_input = ""
    #This part removes the punctuation and converts input text to lowercase
    while i != 1:
        text_input = raw_input ("")
        if text_input == ".":
            i = 1
        else:
            new_char_string = "" 
            for char in text_input:
                if char in string.punctuation:
                    char = " "
                    
                new_char_string = new_char_string + char
                
            line = line + [new_char_string.lower()]
            #This is a list with all of the text that was entered in
            total_text_input = (total_text_input + new_char_string).lower()
    return line

def RemoveStopWords(line):
    line_stop_words = []
    stop_words = "a","i","it","am","at","on","in","of","to","is","so","too","my","the","and","but","are","very","here","even","from","them","then","than","this","that","though"
    #this part removes the stop words for the list of inputs
    line_stop_words = []
    sent = ""
    word = ""
    test = []
    for sent in line:
        word_list = string.split(sent)
        new_string = ""
        for word in word_list:
            if word  not in stop_words:
                new_string = new_string + word + " "
        new_string = string.split(new_string)
        line_stop_words = line_stop_words + [new_string]
    return(line_stop_words)


def StemWords(line_stop_words):
    leaf_words = "s","es","ed","er","ly","ing"
    i=0
    while i < 6:    
        count = 0
        length = len(leaf_words[i])
        while count < len(line_stop_words):
            line = line_stop_words[count]
            count2 = 0
            while count2 < len(line):
                #line is the particular list(or line) that we are dealing with, count if the specific word
                if leaf_words[i] == line[count2][-length:]:
                    line[count2] = line[count2][:-length]
                count2 = count2 + 1
            line_stop_words[count] = line
            count2 = 0
            count = count + 1
        count = 0
        i = i + 1
    return(line_stop_words)

def indexDupe(lineCount,occur):
    if str(lineCount) in occur:
        return True
    else:
        return False

def Indexing(line_stop_words):
    line_limit = len(line_stop_words)
    index = []
    line_count = 0

    while line_count < line_limit:
        for x in line_stop_words[line_count]:
            count = 0
            while count <= len(index):
                if count == len(index):
                    index = index + [[x,[str(line_count+1)]]]
                    break
                else:
                    if x == index[count][0]:
                        if indexDupe(line_count+1,index[count][1]) == False:
                            index[count][1] += str(line_count+1)
                        break
                    
                        
                count = count + 1

        line_count = line_count + 1
    return(index)


def OutputIndex(index):
    
    print "Index:"
    count = 0
    indexLength = len(index)
    while count < indexLength:
        print index[count][0],
        count2 = 0
        lineOccur = len(index[count][1])
        while count2 < lineOccur:
            print index[count][1][count2],
            if count2 == lineOccur -1:
                print ""
                break
            else:
                print ",",
            count2 += 1
            
        count += 1

line = RemovePunc()   
line_stop_words = RemoveStopWords(line)
line_stop_words = StemWords(line_stop_words)    
index = Indexing(line_stop_words)
OutputIndex(index)

But i want to take the input from a text file and im not able to execute this program AS it is will you help me regarding this plz

python

4 Contributors
8 Replies
639 Views
4 Days Discussion Span
Latest Post 14 Years Ago Latest Post by Beat_Slayer

woooee 814 Nearly a Posting Maven

14 Years Ago

If I understand correctly, you would modify the RemovePunc() function to receive an individual record from the file, and then execute the rest of the code as is

line = RemovePunc(file_record)
line_stop_words = RemoveStopWords(line)
line_stop_words = StemWords(line_stop_words)
index = Indexing(line_stop_words)
OutputIndex(index)

See here http://www.greenteapress.com/thinkpython/html/book010.html#wordlist

Edited 14 Years Ago by woooee because: n/a

TrustyTony 888 ex-Moderator

14 Years Ago

This code is very unpythonic, could you explain the form of output little. Maybe this should produce concordance of occurrences of words in file?

My understanding from the lines:

line = RemovePunc()   
line_stop_words = RemoveStopWords(line)
line_stop_words = StemWords(line_stop_words)    
index = Indexing(line_stop_words)
OutputIndex(index)

is that total effect for each line is like this:

def process_line(line):
    OutputIndex(Indexing(StemWords(RemoveStopWords(RemovePunc(line)))))

For me looks like it does only counting of words plus the stop/stemword stuff. Making the word counts can be done simply:

import string
from collections import Counter

inputstring=open('test.txt').read()

counts=Counter(word.strip(string.punctuation+string.digits)
              for word in inputstring.lower().replace('--',' ').split())

print('The counts of %i different words are: %s' % (len(counts),sorted(counts.items())))
print('Most popular words were: %s' % (sorted(counts.items(), key=lambda x: x[1], reverse = True)[:10]))

Edited 14 Years Ago by TrustyTony because: n/a

Reply to this topic

Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.

Beat_Slayer 17 Posting Pro in Training · Answer 1 · 2010-08-09T03:25:42+00:00

Maybe like this?

import string


def RemovePunc(text_input):
    line = []
    i = 0
    total_text_input = ""
    #This part removes the punctuation and converts input text to lowercase
    new_char_string = "" 
    for char in text_input:
        if char in string.punctuation:
            char = " "
                    
        new_char_string = new_char_string + char
                
    line = line + [new_char_string.lower()]
    #This is a list with all of the text that was entered in
    total_text_input = (total_text_input + new_char_string).lower()
    return line

def RemoveStopWords(line):
    line_stop_words = []
    stop_words = "a","i","it","am","at","on","in","of","to","is","so","too","my","the","and","but","are","very","here","even","from","them","then","than","this","that","though"
    #this part removes the stop words for the list of inputs
    line_stop_words = []
    sent = ""
    word = ""
    test = []
    for sent in line:
        word_list = string.split(sent)
        new_string = ""
        for word in word_list:
            if word  not in stop_words:
                new_string = new_string + word + " "
        new_string = string.split(new_string)
        line_stop_words = line_stop_words + [new_string]
    return(line_stop_words)


def StemWords(line_stop_words):
    leaf_words = "s","es","ed","er","ly","ing"
    i=0
    while i < 6:    
        count = 0
        length = len(leaf_words[i])
        while count < len(line_stop_words):
            line = line_stop_words[count]
            count2 = 0
            while count2 < len(line):
                #line is the particular list(or line) that we are dealing with, count if the specific word
                if leaf_words[i] == line[count2][-length:]:
                    line[count2] = line[count2][:-length]
                count2 = count2 + 1
            line_stop_words[count] = line
            count2 = 0
            count = count + 1
        count = 0
        i = i + 1
    return(line_stop_words)

def indexDupe(lineCount,occur):
    if str(lineCount) in occur:
        return True
    else:
        return False

def Indexing(line_stop_words):
    line_limit = len(line_stop_words)
    index = []
    line_count = 0

    while line_count < line_limit:
        for x in line_stop_words[line_count]:
            count = 0
            while count <= len(index):
                if count == len(index):
                    index = index + [[x,[str(line_count+1)]]]
                    break
                else:
                    if x == index[count][0]:
                        if indexDupe(line_count+1,index[count][1]) == False:
                            index[count][1] += str(line_count+1)
                        break
                    
                        
                count = count + 1

        line_count = line_count + 1
    return(index)


def OutputIndex(index):
    
    print "Index:"
    count = 0
    indexLength = len(index)
    while count < indexLength:
        print index[count][0],
        count2 = 0
        lineOccur = len(index[count][1])
        while count2 < lineOccur:
            print index[count][1][count2],
            if count2 == lineOccur -1:
                print ""
                break
            else:
                print ",",
            count2 += 1
            
        count += 1

f_in = open('readme.txt').read()
for item in f_in.split('.'):
    print item
    line = RemovePunc(item)
    line_stop_words = RemoveStopWords(line)
    line_stop_words = StemWords(line_stop_words)    
    index = Indexing(line_stop_words)
    OutputIndex(index)

Cheers and Happy Coding

TrustyTony 888 ex-Moderator Team Colleague Featured Poster · Answer 2 · 2010-08-10T00:04:17+00:00

Beatslayer's code applied to long file gives strange results (however for example comes howev) of dict with allways one as value (why no set instead), each line or paragraph with own output. What is really your goal? Can you give example input and wished output? Where does that input/output fit in your use case?

TrustyTony 888 ex-Moderator Team Colleague Featured Poster · Answer 3 · 2010-08-10T00:35:31+00:00

Maybe you would find this interesting:

description: TextSTAT is a simple programme for the analysis of texts. It reads ASCII/ANSI texts and HTML files (directly from the internet) and it produces word frequency lists and concordances from these files. The programme runs on MS Windows and is distributed as freeware. Source code in Python is also available for free. User interface in German (default), English, and French.

TextSTAT 2.8g Sourcecode

The user interface is also in Finnish, so this description must be little out of date

sandhya1202 0 Newbie Poster · Answer 4 · 2010-08-10T17:24:17+00:00

Actually i would like to give the text file as input and take the contents of it remove stop words,punctuations and do steaming to that and save the filtered output in the file say "Output.txt". A word is read from the "output.txt" file this word is compared to the words in the excel sheet if it is present in the excel sheet the corresponding value is checked if the cell is empty the value is considered as zero and the cell value in updated to 1 else if the cell has some value it is incremented if the word is not present in the list then we have to add it in the excel sheet and the corresponding cell value is updated to 1 we follow the same procedure till we check for all words in output.txt file and the count value is maintained in excel sheet for each file. I have many text files and have to follow the same to all and maintain spreadsheet for each file so that i can access these values for applying to Knn algorithm for classification

Part of doc sheet containing word occurrences of the input document

    Doc 1
Word 1  1
Word 2  3
Word 3  
Word 4  4
Word 5  
Word 6  1
Word 7  2
Word 8  
Word 9

TrustyTony 888 ex-Moderator Team Colleague Featured Poster · Answer 5 · 2010-08-11T02:49:12+00:00

Something like this?:
http://blog.josephwilk.net/projects/building-a-vector-space-search-engine-in-python.html

Beat_Slayer 17 Posting Pro in Training · Answer 6 · 2010-08-13T04:16:30+00:00

Texts K-Nearest Neighbor (KNN) using the Euclidean algorithm

Cheers and Happy coding.

import cPickle
import re
from math import sqrt

class Words_Works():

    def __init__(self):
        self.all_texts = {}
        self.categories = {}
        self.knn_results = {}
        self.leaf_words = ['s', 'es', 'ed', 'er', 'ly', 'ing']
        self.stop_words = ['a', 'i', 'it', 'am', 'at', 'on', 'in', 'of', 'to', 'is', 'so', 'too', 'my', 'the', 'and', 'but', 'are', 'very', 'here', 'even', 'from', 'them', 'then', 'than', 'this', 'that', 'though']

    def load_categories(self):
        try:
            cat_db = open('categories.pkl', 'rb')
            self.categories = cPickle.load(cat_db)
            cat_db.close()
        except:
            print 'Load of categories file failed'

    def add_category(self, f, cat_name):
        f_in = open(f)
        self.text = f_in.read().lower()
        f_in.close()
        self.wordify()
        self.unstopify()
        self.unleafify()
        self.categories[cat_name] = {}        
        for item in self.unleaf:
            if self.categories[cat_name].has_key(item):
                self.categories[cat_name][item] += 1
            else:
                self.categories[cat_name][item] = 1

    def save_categories(self):
        cat_db = open('categories.pkl', 'wb')
        cPickle.dump(self.categories, cat_db, -1)
        cat_db.close()

    def add_text(self, f):
        f_in = open(f)
        self.text = f_in.read().lower()
        f_in.close()
        self.wordify()
        self.unstopify()
        self.unleafify()
        self.indexify()
        self.all_texts[f] = {}        
        for item in self.unleaf:
            if self.all_texts[f].has_key(item):
                self.all_texts[f][item] += 1
            else:
                self.all_texts[f][item] = 1

    def wordify(self):
        words_pat = re.compile('\\w+')
        self.words = words_pat.findall(self.text)

    def unstopify(self):
        self.unstop = [item for item in self.words if item not in self.stop_words]

    def unleafify(self):
        self.unleaf = self.unstop[:]
        for leaf in self.leaf_words:
            leaf_len = len(leaf)
            leaf_pat = re.compile('%s$' % leaf)
            for i in range(len(self.unleaf)):
                if leaf_pat.findall(self.unleaf[i]):
                    self.unleaf[i] = self.unleaf[i][:-leaf_len]

    def knn_calc(self):
        for text in self.all_texts.keys():
            self.knn_results[text] = {}
            for category in self.categories.keys():
                self.knn_results[text][category] = {}
                iterations = 0
                distance = 0
                for word in self.all_texts[text].keys():
                    if word in self.categories[category].keys():
                        distance += (self.all_texts[text][word] - self.categories[category][word]) ** 2
                        iterations += 1
                distance = sqrt(distance)
                self.knn_results[text][category]['KNN distance'] = distance
                self.knn_results[text][category]['KNN iterations'] = iterations

    def knn(self):
        for text in self.all_texts.keys():
            result = None
            for category in self.categories.keys():
                if not result or self.knn_results[text][category]['KNN distance'] < result:
                    knn = category
                    distance = self.knn_results[text][category]['KNN distance']
                    iterations = self.knn_results[text][category]['KNN iterations']
            print 'File:', text
            print 'KNN:', category
            print 'Distance:', distance
            print 'Iterations:', iterations
            print


mywork = Words_Works()

lit = 'literature.txt'

mywork.add_category(lit, 'Literature')

comp = 'computers.txt'

mywork.add_category(comp, 'Computers')

phy = 'physics.txt'

mywork.add_category(phy, 'Physics')

mywork.save_categories()

print mywork.categories
print

txts = ('sample1.txt', 'sample2.txt')

for text in txts:
    mywork.add_text(text)

print mywork.all_texts
print

mywork.knn_calc()

print mywork.knn_results
print

mywork.knn()