Character counting...

Question

Ghostenshell 0 Light Poster

15 Years Ago

Can anyone simplify this for me?
I don't know if it is going to work I tried to adjust the code to fit on the page. Might have to delete the extra +\

def main():
    try:
        
        infile = open('text.txt','r')
        files = infile.readlines()

        lower,count,blank,dig = 0,0,0,0
    #Counting uppercase
        for line in files:
            count+=line.count('A')+line.count('B')+line.count('C')+line.count('D')+\ +line.count('E')+line.count('F')+line.count('G')+line.count('H')+\
+line.count('I')+line.count('J')+line.count('K')+line.count('L')+\
+line.count('M')+line.count('N')+line.count('O')+line.count('P')+\
+line.count('Q')+line.count('R')+line.count('S')+line.count('T')+\
+line.count('U')+line.count('V')+line.count('W')+line.count('X')+\
+line.count('Y')+line.count('Z')

        print 'There are',count,'uppercase letters.'
        print
    #Counting lowercase
        for line in files:
            lower+=line.count('a')+line.count('b')+line.count('c')+line.count('d')+\
+line.count('e')+line.count('f')+line.count('g')+line.count('h')+\
+line.count('i')+line.count('j')+line.count('k')+line.count('l')+\
+line.count('m')+line.count('n')+line.count('o')+line.count('p')+\
+line.count('q')+line.count('r')+line.count('s')+line.count('t')+\
+line.count('u')+line.count('v')+line.count('w')+line.count('x')+\
+line.count('y')+line.count('z')
        
        print 'I also have',lower,'lowercase letters.'
        print
    #Number of didgits
        for line in files:
            dig+=line.count('0')+line.count('1')+line.count('2')+line.count('3')+\
+line.count('4')+line.count('5')+line.count('6')+line.count('7')+\
+line.count('8')+line.count('9')
        print 'The number of digits in the file =',dig
        print
    #number of whitespace characters
        for line in files:
            blank+=line.count(' ')
        print 'I found',blank,'whitespaces in this file'
        print
        raw_input('Hit enter to quit')
        infile.close
    except:
        print 'An error occured.'
        
main()

python

7 Contributors
16 Replies
963 Views
1 Day Discussion Span
Latest Post 15 Years Ago Latest Post by Stefano Mtangoo

mn_kthompson 3 Junior Poster

15 Years Ago

I'm not going to go through all of your code because I'm in a hurry, but this should simplify it quite a bit. At a minimum it will make it easier to read.

alphabet = ['A','B','C','D'....etc]
count = 0
infile = open('infile.txt','r')
for character in infile.readlines():
    if character in alphabet:
        count += 1

print count

mn_kthompson 3 Junior Poster

15 Years Ago

Here is another idea...

for character in infile.readlines():
    if character.isupper() == True:
        uppercasecount += 1
    if character.islower() == True:
        lowercasecount += 1
    if character.isdigit() == True:
        digitcount += 1

Gribouillis 1,391 Programming Explorer

15 Years Ago

Here is another one (faster ?)

#!/usr/bin/env python
import re

def main():
    regexes = [ re.compile(x) for x in
         (r"[^A-Z]+", r"[^a-z]+", r"[^0-9]+", r"[^\ ]+")]
    filename = "test.txt"
    content = open(filename).read()
    counts = [len(s) for s in (r.sub("", content) for r in regexes)]
    print("""There are
%d uppercase letters
%d lowercase letters
%d digits
%d space characters
in file '%s'""" % (tuple(counts) + (filename,)))

main()

sneekula commented: nice +6

sneekula 969 Nearly a Posting Maven

15 Years Ago

I got curious, so I timed the 'just Python' and 'module re' approaches:

# timing characer count by type
# compare 'module re' and 'just Python' approches

import timeit
import re

def count_char1(text):
    """
    count upper case char, lower case char, digits and spaces in a text
    """
    regexes = [ re.compile(x) for x in
        (r"[^A-Z]+", r"[^a-z]+", r"[^0-9]+", r"[^\ ]+")]
    counts = [len(s) for s in (r.sub("", text) for r in regexes)]
    return tuple(counts)

def count_char2(text):
    """
    count upper case char, lower case char, digits and spaces in a text
    """    
    upper = lower = digit = space = 0
    for c in text:
        if c.isupper():
            upper += 1
        elif c.islower():
            lower += 1
        elif c.isdigit():
            digit += 1
        elif c.isspace():
            space += 1
    return (upper, lower, digit, space)


text = """
There is one rule for the industrialist and that is: 
Make the best quality of goods possible at the lowest 
cost possible, paying the highest wages possible.

Henry Ford 1924
"""

# for longer text uncomment line below
#text = text*10

stmt = 'count_char1(text)'
t = timeit.Timer(stmt, setup="from __main__ import count_char1, text")
#  doing 10000 passes * 100 gives the time in microseconds/pass
elapsed = (100 * t.timeit(number=10000))
print( "Function %s takes %0.3f micro-seconds/pass" % (stmt, elapsed) )

stmt = 'count_char2(text)'
t = timeit.Timer(stmt, setup="from __main__ import count_char2, text")
#  doing 10000 passes * 100 gives the time in microseconds/pass
elapsed = (100 * t.timeit(number=10000))
print( "Function %s takes %0.3f micro-seconds/pass" % (stmt, elapsed) )

"""
my timing result -->
Function count_char1(text) takes 119.080 micro-seconds/pass
Function count_char2(text) takes 184.978 micro-seconds/pass
"""

Gribouillis commented: good idea +2

mn_kthompson commented: Damn fine analysis +2

Reply to this topic

Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.

woooee 814 Nearly a Posting Maven · Answer 1 · 2009-01-14T09:04:50+00:00

Python has the built in isupper() and islower() (also isdigit, isspace, ispunct) http://www.python.org/doc/2.5.2/lib/module-curses.ascii.html

test=["AabcdEFg.",
      "jKLmnopR?" ]
      
upper_total=0
lower_total=0
neither_total=0
for rec in test:
   for chr in rec:
      if chr.isupper():
         upper_total += 1
      elif chr.islower():
         lower_total += 1
      else:
         neither_total += 1

print "%d Upper Case,  %d lower case,  and %d neither" % \
      (upper_total, lower_total, neither_total)
#
# And if you wanted to go with your original idea, it would be
if (chr >= "A") and (chr <= "Z"):
   upper_total += 1
etc.

Edit: and it looks like great minds think alike.

Stefano Mtangoo 455 Senior Poster · Answer 2 · 2009-01-14T11:08:52+00:00

Here is another idea...

Just polishing what Kthom has written

infile = open("test.txt", "r")
uppercasecount, lowercasecount, digitcount = (0, 0, 0)
for character in infile.readlines():
    if character.isupper() == True:
        uppercasecount += 1
    if character.islower() == True:
        lowercasecount += 1
    if character.isdigit() == True:
        digitcount += 1
    print uppercasecount, lowercasecount, digitcount
print "Total count is %d Upper case, %d Lower case and %d Digit(s)" %(uppercasecount, lowercasecount, digitcount)

Stefano Mtangoo 455 Senior Poster · Answer 3 · 2009-01-14T13:31:19+00:00

I would like to learn regular expressions. I have no Idea what it is (I always see people here using). Can someone explain a litle and suggest good tutorial?

Didn't mean to deviate the thread, so keep your first focus on the thread, then you can answer my question

Gribouillis 1,391 Programming Explorer Team Colleague · Answer 4 · 2009-01-14T14:10:04+00:00

I would like to learn regular expressions. I have no Idea what it is (I always see people here using). Can someone explain a litle and suggest good tutorial?
Didn't mean to deviate the thread, so keep your first focus on the thread, then you can answer my question

You could start with dive into python http://diveintopython.org/regular_expressions/index.html.

sneekula 969 Nearly a Posting Maven · Answer 5 · 2009-01-14T23:25:11+00:00

I would like to learn regular expressions. I have no Idea what it is (I always see people here using). Can someone explain a litle and suggest good tutorial?
Didn't mean to deviate the thread, so keep your first focus on the thread, then you can answer my question

There is also:
http://www.amk.ca/python/howto/regex/

Gribouillis 1,391 Programming Explorer Team Colleague · Answer 6 · 2009-01-15T03:43:31+00:00

Here is a function which is 10 times faster on my computer

from string import maketrans
fromst = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
tost = ("a" * 26) + ("A"*26) + ("0"*10)
assert (len(fromst) == 62) and (len(tost) == len(fromst))
table = maketrans(fromst, tost)
def count_char3(text):
    text = text.translate(table)
    low = text.count("a")
    upp = text.count("A")
    dig = text.count("0")
    spa = text.count(" ")
    return upp, low, dig, spa

However this algorithm should work only with 8 bits characters. Also in count_char2, you should replace c.isspace() by c == " ", because isspace matches other chars than the space character.

mn_kthompson 3 Junior Poster · Answer 7 · 2009-01-15T03:56:20+00:00

That was a fantastic analysis! I'm personally going to stick to using the pure python approach because I think it is more readable, and I'm willing to sacrifice 65 micro-seconds for that readability. But at least now I'm making an educated decision and I know what I'm giving up if I don't go the regex route. Big ups to Sneekula!

mn_kthompson 3 Junior Poster · Answer 8 · 2009-01-15T04:01:26+00:00

Here is a function which is 10 times faster on my computer

I don't know if you're comparing apples to apples here. With Sneekula's regex function he was able to feed in raw text and count the characters. Your function requires us to first translate the characters into lowercase a's, uppercase a's or zeros. That translation should be included in the function so that both functions take the same input and produce the same output. Then we can really compare the two objectively.

Gribouillis 1,391 Programming Explorer Team Colleague · Answer 9 · 2009-01-15T04:04:56+00:00

No the translation is done inside the function. Outside of the function, I only build the translation table which has nothing to do with the text.

mn_kthompson 3 Junior Poster · Answer 10 · 2009-01-15T04:11:11+00:00

Oh, I see. You're right. Wow, ten times faster? That's pretty sweet.

vegaseat 1,735 DaniWeb's Hypocrite Team Colleague · Answer 11 · 2009-01-15T05:15:16+00:00

Function calls are time expensive in Python, so I modified Sneekula's count_char2() approach by replacing all those calls to islower(), isupper(), isdigit() and isspace(), and also changing the order of if/elif to use the most common character test first. The result is promising ...

# timing character count by type
# compare 'module re' and 'improved just Python' approaches

import timeit
import re

def count_char1(text):
    """
    count upper case char, lower case char, digits and spaces in a text
    """
    regexes = [ re.compile(x) for x in
        (r"[^A-Z]+", r"[^a-z]+", r"[^0-9]+", r"[^\ ]+")]
    counts = [len(s) for s in (r.sub("", text) for r in regexes)]
    return tuple(counts)

def count_char2(text):
    """
    count upper case char, lower case char, digits and spaces in a text
    """
    upper = lower = digit = space = 0
    for c in text:
        if c.isupper():
            upper += 1
        elif c.islower():
            lower += 1
        elif c.isdigit():
            digit += 1
        elif c.isspace():
            space += 1
    return (upper, lower, digit, space)

def count_char4(text):
    """
    count upper case char, lower case char, digits and spaces in a text
    to improve performance I replaced c.islower() etc. with c in "..."
    also put the test for lower case first since text has mostly lower
    case characters
    """
    upper = lower = digit = space = 0
    for c in text:
        if c in "abcdefghijklmnopqrstuvwxyz":
            lower += 1
        elif c in " ":
            space += 1
        elif c in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
            upper += 1
        elif c in "0123456789":
            digit += 1
    return (upper, lower, digit, space)


text = """
There is one rule for the industrialist and that is:
Make the best quality of goods possible at the lowest
cost possible, paying the highest wages possible.

Henry Ford 1924
"""

# for longer text uncomment line below
#text = text*10

stmt = 'count_char1(text)'
t = timeit.Timer(stmt, setup="from __main__ import count_char1, text")
#  doing 10000 passes * 100 gives the time in microseconds/pass
elapsed = (100 * t.timeit(number=10000))
print( "Function %s takes %0.3f micro-seconds/pass" % (stmt, elapsed) )

stmt = 'count_char2(text)'
t = timeit.Timer(stmt, setup="from __main__ import count_char2, text")
#  doing 10000 passes * 100 gives the time in microseconds/pass
elapsed = (100 * t.timeit(number=10000))
print( "Function %s takes %0.3f micro-seconds/pass" % (stmt, elapsed) )

stmt = 'count_char4(text)'
t = timeit.Timer(stmt, setup="from __main__ import count_char4, text")
#  doing 10000 passes * 100 gives the time in microseconds/pass
elapsed = (100 * t.timeit(number=10000))
print( "Function %s takes %0.3f micro-seconds/pass" % (stmt, elapsed) )

"""
my output -->
Function count_char1(text) takes 142.210 micro-seconds/pass
Function count_char2(text) takes 167.409 micro-seconds/pass
Function count_char4(text) takes 58.894 micro-seconds/pass
"""

Stefano Mtangoo 455 Senior Poster · Answer 12 · 2009-01-15T12:44:10+00:00

You could start with dive into python http://diveintopython.org/regular_expressions/index.html.

There is also:
http://www.amk.ca/python/howto/regex/

Thanks all, I'll dive in