Hi, I'm trying to count the number of unique IP addresses present in an apache log file. I am using a text file to store the unique IP addresses as this needs to scale to large numbers of IP addresses (upwards of 100 million) so using a dictionary or any other data structure stored in memory would make the machine run out of RAM.
import re
import os
import fileinput
def ipCounter(myline):
if ippattern.search(myline):
#search for all existing ips and store in list
iplist = ippattern.findall(myline)
#Open the IP file in read mode
ip_file = open("unique_ips.txt",'r')
for eachip in iplist:
for line in ip_file:
if line.find(eachip) > 0:
break
else:
print "adding to file..."
#Close file and open in append mode
ip_file.close()
ip_file = open("unique_ips.txt",'a')
#Now write the new IP address found to the file
ip_file.write(eachip)
#Close file again
ip_file.close()
ippattern = re.compile(r"\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b")
#Iterate through all files in directory
for fname in os.listdir(os.getcwd()):
if fname.find('LOG' or 'log')> 0:
print "Processing....."+fname
inputlog = fname
#Open the log and start going through it line by line
for line in fileinput.input(inputlog):
ipCounter(line)
fileinput.close()
My final output file does'nt seem to get populated by any addresses, I'm not sure about the part in ipCounter() where I check if the IP address is already present and if not, append it to the file. I would be grateful for any help.
Thanks,
Adi