Hi all,

I have been trying to parse the a log file to plot some of the data in it. The format is something like this:

PyrateBufferManager: 2011-10-24 15:42:47.709684:
	CurrentAquiredBuffers: 0
	ReturnBufferCount: 11527
	CurrentAquiredBytes: 0

SimDataCollectorMonitor-00A: 2011-10-24 15:42:47.709684:
	SNProfile: {'SNLoop': 6133}

I was trying to use regex, but didnt get very far in making it flexible. I was recommended lepl, but that it was slow would be an understatement. What I would like to get out is a dictionary has all the information it it sorted by time stamp, i.e.

{2011-10-24 15:42:47: {"PyrateBufferManager": {"CurrentAquiredBuffers": 0, "ReturnBufferCount":11527, "CurrentAquiredBytes": 0}, "SimDataCollectorMonitor-00A":{"SNLoop":6133}, etc.}

Now I have been trying to use regex, but i cant get to be be flexible because there maybe new tags added, etc. and I dont want to add something every time that happens.

Could somebody give me some input as to how to do this.

Thanks a bunch in advance.

Cheers.

Post your regex code, and give example of failure.

It is isnt failing it is just as flexible as a brick wall. What i have so far looks something like:

#!/usr/bin/env python
import sys, os, re
import numpy as np
import pylab as py



#StringHub Variables
timestampssH={}
pBMtags={} #PyrateBufferManager tags
jvmsHtags={} #JVM string hub tags
mDsHtags={} #moniData
rdDsHtags={} #readoutData
rdRsHtags={} #readoutRequest
stags={} #sender 
snDsHtags={} #supernova data
sHsHtags={} #stringHit
shsHtags={} #stringhub
syssHtags={} #system
tDsHtags={} #tcaldata

#Event Builder variables
timestampseB={}
bEtags={} #backend
gTeBtags={} #global trigger 
jvmeBtags={} #jvm
rdDeBtags={} #readoutData
rdReBtags={} #readoutRequest
syseBtrags={} #system

#Secondary builder
timestampssB={}
jvmsBtags={} #JVM
mBsBtags={} #Moni builder
mDsBtags={} #moni data
snBsBtags={} # supernova builder
snDsBtags={} # supernova data
syssBtags={} # system
tBsBtags={} # tcal builder
tDsBtags={} # tcal data

#GlobalTrigger
timestampsgT={}
gTgTtags={} #global trigger
jvmgTtags={} #JVM
mgTtags={} #manager
sysgttags={} #system
tgTtags={} #trigger

#InIceTrigger
timestampsiT={}
jvmiTtags={} #JVM
miTtags={} #manager
sHiTtags={} #stringHit
sysiTtags={} #system
tiTtags={} #trigger


sH={"PyrateBufferManager":pBMtags,
    "jvm":jvmsHtags,
    "moniData":mDsHtags,
    "rdoutData":rdDsHtags,
    "rdoutReq":rdRsHtags,
    "sender":stags,
    "snData":snDsHtags,
    "stringHit":sHsHtags,
    "stringhub":shsHtags,
    "system":syssHtags,
    "tcalData":tDsHtags}
eB={"backEnd": bEtags,
    "glblTrig": gTeBtags,
    "jvm": jvmeBtags,
    "rdoutData": rdDeBtags,
    "rdoutReq": rdReBtags,
    "system": syseBtrags}
sB={"jvm":jvmsBtags,
    "moniBuilder":mBsBtags,
    "moniData":mDsBtags,
    "snBuilder":snBsBtags,
    "snData":snDsBtags,
    "system":syssBtags,
    "tcalBuilder":tBsBtags,
    "tcalData":tDsBtags}
gT={"glblTrig":gTgTtags,
    "jvm":jvmgTtags,
    "manager":mgTtags,
    "systemtrigger":sTgTtags}
iT={"jvm":jvmiTtags,
    "manager":miTtags,
    "stringHit":sHiTtags,
    "system":sysiTtags,
    "trigger":tiTtags}

data={"stringHub":timestampssH, "eventBuilder":timestampseB,"secondaryBuilder":timestampssB, "globalTrigger":timesstampgT, "inIceTrigger": timesstampiT} #Main Dict
for filename in sys.argv[1:]:
    name = os.path.splitext(filename)[0]
    ext = os.path.splitext(filename)[1]
    if 'stringHub' in name:
        if 'moni' in ext:
            f=open(filename, 'rU')
            c=f.readlines()
            count = -1
            for item in c:	
                timestamp = str(re.findall(r"\d\d:\S+:\d\d.\d\d",item))
                timestampssH[timestamp]=sH
                count += 1
                if "PyrateBufferManager" in item:
                    pBMtags["CurrentAquiredBuffers"] = re.findall(r"\s+CurrentAquiredBuffers:\s(\d+)", c[count+1])
                    pBMtags["ReturnBufferCount"] = re.findall(r"\s+ReturnBufferCount:\s(\d+)",c[count+2])
                    pBMtags["CurrentAquiredBytes"] = re.findall(r"\s+CurrentAquiredBytes:\s(\d+)", c[count+3])
                if "jvm" in item:
                    jvmsHtags["MemoryStatistics"] = re.findall(r"\s+MemoryStatistics:\s\[(\d+),(\d+)\]",c[count+1])
                if "moniData" in item:
                    mDsHtags["Depth"] = re.findall(r"\s+Depth:\s\[(\d+)\]", c[count+1])
                    mDsHtags["RecordsSent"] = re.findall(r"\s+RecordsSent:\s\[(\d+)\]", c[count+2])
                if "rdoutData" in item:
                    rdDsHtags["Depth"] = re.findall(r"\s+Depth:\s\[(\d+)\]", c[count+1])
                    rdDsHtags["RecordsSent"] = re.findall(r"\s+RecordsSent:\s\[(\d+)\]", c[count+2])
                if "rdoutReq" in item:
                    rdRsHtags["TotalRecordsReceived"] = re.findall(r"\s+TotalRecordsReceived:\s(\d+)", c[count+1])
                    rdRsHtags["RecordsReceived"] = re.findall(r"\s+RecordsReceived:\s\[(\d+)\]", c[count+2])
                    rdRsHtags["BytesReceived"] = re.findall(r"\s+BytesReceived:\s\[(\d+)\]", c[count+3])
                if "sender" in item:
                    stags["NumReadoutRequestsReceived"] = re.findall(r"\s+NumReadoutRequestsReceived:\s(\d+)", c[count+1])
                    stags["NumHitsReceived"] = re.findall(r"\s+NumHitsReceived:\s(\d+)", c[count+2])
                    stags["NumReadoutsSent"] = re.findall(r"\s+NumReadoutsSent:\s(\d+)", c[count+3])
                    stags["NumHitsCached"] = re.findall(r"\s+NumHitsCached:\s(\d+)", c[count+4])
                    stags["NumHitsQueued"] = re.findall(r"\s+NumHitsQueued:\s(\d+)", c[count+5])
                    stags["NumReadoutRequestsQueued"] = re.findall(r"\s+NumReadoutRequestsQueued:\s(\d+)", c[count+6])
                if "snData" in item:
                    snDsHtags["Depth"] = re.findall(r"\s+Depth:\s\[(\d+)\]", c[count+1])
                    snDsHtags["RecordsSent"] = re.findall(r"\s+RecordsSent:\s\[(\d+)\]", c[count+2])
                if "stringHit" in item:
                    sHsHtags["Depth"] = re.findall(r"\s+Depth:\s\[(\d+)\]", c[count+1])
                    sHsHtags["RecordsSent"] = re.findall(r"\s+RecordsSent:\s\[(\d+)\]", c[count+2])
                if "stringhub" in item:
                    shsHtags["TimeOfLastHitOutputFromHKN1"] = re.findall(r"\s+TimeOfLastHitOutputFromHKN1:\s(\d+)", c[count+1])
                    shsHtags["NumberOfActiveAndTotalChannels"] = re.findall(r"\s+NumberOfActiveAndTotalChannels:\s\[\d+,(\d+)\]", c[count+2])
                    shsHtags["NumberOfActiveChannels"] = re.findall(r"\s+NumberOfActiveChannels:\s(\d+)", c[count+3])
                    shsHtags["TimeOfLastHitInputToHKN1"] = re.findall(r"\s+TimeOfLastHitInputToHKN1:\s(\d+)", c[count+4])
                    shsHtags["HitRateLC"] = re.findall(r"\s+HitRateLC:\s(\d+)", c[count+5])
                    shsHtags["HitRate"] = re.findall(r"\s+HitRate:\s(\d+)", c[count+6])
                    shsHtags["TotalLBMOverflows"] = re.findall(r"\s+TotalLBMOverflows:\s(\d+)", c[count+7])           
                if "system" in item:
                    syssHtags["LoadAverage"] = re.findall(r"\s+LoadAverage:\[(\d+),(\d+),(\d+)\]", c[count+1])
                    syssHtags["AvailableDiskSpace"] = re.findall(r"\s+AvailableDiskSpace:\s{'/':(\d+),\s'/dev/shm':\s(\d+)}", c[count+3])
                if "tcalData" in item:
                    tDsHtags["Depth"] = re.findall(r"\s+Depth:\s\[(\d+)\]", c[count+1])
                    tDsHtags["RecordsSent"] = re.findall(r"\s+RecordsSent:\s\[(\d+)\]", c[count+2])
        if 'log' in ext:
            f=open(filename, 'rU')
            c=f.readlines()
            print c

It does not seem difficult: your file has a simple structure: it's a series of blocks of text separated by blank lines. You read each block one after the other

from itertools import groupby

def read_blocks(filename):
    """read a file as a sequence of blocks of rstripped non blank lines"""
    with open(filename) as ifh:
        data = ((bool(x), x) for x in (line.rstrip() for line in ifh))
        for b, group in groupby(data, key=lambda z: z[0]):
            if b:
                yield (z[1] for z in group)
                
                
if __name__ == "__main__":
    for block in read_blocks("logfile.txt"):
        print list(block)

Then each block contains a first line of the form word: date: , from which you can extract the word and the date by splitting on the first :, then every other line has the form \tword: value , again you can split on the first : to get the word and the value. The value seems to be a python acceptable value, which you could eval(). You would use regular expression to check that the lines have the announced form.

Actually, the previous function read_blocks() can be much simplified

from itertools import groupby

def read_blocks(filename):
    """read a file as a sequence of blocks of rstripped non blank lines"""
    with open(filename) as ifh:
        for b, g in groupby((line.rstrip() for line in ifh), key=bool):
            if b: yield g

if __name__ == "__main__":
    for block in read_blocks("logfile.txt"):
        print list(block)

it could even be turned into a one-liner.

Good you noticed it before I nagged you, Gribouillis, but here the groupby is bit overkill:

def read_blocks(filename):
    """read a file as a sequence of blocks separated by empty line"""
    with open(filename) as ifh:
        for block in ifh.read().split('\n\n'):
            yield block.splitlines()           

if __name__ == "__main__":
    for group_number, block in enumerate(read_blocks("logfile.txt"), 1):
        print(group_number, block)
        print('-'*40)

Good you noticed it before I nagged you, Gribouillis, but here the groupby is bit overkill:

def read_blocks(filename):
    """read a file as a sequence of blocks separated by empty line"""
    with open(filename) as ifh:
        for block in ifh.read().split('\n\n'):
            yield block.splitlines()           

if __name__ == "__main__":
    for group_number, block in enumerate(read_blocks("logfile.txt"), 1):
        print(group_number, block)
        print('-'*40)

Suppose that there are 3 \n, or that the file starts with \n, your code is less robust. Also my function reads the file line by line. A log file could be very large.

As alternative for big files (I was also considering it by myself), I did implementation of isplit from generator. Your code is nice, but how it jumps over newlines by the check of groupby condition could be much for newbie, even this my code uses yield, which could already be challenge.

from __future__ import print_function

def isplit(iterable, sep=None):
    r = ''
    for c in iterable:
        r += c
        if sep is None:
            if not c.strip():
                r = r[:-1]
                if r:
                    yield r
                    r = ''                    
        elif r.endswith(sep):
            r=r[:-len(sep)]
            yield r
            r = ''
    if r:
        yield r
            

def read_blocks(filename):
    """read a file as a sequence of blocks separated by empty line"""
    with open(filename) as ifh:
        for block in isplit(ifh, '\n\n'):
            yield block.splitlines()           

if __name__ == "__main__":
    for lineno, block in enumerate(read_blocks("logfile.txt"), 1):
        print(lineno,':')
        print('\n'.join(block))
        print('-'*40)

    print('Testing skip with None.')
    for word in isplit('\tTony   \t  Jarkko \n  Veijalainen\n'):
        print(word)

(I leave skipping to caller, if he choose to ignore empty lines if sep is not None)
(Another edit, sorry, this is fresh code)

Thanks for the thoughts. I got it solved this way:

import re, sys, os, datetime

class MonitorFile(object):
    # match a category/time line like "foo: 2011-02-28 09:10:11.123456:"
    #
    CATTIME_PAT = re.compile(r"^([^:]+):\s(\d+-\d+-\d+\s\d+:\d+:\d+\.\d{2})\d+\s*")

    # match a data line like "    name: value"
    #
    DATA_PAT = re.compile(r"^(\s+)([^:]+):\s+(.*\S)\s*")

    # list/dict separator
    #
    COMMA_SEP_PAT = re.compile("\s*,\s*")

    # dict key/value separator
    #
    COLON_SEP_PAT = re.compile("\s*:\s*")

    def __init__(self, filename):
        """
        Cache a pDAQ monitor file
        """
        self._data = self._read_file(filename)

    def _get_value(self, valstr):
        try:
            return int(valstr)
        except:
            try:
                return float(valstr)
            except:
                if valstr.startswith("["):
                    return self._parse_list(valstr)
                elif valstr.startswith("{"):
                    return self._parse_dict(valstr)
                else:
                    return valstr

    def _parse_list(self, listStr):
        newlist = []
        for val in self.COMMA_SEP_PAT.split(listStr[1:-1]):
            newlist.append(self._get_value(val))
        return newlist

    def _parse_dict(self, dictStr):
        newdict = {}
        try:
            for pair in self.COMMA_SEP_PAT.split(dictStr[1:-1]):
                (key, value) = self.COLON_SEP_PAT.split(pair)
                newdict[key] = self._get_value(value)
            return newdict
        except:
            print 'Empty dict'


    def _read_file(self, filename):
        "Read and parse a monitor file"
        data = {}

        curDict = None
        for line in open(filename, "r").readlines():
            print line 
            if len(line) == 0:
                curDict = None
                continue

            m = self.CATTIME_PAT.match(line)
            if m is not None:
                cat = m.group(1)
                time = datetime.datetime.strptime(m.group(2),
                                                  "%Y-%m-%d %H:%M:%S.%f")
                if not time in data:
                    data[time] = {}
                curDict = {}
                data[time][cat] = curDict
                continue

            m = self.DATA_PAT.match(line)
            if m is not None:
                curDict[m.group(2)] = self._get_value(m.group(3))
                continue

            print >>sys.stderr, "Bad line: " + line
        
        return data

    def dump_data(self):
        "Dump data in pDAQ .moni format"
        #self.write_data_to_file(sys.stdout, self._data)
        return self._data

    @classmethod
    def write_data_to_file(cls, out, data):
        "Write data to a file in pDAQ .moni format"
        times = data.keys()
        times.sort()

        need_nl = False

        for t in times:
            keys = data[t].keys()
            keys.sort()
            for k in keys:
                if need_nl:
                    print >>out
                else:
                    need_nl = True

                print >>out, "%s: %s" % (k, t)

                names = data[t][k].keys()
                names.sort()
                for n in names:
                    print >>out, "\t%s: %s" % (n, data[t][k][n])
Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.