Author: poeml Date: Mon Dec 14 23:25:09 2009 New Revision: 51 URL: http://svn.mirrorbrain.org/viewvc/mod_stats?rev=51&view=rev Log: - further work on the offline download stats collector script (0.91): - grab also the timestamp, which we'll need when processing logs afterwards - put the request parsing code into its own function; add a class for holding parsed data Modified: trunk/tools/dlcount.py trunk/tools/ooo.conf Modified: trunk/tools/dlcount.py URL: http://svn.mirrorbrain.org/viewvc/mod_stats/trunk/tools/dlcount.py?rev=51&r1=50&r2=51&view=diff ============================================================================== --- trunk/tools/dlcount.py (original) +++ trunk/tools/dlcount.py Mon Dec 14 23:25:09 2009 @@ -54,7 +54,7 @@ # -__version__='0.9' +__version__='0.91' __author__='Peter Poeml <poeml_at_cmdline.net>' __copyright__='Peter poeml <poeml_at_cmdline.net>' __license__='GPLv2' @@ -62,8 +62,11 @@ import sys +import os import re import hashlib +import time +from optparse import OptionParser try: set @@ -105,7 +108,7 @@ for line in lines: if patc.search(line): yield line -def gen_fragments(pat, lines): +def gen_fragments(lines, pat): """Generate a sequence of line fragments, according to a given regular expression""" for line in lines: @@ -226,7 +229,7 @@ # set defaults for directives that didn't occur in the config if not len(conf['statslogmask']): - regex = '^(\S+).+"GET (\S*) HTTP.*" (200|302) [^"]+ "([^"]*)" "([^"]*)".* \w\w:(\w\w) ASN:' + regex = '^(\S+).+\[(.*?)\] "GET (\S*) HTTP.*" (200|302) [^"]+ "([^"]*)" "([^"]*)".* \w\w:(\w\w) ASN:' regex_compiled = re.compile(regex) conf['statslogmask'] = [(regex_compiled, regex)] @@ -237,31 +240,33 @@ return conf - -def main(): - """ - Create a generator pipeline for the matching log file lines - and process them. - """ - - if not len(sys.argv[2:]): - sys.exit('Usage: dlcount CONFIGFILE LOGFILE [LOGFILE ...]') - - conf = readconf(sys.argv[1]) +class Req(): + def __init__(self): + # url_raw contains the original url, if needed + self.url_raw = None + self.tstamp = None + self.tstamp_raw = None + self.status = None + self.referer = None + self.ua = None + self.country = None + + self.url = None + + self.countable = False + + def __str__(self): + return '%-80s' % self.url + + +def gen_processreqs(reqs, conf): + """process a tuple of request data, and return the parsed in the form of a generator""" known = RingBuffer(conf['statsdupwindow']) - filenames = sys.argv[2:] - logfiles = gen_open(filenames) - loglines = gen_cat(logfiles) - - reqs = gen_fragments(conf['statslogmask'][0][0], loglines) - - - for req in reqs: - - (ip, url, status, referer, ua, country) = req - url_raw = url + for req in reqs: + rq = Req() + (ip, tstamp_raw, url, status, referer, ua, country) = req skip = False for r, mreg in conf['statsignoremask']: @@ -292,12 +297,19 @@ continue known.append(md) + rq.url_raw = url + rq.status = status + rq.referer = referer + rq.ua = ua + rq.country = country.lower() + + rq.tstamp = time.strptime(tstamp_raw, '%d/%b/%Y:%H:%M:%S +0100') + rq.tstamp_raw = tstamp_raw + # apply the prefiltering rules for r, s, mreg in conf['statsprefilter']: url = r.sub(s, url) - # url_raw still contains the original url, if needed - print '%-80s ' % url, matched = False for r, s, mreg in conf['statscount']: @@ -305,15 +317,63 @@ if matched: # FIXME: eventually, we want to allow multiple matches. But now we are debugging. sys.exit('warning: %r matches\n %r\nbut already matched a pevious regexp:\n %r' % (url, mreg, matched)) - print r.sub(s, url) + url = r.sub(s, url) matched = mreg if not matched: - print '-' + yield rq # apply postfiltering for r, s, mreg in conf['statspostfilter']: url = r.sub(s, url) + rq.url = url + + rq.countable = True + yield rq + + +def main(): + """ + Create a generator pipeline for the matching log file lines + and process them. + """ + + usage = 'usage: %prog [options] CONFIGFILE LOGFILE [LOGFILE ...]' + version = '%prog ' + __version__ + + parser = OptionParser(usage=usage, version=version) + #parser.disable_interspersed_args() + + parser.add_option("-q", "--quiet", + action="store_true", dest="quiet", default=False, + help="print only errors") + + parser.add_option("-v", "--verbose", + action="store_true", dest="verbose", default=False, + help="print debug messages to stderr") + + (options, args) = parser.parse_args() + + usage = usage.replace('%prog', os.path.basename(sys.argv[0])) + + + if len(args) < 2: + sys.exit(usage) + + conffile = args[0] + filenames = args[1:] + + conf = readconf(conffile) + + logfiles = gen_open(filenames) + loglines = gen_cat(logfiles) + reqs = gen_fragments(loglines, conf['statslogmask'][0][0]) + items = gen_processreqs(reqs, conf) + + for item in items: + if item.countable: + print item.country, item.url + sys.exit(0) Modified: trunk/tools/ooo.conf URL: http://svn.mirrorbrain.org/viewvc/mod_stats/trunk/tools/ooo.conf?rev=51&r1=50&r2=51&view=diff ============================================================================== --- trunk/tools/ooo.conf (original) +++ trunk/tools/ooo.conf Mon Dec 14 23:25:09 2009 @@ -10,7 +10,7 @@ # # 200 is returned for files that are not on mirrors, and for metalinks # -StatsLogMask "^(\S+).+\"GET (\S*) HTTP.*\" (200|302) [^\"]+ \"([^\"]*)\" \"([^\"]*)\".* \w\w:(\w\w) ASN:" +StatsLogMask "^(\S+).+\[(.*?)\] \"GET (\S*) HTTP.*\" (200|302) [^\"]+ \"([^\"]*)\" \"([^\"]*)\".* \w\w:(\w\w) ASN:" # define the size of a sliding window for remembering the last requests, @@ -102,6 +102,7 @@ # filtering to be applied after parsing (but before counting) #StatsPostfilter "foo" "bar" +StatsPostfilter "(prod|os|version|lang): " "" _______________________________________________ mirrorbrain-commits mailing list Archive: http://mirrorbrain.org/archive/mirrorbrain-commits/ Note: To remove yourself from this list, send a mail with the content unsubscribe to the address mirrorbrain-commits-request_at_mirrorbrain.orgReceived on Mon Dec 14 2009 - 22:25:13 GMT
This archive was generated by hypermail 2.2.0 : Tue Dec 15 2009 - 15:20:04 GMT