[mirrorbrain-commits] [mod_stats] r51 - in /trunk/tools: dlcount.py ooo.conf

From: <poeml_at_mirrorbrain.org>
Date: Mon, 14 Dec 2009 22:25:10 -0000
Author: poeml
Date: Mon Dec 14 23:25:09 2009
New Revision: 51

URL: http://svn.mirrorbrain.org/viewvc/mod_stats?rev=51&view=rev
Log:
- further work on the offline download stats collector script (0.91):
  - grab also the timestamp, which we'll need when processing logs afterwards
  - put the request parsing code into its own function; add a class for holding parsed data

Modified:
    trunk/tools/dlcount.py
    trunk/tools/ooo.conf

Modified: trunk/tools/dlcount.py
URL: http://svn.mirrorbrain.org/viewvc/mod_stats/trunk/tools/dlcount.py?rev=51&r1=50&r2=51&view=diff
==============================================================================
--- trunk/tools/dlcount.py (original)
+++ trunk/tools/dlcount.py Mon Dec 14 23:25:09 2009
_at_@ -54,7 +54,7 @@
 #
 
 
-__version__='0.9'
+__version__='0.91'
 __author__='Peter Poeml <poeml_at_cmdline.net>'
 __copyright__='Peter poeml <poeml_at_cmdline.net>'
 __license__='GPLv2'
_at_@ -62,8 +62,11 @@
 
 
 import sys
+import os
 import re
 import hashlib
+import time
+from optparse import OptionParser
 
 try:
     set
_at_@ -105,7 +108,7 @@
     for line in lines: 
         if patc.search(line): yield line 
 
-def gen_fragments(pat, lines): 
+def gen_fragments(lines, pat): 
     """Generate a sequence of line fragments, according to
     a given regular expression"""
     for line in lines: 
_at_@ -226,7 +229,7 @@
 
     # set defaults for directives that didn't occur in the config
     if not len(conf['statslogmask']):
-        regex = '^(\S+).+"GET (\S*) HTTP.*" (200|302) [^"]+ "([^"]*)" "([^"]*)".* \w\w:(\w\w) ASN:'
+        regex = '^(\S+).+\[(.*?)\] "GET (\S*) HTTP.*" (200|302) [^"]+ "([^"]*)" "([^"]*)".* \w\w:(\w\w) ASN:'
         regex_compiled = re.compile(regex)
         conf['statslogmask'] = [(regex_compiled, regex)]
 
_at_@ -237,31 +240,33 @@
     return conf
     
 
-
-def main():
-    """
-    Create a generator pipeline for the matching log file lines
-    and process them.
-    """
-
-    if not len(sys.argv[2:]):
-        sys.exit('Usage: dlcount CONFIGFILE LOGFILE [LOGFILE ...]')
-
-    conf = readconf(sys.argv[1])
+class Req():
+    def __init__(self):
+        # url_raw contains the original url, if needed
+        self.url_raw = None
+        self.tstamp = None
+        self.tstamp_raw = None
+        self.status = None
+        self.referer = None
+        self.ua = None
+        self.country = None
+
+        self.url = None
+
+        self.countable = False
+
+    def __str__(self):
+        return '%-80s' % self.url 
+
+
+def gen_processreqs(reqs, conf): 
+    """process a tuple of request data, and return the parsed in the form of a generator"""
 
     known = RingBuffer(conf['statsdupwindow'])
 
-    filenames = sys.argv[2:]
-    logfiles = gen_open(filenames)
-    loglines = gen_cat(logfiles)
-
-    reqs = gen_fragments(conf['statslogmask'][0][0], loglines)
-
-
-    for req in reqs:
-
-        (ip, url, status, referer, ua, country) = req
-        url_raw = url
+    for req in reqs: 
+        rq = Req()
+        (ip, tstamp_raw, url, status, referer, ua, country) = req
 
         skip = False
         for r, mreg in conf['statsignoremask']:
_at_@ -292,12 +297,19 @@
                 continue
             known.append(md)
 
+        rq.url_raw = url
+        rq.status = status
+        rq.referer = referer
+        rq.ua = ua
+        rq.country = country.lower()
+
+        rq.tstamp = time.strptime(tstamp_raw, '%d/%b/%Y:%H:%M:%S +0100')
+        rq.tstamp_raw = tstamp_raw
+
         # apply the prefiltering rules
         for r, s, mreg in conf['statsprefilter']:
             url = r.sub(s, url)
 
-        # url_raw still contains the original url, if needed
-        print '%-80s ' % url, 
 
         matched = False
         for r, s, mreg in conf['statscount']:
_at_@ -305,15 +317,63 @@
                 if matched:
                     # FIXME: eventually, we want to allow multiple matches. But now we are debugging.
                     sys.exit('warning: %r matches\n   %r\nbut already matched a pevious regexp:\n   %r' % (url, mreg, matched))
-                print r.sub(s, url)
+                url = r.sub(s, url)
                 matched = mreg
         if not matched:
-            print '-'
+            yield rq
 
         # apply postfiltering
         for r, s, mreg in conf['statspostfilter']:
             url = r.sub(s, url)
 
+        rq.url = url
+
+        rq.countable = True
+        yield rq
+
+
+def main():
+    """
+    Create a generator pipeline for the matching log file lines
+    and process them.
+    """
+
+    usage = 'usage: %prog [options] CONFIGFILE LOGFILE [LOGFILE ...]'
+    version = '%prog ' + __version__
+
+    parser = OptionParser(usage=usage, version=version)
+    #parser.disable_interspersed_args()
+
+    parser.add_option("-q", "--quiet",
+                      action="store_true", dest="quiet", default=False,
+                      help="print only errors")
+
+    parser.add_option("-v", "--verbose",
+                      action="store_true", dest="verbose", default=False,
+                      help="print debug messages to stderr")
+
+    (options, args) = parser.parse_args()
+
+    usage = usage.replace('%prog', os.path.basename(sys.argv[0]))
+
+
+    if len(args) < 2:
+        sys.exit(usage)
+
+    conffile = args[0]
+    filenames = args[1:]
+
+    conf = readconf(conffile)
+
+    logfiles = gen_open(filenames)
+    loglines = gen_cat(logfiles)
+    reqs = gen_fragments(loglines, conf['statslogmask'][0][0])
+    items = gen_processreqs(reqs, conf)
+
+    for item in items:
+        if item.countable:
+            print item.country, item.url
+
 
     sys.exit(0)
 

Modified: trunk/tools/ooo.conf
URL: http://svn.mirrorbrain.org/viewvc/mod_stats/trunk/tools/ooo.conf?rev=51&r1=50&r2=51&view=diff
==============================================================================
--- trunk/tools/ooo.conf (original)
+++ trunk/tools/ooo.conf Mon Dec 14 23:25:09 2009
_at_@ -10,7 +10,7 @@
 #
 # 200 is returned for files that are not on mirrors, and for metalinks
 #
-StatsLogMask "^(\S+).+\"GET (\S*) HTTP.*\" (200|302) [^\"]+ \"([^\"]*)\" \"([^\"]*)\".* \w\w:(\w\w) ASN:"
+StatsLogMask "^(\S+).+\[(.*?)\] \"GET (\S*) HTTP.*\" (200|302) [^\"]+ \"([^\"]*)\" \"([^\"]*)\".* \w\w:(\w\w) ASN:"
 
 
 # define the size of a sliding window for remembering the last requests,
_at_@ -102,6 +102,7 @@
 
 # filtering to be applied after parsing (but before counting)
 #StatsPostfilter "foo" "bar"
+StatsPostfilter "(prod|os|version|lang): " ""
 
 
 




_______________________________________________
mirrorbrain-commits mailing list
Archive: http://mirrorbrain.org/archive/mirrorbrain-commits/

Note: To remove yourself from this list, send a mail with the content
 	unsubscribe
to the address mirrorbrain-commits-request_at_mirrorbrain.org
Received on Mon Dec 14 2009 - 22:25:13 GMT

This archive was generated by hypermail 2.3.0 : Mon Feb 20 2012 - 23:47:04 GMT