[mirrorbrain-commits] [mod_stats] r24 - /trunk/tools/dlcount.py

From: <poeml_at_mirrorbrain.org>
Date: Wed, 25 Nov 2009 15:31:45 -0000
Author: poeml
Date: Wed Nov 25 16:31:44 2009
New Revision: 24

URL: http://svn.mirrorbrain.org/viewvc/mod_stats?rev=24&view=rev
Log:
working on factoring out OOo specific regexps into a config file (that
will be readable by Apache as well)

Modified:
    trunk/tools/dlcount.py

Modified: trunk/tools/dlcount.py
URL: http://svn.mirrorbrain.org/viewvc/mod_stats/trunk/tools/dlcount.py?rev=24&r1=23&r2=24&view=diff
==============================================================================
--- trunk/tools/dlcount.py (original)
+++ trunk/tools/dlcount.py Wed Nov 25 16:31:44 2009
_at_@ -158,6 +158,39 @@
         return self.data
 
 
+def readconf(filename):
+    """we'd need Apache's config parser here..."""
+    known_directives = ['StatsDupWindow', 'StatsIgnoreIP', 'StatsPreFilter', 'StatsCount', 'StatsPostFilter']
+    known_directives_lower = [ i.lower() for i in known_directives ]
+
+    # dictionary to hold the config
+    # each item is a list
+    cf = {}
+    for i in known_directives_lower:
+        cf[i] = list()
+
+    for line in open(filename):
+        line = line.strip()
+        if line.startswith('#'):
+            continue
+
+        d = line.split(None, 1)
+        if not len(d):
+            continue
+        if d[0].lower() not in known_directives_lower:
+            print 'not found:', d[0]
+            continue
+        d, val = d
+        d = d.lower()
+
+        print d, val
+        cf[d].append(val)
+
+    cf['statsdupwindow'] = int(cf['statsdupwindow'][0])
+
+    return cf
+    
+
 
 def main():
     """
_at_@ -168,35 +201,18 @@
     import sys
     import hashlib
 
-    if not len(sys.argv[1:]):
-        sys.exit('Usage: dlcount LOGFILE [LOGFILE ...]')
-
-
-
-    # best reference about Python regexp: http://www.amk.ca/python/howto/regex/regex.html
-    #
-    # short intro to things that *may* be special: 
-    #   (?:   )         non-capturing group
-    #   (?P<foo>    )   named group
-    # (FIXME: need to check if all these are supported in Apache)
-    #
+    if not len(sys.argv[2:]):
+        sys.exit('Usage: dlcount CONFIGFILE LOGFILE [LOGFILE ...]')
+
+    conf = readconf(sys.argv[1])
+    print; print
+    import pprint
+    pprint.pprint(conf)
+
+
+
     matchlist = [ 
-        # stable/3.1.1/OOo_3.1.1_Win32Intel_install_en-US.exe
-        # stable/3.1.1/OOo_3.1.1_MacOSXIntel_install_en-US.dmg
-        # stable/3.1.1/OOo_3.1.1_Win32Intel_install_wJRE_en-US.exe
-        # extended/3.1.1rc2/OOo_3.1.1rc2_20090820_Win32Intel_langpack_en-ZA.exe      -
-        # extended/3.1.1rc2/OOo_3.1.1rc2_20090820_Win32Intel_langpack_en-ZA.exe      -
-        # extended/3.1.1rc2/OOo_3.1.1rc2_20090820_Win32Intel_langpack_en-ZA.exe      -
-        # extended/3.1.1rc2/OOo_3.1.1rc2_20090820_LinuxIntel_langpack_brx_deb.tar.gz
-        # extended/developer/DEV300_m65/OOo-Dev-SDK_DEV300_m65_Win32Intel_install_en-US.exe
-        ( r'^(?:stable|extended)/(?:developer/)?([^/]+)/(OOo|OOo-SDK|OOo-Dev|OOo-Dev-SDK)_(?P<realversion>[^_]+(?:_[0-9]+)?)_(.+)_(?P<lang>([a-zA-Z]{2}(-[a-zA-Z]{2})?|binfilter|core|l10n|extensions|system|testautomation|brx|dgo|kok|mai|mni|sat))(_deb|_rpm)?\.(exe|dmg|sh|tar\.gz|tar\.bz2)$', r'prod: \2  os: \4  version: \1  realversion: \g<realversion>  lang: \g<lang>'),
-
-
-        # extended/3.1.1rc2/OOo_3.1.1rc2_20090820_LinuxX86-64_langpack_zh-CN.tar.gz
-        # extended/3.1.1rc2/OOo_3.1.1rc2_20090820_LinuxX86-64_langpack_zh-CN_deb.tar.gz
-
-        # localized/ru/2.4.3/OOo_2.4.3_Win32Intel_install_ru.exe      -
-        # localized/es/2.4.3/OOo_2.4.3_Win32Intel_install_es.exe      -
+        # FIXME: grab list of regexp from config
 
     ]
     re_matchlist = []
_at_@ -204,11 +220,9 @@
         re_matchlist.append((re.compile(match), sub, match))
 
 
-
-    DUP_WINDOW = 200
-    known = RingBuffer(DUP_WINDOW)
-
-    filenames = sys.argv[1:]
+    known = RingBuffer(conf['statsdupwindow'])
+
+    filenames = sys.argv[2:]
     logfiles = gen_open(filenames)
     loglines = gen_cat(logfiles)
 
_at_@ -217,27 +231,24 @@
     pat = r'^(\S+).+"GET (\S*) HTTP.*" (200|302) [^"]+ "([^"]*)" "([^"]*)".* \w\w:(\w\w) ASN:'
     reqs = gen_fragments(pat, loglines)
 
-    re_strip_protocol = re.compile(r'^http://[^/]+/')
-    re_single_slashes = re.compile(r'/+')
-    re_strip_queries = re.compile(r'\?.*')
-    re_strip_prefix = re.compile(r'^/files/')
-    re_strip_metalink = re.compile(r'\.metalink$')
+    # pretreatment (filtering, fixups), applied in order
+    # FIXME: read prefilter expressions here
 
 
     for req in reqs:
 
         (ip, url, status, referer, ua, country) = req
 
-        # over a window of DUP_WINDOW last requests, the same request must
+        # over a window of StatsDupWindow last requests, the same request must
         # not have occured already
         m = hashlib.md5()
         m.update(repr(req))
         md = m.digest()
 
-        # FIXME
-        if ip == '140.211.167.212':
-            # that's osuosl.org's Bouncer host
-            continue
+        for i in conf['statsignoreip']:
+            if ip.startswith(i):
+                #print 'ignoring ip %s because it matches %s' %(ip, i)
+                continue
 
         # was the requests seen recently? If yes, ignore it.
         # otherwise, put it into the ring buffer.
_at_@ -246,14 +257,8 @@
         known.append(md)
 
 
-        # note that we could use .replace() for many of these, but for compatibility with
-        # an Apache module in C we'll follow a pure regex-based approach
-        url = re_strip_protocol.sub('', url)
-        url = re_single_slashes.sub('/', url)
-        # FIXME: should we rather ignore requests with query string?
-        url = re_strip_queries.sub('', url)
-        url = re_strip_prefix.sub('', url)
-        url = re_strip_metalink.sub('', url)
+        # apply prefiltering
+        # FIXME
 
         print '%-80s ' % url, 
 




_______________________________________________
mirrorbrain-commits mailing list
Archive: http://mirrorbrain.org/archive/mirrorbrain-commits/

Note: To remove yourself from this list, send a mail with the content
 	unsubscribe
to the address mirrorbrain-commits-request_at_mirrorbrain.org
Received on Wed Nov 25 2009 - 15:31:47 GMT

This archive was generated by hypermail 2.3.0 : Mon Feb 20 2012 - 23:47:04 GMT