[mirrorbrain-commits] [mod_stats] r41 - in /trunk/tools: dlcount.py ooo.conf

From: <poeml_at_mirrorbrain.org>
Date: Thu, 26 Nov 2009 13:05:14 -0000
Author: poeml
Date: Thu Nov 26 14:05:12 2009
New Revision: 41

URL: http://svn.mirrorbrain.org/viewvc/mod_stats?rev=41&view=rev
Log:
- implement a StatsLogMask configuration directive, so the remaining hard-coded
  regex can be moved from the script into the config file. It still needs to be
  better documented
- comment the StatsDupWindow directive

Modified:
    trunk/tools/dlcount.py
    trunk/tools/ooo.conf

Modified: trunk/tools/dlcount.py
URL: http://svn.mirrorbrain.org/viewvc/mod_stats/trunk/tools/dlcount.py?rev=41&r1=40&r2=41&view=diff
==============================================================================
--- trunk/tools/dlcount.py (original)
+++ trunk/tools/dlcount.py Thu Nov 26 14:05:12 2009
@@ -112,10 +112,8 @@
 def gen_fragments(pat, lines): 
     """Generate a sequence of line fragments, according to
     a given regular expression"""
-    import re 
-    patc = re.compile(pat) 
     for line in lines: 
-        m = patc.match(line)
+        m = pat.match(line)
         if m:
             yield m.groups()
 
@@ -162,7 +160,13 @@
 
 def readconf(filename):
     """we'd need Apache's config parser here..."""
-    known_directives = ['StatsDupWindow', 'StatsIgnoreIP', 'StatsIgnoreMask', 'StatsPreFilter', 'StatsCount', 'StatsPostFilter']
+    known_directives = ['StatsLogMask', 
+                        'StatsIgnoreMask', 
+                        'StatsIgnoreIP', 
+                        'StatsDupWindow', 
+                        'StatsPreFilter', 
+                        'StatsCount', 
+                        'StatsPostFilter']
     known_directives_lower = [ i.lower() for i in known_directives ]
     # regular expressions to parse arguments
     parse_1_in_quotes = re.compile(r'"(.*)"')
@@ -200,7 +204,7 @@
             conf[directive] = int(val)
 
         # directives with one argument: a regexp
-        elif directive in ['statsignoremask']:
+        elif directive in ['statslogmask', 'statsignoremask']:
             m = parse_1_in_quotes.match(val)
             regex = m.group(1).replace('\\"', '"')
             regex_compiled = re.compile(regex)
@@ -222,6 +226,12 @@
         else:
             sys.exit('unparsed directive (implementation needed)', directive)
 
+    # set defaults for directives that didn't occur in the config
+    if len(conf['statslogmask']) == 0:
+        regex = '^(\S+).+"GET (\S*) HTTP.*" (200|302) [^"]+ "([^"]*)" "([^"]*)".* \w\w:(\w\w) ASN:'
+        regex_compiled = re.compile(regex)
+        conf['statslogmask'].append((regex_compiled, regex))
+
     return conf
     
 
@@ -246,10 +256,7 @@
     logfiles = gen_open(filenames)
     loglines = gen_cat(logfiles)
 
-    # 123.123.123.123 - - [23/Nov/2009:18:19:14 +0100] "GET /files/stable/3.1.1/OOo_3.1.1_MacOSXIntel_install_en-US.dmg HTTP/1.1" 302 399 "http://download.openoffice.org/all_rc.html" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 1.1.4322; .NET CLR 3.5.30729; .NET CLR 3.0.30618)" ftp.astral.ro r:country 913 844 EU:RO ASN:9050 P:92.81.0.0/16 size:24661382 -
-    # 200 is returned for files that are not on mirrors, and for metalinks
-    pat = r'^(\S+).+"GET (\S*) HTTP.*" (200|302) [^"]+ "([^"]*)" "([^"]*)".* \w\w:(\w\w) ASN:'
-    reqs = gen_fragments(pat, loglines)
+    reqs = gen_fragments(conf['statslogmask'][0][0], loglines)
 
 
     for req in reqs:

Modified: trunk/tools/ooo.conf
URL: http://svn.mirrorbrain.org/viewvc/mod_stats/trunk/tools/ooo.conf?rev=41&r1=40&r2=41&view=diff
==============================================================================
--- trunk/tools/ooo.conf (original)
+++ trunk/tools/ooo.conf Thu Nov 26 14:05:12 2009
@@ -2,16 +2,33 @@
 # the syntax is meant to be suitable for inclusion into Apache config
 # regexps must be in double quotes. Double quotes can be backslash-quoted.
 
+
+# consider only log lines that match the following expression
+# at the same time, the match groups determine splitting of the log file into fragments with meaning.
+#
+# 123.123.123.123 - - [23/Nov/2009:18:19:14 +0100] "GET /files/stable/3.1.1/OOo_3.1.1_MacOSXIntel_install_en-US.dmg HTTP/1.1" 302 399 "http://download.openoffice.org/all_rc.html" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 1.1.4322; .NET CLR 3.5.30729; .NET CLR 3.0.30618)" ftp.astral.ro r:country 913 844 EU:RO ASN:9050 P:92.81.0.0/16 size:24661382 -
+#
+# 200 is returned for files that are not on mirrors, and for metalinks
+#
+#StatsLogMask "^(\S+).+\"GET (\S*) HTTP.*\" (200|302) [^\"]+ \"([^\"]*)\" \"([^\"]*)\".* \w\w:(\w\w) ASN:"
+
+
+# define the size of a sliding window for remembering the last requests,
+# while parsing the log. Keyed by (ip, url, status, referer, user-agent, country), 
+# every requests is checked whether it has been seen in identical form before.
 StatsDupWindow 200
 
 # that's osuosl.org's Bouncer host
 StatsIgnoreIP 140.211.167.212
 
+
 # silently ignore all files matching this regular expression.
+# (the filter is applied to the requested url.)
 # (all others will be counted and need to match one of the StatsCount expressions.)
 StatsIgnoreMask "^.*\.(txt|list|html)$"
 
 # strip prefixed protocol (normally only sent to proxies, but can occur in the wild)
+# (the filter is applied to the requested url.)
 StatsPrefilter "^http://[^/]+/" ""
 
 # remove duplicated slashes




_______________________________________________
mirrorbrain-commits mailing list
Archive: http://mirrorbrain.org/archive/mirrorbrain-commits/

Note: To remove yourself from this list, send a mail with the content
 	unsubscribe
to the address mirrorbrain-commits-request_at_mirrorbrain.org
Received on Thu Nov 26 2009 - 13:05:16 GMT

This archive was generated by hypermail 2.2.0 : Thu Nov 26 2009 - 13:45:25 GMT