Author: poeml Date: Thu Nov 26 14:05:12 2009 New Revision: 41 URL: http://svn.mirrorbrain.org/viewvc/mod_stats?rev=41&view=rev Log: - implement a StatsLogMask configuration directive, so the remaining hard-coded regex can be moved from the script into the config file. It still needs to be better documented - comment the StatsDupWindow directive Modified: trunk/tools/dlcount.py trunk/tools/ooo.conf Modified: trunk/tools/dlcount.py URL: http://svn.mirrorbrain.org/viewvc/mod_stats/trunk/tools/dlcount.py?rev=41&r1=40&r2=41&view=diff ============================================================================== --- trunk/tools/dlcount.py (original) +++ trunk/tools/dlcount.py Thu Nov 26 14:05:12 2009 @@ -112,10 +112,8 @@ def gen_fragments(pat, lines): """Generate a sequence of line fragments, according to a given regular expression""" - import re - patc = re.compile(pat) for line in lines: - m = patc.match(line) + m = pat.match(line) if m: yield m.groups() @@ -162,7 +160,13 @@ def readconf(filename): """we'd need Apache's config parser here...""" - known_directives = ['StatsDupWindow', 'StatsIgnoreIP', 'StatsIgnoreMask', 'StatsPreFilter', 'StatsCount', 'StatsPostFilter'] + known_directives = ['StatsLogMask', + 'StatsIgnoreMask', + 'StatsIgnoreIP', + 'StatsDupWindow', + 'StatsPreFilter', + 'StatsCount', + 'StatsPostFilter'] known_directives_lower = [ i.lower() for i in known_directives ] # regular expressions to parse arguments parse_1_in_quotes = re.compile(r'"(.*)"') @@ -200,7 +204,7 @@ conf[directive] = int(val) # directives with one argument: a regexp - elif directive in ['statsignoremask']: + elif directive in ['statslogmask', 'statsignoremask']: m = parse_1_in_quotes.match(val) regex = m.group(1).replace('\\"', '"') regex_compiled = re.compile(regex) @@ -222,6 +226,12 @@ else: sys.exit('unparsed directive (implementation needed)', directive) + # set defaults for directives that didn't occur in the config + if len(conf['statslogmask']) == 0: + regex = '^(\S+).+"GET (\S*) HTTP.*" (200|302) [^"]+ "([^"]*)" "([^"]*)".* \w\w:(\w\w) ASN:' + regex_compiled = re.compile(regex) + conf['statslogmask'].append((regex_compiled, regex)) + return conf @@ -246,10 +256,7 @@ logfiles = gen_open(filenames) loglines = gen_cat(logfiles) - # 123.123.123.123 - - [23/Nov/2009:18:19:14 +0100] "GET /files/stable/3.1.1/OOo_3.1.1_MacOSXIntel_install_en-US.dmg HTTP/1.1" 302 399 "http://download.openoffice.org/all_rc.html" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 1.1.4322; .NET CLR 3.5.30729; .NET CLR 3.0.30618)" ftp.astral.ro r:country 913 844 EU:RO ASN:9050 P:92.81.0.0/16 size:24661382 - - # 200 is returned for files that are not on mirrors, and for metalinks - pat = r'^(\S+).+"GET (\S*) HTTP.*" (200|302) [^"]+ "([^"]*)" "([^"]*)".* \w\w:(\w\w) ASN:' - reqs = gen_fragments(pat, loglines) + reqs = gen_fragments(conf['statslogmask'][0][0], loglines) for req in reqs: Modified: trunk/tools/ooo.conf URL: http://svn.mirrorbrain.org/viewvc/mod_stats/trunk/tools/ooo.conf?rev=41&r1=40&r2=41&view=diff ============================================================================== --- trunk/tools/ooo.conf (original) +++ trunk/tools/ooo.conf Thu Nov 26 14:05:12 2009 @@ -2,16 +2,33 @@ # the syntax is meant to be suitable for inclusion into Apache config # regexps must be in double quotes. Double quotes can be backslash-quoted. + +# consider only log lines that match the following expression +# at the same time, the match groups determine splitting of the log file into fragments with meaning. +# +# 123.123.123.123 - - [23/Nov/2009:18:19:14 +0100] "GET /files/stable/3.1.1/OOo_3.1.1_MacOSXIntel_install_en-US.dmg HTTP/1.1" 302 399 "http://download.openoffice.org/all_rc.html" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 1.1.4322; .NET CLR 3.5.30729; .NET CLR 3.0.30618)" ftp.astral.ro r:country 913 844 EU:RO ASN:9050 P:92.81.0.0/16 size:24661382 - +# +# 200 is returned for files that are not on mirrors, and for metalinks +# +#StatsLogMask "^(\S+).+\"GET (\S*) HTTP.*\" (200|302) [^\"]+ \"([^\"]*)\" \"([^\"]*)\".* \w\w:(\w\w) ASN:" + + +# define the size of a sliding window for remembering the last requests, +# while parsing the log. Keyed by (ip, url, status, referer, user-agent, country), +# every requests is checked whether it has been seen in identical form before. StatsDupWindow 200 # that's osuosl.org's Bouncer host StatsIgnoreIP 140.211.167.212 + # silently ignore all files matching this regular expression. +# (the filter is applied to the requested url.) # (all others will be counted and need to match one of the StatsCount expressions.) StatsIgnoreMask "^.*\.(txt|list|html)$" # strip prefixed protocol (normally only sent to proxies, but can occur in the wild) +# (the filter is applied to the requested url.) StatsPrefilter "^http://[^/]+/" "" # remove duplicated slashes _______________________________________________ mirrorbrain-commits mailing list Archive: http://mirrorbrain.org/archive/mirrorbrain-commits/ Note: To remove yourself from this list, send a mail with the content unsubscribe to the address mirrorbrain-commits-request_at_mirrorbrain.orgReceived on Thu Nov 26 2009 - 13:05:16 GMT
This archive was generated by hypermail 2.2.0 : Thu Nov 26 2009 - 13:45:25 GMT