[mirrorbrain-commits] [mod_stats] r66 - /trunk/tools/ooo.conf

From: <poeml_at_mirrorbrain.org>
Date: Fri, 18 Dec 2009 02:09:05 -0000
Author: poeml
Date: Fri Dec 18 03:09:04 2009
New Revision: 66

URL: http://svn.mirrorbrain.org/viewvc/mod_stats?rev=66&view=rev
Log:
ooo.conf parsing regexps: 
- cleanups and documentation 
- order all directions in the order of processing

Modified:
    trunk/tools/ooo.conf

Modified: trunk/tools/ooo.conf
URL: http://svn.mirrorbrain.org/viewvc/mod_stats/trunk/tools/ooo.conf?rev=66&r1=65&r2=66&view=diff
==============================================================================
--- trunk/tools/ooo.conf (original)
+++ trunk/tools/ooo.conf Fri Dec 18 03:09:04 2009
_at_@ -3,36 +3,75 @@
 # regexps must be in double quotes. Double quotes can be backslash-quoted.
 
 
-# consider only log lines that match the following expression
-# at the same time, the match groups determine splitting of the log file into fragments with meaning.
+# good reference about Python regexp: http://www.amk.ca/python/howto/regex/regex.html
 #
+# short intro to things that may be special to Python: 
+#   (?:   )         non-capturing group
+#   (?P<foo>    )   named group
+# (FIXME: need to check if all these are supported in Apache)
+#
+
+
+
+# This directive applies only to the "offline parsing" script. Apache doesn't see
+# the log line before it constructs and writes it at the end of request processing.
+# Thus, Apache ignores this directive.
+#
+# It serves to 
+#   1) split a line of the log file into the relevant fragments
+#   2) ignore log lines that don't match
+#
+# The expression needs to result into the following six match groups:
+# (IP, timestamp, url, status, referer, ua, country)
+# FIXME: country should be optional, because it occurs only in a MirrorBrain logfile
+# 
 # 123.123.123.123 - - [23/Nov/2009:18:19:14 +0100] "GET /files/stable/3.1.1/OOo_3.1.1_MacOSXIntel_install_en-US.dmg HTTP/1.1" 302 399 "http://download.openoffice.org/all_rc.html" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 1.1.4322; .NET CLR 3.5.30729; .NET CLR 3.0.30618)" ftp.astral.ro r:country 913 844 EU:RO ASN:9050 P:92.81.0.0/16 size:24661382 -
 #
 # 200 is returned for files that are not on mirrors, and for metalinks
 #
-# the expression needs to result into the following six match groups:
-# (ip, timestamp, url, status, referer, ua, country)
 StatsLogMask "^(\S+).+\[(.*?)\] \"GET (\S*) HTTP.*\" (200|302) [^\"]+ \"([^\"]*)\" \"([^\"]*)\".* \w\w:(\w\w) ASN:"
 
+# FIXME: we should have a separate directive that determines which status codes
+#        are considered for logging, which is read both by the script and by
+#        Apache
+#
+#StatsLogStatus 200
+#StatsLogStatus 302
+
+
+#
+# Before doing anything else, silently ignore all files matching the following
+# regular expression.  (All others will be considered for counting.)
+# 
+
+# ignore all requests that come with query string
+StatsIgnoreMask ".*\?.*"
+# 
+# ignore files with these endings
+StatsIgnoreMask "^.*\.(txt|list|html)$"
+
+
+# Ignore all requests from this host (string prefix match), and don't process
+# the log lines at all.
+StatsIgnoreIP 140.211.167.212
+
+
+#
+# Drop recurring identical requests
+#
 
 # define the size of a sliding window for remembering the last requests,
 # while parsing the log. Keyed by (ip, url, referer, user-agent), 
 # every requests is checked whether it has been seen in identical form before.
 StatsDupWindow 200
 
-# that's osuosl.org's Bouncer host
-StatsIgnoreIP 140.211.167.212
+# FIXME: Apache will have to filter on time instead of number of requests,
+#        for practical reasons (memcached automates this nicely)
 
 
-# silently ignore all files matching this regular expression.
-# (the filter is applied to the requested url.)
-# (all others will be counted and need to match one of the StatsCount expressions.)
-# 
-# ignore all requests that come with query string
-StatsIgnoreMask ".*\?.*"
-# 
-# ignore files with these endings
-StatsIgnoreMask "^.*\.(txt|list|html)$"
+#
+# Apply the following series of filters to the request URL
+#
 
 # strip prefixed protocol (normally only sent to proxies, but can occur in the wild)
 # (the filter is applied to the requested url.)
_at_@ -57,14 +96,10 @@
 StatsPrefilter "zh-cn" "zh-CN"
 StatsPrefilter "zh-tw" "zh-TW"
 
-
-# best reference about Python regexp: http://www.amk.ca/python/howto/regex/regex.html
 #
-# short intro to things that *may* be special: 
-#   (?:   )         non-capturing group
-#   (?P<foo>    )   named group
-# (FIXME: need to check if all these are supported in Apache)
+# StatsCount translates the remaining URL into the pieces to be logged
 #
+# FIXME: This assumes that there need to be 4 pieces. Fewer should work as well.
 
 # stable/3.1.1/OOo_3.1.1_Win32Intel_install_en-US.exe
 # stable/3.1.1/OOo_3.1.1_MacOSXIntel_install_en-US.dmg
_at_@ -102,9 +137,13 @@
 
 
 
-# filtering to be applied after parsing (but before counting)
+#
+# Filters to be applied after parsing (but still before counting)
+#
+
 #StatsPostfilter "foo" "bar"
 StatsPostfilter "(prod|os|version|lang): " ""
+# FIXME: we didn't need to add those words in the first place; not needed at all.
 
 
 




_______________________________________________
mirrorbrain-commits mailing list
Archive: http://mirrorbrain.org/archive/mirrorbrain-commits/

Note: To remove yourself from this list, send a mail with the content
 	unsubscribe
to the address mirrorbrain-commits-request_at_mirrorbrain.org
Received on Fri Dec 18 2009 - 02:09:07 GMT

This archive was generated by hypermail 2.3.0 : Mon Feb 20 2012 - 23:47:04 GMT