[mirrorbrain-commits] [mod_stats] r20 - /trunk/tools/dlcount.py

From: <poeml_at_mirrorbrain.org> Date: Tue, 24 Nov 2009 17:16:06 -0000 · This archive was generated by hypermail 2.2.0 : Tue Nov 24 2009 - 17:45:10 GMT

Author: poeml
Date: Tue Nov 24 18:16:05 2009
New Revision: 20

URL: http://svn.mirrorbrain.org/viewvc/mod_stats?rev=20&view=rev
Log:
I'm adding a script (w.i.p.) that parses a MirrorBrain-enhanced access_log and
does the following:

- a little ring buffer filters requests recurring within a certain time window (keyed by ip+url+referer+user-agent)
- strip trailing http://... cruft
- remove duplicated slashes
- remove accidental query strings
- remove a possible .metalink suffix
- remove the /files/ prefix

it applies filtering by
- GET requests
- status code 200 or 302
- bouncer's IP which keeps coming back to download all files

It also captures the country where the client requests originate from.

I baked a first regexp which is able to parse most (OpenOffice.org) requests
from /stable and /extended. There are some exceptions (language code with 3
letters) and I didn't take care of /localized yet.

The script should serve as model implementation for the Apache module which
does the same live.


Added:
    trunk/tools/dlcount.py   (with props)

Added: trunk/tools/dlcount.py
URL: http://svn.mirrorbrain.org/viewvc/mod_stats/trunk/tools/dlcount.py?rev=20&view=auto
==============================================================================

--- trunk/tools/dlcount.py (added)
+++ trunk/tools/dlcount.py Tue Nov 24 18:16:05 2009
@@ -1,0 +1,244 @@
+#!/usr/bin/python
+
+# Analyze Apache logfiles without hogging memory
+# 
+# This script uses Python generators, which means that it doesn't allocate memory
+# It rather works like a Unix pipe.
+# 
+# It transparently opens uncompressed, gzip or bzip2 compressed files.
+# 
+# The implementation is based on David Beazley's PyCon UK 08 great talk about
+# generator tricks for systems programmers.
+#
+#
+#
+# Copyright 2008,2009 Peter Poeml
+#
+# This program is free software; you can redistribute it and/or
+# modify it under the terms of the GNU General Public License version 2
+# as published by the Free Software Foundation;
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
+
+
+__version__='0.9'
+__author__='Peter Poeml <poeml_at_cmdline.net>'
+__copyright__='Peter poeml <poeml_at_cmdline.net>'
+__license__='GPLv2'
+__url__='http://mirrorbrain.org/'
+
+
+import re
+
+try:
+    set
+except NameError:
+    from sets import Set as set     # Python 2.3 fallback
+
+try:
+    sorted
+except NameError:
+    def sorted(in_value):           # Python 2.3 fallback
+        "A naive implementation of sorted"
+        out_value = list(in_value)
+        out_value.sort()
+        return out_value
+
+
+def gen_open(filenames): 
+    """Open a sequence of filenames"""
+    import gzip, bz2 
+    for name in filenames: 
+        if name.endswith(".gz"): 
+             yield gzip.open(name) 
+        elif name.endswith(".bz2"): 
+             yield bz2.BZ2File(name) 
+        else: 
+             yield open(name) 
+
+def gen_cat(sources): 
+    """Concatenate items from one or more 
+    source into a single sequence of items"""
+    for s in sources: 
+        for item in s: 
+            yield item 
+
+
+def gen_grep(pat, lines): 
+    import re 
+    patc = re.compile(pat) 
+    for line in lines: 
+        if patc.search(line): yield line 
+
+def gen_fragments(pat, lines): 
+    """Generate a sequence of line fragments, according to
+    a given regular expression"""
+    import re 
+    patc = re.compile(pat) 
+    for line in lines: 
+        m = patc.match(line)
+        if m:
+            yield m.groups()
+
+
+class RingBuffer:
+    """Here is a simple circular buffer, or ring buffer, implementation in
+    Python. It is a first-in, first-out (FIFO) buffer with a fixed size.
+
+    Here is an example where the buffer size is 4. Ten integers, 0-9, are
+    inserted, one at a time, at the end of the buffer. Each iteration, the first
+    element is removed from the front of the buffer.
+    
+    buf = RingBuffer(4)
+    for i in xrange(10):
+        buf.append(i)
+        print buf.get()
+    
+    
+    Here are the results:
+    
+    [None, None, None, 0]
+    [None, None, 0, 1]
+    [None, 0, 1, 2]
+    [0, 1, 2, 3]
+    [1, 2, 3, 4]
+    [2, 3, 4, 5]
+    [3, 4, 5, 6]
+    [4, 5, 6, 7]
+    [5, 6, 7, 8]
+    [6, 7, 8, 9]
+    
+    from http://www.saltycrane.com/blog/2007/11/python-circular-buffer/
+    """
+    def __init__(self, size):
+        self.data = [None for i in xrange(size)]
+
+    def append(self, x):
+        self.data.pop(0)
+        self.data.append(x)
+
+    def get(self):
+        return self.data
+
+
+
+def main():
+    """
+    Create a generator pipeline for the matching log file lines
+    and process them.
+    """
+    import re
+    import sys
+    import hashlib
+
+    if not len(sys.argv[1:]):
+        sys.exit('Usage: dlcount LOGFILE [LOGFILE ...]')
+
+
+
+    # best reference about Python regexp: http://www.amk.ca/python/howto/regex/regex.html
+    #
+    # short intro to things that *may* be special: 
+    #   (?:   )         non-capturing group
+    #   (?P<foo>    )   named group
+    # (FIXME: need to check if all these are supported in Apache)
+    #
+    matchlist = [ 
+        # stable/3.1.1/OOo_3.1.1_Win32Intel_install_en-US.exe
+        # stable/3.1.1/OOo_3.1.1_MacOSXIntel_install_en-US.dmg
+        # stable/3.1.1/OOo_3.1.1_Win32Intel_install_wJRE_en-US.exe
+        # extended/3.1.1rc2/OOo_3.1.1rc2_20090820_Win32Intel_langpack_en-ZA.exe      -
+        # extended/3.1.1rc2/OOo_3.1.1rc2_20090820_Win32Intel_langpack_en-ZA.exe      -
+        # extended/3.1.1rc2/OOo_3.1.1rc2_20090820_Win32Intel_langpack_en-ZA.exe      -
+        ( r'^(?:stable|extended)/([^/]+)/(OOo|OOo-SDK)_(?P<realversion>[^_]+(?:_[0-9]+)?)_(.+)_(?P<lang>([a-zA-Z]{2}(-[a-zA-Z]{2})?|binfilter|core|l10n|extensions|system|testautomation))(_deb|_rpm)?\.(exe|dmg|sh|tar\.gz|tar\.bz2)$', r'prod: \2  os: \4  version: \1  realversion: \g<realversion>  lang: \g<lang>'),
+
+
+        # extended/3.1.1rc2/OOo_3.1.1rc2_20090820_LinuxX86-64_langpack_zh-CN.tar.gz
+        # extended/3.1.1rc2/OOo_3.1.1rc2_20090820_LinuxX86-64_langpack_zh-CN_deb.tar.gz
+
+        # localized/ru/2.4.3/OOo_2.4.3_Win32Intel_install_ru.exe      -
+        # localized/es/2.4.3/OOo_2.4.3_Win32Intel_install_es.exe      -
+
+    ]
+    re_matchlist = []
+    for match, sub in matchlist:
+        re_matchlist.append((re.compile(match), sub, match))
+
+
+
+    DUP_WINDOW = 200
+    known = RingBuffer(DUP_WINDOW)
+
+    filenames = sys.argv[1:]
+    logfiles = gen_open(filenames)
+    loglines = gen_cat(logfiles)
+
+    # 123.123.123.123 - - [23/Nov/2009:18:19:14 +0100] "GET /files/stable/3.1.1/OOo_3.1.1_MacOSXIntel_install_en-US.dmg HTTP/1.1" 302 399 "http://download.openoffice.org/all_rc.html" "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 1.1.4322; .NET CLR 3.5.30729; .NET CLR 3.0.30618)" ftp.astral.ro r:country 913 844 EU:RO ASN:9050 P:92.81.0.0/16 size:24661382 -
+    # 200 is returned for files that are not on mirrors, and for metalinks
+    pat = r'^(\S+).+"GET (\S*) HTTP.*" (200|302) [^"]+ "([^"]*)" "([^"]*)".* \w\w:(\w\w) ASN:'
+    reqs = gen_fragments(pat, loglines)
+
+    re_strip_protocol = re.compile(r'^http://[^/]+/')
+    re_single_slashes = re.compile(r'/+')
+    re_strip_queries = re.compile(r'\?.*')
+    re_strip_prefix = re.compile(r'^/files/')
+    re_strip_metalink = re.compile(r'\.metalink$')
+
+
+    for req in reqs:
+
+        (ip, url, status, referer, ua, country) = req
+
+        # over a window of DUP_WINDOW last requests, the same request must
+        # not have occured already
+        m = hashlib.md5()
+        m.update(repr(req))
+        md = m.digest()
+
+        # FIXME
+        if ip == '140.211.167.212':
+            # that's osuosl.org's Bouncer host
+            continue
+
+        # was the requests seen recently? If yes, ignore it.
+        # otherwise, put it into the ring buffer.
+        if md in known.data:
+            continue
+        known.append(md)
+
+
+        # note that we could use .replace() for many of these, but for compatibility with
+        # an Apache module in C we'll follow a pure regex-based approach
+        url = re_strip_protocol.sub('', url)
+        url = re_single_slashes.sub('/', url)
+        # FIXME: should we rather ignore requests with query string?
+        url = re_strip_queries.sub('', url)
+        url = re_strip_prefix.sub('', url)
+        url = re_strip_metalink.sub('', url)
+
+        print '%-80s ' % url, 
+
+        matched = False
+        for m, s, mreg in re_matchlist:
+            if matched:
+                sys.exit('warning: %r matches\n   %r\nbut already matched a pevious regexp:\n   %r' % (url, mreg, matched))
+            if m.match(url):
+                print m.sub(s, url)
+                matched = mreg
+        if not matched:
+            print '-'
+
+
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
+




_______________________________________________
mirrorbrain-commits mailing list
Archive: http://mirrorbrain.org/archive/mirrorbrain-commits/

Note: To remove yourself from this list, send a mail with the content
 	unsubscribe
to the address mirrorbrain-commits-request_at_mirrorbrain.org