Author: poeml Date: Fri Dec 18 02:31:45 2009 New Revision: 65 URL: http://svn.mirrorbrain.org/viewvc/mod_stats?rev=65&view=rev Log: improved performance of saving data to the database. Before, every line to be counted meant a database hit. Now, counting happens internally and the results are saved to the database at the end. The scripts parses ~3000 lines/s, and saving takes a constant time (about 10 ms / counter, which isn't bad for a ORM). Modified: trunk/tools/dlcount.py Modified: trunk/tools/dlcount.py URL: http://svn.mirrorbrain.org/viewvc/mod_stats/trunk/tools/dlcount.py?rev=65&r1=64&r2=65&view=diff ============================================================================== --- trunk/tools/dlcount.py (original) +++ trunk/tools/dlcount.py Fri Dec 18 02:31:45 2009 @@ -241,24 +241,43 @@ return conf +#class Countable(): +# """This holds a result from a parsed log line +# which consists of a date and 5 attributes""" +# #def __init__(self, date, a0, a1, a2, a3, a4): +# def __init__(self, (date, a0, a1, a2, a3, a4, a5)): +# self.date = date +# self.a0 = a0 +# self.a1 = a1 +# self.a2 = a2 +# self.a3 = a3 +# self.a4 = a4 +# self.a5 = a5 class Req(): + """This helps us in housekeeping while parsing a log line""" def __init__(self): # url_raw contains the original url, if needed self.url_raw = None self.tstamp = None self.tstamp_raw = None + self.date = None self.status = None self.referer = None self.ua = None self.country = None + # this is the processed URL, after running through all the regexps self.url = None self.countable = False def __str__(self): return '%-80s' % self.url + def as_tuple(self): + return self.tuple +# def as_obj(self): +# return Countable(self.tuple) def gen_processreqs(reqs, conf): @@ -331,6 +350,15 @@ rq.url = url + # would time.strftime("%Y-%m-%d", ...) be faster? + rq.date = datetime(rq.tstamp[0], rq.tstamp[1], rq.tstamp[2]) + + rq.tuple = [rq.date] + rq.tuple.extend(rq.url.split()) + # the country is our fifth attribute + rq.tuple.append(rq.country) + rq.tuple = tuple(rq.tuple) + rq.countable = True yield rq @@ -401,32 +429,47 @@ # see below, in the loop # http://docs.djangoproject.com/en/dev/faq/models/#why-is-django-leaking-memory + + start = time.time() + + counterdict = {} + n = 0 + get = counterdict.get for item in items: - if item.countable: - - - #print item.country, item.url - (product, osname, version, lang) = item.url.split() - - # d = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(event_epoch)) - d = datetime(item.tstamp[0], item.tstamp[1], item.tstamp[2]) - #print d, (product, osname, version, lang), item.country - if options.db: - - if downloadstats.settings.DEBUG: - db.reset_queries() - - c, created = Counter.objects.get_or_create(date=d, - product=product, osname=osname, version=version, lang=lang, - country=item.country) - if created: - # count is 1 for a new item - pass - else: - # item existed already - increase its counter - c.count += 1 - c.save() - + if not item.countable: + continue + + t = item.as_tuple() + n += 1 + counterdict[t] = get(t, 0) + 1 + + delta = time.time() - start + print 'processed %s lines in %s seconds' % (n, delta) + print 'found %s countables' % len(counterdict) + start = time.time() + + + if options.db: + for key, val in counterdict.iteritems(): + + (date, a0, a1, a2, a3, a4) = key + + if downloadstats.settings.DEBUG: + db.reset_queries() + + counter, created = Counter.objects.get_or_create(date=date, + product=a0, osname=a1, version=a2, lang=a3, + country=a4) + if created: + # count is 1 for a new item + counter.count = val + else: + # item existed already - increase its counter + counter.count += val + counter.save() + + delta = time.time() - start + print 'saved data in %s seconds' % delta sys.exit(0) _______________________________________________ mirrorbrain-commits mailing list Archive: http://mirrorbrain.org/archive/mirrorbrain-commits/ Note: To remove yourself from this list, send a mail with the content unsubscribe to the address mirrorbrain-commits-request_at_mirrorbrain.orgReceived on Fri Dec 18 2009 - 01:31:48 GMT
This archive was generated by hypermail 2.2.0 : Fri Dec 18 2009 - 01:45:29 GMT