Author: poeml Date: Sat Mar 27 03:11:31 2010 New Revision: 8015 URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain?rev=8015&view=rev Log: mb makehashes: - add code for generating and storing zsync hashes Modified: trunk/mirrordoctor/mb/hashes.py Modified: trunk/mirrordoctor/mb/hashes.py URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain/trunk/mirrordoctor/mb/hashes.py?rev=8015&r1=8014&r2=8015&view=diff ============================================================================== --- trunk/mirrordoctor/mb/hashes.py (original) +++ trunk/mirrordoctor/mb/hashes.py Sat Mar 27 03:11:31 2010 _at_@ -4,6 +4,8 @@ import os import os.path import stat +import zsync +import binascii try: import hashlib _at_@ -21,6 +23,9 @@ PIECESIZE = 262144 +# must be a multiple of 2048 and 4096 for zsync checksumming +assert PIECESIZE % 4096 == 0 + class Hasheable: """represent a file and its metadata""" _at_@ -48,7 +53,7 @@ self.dst_basename = '%s.size_%s' % (self.basename, self.size) self.dst = os.path.join(self.dst_dir, self.dst_basename) - self.hb = HashBag(src = self.src) + self.hb = HashBag(src=self.src, parent=self) def islink(self): return stat.S_ISLNK(self.mode) _at_@ -105,6 +110,7 @@ conn.mycursor = conn.Hash._connection.getConnection().cursor() c = conn.mycursor + c.execute("SELECT id FROM filearr WHERE path = %s LIMIT 1", [self.src_rel]) res = c.fetchone() _at_@ -134,18 +140,23 @@ c.execute("""INSERT INTO hash (file_id, mtime, size, md5, sha1, sha256, sha1piecesize, - sha1pieces, pgp) + sha1pieces, pgp, zblocksize, + zhashlens, zsums) VALUES (%s, %s, %s, decode(%s, 'hex'), decode(%s, 'hex'), decode(%s, 'hex'), %s, decode(%s, 'hex'), - %s )""", + %s, %s, %s, decode(%s, 'hex'))""", [file_id, int(self.mtime), self.size, self.hb.md5hex, self.hb.sha1hex, self.hb.sha256hex or '', PIECESIZE, ''.join(self.hb.pieceshex), - self.hb.pgp or '']) + self.hb.pgp or '', + self.hb.zblocksize, + '%s,%s,%s' % (self.hb.zseq_matches, self.hb.zrsum_len, self.hb.zchecksum_len), + binascii.hexlify(''.join(self.hb.zsums))] + ) if verbose: print 'Hash was not present yet in database - inserted' else: _at_@ -165,25 +176,24 @@ sha256 = decode(%s, 'hex'), sha1piecesize = %s, sha1pieces = decode(%s, 'hex'), - pgp = %s + pgp = %s, + zblocksize = %s, + zhashlens = %s, + zsums = decode(%s, 'hex') WHERE file_id = %s""", [int(self.mtime), self.size, self.hb.md5hex, self.hb.sha1hex, self.hb.sha256hex or '', PIECESIZE, ''.join(self.hb.pieceshex), self.hb.pgp or '', + self.hb.zblocksize, + '%s,%s,%s' % (self.hb.zseq_matches, self.hb.zrsum_len, self.hb.zchecksum_len), + binascii.hexlify(''.join(self.hb.zsums)), file_id]) if verbose: print 'Hash updated in database for %r' % self.src_rel c.execute('commit') - - - #def __eq__(self, other): - # return self.basename == other.basename - #def __eq__(self, basename): - # return self.basename == basename - def __str__(self): return self.basename _at_@ -191,9 +201,10 @@ class HashBag(): - def __init__(self, src): + def __init__(self, src, parent=None): self.src = src self.basename = os.path.basename(src) + self.h = parent self.md5 = None self.sha1 = None _at_@ -207,6 +218,8 @@ self.pieces = [] self.pieceshex = [] + self.zsums = [] + self.empty = True def fill(self, verbose=False): _at_@ -215,6 +228,8 @@ sys.stdout.write('Hashing %r... ' % self.src) sys.stdout.flush() + self.zs_guess_zsync_params() + m = md5.md5() s1 = sha1.sha1() if sha256: _at_@ -241,6 +256,8 @@ self.npieces += 1 self.pieces.append(sha1.sha1(buf).digest()) self.pieceshex.append(sha1.sha1(buf).hexdigest()) + + self.zs_get_block_sums(buf) f.close() _at_@ -255,6 +272,8 @@ # if present, grab PGP signature if os.path.exists(self.src + '.asc'): self.pgp = open(self.src + '.asc').read() + + #print len(self.zsums) self.empty = False _at_@ -306,3 +325,68 @@ return '\n'.join(r) + def zs_guess_zsync_params(self): + import math + + size = self.h.size + if size < 100000000: + blocksize = 2048 + else: + blocksize = 4096 + + # Decide how long a rsum hash and checksum hash per block we need for this file + if size > blocksize: + seq_matches = 2 + else: + seq_matches = 1 + + rsum_len = math.ceil(((math.log(size) + math.log(blocksize)) / math.log(2) - 8.6) / seq_matches / 8) + + # min and max lengths of rsums to store + if rsum_len > 4: + rsum_len = 4 + if rsum_len < 2: + rsum_len = 2 + + # Now the checksum length; min of two calculations + checksum_len = math.ceil( + (20 + (math.log(size) + math.log(1 + size / blocksize)) / math.log(2)) + / seq_matches / 8) + checksum_len2 = (7.9 + (20 + math.log(1 + size / blocksize) / math.log(2))) / 8 + + if checksum_len < checksum_len2: + checksum_len = checksum_len2 + + self.zblocksize = blocksize + self.zseq_matches = seq_matches + self.zrsum_len = int(rsum_len) + self.zchecksum_len = int(checksum_len) + + #print '%s: %s,%s,%s' % (self.zblocksize, self.zseq_matches, self.zrsum_len, self.zchecksum_len) + + + + def zs_get_block_sums(self, buf): + + offset = 0 + while 1: + block = buf[ offset : offset + self.zblocksize ] + offset += self.zblocksize + if not block: + #print 'last.' + break + + # padding + if len(block) < self.zblocksize: + block = block + ( '\x00' * ( self.zblocksize - len(block) ) ) + + md4 = hashlib.new('md4') + md4.update(block) + c = md4.digest() + + r = zsync.rsum06(block) + + self.zsums.append( r[-self.zrsum_len:] ) # save only some trailing bytes + self.zsums.append( c[0:self.zchecksum_len] ) # save only some leading bytes + + _______________________________________________ mirrorbrain-commits mailing list Archive: http://mirrorbrain.org/archive/mirrorbrain-commits/ Note: To remove yourself from this list, send a mail with the content unsubscribe to the address mirrorbrain-commits-request_at_mirrorbrain.orgReceived on Sat Mar 27 2010 - 02:11:33 GMT
This archive was generated by hypermail 2.3.0 : Mon Feb 20 2012 - 23:47:04 GMT