Author: poeml Date: Tue Mar 9 21:25:30 2010 New Revision: 7967 URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain?rev=7967&view=rev Log: mb makehashes: - implement saving hashes to the database - add --force option to force refreshing all hashes - better --base-dir example Modified: trunk/mirrordoctor/mb/conn.py trunk/mirrordoctor/mb/hashes.py trunk/mirrordoctor/mirrordoctor.py trunk/sql/schema-postgresql.sql Modified: trunk/mirrordoctor/mb/conn.py URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain/trunk/mirrordoctor/mb/conn.py?rev=7967&r1=7966&r2=7967&view=diff ============================================================================== --- trunk/mirrordoctor/mb/conn.py (original) +++ trunk/mirrordoctor/mb/conn.py Tue Mar 9 21:25:30 2010 _at_@ -162,9 +162,19 @@ # to be installed as well pass + try: + class Hash(SQLObject): + """the hashes table""" + class sqlmeta: + fromDatabase = True + idName = 'file_id' + self.Hash = Hash + except psycopg2.ProgrammingError: + # this is raised if the table hasn't been installed yet + pass + if debug: self.Server._connection.debug = True - def servertext2dict(s): Modified: trunk/mirrordoctor/mb/hashes.py URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain/trunk/mirrordoctor/mb/hashes.py?rev=7967&r1=7966&r2=7967&view=diff ============================================================================== --- trunk/mirrordoctor/mb/hashes.py (original) +++ trunk/mirrordoctor/mb/hashes.py Tue Mar 9 21:25:30 2010 _at_@ -1,5 +1,6 @@ #!/usr/bin/python +import sys import os import os.path import stat _at_@ -22,7 +23,8 @@ class Hasheable: """represent a file and its metadata""" - def __init__(self, basename, src_dir=None, dst_dir=None): + def __init__(self, basename, src_dir=None, dst_dir=None, + base_dir=None): self.basename = basename if src_dir: self.src_dir = src_dir _at_@ -30,6 +32,8 @@ self.src_dir = os.path.dirname(self.basename) self.src = os.path.join(src_dir, self.basename) + self.base_dir = base_dir + self.src_rel = os.path.join(src_dir[len(base_dir):], self.basename).lstrip('/') self.finfo = os.lstat(self.src) self.atime = self.finfo.st_atime _at_@ -43,6 +47,8 @@ self.dst_basename = '%s.size_%s' % (self.basename, self.size) self.dst = os.path.join(self.dst_dir, self.dst_basename) + self.hb = HashBag(src = self.src) + def islink(self): return stat.S_ISLNK(self.mode) def isreg(self): _at_@ -50,7 +56,9 @@ def isdir(self): return stat.S_ISDIR(self.mode) - def do_hashes(self, verbose=False, dry_run=False, copy_permissions=True): + + def check_file(self, verbose=False, dry_run=False, force=False, copy_permissions=True): + """check whether the hashes stored on disk are up to date""" try: dst_statinfo = os.stat(self.dst) dst_mtime = dst_statinfo.st_mtime _at_@ -58,26 +66,24 @@ except OSError: dst_mtime = dst_size = 0 # file missing - if int(dst_mtime) == int(self.mtime) and dst_size != 0: + if int(dst_mtime) == int(self.mtime) and dst_size != 0 and not force: if verbose: - print 'Up to date: %r' % self.dst + print 'Up to date hash file: %r' % self.dst return if dry_run: print 'Would make hashes for: ', self.src return - digests = Digests(src = self.src) - - # if present, grab PGP signature - if os.path.exists(self.src + '.asc'): - digests.pgp = open(self.src + '.asc').read() - - digests.read() + if self.hb.empty: + self.hb.fill(verbose=verbose) d = open(self.dst, 'wb') - d.write(digests.dump_2_12_template()) + d.write(self.hb.dump_2_12_template()) d.close() + + if verbose: + print 'Hash file updated: %r' % self.dst os.utime(self.dst, (self.atime, self.mtime)) _at_@ -85,6 +91,77 @@ os.chmod(self.dst, self.mode) else: os.chmod(self.dst, 0644) + + + def check_db(self, conn, verbose=False, dry_run=False, force=False): + """check if the hashes that are stored in the database are up to date + + for performance, this function talks very low level to the database""" + # get a database cursor, but make it persistent which is faster + try: + conn.mycursor + except AttributeError: + conn.mycursor = conn.Hash._connection.getConnection().cursor() + c = conn.mycursor + + c.execute("SELECT id FROM filearr WHERE path = %s LIMIT 1", + [self.src_rel]) + res = c.fetchone() + if not res: + print 'file %r not found (no mirror has it?)' % self.src_rel + ### XXX we'd need to insert it, if we want to support hashes for files that are not on any mirror... + return + file_id = res[0] + + c.execute("SELECT file_id, mtime, size FROM hash WHERE file_id = %s LIMIT 1", + [file_id]) + res = c.fetchone() + + if not res: + + if self.hb.empty: + self.hb.fill(verbose=verbose) + + c.execute("""INSERT INTO hash (file_id, mtime, size, md5, + sha1, sha256, sha1piecesize, + sha1pieces, pgp) + VALUES (%s, %s, %s, + decode(%s, 'hex'), decode(%s, 'hex'), + decode(%s, 'hex'), %s, decode(%s, 'hex'), + %s )""", + [file_id, self.mtime, self.size, + self.hb.md5hex, + self.hb.sha1hex, + self.hb.sha256hex or '', + PIECESIZE, + ''.join(self.hb.pieceshex), + self.hb.pgp or '']) + print 'hash was not present yet in database - inserted' + else: + mtime, size = res[1], res[2] + if int(self.mtime) == mtime and self.size == size and not force: + if verbose: + print 'Up to date in db: %r' % self.src_rel + return + c.execute("""UPDATE hash set mtime = %s, size = %s, + md5 = decode(%s, 'hex'), + sha1 = decode(%s, 'hex'), + sha256 = decode(%s, 'hex'), + sha1piecesize = %s, + sha1pieces = decode(%s, 'hex'), + pgp = %s + WHERE file_id = %s""", + [int(self.mtime), self.size, + self.hb.md5hex, self.hb.sha1hex, self.hb.sha256hex or '', + PIECESIZE, ''.join(self.hb.pieceshex), + self.hb.pgp or '', + file_id]) + if verbose: + print 'Hash updated in database for %r' % self.src_rel + + c.execute('commit') + + #def __eq__(self, other): # return self.basename == other.basename _at_@ -96,7 +173,8 @@ -class Digests(): +class HashBag(): + def __init__(self, src): self.src = src self.basename = os.path.basename(src) _at_@ -104,13 +182,23 @@ self.md5 = None self.sha1 = None self.sha256 = None + self.md5hex = None + self.sha1hex = None + self.sha256hex = None self.pgp = None self.npieces = 0 self.pieces = [] - - - def read(self): + self.pieceshex = [] + + self.empty = True + + def fill(self, verbose=False): + verbose = True # XXX + if verbose: + sys.stdout.write('Hashing %r... ' % self.src) + sys.stdout.flush() + m = md5.md5() s1 = sha1.sha1() s256 = sha256.sha256() _at_@ -133,22 +221,36 @@ s256.update(buf) self.npieces += 1 - self.pieces.append(hashlib.sha1(buf).hexdigest()) + self.pieces.append(hashlib.sha1(buf).digest()) + self.pieceshex.append(hashlib.sha1(buf).hexdigest()) f.close() - self.md5 = m.hexdigest() - self.sha1 = s1.hexdigest() - self.sha256 = s256.hexdigest() + self.md5 = m.digest() + self.sha1 = s1.digest() + self.sha256 = s256.digest() + self.md5hex = m.hexdigest() + self.sha1hex = s1.hexdigest() + self.sha256hex = s256.hexdigest() + + # if present, grab PGP signature + if os.path.exists(self.src + '.asc'): + self.pgp = open(self.src + '.asc').read() + + self.empty = False + + if verbose: + sys.stdout.write('done.\n') + def dump_raw(self): r = [] - for i in self.pieces: + for i in self.pieceshex: r.append('piece %s' % i) - r.append('md5 %s' % self.md5) - r.append('sha1 %s' % self.sha1) + r.append('md5 %s' % self.md5hex) + r.append('sha1 %s' % self.sha1hex) if sha256: - r.append('sha256 %s' % self.sha256) + r.append('sha256 %s' % self.sha256hex) return '\n'.join(r) _at_@ -164,9 +266,9 @@ r.append(""" <verification> <hash type="md5">%s</hash> - <hash type="sha1">%s</hash>""" % (self.md5, self.sha1)) + <hash type="sha1">%s</hash>""" % (self.md5hex, self.sha1hex)) if self.sha256: - r.append(' <hash type="sha256">%s</hash>' % (self.sha256)) + r.append(' <hash type="sha256">%s</hash>' % (self.sha256hex)) if self.pgp: r.append(' <signature type="pgp" file="%s.asc">' % self.basename) _at_@ -176,7 +278,7 @@ r.append(' <pieces length="%s" type="sha1">' % (PIECESIZE)) n = 0 - for piece in self.pieces: + for piece in self.pieceshex: r.append(' <hash piece="%s">%s</hash>' % (n, piece)) n += 1 Modified: trunk/mirrordoctor/mirrordoctor.py URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain/trunk/mirrordoctor/mirrordoctor.py?rev=7967&r1=7966&r2=7967&view=diff ============================================================================== --- trunk/mirrordoctor/mirrordoctor.py (original) +++ trunk/mirrordoctor/mirrordoctor.py Tue Mar 9 21:25:30 2010 _at_@ -827,6 +827,8 @@ + _at_cmdln.option('--force', action='store_true', + help='force refreshing all cached hashes') _at_cmdln.option('-n', '--dry-run', action='store_true', help='don\'t actually do anything, just show what would be done') _at_cmdln.option('--copy-permissions', action='store_true', _at_@ -842,7 +844,8 @@ 'If matching a directory, the directory is ignored and ' 'deleted in the target tree.') _at_cmdln.option('-b', '--base-dir', metavar='PATH', - help='set the base directory (so that you can work on a subdirectory)') + help='set the base directory (so that you can work on a ' + 'subdirectory -- see examples)') _at_cmdln.option('-t', '--target-dir', metavar='PATH', help='set a different target directory') _at_cmdln.option('-v', '--verbose', action='store_true', _at_@ -860,9 +863,14 @@ -i '^.*/repoview/.*$' mb makehashes \\ + -t /srv/metalink-hashes/samba/srv/mirrors/samba \\ + -b /srv/mirrors/samba \\ + /srv/mirrors/samba/pub/samba/xfertest + + mb makehashes \\ -f '.*.(torrent|iso)$' \\ - -t /var/lib/apache2/metalink-hashes/srv/ftp/pub/opensuse/distribution/11.0/iso \\ - -b /srv/ftp-stage/pub/opensuse/distribution/11.0/iso \\ + -t /var/lib/apache2/metalink-hashes/srv/ftp/pub/opensuse \\ + -b /srv/ftp-stage/pub/opensuse \\ /srv/ftp-stage/pub/opensuse/distribution/11.0/iso \\ -n _at_@ -874,6 +882,7 @@ import fcntl import errno import re + import shutil import mb.hashes if not opts.target_dir: _at_@ -937,7 +946,7 @@ src_basenames = set(os.listdir(src_dir)) if opts.verbose: - print 'looking at', src_dir + print 'Examining directory', src_dir dst_keep = set() dst_keep.add('LOCK') _at_@ -975,7 +984,8 @@ try: hasheable = mb.hashes.Hasheable(src_basename, src_dir=src_dir, - dst_dir=dst_dir) + dst_dir=dst_dir, + base_dir=opts.base_dir) except OSError, e: if e.errno == errno.ENOENT: sys.stderr.write('File vanished: %r\n' % src) _at_@ -990,9 +1000,14 @@ if not opts.file_mask or re.match(opts.file_mask, src_basename): #if opts.verbose: # print 'dst:', dst - hasheable.do_hashes(verbose=opts.verbose, + hasheable.check_file(verbose=opts.verbose, dry_run=opts.dry_run, + force=opts.force, copy_permissions=opts.copy_permissions) + hasheable.check_db(conn=self.conn, + verbose=opts.verbose, + dry_run=opts.dry_run, + force=opts.force) dst_keep.add(hasheable.dst_basename) elif hasheable.isdir(): Modified: trunk/sql/schema-postgresql.sql URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain/trunk/sql/schema-postgresql.sql?rev=7967&r1=7966&r2=7967&view=diff ============================================================================== --- trunk/sql/schema-postgresql.sql (original) +++ trunk/sql/schema-postgresql.sql Tue Mar 9 21:25:30 2010 _at_@ -20,6 +20,32 @@ ); -- -------------------------------------------------------- + + +CREATE TABLE "hash" ( + "id" INTEGER REFERENCES filearr PRIMARY KEY, + "mtime" INTEGER NOT NULL, + "size" INTEGER NOT NULL, + "md5" BYTEA NOT NULL, + "sha1" BYTEA NOT NULL, + "sha256" BYTEA NOT NULL, + "sha1piecesize" INTEGER NOT NULL, + "sha1pieces" BYTEA, + "pgp" TEXT NOT NULL +); + +CREATE VIEW hexhash AS + SELECT file_id, mtime, size, + encode(md5, 'hex') AS md5, + encode(sha1, 'hex') AS sha1, + encode(sha256, 'hex') AS sha256, + sha1piecesize, + encode(sha1pieces, 'hex') AS sha1pieces, + pgp + FROM hash; + +-- -------------------------------------------------------- + CREATE TABLE "server" ( "id" serial NOT NULL PRIMARY KEY, _______________________________________________ mirrorbrain-commits mailing list Archive: http://mirrorbrain.org/archive/mirrorbrain-commits/ Note: To remove yourself from this list, send a mail with the content unsubscribe to the address mirrorbrain-commits-request_at_mirrorbrain.orgReceived on Tue Mar 09 2010 - 20:25:34 GMT
This archive was generated by hypermail 2.3.0 : Mon Feb 20 2012 - 23:47:04 GMT