[mirrorbrain-commits] r7967 - in /trunk: mirrordoctor/ mirrordoctor/mb/ sql/

From: <poeml_at_mirrorbrain.org>
Date: Tue, 09 Mar 2010 20:25:32 -0000
Author: poeml
Date: Tue Mar  9 21:25:30 2010
New Revision: 7967

URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain?rev=7967&view=rev
Log:
mb makehashes: 
- implement saving hashes to the database
- add --force option to force refreshing all hashes
- better --base-dir example

Modified:
    trunk/mirrordoctor/mb/conn.py
    trunk/mirrordoctor/mb/hashes.py
    trunk/mirrordoctor/mirrordoctor.py
    trunk/sql/schema-postgresql.sql

Modified: trunk/mirrordoctor/mb/conn.py
URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain/trunk/mirrordoctor/mb/conn.py?rev=7967&r1=7966&r2=7967&view=diff
==============================================================================
--- trunk/mirrordoctor/mb/conn.py (original)
+++ trunk/mirrordoctor/mb/conn.py Tue Mar  9 21:25:30 2010
_at_@ -162,9 +162,19 @@
             # to be installed as well
             pass
 
+        try:
+            class Hash(SQLObject):
+                """the hashes table"""
+                class sqlmeta:
+                    fromDatabase = True
+                    idName = 'file_id'
+            self.Hash = Hash
+        except psycopg2.ProgrammingError:
+            # this is raised if the table hasn't been installed yet
+            pass
+
         if debug:
             self.Server._connection.debug = True
-
 
 
 def servertext2dict(s):

Modified: trunk/mirrordoctor/mb/hashes.py
URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain/trunk/mirrordoctor/mb/hashes.py?rev=7967&r1=7966&r2=7967&view=diff
==============================================================================
--- trunk/mirrordoctor/mb/hashes.py (original)
+++ trunk/mirrordoctor/mb/hashes.py Tue Mar  9 21:25:30 2010
_at_@ -1,5 +1,6 @@
 #!/usr/bin/python
 
+import sys
 import os
 import os.path
 import stat
_at_@ -22,7 +23,8 @@
 
 class Hasheable:
     """represent a file and its metadata"""
-    def __init__(self, basename, src_dir=None, dst_dir=None):
+    def __init__(self, basename, src_dir=None, dst_dir=None,
+                 base_dir=None):
         self.basename = basename
         if src_dir:
             self.src_dir = src_dir
_at_@ -30,6 +32,8 @@
             self.src_dir = os.path.dirname(self.basename)
 
         self.src = os.path.join(src_dir, self.basename)
+        self.base_dir = base_dir
+        self.src_rel = os.path.join(src_dir[len(base_dir):], self.basename).lstrip('/')
 
         self.finfo = os.lstat(self.src)
         self.atime = self.finfo.st_atime
_at_@ -43,6 +47,8 @@
         self.dst_basename = '%s.size_%s' % (self.basename, self.size)
         self.dst = os.path.join(self.dst_dir, self.dst_basename)
 
+        self.hb = HashBag(src = self.src)
+
     def islink(self):
         return stat.S_ISLNK(self.mode)
     def isreg(self):
_at_@ -50,7 +56,9 @@
     def isdir(self):
         return stat.S_ISDIR(self.mode)
 
-    def do_hashes(self, verbose=False, dry_run=False, copy_permissions=True):
+
+    def check_file(self, verbose=False, dry_run=False, force=False, copy_permissions=True):
+        """check whether the hashes stored on disk are up to date"""
         try:
             dst_statinfo = os.stat(self.dst)
             dst_mtime = dst_statinfo.st_mtime
_at_@ -58,26 +66,24 @@
         except OSError:
             dst_mtime = dst_size = 0 # file missing
 
-        if int(dst_mtime) == int(self.mtime) and dst_size != 0:
+        if int(dst_mtime) == int(self.mtime) and dst_size != 0 and not force:
             if verbose:
-                print 'Up to date: %r' % self.dst
+                print 'Up to date hash file: %r' % self.dst
             return 
 
         if dry_run: 
             print 'Would make hashes for: ', self.src
             return
 
-        digests = Digests(src = self.src)
-
-        # if present, grab PGP signature
-        if os.path.exists(self.src + '.asc'):
-            digests.pgp = open(self.src + '.asc').read()
-
-        digests.read()
+        if self.hb.empty:
+            self.hb.fill(verbose=verbose)
 
         d = open(self.dst, 'wb')
-        d.write(digests.dump_2_12_template())
+        d.write(self.hb.dump_2_12_template())
         d.close()
+
+        if verbose:
+            print 'Hash file updated: %r' % self.dst
 
         os.utime(self.dst, (self.atime, self.mtime))
 
_at_@ -85,6 +91,77 @@
             os.chmod(self.dst, self.mode)
         else:
             os.chmod(self.dst, 0644)
+
+
+    def check_db(self, conn, verbose=False, dry_run=False, force=False):
+        """check if the hashes that are stored in the database are up to date
+        
+        for performance, this function talks very low level to the database"""
+        # get a database cursor, but make it persistent which is faster
+        try:
+            conn.mycursor
+        except AttributeError:
+            conn.mycursor = conn.Hash._connection.getConnection().cursor()
+        c = conn.mycursor
+
+        c.execute("SELECT id FROM filearr WHERE path = %s LIMIT 1",
+                  [self.src_rel])
+        res = c.fetchone()
+        if not res:
+            print 'file %r not found (no mirror has it?)' % self.src_rel
+            ### XXX we'd need to insert it, if we want to support hashes for files that are not on any mirror...
+            return
+        file_id = res[0]
+
+        c.execute("SELECT file_id, mtime, size FROM hash WHERE file_id = %s LIMIT 1",
+                  [file_id])
+        res = c.fetchone()
+
+        if not res:
+
+            if self.hb.empty:
+                self.hb.fill(verbose=verbose)
+
+            c.execute("""INSERT INTO hash (file_id, mtime, size, md5, 
+                                           sha1, sha256, sha1piecesize, 
+                                           sha1pieces, pgp) 
+                         VALUES (%s, %s, %s, 
+                                 decode(%s, 'hex'), decode(%s, 'hex'), 
+                                 decode(%s, 'hex'), %s, decode(%s, 'hex'),
+                                 %s )""",
+                      [file_id, self.mtime, self.size,
+                       self.hb.md5hex,
+                       self.hb.sha1hex,
+                       self.hb.sha256hex or '',
+                       PIECESIZE,
+                       ''.join(self.hb.pieceshex),
+                       self.hb.pgp or ''])
+            print 'hash was not present yet in database - inserted'
+        else:
+            mtime, size = res[1], res[2]
+            if int(self.mtime) == mtime and self.size == size and not force:
+                if verbose:
+                    print 'Up to date in db: %r' % self.src_rel
+                return
+            c.execute("""UPDATE hash set mtime = %s, size = %s, 
+                                         md5 = decode(%s, 'hex'), 
+                                         sha1 = decode(%s, 'hex'), 
+                                         sha256 = decode(%s, 'hex'), 
+                                         sha1piecesize = %s,
+                                         sha1pieces = decode(%s, 'hex'), 
+                                         pgp = %s
+                         WHERE file_id = %s""",
+                      [int(self.mtime), self.size,
+                       self.hb.md5hex, self.hb.sha1hex, self.hb.sha256hex or '',
+                       PIECESIZE, ''.join(self.hb.pieceshex),
+                       self.hb.pgp or '', 
+                       file_id])
+            if verbose:
+                print 'Hash updated in database for %r' % self.src_rel
+
+        c.execute('commit')
+
+
 
     #def __eq__(self, other):
     #    return self.basename == other.basename
_at_@ -96,7 +173,8 @@
 
 
 
-class Digests():
+class HashBag():
+
     def __init__(self, src):
         self.src = src
         self.basename = os.path.basename(src)
_at_@ -104,13 +182,23 @@
         self.md5 = None
         self.sha1 = None
         self.sha256 = None
+        self.md5hex = None
+        self.sha1hex = None
+        self.sha256hex = None
         self.pgp = None
 
         self.npieces = 0
         self.pieces = []
-
-
-    def read(self):
+        self.pieceshex = []
+
+        self.empty = True
+
+    def fill(self, verbose=False):
+        verbose = True # XXX
+        if verbose:
+            sys.stdout.write('Hashing %r... ' % self.src)
+            sys.stdout.flush()
+
         m = md5.md5()
         s1 = sha1.sha1()
         s256 = sha256.sha256()
_at_@ -133,22 +221,36 @@
             s256.update(buf)
 
             self.npieces += 1
-            self.pieces.append(hashlib.sha1(buf).hexdigest())
+            self.pieces.append(hashlib.sha1(buf).digest())
+            self.pieceshex.append(hashlib.sha1(buf).hexdigest())
 
         f.close()
 
-        self.md5 = m.hexdigest()
-        self.sha1 = s1.hexdigest()
-        self.sha256 = s256.hexdigest()
+        self.md5 = m.digest()
+        self.sha1 = s1.digest()
+        self.sha256 = s256.digest()
+        self.md5hex = m.hexdigest()
+        self.sha1hex = s1.hexdigest()
+        self.sha256hex = s256.hexdigest()
+
+        # if present, grab PGP signature
+        if os.path.exists(self.src + '.asc'):
+            self.pgp = open(self.src + '.asc').read()
+
+        self.empty = False
+
+        if verbose:
+            sys.stdout.write('done.\n')
+
 
     def dump_raw(self):
         r = []
-        for i in self.pieces:
+        for i in self.pieceshex:
             r.append('piece %s' % i)
-        r.append('md5 %s' % self.md5)
-        r.append('sha1 %s' % self.sha1)
+        r.append('md5 %s' % self.md5hex)
+        r.append('sha1 %s' % self.sha1hex)
         if sha256:
-            r.append('sha256 %s' % self.sha256)
+            r.append('sha256 %s' % self.sha256hex)
         return '\n'.join(r)
 
 
_at_@ -164,9 +266,9 @@
 
         r.append("""      <verification>
         <hash type="md5">%s</hash>
-        <hash type="sha1">%s</hash>""" % (self.md5, self.sha1))
+        <hash type="sha1">%s</hash>""" % (self.md5hex, self.sha1hex))
         if self.sha256:
-            r.append('        <hash type="sha256">%s</hash>' % (self.sha256))
+            r.append('        <hash type="sha256">%s</hash>' % (self.sha256hex))
 
         if self.pgp:
             r.append('        <signature type="pgp" file="%s.asc">' % self.basename)
_at_@ -176,7 +278,7 @@
         r.append('        <pieces length="%s" type="sha1">' % (PIECESIZE))
 
         n = 0
-        for piece in self.pieces:
+        for piece in self.pieceshex:
             r.append('            <hash piece="%s">%s</hash>' % (n, piece))
             n += 1
 

Modified: trunk/mirrordoctor/mirrordoctor.py
URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain/trunk/mirrordoctor/mirrordoctor.py?rev=7967&r1=7966&r2=7967&view=diff
==============================================================================
--- trunk/mirrordoctor/mirrordoctor.py (original)
+++ trunk/mirrordoctor/mirrordoctor.py Tue Mar  9 21:25:30 2010
_at_@ -827,6 +827,8 @@
 
 
 
+    _at_cmdln.option('--force', action='store_true',
+                        help='force refreshing all cached hashes')
     _at_cmdln.option('-n', '--dry-run', action='store_true',
                         help='don\'t actually do anything, just show what would be done')
     _at_cmdln.option('--copy-permissions', action='store_true',
_at_@ -842,7 +844,8 @@
                              'If matching a directory, the directory is ignored and '
                              'deleted in the target tree.')
     _at_cmdln.option('-b', '--base-dir', metavar='PATH',
-                        help='set the base directory (so that you can work on a subdirectory)')
+                        help='set the base directory (so that you can work on a '
+                             'subdirectory -- see examples)')
     _at_cmdln.option('-t', '--target-dir', metavar='PATH',
                         help='set a different target directory')
     _at_cmdln.option('-v', '--verbose', action='store_true',
_at_@ -860,9 +863,14 @@
             -i '^.*/repoview/.*$'
 
         mb makehashes \\
+            -t /srv/metalink-hashes/samba/srv/mirrors/samba \\
+            -b /srv/mirrors/samba \\
+            /srv/mirrors/samba/pub/samba/xfertest
+
+        mb makehashes \\
             -f '.*.(torrent|iso)$' \\
-            -t /var/lib/apache2/metalink-hashes/srv/ftp/pub/opensuse/distribution/11.0/iso \\
-            -b /srv/ftp-stage/pub/opensuse/distribution/11.0/iso \\
+            -t /var/lib/apache2/metalink-hashes/srv/ftp/pub/opensuse \\
+            -b /srv/ftp-stage/pub/opensuse \\
             /srv/ftp-stage/pub/opensuse/distribution/11.0/iso \\
             -n
 
_at_@ -874,6 +882,7 @@
         import fcntl
         import errno
         import re
+        import shutil
         import mb.hashes
 
         if not opts.target_dir:
_at_@ -937,7 +946,7 @@
             src_basenames = set(os.listdir(src_dir))
 
             if opts.verbose:
-                print 'looking at', src_dir
+                print 'Examining directory', src_dir
 
             dst_keep = set()
             dst_keep.add('LOCK')
_at_@ -975,7 +984,8 @@
                 try:
                     hasheable = mb.hashes.Hasheable(src_basename, 
                                                     src_dir=src_dir, 
-                                                    dst_dir=dst_dir)
+                                                    dst_dir=dst_dir,
+                                                    base_dir=opts.base_dir)
                 except OSError, e:
                     if e.errno == errno.ENOENT:
                         sys.stderr.write('File vanished: %r\n' % src)
_at_@ -990,9 +1000,14 @@
                     if not opts.file_mask or re.match(opts.file_mask, src_basename):
                         #if opts.verbose:
                         #    print 'dst:', dst
-                        hasheable.do_hashes(verbose=opts.verbose, 
+                        hasheable.check_file(verbose=opts.verbose, 
                                             dry_run=opts.dry_run, 
+                                            force=opts.force, 
                                             copy_permissions=opts.copy_permissions)
+                        hasheable.check_db(conn=self.conn,
+                                           verbose=opts.verbose, 
+                                           dry_run=opts.dry_run,
+                                           force=opts.force)
                         dst_keep.add(hasheable.dst_basename)
 
                 elif hasheable.isdir():

Modified: trunk/sql/schema-postgresql.sql
URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain/trunk/sql/schema-postgresql.sql?rev=7967&r1=7966&r2=7967&view=diff
==============================================================================
--- trunk/sql/schema-postgresql.sql (original)
+++ trunk/sql/schema-postgresql.sql Tue Mar  9 21:25:30 2010
_at_@ -20,6 +20,32 @@
 );
 
 -- --------------------------------------------------------
+
+
+CREATE TABLE "hash" (
+        "id" INTEGER REFERENCES filearr PRIMARY KEY,
+        "mtime" INTEGER NOT NULL,
+        "size"  INTEGER NOT NULL,
+        "md5"    BYTEA NOT NULL,
+        "sha1"   BYTEA NOT NULL,
+        "sha256" BYTEA NOT NULL,
+        "sha1piecesize" INTEGER NOT NULL,
+        "sha1pieces" BYTEA,
+        "pgp" TEXT NOT NULL
+);
+
+CREATE VIEW hexhash AS 
+  SELECT file_id, mtime, size, 
+         encode(md5, 'hex') AS md5, 
+         encode(sha1, 'hex') AS sha1, 
+         encode(sha256, 'hex') AS sha256, 
+         sha1piecesize, 
+         encode(sha1pieces, 'hex') AS sha1pieces,
+         pgp 
+  FROM hash;
+
+-- --------------------------------------------------------
+
 
 CREATE TABLE "server" (
         "id" serial NOT NULL PRIMARY KEY,




_______________________________________________
mirrorbrain-commits mailing list
Archive: http://mirrorbrain.org/archive/mirrorbrain-commits/

Note: To remove yourself from this list, send a mail with the content
 	unsubscribe
to the address mirrorbrain-commits-request_at_mirrorbrain.org
Received on Tue Mar 09 2010 - 20:25:34 GMT

This archive was generated by hypermail 2.3.0 : Mon Feb 20 2012 - 23:47:04 GMT