[mirrorbrain-commits] r8053 - in /trunk/mb: mb.py mb/files.py mb/hashes.py

From: <poeml_at_mirrorbrain.org>
Date: Thu, 06 May 2010 00:07:13 -0000
Author: poeml
Date: Thu May  6 02:07:11 2010
New Revision: 8053

URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain?rev=8053&view=rev
Log:
mb makehashes:
- working on cleanups of obsolete hashes in the database. Basic functionality
  is there. What's missing is: cleanup of entire directory hiearchies;
  re-implement the locking per directory we had so far; possibly more.

Modified:
    trunk/mb/mb.py
    trunk/mb/mb/files.py
    trunk/mb/mb/hashes.py

Modified: trunk/mb/mb.py
URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain/trunk/mb/mb.py?rev=8053&r1=8052&r2=8053&view=diff
==============================================================================
--- trunk/mb/mb.py (original)
+++ trunk/mb/mb.py Thu May  6 02:07:11 2010
_at_@ -932,6 +932,7 @@
         import re
         import shutil
         import mb.hashes
+        import mb.files
 
         if not opts.target_dir:
             sys.exit('You must specify the target directory (-t)')
_at_@ -971,6 +972,8 @@
                     continue
 
             dst_dir = os.path.join(opts.target_dir, src_dir[len(opts.base_dir):].lstrip('/'))
+            dst_dir_db = src_dir[len(opts.base_dir):].lstrip('/')
+            #print dst_dir_db
 
             if not opts.dry_run:
                 if not os.path.isdir(dst_dir):
_at_@ -983,6 +986,11 @@
             try:
                 dst_names = os.listdir(dst_dir)
                 dst_names.sort()
+                dst_names_db = [ (os.path.basename(i), j) 
+                                 for i, j in mb.files.dir_filelist(self.conn, dst_dir_db)]
+                dst_names_db_dict = dict(dst_names_db)
+                dst_names_db_keys = dst_names_db_dict.keys()
+                #print dst_names_db_keys
             except OSError, e:
                 if e.errno == errno.ENOENT:
                     sys.exit('\nSorry, cannot really continue in dry-run mode, because directory %r does not exist.\n'
_at_@ -996,9 +1004,11 @@
             if opts.verbose:
                 print 'Examining directory', src_dir
 
+            dst_keep_db = set()
             dst_keep = set()
             dst_keep.add('LOCK')
 
+            # FIXME: given that we don't need -t parameter anymore... can we create a lock hierarchy in /tmp instead??
             lockfile = os.path.join(dst_dir, 'LOCK')
             try:
                 if not opts.dry_run:
_at_@ -1057,13 +1067,18 @@
                                            dry_run=opts.dry_run,
                                            force=opts.force)
                         dst_keep.add(hasheable.dst_basename)
+                        dst_keep_db.add(hasheable.basename)
 
                 elif hasheable.isdir():
                     directories_todo.append(src)  # It's a directory, store it.
                     dst_keep.add(hasheable.basename)
+                    dst_keep_db.add(hasheable.basename)
 
 
             dst_remove = set(dst_names) - dst_keep
+            #print 'old', dst_remove
+            dst_remove_db = set(dst_names_db_keys) - dst_keep_db
+            #print 'new', dst_remove_db
 
             # print 'files to keep:'
             # print dst_keep
_at_@ -1104,6 +1119,19 @@
                                 sys.stderr.write('Unlink failed for %r: %s\n' \
                                                     % (i_path, os.strerror(e.errno)))
                     unlinked_files += 1
+            ids_to_delete = []
+            for i in sorted(dst_remove_db):
+                relpath = os.path.join(dst_dir_db, i)
+                dbid = dst_names_db_dict.get(i)
+                if dbid:
+                    print 'Obsolete hash in db: %r (id %s)' % (relpath, dbid)
+                    ids_to_delete.append(dbid)
+                else:
+                    print 'hm:', relpath
+            if len(ids_to_delete):
+                print 'Deleting %s obsolete hashes from hash table' % len(ids_to_delete)
+                if not opts.dry_run:
+                    mb.files.hash_list_delete(self.conn, ids_to_delete)
 
             if opts.verbose:
                 print 'unlocking', lockfile 

Modified: trunk/mb/mb/files.py
URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain/trunk/mb/mb/files.py?rev=8053&r1=8052&r2=8053&view=diff
==============================================================================
--- trunk/mb/mb/files.py (original)
+++ trunk/mb/mb/files.py Thu May  6 02:07:11 2010
_at_@ -42,6 +42,15 @@
 
 
 def ls(conn, path):
+    """If path contains a wildcard (* or %): 
+    
+    Return all paths known to the database that start match the given path
+    argument (containing wildcards).
+
+    If path doesn't contain wildcards:
+
+    Return the exact match on the path argument."""
+
     if path.find('*') >= 0 or path.find('%') >= 0:
         pattern = True
         oprtr = 'like'
_at_@ -136,3 +145,29 @@
 
     return result
 
+
+def dir_filelist(conn, path):
+    """Returns tuples of (id, name) for all files that reside in a directory
+    
+    The returned filenames include their path."""
+
+    query = """SELECT filearr.path, hash.file_id
+                   FROM filearr 
+               LEFT JOIN hash 
+                   ON hash.file_id = filearr.id 
+               WHERE filearr.path ~ '^%s/[^/]*$'""" % path
+
+    result = conn.Server._connection.queryAll(query)
+    return result
+
+def hash_list_delete(conn, idlist):
+    """deletes all rows from the hash table with ids contained in the id list
+    which is passed as argument"""
+
+    if not len(idlist):
+        return
+
+    query = """BEGIN; DELETE FROM hash 
+               WHERE file_id IN ( %s ); COMMIT""" % ', '.join([ str(i) for i in idlist])
+    print query
+    conn.Filearr._connection.query(query)

Modified: trunk/mb/mb/hashes.py
URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain/trunk/mb/mb/hashes.py?rev=8053&r1=8052&r2=8053&view=diff
==============================================================================
--- trunk/mb/mb/hashes.py (original)
+++ trunk/mb/mb/hashes.py Thu May  6 02:07:11 2010
_at_@ -78,7 +78,7 @@
             return 
 
         if dry_run: 
-            print 'Would make hash file', self.dst
+            print 'Would create hash file', self.dst
             return
 
         if self.hb.empty:
_at_@ -117,7 +117,7 @@
         if res:
             file_id = res[0]
         else:
-            print 'File %r not found. Not on mirrors yet? Inserting.' % self.src_rel
+            print 'File %r not in database. Not on mirrors yet? Inserting.' % self.src_rel
             c.execute("INSERT INTO filearr (path, mirrors) VALUES (%s, '{}')",
                       [self.src_rel])
             c.execute("SELECT currval('filearr_id_seq')")




_______________________________________________
mirrorbrain-commits mailing list
Archive: http://mirrorbrain.org/archive/mirrorbrain-commits/

Note: To remove yourself from this list, send a mail with the content
 	unsubscribe
to the address mirrorbrain-commits-request_at_mirrorbrain.org
Received on Thu May 06 2010 - 00:07:16 GMT

This archive was generated by hypermail 2.3.0 : Mon Feb 20 2012 - 23:47:04 GMT