Author: poeml Date: Thu May 6 02:07:11 2010 New Revision: 8053 URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain?rev=8053&view=rev Log: mb makehashes: - working on cleanups of obsolete hashes in the database. Basic functionality is there. What's missing is: cleanup of entire directory hiearchies; re-implement the locking per directory we had so far; possibly more. Modified: trunk/mb/mb.py trunk/mb/mb/files.py trunk/mb/mb/hashes.py Modified: trunk/mb/mb.py URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain/trunk/mb/mb.py?rev=8053&r1=8052&r2=8053&view=diff ============================================================================== --- trunk/mb/mb.py (original) +++ trunk/mb/mb.py Thu May 6 02:07:11 2010 _at_@ -932,6 +932,7 @@ import re import shutil import mb.hashes + import mb.files if not opts.target_dir: sys.exit('You must specify the target directory (-t)') _at_@ -971,6 +972,8 @@ continue dst_dir = os.path.join(opts.target_dir, src_dir[len(opts.base_dir):].lstrip('/')) + dst_dir_db = src_dir[len(opts.base_dir):].lstrip('/') + #print dst_dir_db if not opts.dry_run: if not os.path.isdir(dst_dir): _at_@ -983,6 +986,11 @@ try: dst_names = os.listdir(dst_dir) dst_names.sort() + dst_names_db = [ (os.path.basename(i), j) + for i, j in mb.files.dir_filelist(self.conn, dst_dir_db)] + dst_names_db_dict = dict(dst_names_db) + dst_names_db_keys = dst_names_db_dict.keys() + #print dst_names_db_keys except OSError, e: if e.errno == errno.ENOENT: sys.exit('\nSorry, cannot really continue in dry-run mode, because directory %r does not exist.\n' _at_@ -996,9 +1004,11 @@ if opts.verbose: print 'Examining directory', src_dir + dst_keep_db = set() dst_keep = set() dst_keep.add('LOCK') + # FIXME: given that we don't need -t parameter anymore... can we create a lock hierarchy in /tmp instead?? lockfile = os.path.join(dst_dir, 'LOCK') try: if not opts.dry_run: _at_@ -1057,13 +1067,18 @@ dry_run=opts.dry_run, force=opts.force) dst_keep.add(hasheable.dst_basename) + dst_keep_db.add(hasheable.basename) elif hasheable.isdir(): directories_todo.append(src) # It's a directory, store it. dst_keep.add(hasheable.basename) + dst_keep_db.add(hasheable.basename) dst_remove = set(dst_names) - dst_keep + #print 'old', dst_remove + dst_remove_db = set(dst_names_db_keys) - dst_keep_db + #print 'new', dst_remove_db # print 'files to keep:' # print dst_keep _at_@ -1104,6 +1119,19 @@ sys.stderr.write('Unlink failed for %r: %s\n' \ % (i_path, os.strerror(e.errno))) unlinked_files += 1 + ids_to_delete = [] + for i in sorted(dst_remove_db): + relpath = os.path.join(dst_dir_db, i) + dbid = dst_names_db_dict.get(i) + if dbid: + print 'Obsolete hash in db: %r (id %s)' % (relpath, dbid) + ids_to_delete.append(dbid) + else: + print 'hm:', relpath + if len(ids_to_delete): + print 'Deleting %s obsolete hashes from hash table' % len(ids_to_delete) + if not opts.dry_run: + mb.files.hash_list_delete(self.conn, ids_to_delete) if opts.verbose: print 'unlocking', lockfile Modified: trunk/mb/mb/files.py URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain/trunk/mb/mb/files.py?rev=8053&r1=8052&r2=8053&view=diff ============================================================================== --- trunk/mb/mb/files.py (original) +++ trunk/mb/mb/files.py Thu May 6 02:07:11 2010 _at_@ -42,6 +42,15 @@ def ls(conn, path): + """If path contains a wildcard (* or %): + + Return all paths known to the database that start match the given path + argument (containing wildcards). + + If path doesn't contain wildcards: + + Return the exact match on the path argument.""" + if path.find('*') >= 0 or path.find('%') >= 0: pattern = True oprtr = 'like' _at_@ -136,3 +145,29 @@ return result + +def dir_filelist(conn, path): + """Returns tuples of (id, name) for all files that reside in a directory + + The returned filenames include their path.""" + + query = """SELECT filearr.path, hash.file_id + FROM filearr + LEFT JOIN hash + ON hash.file_id = filearr.id + WHERE filearr.path ~ '^%s/[^/]*$'""" % path + + result = conn.Server._connection.queryAll(query) + return result + +def hash_list_delete(conn, idlist): + """deletes all rows from the hash table with ids contained in the id list + which is passed as argument""" + + if not len(idlist): + return + + query = """BEGIN; DELETE FROM hash + WHERE file_id IN ( %s ); COMMIT""" % ', '.join([ str(i) for i in idlist]) + print query + conn.Filearr._connection.query(query) Modified: trunk/mb/mb/hashes.py URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain/trunk/mb/mb/hashes.py?rev=8053&r1=8052&r2=8053&view=diff ============================================================================== --- trunk/mb/mb/hashes.py (original) +++ trunk/mb/mb/hashes.py Thu May 6 02:07:11 2010 _at_@ -78,7 +78,7 @@ return if dry_run: - print 'Would make hash file', self.dst + print 'Would create hash file', self.dst return if self.hb.empty: _at_@ -117,7 +117,7 @@ if res: file_id = res[0] else: - print 'File %r not found. Not on mirrors yet? Inserting.' % self.src_rel + print 'File %r not in database. Not on mirrors yet? Inserting.' % self.src_rel c.execute("INSERT INTO filearr (path, mirrors) VALUES (%s, '{}')", [self.src_rel]) c.execute("SELECT currval('filearr_id_seq')") _______________________________________________ mirrorbrain-commits mailing list Archive: http://mirrorbrain.org/archive/mirrorbrain-commits/ Note: To remove yourself from this list, send a mail with the content unsubscribe to the address mirrorbrain-commits-request_at_mirrorbrain.orgReceived on Thu May 06 2010 - 00:07:16 GMT
This archive was generated by hypermail 2.3.0 : Mon Feb 20 2012 - 23:47:04 GMT