[mirrorbrain-commits] r7859 - in /branches/experimental-duptree: mirrordoctor/ tools/

From: <poeml_at_mirrorbrain.org>
Date: Mon, 23 Nov 2009 14:22:00 -0000
Author: poeml
Date: Mon Nov 23 15:22:00 2009
New Revision: 7859

URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain?rev=7859&view=rev
Log:
- experimental support for creation of a local "shadow" file tree with all
  files scanned on a particular mirror. The locally created files are empty
  (and sparse).

Added:
    branches/experimental-duptree/tools/create_sparse.py   (with props)
Modified:
    branches/experimental-duptree/mirrordoctor/mirrordoctor.py
    branches/experimental-duptree/tools/scanner.pl

Modified: branches/experimental-duptree/mirrordoctor/mirrordoctor.py
URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain/branches/experimental-duptree/mirrordoctor/mirrordoctor.py?rev=7859&r1=7858&r2=7859&view=diff
==============================================================================
--- branches/experimental-duptree/mirrordoctor/mirrordoctor.py (original)
+++ branches/experimental-duptree/mirrordoctor/mirrordoctor.py Mon Nov 23 15:22:00 2009
@@ -667,6 +667,9 @@
     @cmdln.option('-d', '--directory', metavar='DIR',
                   help='Scan only in dir under mirror\'s baseurl. '
                        'Default: start at baseurl. Does not delete files, only add.')
+    @cmdln.option('-D', '--duplicate-tree', metavar='DIR',
+                  help='Create a local file tree, duplicating the file tree '
+                       'of the scanned mirror with (sparse) pseudo files.')
     def do_scan(self, subcmd, opts, *args):
         """${cmd_name}: scan mirrors
 
@@ -697,6 +700,8 @@
             cmd.append('-e')
         if opts.directory:
             cmd.append('-d %s' % opts.directory)
+        if opts.duplicate_tree:
+            cmd.append('-D %s' % opts.duplicate_tree)
         if opts.jobs:
             cmd += [ '-j', opts.jobs ]
         if opts.enable or args:
@@ -708,6 +713,12 @@
                  self.config.dbconfig.get('scan_exclude', '').split() ]
         cmd += [ '--exclude-rsync %s' % i for i in 
                  self.config.dbconfig.get('scan_exclude_rsync', '').split() ]
+
+        i = self.config.dbconfig.get('dup_tree_from', None)
+        j = self.config.dbconfig.get('dup_tree_dest', None)
+        if i and j:
+            cmd.append('--dup-tree-from %s' % i)
+            cmd.append('--dup-tree-dest %s' % j)
 
         if not opts.all and not args:
             sys.exit('No mirrors specified for scanning. Either give identifiers, or use -a [-j N].')

Added: branches/experimental-duptree/tools/create_sparse.py
URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain/branches/experimental-duptree/tools/create_sparse.py?rev=7859&view=auto
==============================================================================
--- branches/experimental-duptree/tools/create_sparse.py (added)
+++ branches/experimental-duptree/tools/create_sparse.py Mon Nov 23 15:22:00 2009
@@ -1,0 +1,46 @@
+#!/usr/bin/python
+
+import sys
+import os
+
+(basedir, name, size, mtime) = sys.argv[1:5]
+
+print 'Creating %s/%s (%s bytes, mtime %s)' % (basedir, name, size, mtime)
+size = int(size)
+mtime = int(mtime)
+atime = mtime
+
+path = os.path.join(basedir, name)
+canonical_path = os.path.realpath(path)
+
+# for safety
+if not canonical_path.startswith(basedir):
+    sys.exit("canonical path (%r) doesn't start with the basedir (%r)")
+
+
+try:
+    os.makedirs(os.path.dirname(canonical_path))
+except:
+    pass
+
+# FIXME: it is inefficient to delete and recreate the files all the time
+# but for now it allows this prototype to get forward
+# the removal (or a check in general) is needed because the file size
+# could shrink
+
+try:
+    os.unlink(canonical_path)
+except:
+    pass
+
+fd = open(canonical_path, 'w')
+
+if size == 0:
+    fd.truncate()
+else:
+    fd.seek(size - 1)
+    fd.write('\0')
+fd.close()
+
+os.utime(canonical_path, (atime, mtime))
+

Modified: branches/experimental-duptree/tools/scanner.pl
URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain/branches/experimental-duptree/tools/scanner.pl?rev=7859&r1=7858&r2=7859&view=diff
==============================================================================
--- branches/experimental-duptree/tools/scanner.pl (original)
+++ branches/experimental-duptree/tools/scanner.pl Mon Nov 23 15:22:00 2009
@@ -75,6 +75,8 @@
 my $rsync_muxbuf = '';
 my $all_servers = 0;
 my $start_dir = '/';
+my $dup_tree_from = '';
+my $dup_tree_dest = '';
 my $parallel = 1;
 my $list_only = 0;
 my $recursion_delay = 0;	# seconds delay per *_readdir recuursion
@@ -125,6 +127,8 @@
 	elsif ($arg =~ m{^-e})                 { $enable_after_scan++; }
 	elsif ($arg =~ m{^-f})                 { $force_scan++; }
 	elsif ($arg =~ m{^-d})                 { $start_dir = shift; }
+	elsif ($arg =~ m{^--dup-tree-from$})   { $dup_tree_from = shift; }
+	elsif ($arg =~ m{^--dup-tree-dest$})   { $dup_tree_dest = shift; }
 	elsif ($arg =~ m{^-b})                 { $brain_instance = shift; }
 	elsif ($arg =~ m{^-l})                 { $list_only++; 
 						 $list_only++ if $arg =~ m{ll}; 
@@ -243,6 +247,8 @@
   push @cmd, '-f' if $force_scan;
   push @cmd, '-e' if $enable_after_scan;
   push @cmd, '-d', $start_dir if length $start_dir;
+  push @cmd, '--dup-tree-from', $dup_tree_from if length $dup_tree_from;
+  push @cmd, '--dup-tree-dest', $dup_tree_dest if length $dup_tree_dest;
   # We must not propagate -j here.
   # All other options we should propagate.
 
@@ -267,6 +273,12 @@
 
 for my $row (@scan_list) {
   print localtime(time) . " $row->{identifier}: starting\n" if $verbose;
+
+  my $dup_tree_dest_this = '';
+  if ($dup_tree_dest and ($row->{identifier} eq $dup_tree_from)) { 
+    print localtime(time) . " $row->{identifier}: duplicating local tree from this mirror\n";
+    $dup_tree_dest_this = $dup_tree_dest;
+  }
 
   # already in a transaction? why??
   #if($do_transaction) {
@@ -308,14 +320,14 @@
 
 
   my $start = int(gettimeofday * 1000);
-  my $file_count = rsync_readdir($row->{identifier}, $row->{id}, $row->{baseurl_rsync}, $start_dir);
+  my $file_count = rsync_readdir($row->{identifier}, $row->{id}, $row->{baseurl_rsync}, $start_dir, $dup_tree_dest_this);
   if(!$file_count and $row->{baseurl_ftp}) {
     print localtime(time) . " $row->{identifier}: no rsync, trying ftp\n" if $verbose;
-    $file_count = scalar ftp_readdir($row->{identifier}, $row->{id}, $row->{baseurl_ftp}, time, $start_dir);
+    $file_count = scalar ftp_readdir($row->{identifier}, $row->{id}, $row->{baseurl_ftp}, time, $start_dir, $dup_tree_dest_this);
   }
   if(!$file_count and $row->{baseurl}) {
     print localtime(time) . " $row->{identifier}: no rsync, no ftp, trying http\n" if $verbose;
-    $file_count = scalar http_readdir($row->{identifier}, $row->{id}, $row->{baseurl}, $start_dir);
+    $file_count = scalar http_readdir($row->{identifier}, $row->{id}, $row->{baseurl}, $start_dir, $dup_tree_dest_this);
   }
 
   if($do_transaction) {
@@ -515,7 +527,7 @@
 # http://ftp1.opensuse.org/repositories/#@^@repositories/@@
 sub http_readdir
 {
-  my ($identifier, $id, $url, $name) = @_;
+  my ($identifier, $id, $url, $name, $dup_tree_dest) = @_;
 
   my $item;
 
@@ -582,7 +594,7 @@
           ## we must be really sure it is a directory, when we come here.
           ## otherwise, we'll retrieve the contents of a file!
           sleep($recursion_delay) if $recursion_delay;
-          push @r, http_readdir($identifier, $id, $urlraw, $t, 0);
+          push @r, http_readdir($identifier, $id, $urlraw, $t, $dup_tree_dest, 0);
         }
         else {
           ## it is a file.
@@ -597,7 +609,7 @@
           }
           elsif(largefile_check($identifier, $id, $t, $len)) {
             #save timestamp and file in database
-            if(save_file($t, $identifier, $id, $time, $re)) {
+            if(save_file($t, $identifier, $id, $time, $len, $re, $dup_tree_dest)) {
               push @r, [ $t , $time ];
             }
           }
@@ -633,11 +645,11 @@
 
 
 
-# $file_count = scalar ftp_readdir($row->{identifier}, $row->{id}, $row->{baseurl_ftp}, $ftp_timer, $start_dir);
+# $file_count = scalar ftp_readdir($row->{identifier}, $row->{id}, $row->{baseurl_ftp}, $ftp_timer, $start_dir, $dup_tree_dest);
 # first call: $ftp undefined
 sub ftp_readdir
 {
-  my ($identifier, $id, $url, $ftp_timer, $name, $ftp) = @_;
+  my ($identifier, $id, $url, $ftp_timer, $name, $ftp, $dup_tree_dest) = @_;
 
   my $ftp_age = (time() - $ftp_timer);
   print "$identifier: last command issued $ftp_age"."s ago\n" if $verbose > 2;
@@ -730,7 +742,7 @@
           next;
         }
         sleep($recursion_delay) if $recursion_delay;
-        push @r, ftp_readdir($identifier, $id, $urlraw, $ftp_timer, $t, $ftp);
+        push @r, ftp_readdir($identifier, $id, $urlraw, $ftp_timer, $t, $ftp, $dup_tree_dest);
       }
 
       if($type eq 'l') {
@@ -742,7 +754,7 @@
         }
         #save timestamp and file in database
         if(largefile_check($identifier, $id, $t, $size)) {
-          if(save_file($t, $identifier, $id, $time, $re)) {
+          if(save_file($t, $identifier, $id, $time, $size, $re, $dup_tree_dest)) {
             push @r, [ $t , $time ];
           }
         }
@@ -762,7 +774,7 @@
 
 sub save_file
 {
-  my ($path, $identifier, $serverid, $mod_re, $ign_re) = @_;
+  my ($path, $identifier, $serverid, $mod_re, $size, $ign_re, $dup_tree_dest) = @_;
 
   #
   # optional patch the file names by adding or removing components.
@@ -770,6 +782,15 @@
   #
 
   return undef if $ign_re and $path =~ m{$ign_re};
+
+
+  if ($dup_tree_dest) {
+    my $save_local = "$dup_tree_dest/$path";
+    printf "Duplicating locally: $save_local (size: $size; mtime: $mod_re)\n" 
+      if $verbose > 2;;
+    system("/tmp/create_sparse.py $dup_tree_dest $path $size $mod_re");
+  }
+
 
   if ($mod_re and $mod_re =~ m{@([^@]*)@([^@]*)}) {
     print "$identifier: save_file: $path + #$mod_re -> " if $verbose > 2;
@@ -914,7 +935,7 @@
         printf "$priv->{identifier}: warning: $name cannot be delivererd via HTTP! Skipping\n" if $verbose > 0;
       }
       else {
-        $name = save_file($name, $priv->{identifier}, $priv->{serverid}, $mtime, $priv->{re});
+        $name = save_file($name, $priv->{identifier}, $priv->{serverid}, $mtime, $len, $priv->{re}, $priv->{dup_tree_dest});
         $priv->{counter}++;
         if (($priv->{counter} % 50) == 0) {
           print "$priv->{identifier}: commit after 50 files\n" if $verbose > 2;
@@ -947,7 +968,7 @@
 #  d: base directory (can be 'undef'): parameter to the '-d' switch
 sub rsync_readdir
 {
-  my ($identifier, $serverid, $url, $d) = @_;
+  my ($identifier, $serverid, $url, $d, $dup_tree_dest) = @_;
   return 0 unless $url;
 
   $url =~ s{^rsync://}{}s; # trailing s: treat as single line, strip off protocol id
@@ -964,6 +985,7 @@
   $peer->{user} = $cred if $cred;
   $peer->{subdir} = $d if length $d;
   $peer->{counter} = 0;
+  $peer->{dup_tree_dest} = $dup_tree_dest;
   $path .= "/". $d if length $d;
   rsync_get_filelist($identifier, $peer, $path, 0, \&rsync_cb, $peer);
   return $peer->{counter};




_______________________________________________
mirrorbrain-commits mailing list
Archive: http://mirrorbrain.org/archive/mirrorbrain-commits/

Note: To remove yourself from this list, send a mail with the content
 	unsubscribe
to the address mirrorbrain-commits-request_at_mirrorbrain.org
Received on Mon Nov 23 2009 - 14:22:02 GMT

This archive was generated by hypermail 2.2.0 : Mon Nov 23 2009 - 14:45:10 GMT