Author: poeml Date: 2009-02-24 17:21:04 -0700 (Tue, 24 Feb 2009) New Revision: 6568 Modified: trunk/tools/download-redirector-v2/scanner/scanner.pl Log: scanner: - another measure to save on database size: allow the definition of top-level directories that are scanned, and all others ignored. Implemented for HTTP and rsync scanning so far; FTP missing. - add stub for using a different database scheme (experimental, disabled) Modified: trunk/tools/download-redirector-v2/scanner/scanner.pl =================================================================== --- trunk/tools/download-redirector-v2/scanner/scanner.pl 2009-02-25 00:15:02 UTC (rev 6567) +++ trunk/tools/download-redirector-v2/scanner/scanner.pl 2009-02-25 00:21:04 UTC (rev 6568) @@ -136,12 +136,15 @@ # FIXME: use DBI functions transaction handling my $do_transaction = 0; +# experimental other database scheme +my $use_file_array = 0; # save prepared statements my $sth_update; my $sth_insert_rel; my $sth_select_file; my $sth_insert_file; +my $sth_mirr_addbyname; my $gig2 = 1<<31; # 2*1024*1024*1024 == 2^1 * 2^10 * 2^10 * 2^10 = 2^31 @@ -150,6 +153,9 @@ my $http_size_hint; my $http_slice_counter; +# directories to be included from top-level +my @top_include_list; + #my $global_ign_re = qr{( # /repoview/ | # /drpmsync/ | @@ -171,6 +177,7 @@ if ($arg !~ m{^-}) { unshift @ARGV, $arg; last; } elsif ($arg =~ m{^(-h|--help|-\?)}) { exit usage(); } elsif ($arg =~ m{^(-i|--ignore)}) { push @norecurse_list, shift; } + elsif ($arg =~ m{^(-I|--top-include)}) { push @top_include_list, shift; } elsif ($arg =~ m{^-q}) { $verbose = 0; } elsif ($arg =~ m{^-v}) { $verbose++; } elsif ($arg =~ m{^-S}) { $sqlverbose++; } @@ -329,6 +336,8 @@ } my $duration = (int(gettimeofday * 1000) - $start) / 1000; + if (!$duration) { $duration = 1; } + if (!$file_count) { $file_count = 0; } my $fpm = int(60*$file_count/$duration); @@ -339,20 +348,27 @@ unless ($keep_dead_files) { $start = time(); print localtime(time) . " $row->{identifier}: purging old files\n" if $verbose > 1; - my $sql = "DELETE FROM file_server WHERE serverid = $row->{id} - AND timestamp_scanner <= (SELECT extract(epoch from last_scan) FROM server - WHERE id = $row->{id} limit 1)"; - if(length $start_dir) { - ## let us hope subselects with paramaters work in mysql. - $sql .= " AND fileid IN (SELECT id FROM file WHERE path LIKE ?)"; + if ($use_file_array) { + + print "$row->{identifier}: FIXME cleanup \n"; + + } else { + my $sql = "DELETE FROM file_server WHERE serverid = $row->{id} + AND timestamp_scanner <= (SELECT extract(epoch from last_scan) FROM server + WHERE id = $row->{id} limit 1)"; + + if(length $start_dir) { + ## let us hope subselects with paramaters work in mysql. + $sql .= " AND fileid IN (SELECT id FROM file WHERE path LIKE ?)"; + } + + # Keep in sync with $start_dir setup above! + my $sth = $dbh->prepare( $sql ); + print "$row->{identifier}: $sql\n" if $sqlverbose; + $sth->execute(length($start_dir) ? "$start_dir/%" : ()) or die "$row->{identifier}: $DBI::errstr"; } - # Keep in sync with $start_dir setup above! - my $sth = $dbh->prepare( $sql ); - print "$row->{identifier}: $sql\n" if $sqlverbose; - $sth->execute(length($start_dir) ? "$start_dir/%" : ()) or die "$row->{identifier}: $DBI::errstr"; - $duration = time() - $start; print localtime(time) . " $row->{identifier}: purged old files in " . $duration . "s.\n" if $verbose > 0; } @@ -418,6 +434,7 @@ -i regexp Define regexp-pattern for path names to ignore. Use '-i 0' to disable any ignore patterns. Default: @norecurse_list + -T /dir/ Directory to be scanned at the top level; option can be repeated. Both, names(identifier) and numbers(id) are accepted as mirror_ids. }; @@ -506,11 +523,23 @@ my $urlraw = $url; my $re = ''; $re = $1 if $url =~ s{#(.*?)$}{}; - print "$identifier: http_readdir: url=$url re=$re\n" if $verbose > 1; + print "$identifier: http_readdir: url=$url re=$re\n" if $verbose > 2; $url =~ s{/+$}{}; # we add our own trailing slashes... $name =~ s{/+$}{}; - foreach my $item(@norecurse_list) { + my $item; + my $included = 0; + foreach my $item(@top_include_list) { + if ($name =~ $item) { + $included = 1; + } + } + if (("$name/" ne "/") && !$included) { + print "$identifier: not in top_include_list: $name\n";# if $verbose > 1; + return; + } + + foreach $item(@norecurse_list) { $item =~ s/([^.])(\*)/$1.$2/g; $item =~ s/^\*/.*/; #$item =~ s/[^.]\*/.\*/g; @@ -521,7 +550,8 @@ } my @r; - print "$identifier: http dir: $url/$name\n" if $verbose > 1; + print "$identifier: http dir: $url/$name\n" if $verbose > 2; + print "$identifier: http dir: $name\n" if $verbose == 2; my $contents = cont("$url/$name/?F=1"); if($contents =~ s{^.*<(PRE|pre|table)>.*<(a href|A HREF)="\?(N=A|C=.*;O=)[^"]*">}{}s) { ## good, we know that one. It is a standard apache dir-listing. @@ -546,7 +576,7 @@ $name1 =~ s{%([\da-fA-F]{2})}{pack 'c', hex $1}ge; $name1 =~ s{^\./}{}; my $dir = 1 if $pre =~ m{"\[DIR\]"}; - print "$identifier: $pre^$name1^$date^$size\n" if $verbose > 1; + #print "$identifier: $pre^$name1^$date^$size\n" if $verbose > 1; my $t = length($name) ? "$name/$name1" : $name1; if($size eq '-' and ($dir or $name1 =~ m{/$})) { ## we must be really sure it is a directory, when we come here. @@ -675,8 +705,6 @@ { my ($path, $identifier, $serverid, $mod_re, $ign_re) = @_; - my $fileid; - # # optional patch the file names by adding or removing components. # you never know what strange paths mirror admins choose. @@ -695,29 +723,43 @@ $path =~ s{//+}{/}g; # avoid double slashes. - $fileid = getfileid($path); + if ($use_file_array) { + my $sql = "SELECT mirr_add_byname(?, ?);"; + if (!defined $sth_mirr_addbyname) { + printf "\nPreparing add statement\n\n" if $sqlverbose; + $sth_mirr_addbyname = $dbh->prepare( $sql ) or die "$identifier: $DBI::errstr"; + } + printf "$sql <-- $serverid, $path \n" if $sqlverbose; + $sth_mirr_addbyname->execute( $serverid, $path ) or die "$identifier: $DBI::errstr"; + $sth_mirr_addbyname->finish; - if(checkfileserver_fileid($serverid, $fileid)) { - my $sql = "UPDATE file_server SET timestamp_scanner = ".time." WHERE fileid = ? AND serverid = ?;"; - if (!defined $sth_update) { - printf "\nPreparing update statement\n\n" if $sqlverbose; - $sth_update = $dbh->prepare( $sql ) or die $DBI::errstr; + } else { + + my $fileid = getfileid($path); + + if(checkfileserver_fileid($serverid, $fileid)) { + my $sql = "UPDATE file_server SET timestamp_scanner = ".time." WHERE fileid = ? AND serverid = ?;"; + if (!defined $sth_update) { + printf "\nPreparing update statement\n\n" if $sqlverbose; + $sth_update = $dbh->prepare( $sql ) or die $DBI::errstr; + } + + printf "$sql <-- $fileid, $serverid \n" if $sqlverbose; + $sth_update->execute( $fileid, $serverid ) or die $DBI::errstr; } + else { + my $sql = "INSERT INTO file_server (fileid, serverid, timestamp_scanner) VALUES (?, ?, ".time.");"; + if (!defined $sth_insert_rel) { + printf "\nPreparing insert statement\n\n" if $sqlverbose; + $sth_insert_rel = $dbh->prepare( $sql ); + } - printf "$sql <-- $fileid, $serverid \n" if $sqlverbose; - $sth_update->execute( $fileid, $serverid ) or die $DBI::errstr; + printf "$sql <-- $fileid, $serverid \n" if $sqlverbose; + $sth_insert_rel->execute( $fileid, $serverid ) or die "$identifier: $DBI::errstr"; + } } - else { - my $sql = "INSERT INTO file_server (fileid, serverid, timestamp_scanner) VALUES (?, ?, ".time.");"; - if (!defined $sth_insert_rel) { - printf "\nPreparing insert statement\n\n" if $sqlverbose; - $sth_insert_rel = $dbh->prepare( $sql ); - } - printf "$sql <-- $fileid, $serverid \n" if $sqlverbose; - $sth_insert_rel->execute( $fileid, $serverid ) or die "$identifier: $sth_insert_rel->errstr"; - } return $path; } @@ -834,7 +876,7 @@ if($mode & 004) { # readable for the world is good. # params for largefile check: url=$ary_ref->{$priv->{serverid}}/$name, size=$len if(largefile_check($priv->{identifier}, $priv->{serverid}, $name, $len) == 0) { - printf "$priv->{identifier}: ERROR: file $name cannot be delivererd via http! Skipping\n" if $verbose > 1; + printf "$priv->{identifier}: warning: $name cannot be delivererd via HTTP! Skipping\n" if $verbose > 0; } else { $name = save_file($name, $priv->{identifier}, $priv->{serverid}, $mtime, $priv->{re}); @@ -1011,12 +1053,20 @@ my @args = ('--server', '--sender', '-rl'); push @args, '--exclude=/*/*' if $norecurse; + if(@top_include_list) { + foreach my $item (@top_include_list) { + push @args, "--include=/$item"; + } + push @args, "--exclude=/*"; + } + # set exclude flag for all dirs specified by '-p' option: if(@norecurse_list) { foreach my $item (@norecurse_list) { push @args, "--exclude=$item"; } } + print "$identifier: rsync args: @args\n" if $verbose > 2; for my $arg (@args, '.', "$syncroot/.", '') { swrite(*S, "$arg\n"); _______________________________________________ Opensuse-svn mailing list Opensuse-svn_at_forge.novell.com http://forge.novell.com/mailman/listinfo/opensuse-svn _______________________________________________ mirrorbrain-commits mailing list Note: To remove yourself from this list, send a mail with the content unsubscribe to the address mirrorbrain-commits-request_at_mirrorbrain.orgReceived on 2009-02-25Z00:21:38
This archive was generated by hypermail 2.2.0 : 2009-07-10Z19:18:11 GMT