Author: poeml Date: 2009-03-29 16:39:32 -0600 (Sun, 29 Mar 2009) New Revision: 6939 Modified: trunk/tools/download-redirector-v2/scanner/scanner.pl Log: scanner: - Version 0.40 - Finally remove the hardcoded exclusions which were openSUSE specific. - Implement new config directives that can be set in mirrorbrain.conf: scan_exclude = regexp ... scan_exclude_rsync = pattern .. Both are passed from "mb scan" to the scanner with the following meanings: --exclude regexp Define pattern(s) for path names to ignore. Paths matching this pattern will not be recursed into (thus saving resources) and also, when matching a file, not added into the database. This option is effective only for scans via HTTP/FTP. For rsync, use the --exclude-rsync option (due to different patterns used there). Here, regular expressions are used. Path names don't start with a slash; thus, if the regexp starts with a slash it will not match at the top-level directory. Option can be repeated. --exclude-rsync pattern Similar like --exclude, but used (only) for rsync scans. For HTTP/FTP, use the --exclude option (due to different patterns used there). The patterns are rsync(1) patterns. Option can be repeated. - Fix false insertion of directories that are ignored into the database. The files within those directories were not inserted, but the directory name was inserted as if it was a file. It is easier to debug the top level includes, and the excludes, if no false insertion happens at all, though. - Fix exclusion of .~tmp~ directories (rsyncs default temporary directory). The pattern did never work for rsync because patterns with leading slash are anchored to the top of the tree. Note that this directory is excluded by default. - Fix bug where an incomplete URL was shown in debug mode when reporting unparsable HTML. - Adjust some verbosity levels. Modified: trunk/tools/download-redirector-v2/scanner/scanner.pl =================================================================== --- trunk/tools/download-redirector-v2/scanner/scanner.pl 2009-03-29 16:49:44 UTC (rev 6938) +++ trunk/tools/download-redirector-v2/scanner/scanner.pl 2009-03-29 22:39:32 UTC (rev 6939) @@ -74,6 +74,10 @@ # files to a temporary table at start, and uses it # after the scan to delete the remaining (unseen) # files. Various fixes for scanning. +# 2009-03-28, poeml - V0.40, Subdirectory scans with deletions are now implemented. +# Add --exclude and --exclude-rsync parameters. +# Hard-coded ignore patterns were removed. +# Bug fixes. # # # @@ -103,7 +107,7 @@ use Time::HiRes qw(gettimeofday); use Encode; -my $version = '0.30'; +my $version = '0.40'; my $scanner_email = 'poeml_at_suse.de'; my $verbose = 1; my $sqlverbose = 0; @@ -161,31 +165,19 @@ # directories to be included from top-level my @top_include_list; -#my $global_ign_re = qr{( -# /repoview/ | -# /drpmsync/ | -# /.~tmp~/ -#)}x; +my @exclude_list; +my @exclude_list_rsync; +# default excludes: +push @exclude_list, '/.~tmp~/'; +push @exclude_list_rsync, '*/.~tmp~/'; -# default ignores: -my @norecurse_list;# = (); -push @norecurse_list, '/repoview/'; -push @norecurse_list, '/drpmsync/'; -push @norecurse_list, '/.~tmp~/'; -# these are symlinks, which would (via HTTP) be crawled just like a directory, -# because itentical to directories in the directory listing HTML -push @norecurse_list, '/openSUSE-current/'; -push @norecurse_list, '/openSUSE-stable/'; -push @norecurse_list, '/SL-OSS-factory/'; -push @norecurse_list, '/SL-OSS-factory-debug/'; -push @norecurse_list, '/SL-10.1/'; - exit usage() unless @ARGV; while (defined (my $arg = shift)) { if ($arg !~ m{^-}) { unshift @ARGV, $arg; last; } elsif ($arg =~ m{^(-h|--help|-\?)}) { exit usage(); } - elsif ($arg =~ m{^(-i|--ignore)}) { push @norecurse_list, shift; } elsif ($arg =~ m{^(-I|--top-include)}) { push @top_include_list, shift; } + elsif ($arg =~ m{^--exclude$}) { push @exclude_list, shift; } + elsif ($arg =~ m{^--exclude-rsync$}) { push @exclude_list_rsync, shift; } elsif ($arg =~ m{^-q}) { $verbose = 0; } elsif ($arg =~ m{^-v}) { $verbose++; } elsif ($arg =~ m{^-S}) { $sqlverbose++; } @@ -303,6 +295,12 @@ foreach my $item(@top_include_list) { push @cmd, '-I', $item; } + foreach my $item(@exclude_list) { + push @cmd, '--exclude', $item; + } + foreach my $item(@exclude_list_rsync) { + push @cmd, '--exclude-rsync', $item; + } push @cmd, '-f' if $force_scan; push @cmd, '-e' if $enable_after_scan; push @cmd, '-d', $start_dir if length $start_dir; @@ -479,9 +477,24 @@ -j N Run up to N scanner queries in parallel. - -i regexp - Define regexp-pattern for path names to ignore. - Use '-i 0' to disable any ignore patterns. Default: @norecurse_list + --exclude regexp + Define pattern(s) for path names to ignore. Paths matching this pattern + will not be recursed into (thus saving resources) and also, when + matching a file, not added into the database. + This option is effective only for scans via HTTP/FTP. For rsync, + use the --exclude-rsync option (due to different patterns used there). + Here, regular expressions are used. + Path names don't start with a slash; thus, if the regexp starts with a slash + it will not match at the top-level directory. + Option can be repeated. + Default: @exclude_list + --exclude-rsync pattern + Similar like --exclude, but used (only) for rsync scans. + For HTTP/FTP, use the --exclude option (due to different patterns + used there). + The patterns are rsync(1) patterns. Option can be repeated. + Default: @exclude_list_rsync + -T dir Directory to be scanned at the top level; option can be repeated. Both, names(identifier) and numbers(id) are accepted as mirror_ids. @@ -594,10 +607,7 @@ } } - foreach $item(@norecurse_list) { - $item =~ s/([^.])(\*)/$1.$2/g; - $item =~ s/^\*/.*/; - #$item =~ s/[^.]\*/.\*/g; + foreach $item(@exclude_list) { if("$name/" =~ $item) { print "$identifier: ignore match: $name matches ignored item $item, skipped.\n" if $verbose > 1; return; @@ -648,7 +658,7 @@ # workaround: don't store files with broken times if(not defined($time)) { print "$identifier: Error: str2time returns undef on parsing \"$date\". Skipping file $name1\n"; - print "$identifier: current line was:\n$line\nat url $url\nname= $name1\n"; + print "$identifier: current line was:\n$line\nat url $url/$name\nname= $name1\n" if $verbose > 1; } elsif(largefile_check($identifier, $id, $t, $len)) { #save timestamp and file in database @@ -659,7 +669,7 @@ } } } - print "$identifier: committing http dir $name\n" if $verbose > 1; + print "$identifier: committing http dir $name\n" if $verbose > 2; if($do_transaction) { $dbh->commit or die "$DBI::errstr"; } @@ -700,14 +710,6 @@ my $item; - # ignore paths matching those in @norecurse-list: - for $item(@norecurse_list) { - if ($name =~ $item) { - print "$identifier: ignore match: $name matches ignored item $item, skipped.\n" if $verbose > 1; - return; - } - } - print "$identifier: ftp dir: $name\n" if $verbose > 1; my $urlraw = $url; @@ -715,24 +717,6 @@ $url =~ s{/+$}{}; # we add our own trailing slashes... - # are we looking at a top-level directory name? - # (we recognize it by not containing slashes) - my $attop = 0; - $attop = 1 if (length $name) && !($name =~ "/"); - if ($attop && scalar(@top_include_list)) { - my $included = 0; - foreach my $item(@top_include_list) { - if ($name =~ $item) { - $included = 1; - } - } - if (!$included) { - print "$identifier: not in top_include_list: $name\n";# if $verbose > 1; - return; - } - } - - my $toplevel = ($ftp) ? 0 : 1; $ftp = ftp_connect($identifier, "$url/$name", "anonymous", $scanner_email) unless defined $ftp; return unless defined $ftp; @@ -771,6 +755,36 @@ my ($type, $mode, $size, $timestamp, $fname) = ($1, $2, $3, $4, $5); next if $fname eq "." or $fname eq ".."; + #print "$name / $fname\n"; + + # are we looking at a top-level directory name? + # (can be recognized by name being an empty string) + if (!length($name) && scalar(@top_include_list)) { + my $included = 0; + foreach my $item(@top_include_list) { + if ($fname =~ $item) { + $included = 1; + } + } + if (!$included) { + print "$identifier: not in top_include_list: $fname\n";# if $verbose > 1; + next; + } + } + + my $excluded = 0; + my $s = "$name/$fname"; + if($type eq "d") { + $s = "$s/"; + } + for $item(@exclude_list) { + if ($s =~ $item) { + print "$identifier: $s ignored (matches $item)\n" if $verbose > 0; + $excluded = 1; + } + } + next if ($excluded); + #convert to timestamp my $time = str2time($timestamp); my $t = length($name) ? "$name/$fname" : $fname; @@ -801,7 +815,7 @@ } } - print "$identifier: committing ftp dir $name\n" if $verbose > 1; + print "$identifier: committing ftp dir $name\n" if $verbose > 2; if($do_transaction) { $dbh->commit or die "$DBI::errstr"; } @@ -1155,11 +1169,9 @@ push @args, "--exclude=/*"; } - # set exclude flag for all dirs specified by '-p' option: - if(@norecurse_list) { - foreach my $item (@norecurse_list) { - push @args, "--exclude=$item"; - } + print "$identifier: rsync excludes: @exclude_list_rsync\n" if $verbose > 1; + foreach my $item (@exclude_list_rsync) { + push @args, "--exclude=$item"; } print "$identifier: rsync args: @args\n" if $verbose > 2; @@ -1243,7 +1255,7 @@ } $url =~ s{/.*$}{}; # no path components please $port = $1 if $url =~ s{:(\d+)$}{}; # port number? - my $ftp = Net::FTP->new($url, Timeout => 360, Port => $port, Debug => (($verbose||0)>1)?1:0, Passive => 1, Hash => 0); + my $ftp = Net::FTP->new($url, Timeout => 360, Port => $port, Debug => (($verbose||0)>2)?1:0, Passive => 1, Hash => 0); unless (defined $ftp) { warn "$identifier: ftp_connect($identifier, $url, $port) failed: $! $@\n"; return undef; @@ -1346,10 +1358,10 @@ } if($result->code() == 416) { - print "$identifier: Error: range error: filesize broken for file $url\n" if $verbose >= 2; + print "$identifier: Error: range error: filesize broken for file $url\n" if $verbose >= 1; } else { - print "$identifier: Error ".$result->code()." occured\n" if $verbose >= 2; + print "$identifier: Error ".$result->code()." occured\n" if $verbose >= 1; } error: _______________________________________________ Opensuse-svn mailing list Opensuse-svn_at_forge.novell.com http://forge.novell.com/mailman/listinfo/opensuse-svn _______________________________________________ mirrorbrain-commits mailing list Archive: http://mirrorbrain.org/archive/mirrorbrain-commits/ Note: To remove yourself from this list, send a mail with the content unsubscribe to the address mirrorbrain-commits-request_at_mirrorbrain.orgReceived on 2009-03-29Z22:40:44
This archive was generated by hypermail 2.2.0 : 2009-07-10Z19:18:12 GMT