Author: poeml Date: 2009-03-29 16:39:32 -0600 (Sun, 29 Mar 2009) New Revision: 6939 Modified: trunk/tools/download-redirector-v2/scanner/scanner.pl Log: scanner: - Version 0.40 - Finally remove the hardcoded exclusions which were openSUSE specific. - Implement new config directives that can be set in mirrorbrain.conf: scan_exclude = regexp ... scan_exclude_rsync = pattern .. Both are passed from "mb scan" to the scanner with the following meanings: --exclude regexp Define pattern(s) for path names to ignore. Paths matching this pattern will not be recursed into (thus saving resources) and also, when matching a file, not added into the database. This option is effective only for scans via HTTP/FTP. For rsync, use the --exclude-rsync option (due to different patterns used there). Here, regular expressions are used. Path names don't start with a slash; thus, if the regexp starts with a slash it will not match at the top-level directory. Option can be repeated. --exclude-rsync pattern Similar like --exclude, but used (only) for rsync scans. For HTTP/FTP, use the --exclude option (due to different patterns used there). The patterns are rsync(1) patterns. Option can be repeated. - Fix false insertion of directories that are ignored into the database. The files within those directories were not inserted, but the directory name was inserted as if it was a file. It is easier to debug the top level includes, and the excludes, if no false insertion happens at all, though. - Fix exclusion of .~tmp~ directories (rsyncs default temporary directory). The pattern did never work for rsync because patterns with leading slash are anchored to the top of the tree. Note that this directory is excluded by default. - Fix bug where an incomplete URL was shown in debug mode when reporting unparsable HTML. - Adjust some verbosity levels. Modified: trunk/tools/download-redirector-v2/scanner/scanner.pl =================================================================== --- trunk/tools/download-redirector-v2/scanner/scanner.pl 2009-03-29 16:49:44 UTC (rev 6938) +++ trunk/tools/download-redirector-v2/scanner/scanner.pl 2009-03-29 22:39:32 UTC (rev 6939) _at_@ -74,6 +74,10 @@ # files to a temporary table at start, and uses it # after the scan to delete the remaining (unseen) # files. Various fixes for scanning. +# 2009-03-28, poeml - V0.40, Subdirectory scans with deletions are now implemented. +# Add --exclude and --exclude-rsync parameters. +# Hard-coded ignore patterns were removed. +# Bug fixes. # # # _at_@ -103,7 +107,7 @@ use Time::HiRes qw(gettimeofday); use Encode; -my $version = '0.30'; +my $version = '0.40'; my $scanner_email = 'poeml_at_suse.de'; my $verbose = 1; my $sqlverbose = 0; _at_@ -161,31 +165,19 @@ # directories to be included from top-level my _at_top_include_list; -#my $global_ign_re = qr{( -# /repoview/ | -# /drpmsync/ | -# /.~tmp~/ -#)}x; +my _at_exclude_list; +my _at_exclude_list_rsync; +# default excludes: +push _at_exclude_list, '/.~tmp~/'; +push _at_exclude_list_rsync, '*/.~tmp~/'; -# default ignores: -my _at_norecurse_list;# = (); -push _at_norecurse_list, '/repoview/'; -push _at_norecurse_list, '/drpmsync/'; -push _at_norecurse_list, '/.~tmp~/'; -# these are symlinks, which would (via HTTP) be crawled just like a directory, -# because itentical to directories in the directory listing HTML -push _at_norecurse_list, '/openSUSE-current/'; -push _at_norecurse_list, '/openSUSE-stable/'; -push _at_norecurse_list, '/SL-OSS-factory/'; -push _at_norecurse_list, '/SL-OSS-factory-debug/'; -push _at_norecurse_list, '/SL-10.1/'; - exit usage() unless _at_ARGV; while (defined (my $arg = shift)) { if ($arg !~ m{^-}) { unshift _at_ARGV, $arg; last; } elsif ($arg =~ m{^(-h|--help|-\?)}) { exit usage(); } - elsif ($arg =~ m{^(-i|--ignore)}) { push _at_norecurse_list, shift; } elsif ($arg =~ m{^(-I|--top-include)}) { push _at_top_include_list, shift; } + elsif ($arg =~ m{^--exclude$}) { push _at_exclude_list, shift; } + elsif ($arg =~ m{^--exclude-rsync$}) { push _at_exclude_list_rsync, shift; } elsif ($arg =~ m{^-q}) { $verbose = 0; } elsif ($arg =~ m{^-v}) { $verbose++; } elsif ($arg =~ m{^-S}) { $sqlverbose++; } _at_@ -303,6 +295,12 @@ foreach my $item(_at_top_include_list) { push _at_cmd, '-I', $item; } + foreach my $item(_at_exclude_list) { + push _at_cmd, '--exclude', $item; + } + foreach my $item(_at_exclude_list_rsync) { + push _at_cmd, '--exclude-rsync', $item; + } push _at_cmd, '-f' if $force_scan; push _at_cmd, '-e' if $enable_after_scan; push _at_cmd, '-d', $start_dir if length $start_dir; _at_@ -479,9 +477,24 @@ -j N Run up to N scanner queries in parallel. - -i regexp - Define regexp-pattern for path names to ignore. - Use '-i 0' to disable any ignore patterns. Default: _at_norecurse_list + --exclude regexp + Define pattern(s) for path names to ignore. Paths matching this pattern + will not be recursed into (thus saving resources) and also, when + matching a file, not added into the database. + This option is effective only for scans via HTTP/FTP. For rsync, + use the --exclude-rsync option (due to different patterns used there). + Here, regular expressions are used. + Path names don't start with a slash; thus, if the regexp starts with a slash + it will not match at the top-level directory. + Option can be repeated. + Default: _at_exclude_list + --exclude-rsync pattern + Similar like --exclude, but used (only) for rsync scans. + For HTTP/FTP, use the --exclude option (due to different patterns + used there). + The patterns are rsync(1) patterns. Option can be repeated. + Default: _at_exclude_list_rsync + -T dir Directory to be scanned at the top level; option can be repeated. Both, names(identifier) and numbers(id) are accepted as mirror_ids. _at_@ -594,10 +607,7 @@ } } - foreach $item(_at_norecurse_list) { - $item =~ s/([^.])(\*)/$1.$2/g; - $item =~ s/^\*/.*/; - #$item =~ s/[^.]\*/.\*/g; + foreach $item(_at_exclude_list) { if("$name/" =~ $item) { print "$identifier: ignore match: $name matches ignored item $item, skipped.\n" if $verbose > 1; return; _at_@ -648,7 +658,7 @@ # workaround: don't store files with broken times if(not defined($time)) { print "$identifier: Error: str2time returns undef on parsing \"$date\". Skipping file $name1\n"; - print "$identifier: current line was:\n$line\nat url $url\nname= $name1\n"; + print "$identifier: current line was:\n$line\nat url $url/$name\nname= $name1\n" if $verbose > 1; } elsif(largefile_check($identifier, $id, $t, $len)) { #save timestamp and file in database _at_@ -659,7 +669,7 @@ } } } - print "$identifier: committing http dir $name\n" if $verbose > 1; + print "$identifier: committing http dir $name\n" if $verbose > 2; if($do_transaction) { $dbh->commit or die "$DBI::errstr"; } _at_@ -700,14 +710,6 @@ my $item; - # ignore paths matching those in _at_norecurse-list: - for $item(_at_norecurse_list) { - if ($name =~ $item) { - print "$identifier: ignore match: $name matches ignored item $item, skipped.\n" if $verbose > 1; - return; - } - } - print "$identifier: ftp dir: $name\n" if $verbose > 1; my $urlraw = $url; _at_@ -715,24 +717,6 @@ $url =~ s{/+$}{}; # we add our own trailing slashes... - # are we looking at a top-level directory name? - # (we recognize it by not containing slashes) - my $attop = 0; - $attop = 1 if (length $name) && !($name =~ "/"); - if ($attop && scalar(_at_top_include_list)) { - my $included = 0; - foreach my $item(_at_top_include_list) { - if ($name =~ $item) { - $included = 1; - } - } - if (!$included) { - print "$identifier: not in top_include_list: $name\n";# if $verbose > 1; - return; - } - } - - my $toplevel = ($ftp) ? 0 : 1; $ftp = ftp_connect($identifier, "$url/$name", "anonymous", $scanner_email) unless defined $ftp; return unless defined $ftp; _at_@ -771,6 +755,36 @@ my ($type, $mode, $size, $timestamp, $fname) = ($1, $2, $3, $4, $5); next if $fname eq "." or $fname eq ".."; + #print "$name / $fname\n"; + + # are we looking at a top-level directory name? + # (can be recognized by name being an empty string) + if (!length($name) && scalar(_at_top_include_list)) { + my $included = 0; + foreach my $item(_at_top_include_list) { + if ($fname =~ $item) { + $included = 1; + } + } + if (!$included) { + print "$identifier: not in top_include_list: $fname\n";# if $verbose > 1; + next; + } + } + + my $excluded = 0; + my $s = "$name/$fname"; + if($type eq "d") { + $s = "$s/"; + } + for $item(_at_exclude_list) { + if ($s =~ $item) { + print "$identifier: $s ignored (matches $item)\n" if $verbose > 0; + $excluded = 1; + } + } + next if ($excluded); + #convert to timestamp my $time = str2time($timestamp); my $t = length($name) ? "$name/$fname" : $fname; _at_@ -801,7 +815,7 @@ } } - print "$identifier: committing ftp dir $name\n" if $verbose > 1; + print "$identifier: committing ftp dir $name\n" if $verbose > 2; if($do_transaction) { $dbh->commit or die "$DBI::errstr"; } _at_@ -1155,11 +1169,9 @@ push _at_args, "--exclude=/*"; } - # set exclude flag for all dirs specified by '-p' option: - if(_at_norecurse_list) { - foreach my $item (_at_norecurse_list) { - push _at_args, "--exclude=$item"; - } + print "$identifier: rsync excludes: _at_exclude_list_rsync\n" if $verbose > 1; + foreach my $item (_at_exclude_list_rsync) { + push _at_args, "--exclude=$item"; } print "$identifier: rsync args: _at_args\n" if $verbose > 2; _at_@ -1243,7 +1255,7 @@ } $url =~ s{/.*$}{}; # no path components please $port = $1 if $url =~ s{:(\d+)$}{}; # port number? - my $ftp = Net::FTP->new($url, Timeout => 360, Port => $port, Debug => (($verbose||0)>1)?1:0, Passive => 1, Hash => 0); + my $ftp = Net::FTP->new($url, Timeout => 360, Port => $port, Debug => (($verbose||0)>2)?1:0, Passive => 1, Hash => 0); unless (defined $ftp) { warn "$identifier: ftp_connect($identifier, $url, $port) failed: $! $_at_\n"; return undef; _at_@ -1346,10 +1358,10 @@ } if($result->code() == 416) { - print "$identifier: Error: range error: filesize broken for file $url\n" if $verbose >= 2; + print "$identifier: Error: range error: filesize broken for file $url\n" if $verbose >= 1; } else { - print "$identifier: Error ".$result->code()." occured\n" if $verbose >= 2; + print "$identifier: Error ".$result->code()." occured\n" if $verbose >= 1; } error: _______________________________________________ Opensuse-svn mailing list Opensuse-svn_at_forge.novell.com http://forge.novell.com/mailman/listinfo/opensuse-svn _______________________________________________ mirrorbrain-commits mailing list Archive: http://mirrorbrain.org/archive/mirrorbrain-commits/ Note: To remove yourself from this list, send a mail with the content unsubscribe to the address mirrorbrain-commits-request_at_mirrorbrain.orgReceived on Sun Mar 29 2009 - 22:40:44 GMT
This archive was generated by hypermail 2.3.0 : Mon Feb 20 2012 - 23:47:04 GMT