Author: poeml Date: 2009-03-29 18:09:11 -0600 (Sun, 29 Mar 2009) New Revision: 6945 Added: trunk/tools/download-redirector-v2/tools/scanner.pl Removed: trunk/tools/download-redirector-v2/scanner/ Log: scanner: - move the script into the tools directory; it doesn't need its own directory. It's a helper tools that's called in the background like the other ones as well. Copied: trunk/tools/download-redirector-v2/tools/scanner.pl (from rev 6944, trunk/tools/download-redirector-v2/scanner/scanner.pl) =================================================================== --- trunk/tools/download-redirector-v2/tools/scanner.pl (rev 0) +++ trunk/tools/download-redirector-v2/tools/scanner.pl 2009-03-30 00:09:11 UTC (rev 6945) @@ -0,0 +1,1313 @@ +#!/usr/bin/perl -w + +################################################################################ +# scanner.pl -- daemon for working through opensuse directories. +# +# Copyright 2006,2007,2008,2009 Martin Polster, Juergen Weigert, +# Peter Poeml, Novell Inc. +# +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License version 2 +# as published by the Free Software Foundation; +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA +################################################################################ + +####################################################################### +# rsync protocol implementation +####################################################################### +# +# Copyright (c) 2005 Michael Schroeder (mls_at_suse.de) +# +# This program is licensed under the BSD license, read LICENSE.BSD +# for further information +# +####################################################################### + +use strict; + +use DBI; +use Date::Parse; +use LWP::UserAgent; +use Net::FTP; +use Net::Domain; +use Data::Dumper; +use Time::HiRes; +use Socket; +use bytes; +use Config::IniFiles; +use Time::HiRes qw(gettimeofday); +use Encode; + +my $version = '0.40'; +my $scanner_email = 'poeml_at_suse.de'; +my $verbose = 1; +my $sqlverbose = 0; + +#$DB::inhibit_exit = 0; + +$SIG{'PIPE'} = 'IGNORE'; + +$SIG{__DIE__} = sub { + my @a = ((caller 3)[1..3], '=>', (caller 2)[1..3], '=>', (caller 1)[1..3]); + print "__DIE__: (@a)\n"; + die @_; +}; + +$SIG{USR1} = sub { $verbose++; warn "sigusr1 seen. ++verbose = $verbose\n"; }; +$SIG{USR2} = sub { $verbose--; warn "sigusr2 seen. --verbose = $verbose\n"; }; +$SIG{ALRM} = sub { $verbose++; $verbose++; die "rsync timeout...\n" }; + +$ENV{FTP_PASSIVE} = 1; # used in LWP only, Net::FTP ignores this. + + +# Create a user agent object +my $ua = LWP::UserAgent->new; +$ua->agent("MirrorBrain Scanner/$version (See http://mirrorbrain.org/scanner_info)"); + +my $rsync_muxbuf = ''; +my $all_servers = 0; +my $start_dir = '/'; +my $parallel = 1; +my $list_only = 0; +my $keep_dead_files = 0; +my $recursion_delay = 0; # seconds delay per *_readdir recuursion +my $force_scan = 0; +my $enable_after_scan = 0; +my $cfgfile = '/etc/mirrorbrain.conf'; +my $brain_instance = ''; + +# FIXME: use DBI functions transaction handling +my $do_transaction = 1; + +# save prepared statements +my $sth_update; +my $sth_insert_rel; +my $sth_select_file; +my $sth_insert_file; +my $sth_mirr_addbypath; + +my $gig2 = 1<<31; # 2*1024*1024*1024 == 2^1 * 2^10 * 2^10 * 2^10 = 2^31 + +# these two vars are used in the largefile_check's http request callback to end +# transmission after a maximum amount of data (specified by $http_slice_counter) +my $http_size_hint; +my $http_slice_counter; + +# directories to be included from top-level +my @top_include_list; + +my @exclude_list; +my @exclude_list_rsync; +# default excludes: +push @exclude_list, '/.~tmp~/'; +push @exclude_list_rsync, '*/.~tmp~/'; + +exit usage() unless @ARGV; +while (defined (my $arg = shift)) { + if ($arg !~ m{^-}) { unshift @ARGV, $arg; last; } + elsif ($arg =~ m{^(-h|--help|-\?)}) { exit usage(); } + elsif ($arg =~ m{^(-I|--top-include)}) { push @top_include_list, shift; } + elsif ($arg =~ m{^--exclude$}) { push @exclude_list, shift; } + elsif ($arg =~ m{^--exclude-rsync$}) { push @exclude_list_rsync, shift; } + elsif ($arg =~ m{^-q}) { $verbose = 0; } + elsif ($arg =~ m{^-v}) { $verbose++; } + elsif ($arg =~ m{^-S}) { $sqlverbose++; } + elsif ($arg =~ m{^-a}) { $all_servers++; } + elsif ($arg =~ m{^-j}) { $parallel = shift; } + elsif ($arg =~ m{^-e}) { $enable_after_scan++; } + elsif ($arg =~ m{^-f}) { $force_scan++; } + elsif ($arg =~ m{^-d}) { $start_dir = shift; } + elsif ($arg =~ m{^-b}) { $brain_instance = shift; } + elsif ($arg =~ m{^-l}) { $list_only++; + $list_only++ if $arg =~ m{ll}; + $list_only++ if $arg =~ m{lll}; } + elsif ($arg =~ m{^-}) { exit usage("unknown option '$arg'"); } +} + + +# read the configuration +my $cfg = new Config::IniFiles( -file => $cfgfile ); +$cfg->SectionExists('general') or die 'no [general] section in config file'; + +# if the instance wasn't specified with -b, we use the first of the defined +# instances +my @brain_instances = split(/, /, $cfg->val('general', 'instances')); +$brain_instance = $brain_instances[0] unless $brain_instance; +$cfg->SectionExists($brain_instance) or die 'no [' . $brain_instance . '] section in config file'; + + +my $db_driver = 'mysql'; # backwards compatible default +$db_driver = $cfg->val($brain_instance, 'dbdriver') + if $cfg->val($brain_instance, 'dbdriver'); + +my $db_port = 'not set'; +if($db_driver eq 'Pg' or $db_driver eq 'postgres' or $db_driver eq 'postgresql') { + $db_port = '5432'; + $db_driver = 'Pg'; +} +elsif($db_driver eq 'mysql') { + $db_port = '3306'; +} +else { die 'unknown dbddriver "' . $db_driver . '" in config file'; } + +$db_port = $cfg->val($brain_instance, 'dbport') + if $cfg->val($brain_instance, 'dbport'); + +my $db_cred = { dbi => 'dbi:' . $db_driver + . ':dbname=' . $cfg->val( $brain_instance, 'dbname') + . ';host=' . $cfg->val( $brain_instance, 'dbhost') + . ';port=' . $db_port, + user => $cfg->val( $brain_instance, 'dbuser'), + pass => $cfg->val( $brain_instance, 'dbpass'), + opt => { PrintError => 0 } }; + + +my %only_server_ids = map { $_ => 1 } @ARGV; + +exit usage("Please specify list of server IDs (or -a for all) to scan\n") + unless $all_servers or %only_server_ids or $list_only; + +exit usage("-a takes no parameters (or try without -a ).\n") if $all_servers and %only_server_ids; + +exit usage("-e is useless without -f\n") if $enable_after_scan and !$force_scan; + +exit usage("-j requires a positive number") unless $parallel =~ m{^\d+$} and $parallel > 0; + +my $dbh = DBI->connect( $db_cred->{dbi}, $db_cred->{user}, $db_cred->{pass}, $db_cred->{opt}) or die $DBI::errstr; + +# we fetch last_scan timestamp as epoch, because below we want to sort by it. +my $sql = qq{SELECT id, identifier, baseurl, baseurl_ftp, baseurl_rsync, enabled, extract(epoch from last_scan) as last_scan FROM server WHERE country != '**' }; +print "$sql\n" if $sqlverbose; +my $ary_ref = $dbh->selectall_hashref($sql, 'id') + or die $DBI::errstr; + +my @scan_list; + +for my $row(sort { int(\$a->{last_scan}) <=> int(\$b->{last_scan}) } values %$ary_ref) { + if(keys %only_server_ids) { + next if !defined $only_server_ids{$row->{id}} and !defined $only_server_ids{$row->{identifier}}; + + # keep some keys in %only_server_ids! + undef $only_server_ids{$row->{id}}; + undef $only_server_ids{$row->{identifier}}; + } + + if($row->{enabled} == 1 or $force_scan or $list_only > 1) { + push @scan_list, $row; + } +} +#print Dumper \%only_server_ids, \@scan_list; + +if(scalar(keys %only_server_ids) > 2 * scalar(@scan_list)) { + # print Dumper \%only_server_ids, \@scan_list; + warn "You specified some disabled mirror_ids, use -f to scan them all.\n"; + sleep 2 if scalar @scan_list; +} + +my @missing = grep { defined $only_server_ids{$_} } keys %only_server_ids; +die sprintf "serverid not found: %s\n", @missing if @missing; + +exit mirror_list(\@scan_list, $list_only-1) if $list_only; + +################### +$start_dir =~ s{^/+}{}; # leading slash is implicit; leads to '' per default. +$start_dir =~ s{/+$}{}; # trailing slashes likewise. +################## + +# be sure not to parallelize if there is exactly one server to scan. +$parallel = 1 if scalar @scan_list == 1; + +if ($parallel > 1) { + my @worker; + my @cmd = ($0); + push @cmd, '-b', $brain_instance; + push @cmd, '-q' unless $verbose; + push @cmd, ('-v') x ($verbose - 1) if $verbose > 1; + foreach my $item(@top_include_list) { + push @cmd, '-I', $item; + } + foreach my $item(@exclude_list) { + push @cmd, '--exclude', $item; + } + foreach my $item(@exclude_list_rsync) { + push @cmd, '--exclude-rsync', $item; + } + push @cmd, '-f' if $force_scan; + push @cmd, '-e' if $enable_after_scan; + push @cmd, '-d', $start_dir if length $start_dir; + # We must not propagate -j here. + # All other options we should propagate. + + for my $row (@scan_list) { + # check if one of the workers is idle + my $worker_id = wait_worker(\@worker, $parallel); + $worker[$worker_id] = { identifier => $row->{identifier}, serverid => $row->{id}, pid => fork_child($worker_id, @cmd, $row->{identifier}) }; + } + + while (wait > -1) { + print "reap\n" if $verbose > 1; + ; # reap all children + } + exit 0; +} + + +if($do_transaction) { + $dbh->{AutoCommit} = 0; + #$dbh->{RaiseError} = 1; +} + +for my $row (@scan_list) { + print localtime(time) . " $row->{identifier}: starting\n" if $verbose; + + # already in a transaction? why?? + #if($do_transaction) { + # $dbh->begin_work or die "$DBI::errstr"; + #} + + if(length $start_dir) { + $sql = "CREATE TEMPORARY TABLE temp1 AS + SELECT id FROM filearr + WHERE path LIKE '$start_dir%' + AND $row->{id} = ANY(mirrors)"; + } else { + $sql = "CREATE TEMPORARY TABLE temp1 AS + SELECT id FROM filearr + WHERE $row->{id} = ANY(mirrors)"; + } + print "$sql\n" if $sqlverbose; + $dbh->do($sql) or die "$sql: ".$DBI::errstr; + + $sql = "CREATE INDEX temp1_key ON temp1 (id); + ANALYZE temp1; + SELECT COUNT(*) FROM temp1"; + print "$sql\n" if $sqlverbose; + + my $ary_ref = $dbh->selectall_arrayref($sql) or die $dbh->errstr(); + my $initial_file_count = defined($ary_ref->[0]) ? $ary_ref->[0][0] : 0; + if(length $start_dir) { + print localtime(time) . " $row->{identifier}: files in '$start_dir' before scan: $initial_file_count\n"; + } else { + print localtime(time) . " $row->{identifier}: total files before scan: $initial_file_count\n"; + } + + if($do_transaction) { + $dbh->commit or die "$DBI::errstr"; + } + + #$sql = "SELECT COUNT(*) FROM filearr WHERE $row->{id} = ANY(mirrors)"; + #print "$sql\n" if $sqlverbose; + + + my $start = int(gettimeofday * 1000); + my $file_count = rsync_readdir($row->{identifier}, $row->{id}, $row->{baseurl_rsync}, $start_dir); + if(!$file_count and $row->{baseurl_ftp}) { + print localtime(time) . " $row->{identifier}: no rsync, trying ftp\n" if $verbose; + $file_count = scalar ftp_readdir($row->{identifier}, $row->{id}, $row->{baseurl_ftp}, time, $start_dir); + } + if(!$file_count and $row->{baseurl}) { + print localtime(time) . " $row->{identifier}: no rsync, no ftp, trying http\n" if $verbose; + $file_count = scalar http_readdir($row->{identifier}, $row->{id}, $row->{baseurl}, $start_dir); + } + + if($do_transaction) { + $dbh->commit or die "$DBI::errstr"; + } + my $duration = (int(gettimeofday * 1000) - $start) / 1000; + if (!$duration) { $duration = 1; } + if (!$file_count) { $file_count = 0; } + + my $fpm = int(60*$file_count/$duration); + + print localtime(time) . " $row->{identifier}: scanned $file_count files (" + . int($fpm/60) . "/s) in " + . int($duration) . "s\n" if $verbose; + + unless ($keep_dead_files) { + $start = time(); + print localtime(time) . " $row->{identifier}: purging old files\n" if $verbose > 1; + + + #$sql = "SELECT COUNT(*) FROM temp1"; + $sql = "SELECT COUNT(mirr_del_byid($row->{id}, id)) FROM temp1"; + print "$sql\n" if $sqlverbose; + $ary_ref = $dbh->selectall_arrayref($sql) or die $dbh->errstr(); + my $purge_file_count = defined($ary_ref->[0]) ? $ary_ref->[0][0] : 0; + print localtime(time) . " $row->{identifier}: files to be purged: $purge_file_count\n"; + + + $sql = "SELECT COUNT(*) FROM filearr WHERE $row->{id} = ANY(mirrors);"; + print "$sql\n" if $sqlverbose; + my $ary_ref = $dbh->selectall_arrayref($sql) or die $dbh->errstr(); + $file_count = defined($ary_ref->[0]) ? $ary_ref->[0][0] : 0; + print localtime(time) . " $row->{identifier}: total files after scan: $file_count\n"; + + + $duration = time() - $start; + print localtime(time) . " $row->{identifier}: purged old files in " . $duration . "s.\n" if $verbose > 0; + } + + # update the last_scan timestamp; but only if we did a complete scan. + unless ($start_dir) { + $sql = "UPDATE server SET last_scan = NOW(), scan_fpm = $fpm WHERE id = $row->{id};"; + print "$sql\n" if $sqlverbose; + my $sth = $dbh->prepare( $sql ); + $sth->execute() or die "$row->{identifier}: $DBI::errstr"; + } + + if($enable_after_scan && $file_count > 1 && !$row->{enabled}) { + $sql = "UPDATE server SET enabled = '1' WHERE id = $row->{id};"; + print "$sql\n" if $sqlverbose; + my $sth = $dbh->prepare( $sql ); + $sth->execute() or die "$row->{identifier}: $DBI::errstr"; + print "$row->{identifier}: now enabled.\n" if $verbose > 0; + } + + $sql = "DROP TABLE temp1"; + print "$sql\n" if $sqlverbose; + $dbh->do($sql) or die "$sql: ".$DBI::errstr; + + if($do_transaction) { + $dbh->commit or die "$DBI::errstr"; + } + + print localtime(time) . " $row->{identifier}: done.\n" if $verbose > 0; +} + +$dbh->disconnect(); +exit 0; +################################################################################################### + + + +sub usage +{ + my ($msg) = @_; + + print STDERR qq{$0 V$version usage: + +scanner [options] [mirror_ids ...] + + -b MirrorBrain instance to use + Default: the first which is defined in the config. + -v Be more verbose (Default: $verbose). + -S Show SQL statements. + -q Be quiet. + -l Do not scan. List enabled mirrors only. + -ll As -l, but include disabled mirrors and print urls. + -lll As -ll, but all in one grep-friendly line. + + -a Scan all enabled mirrors. Alternative to providing a list of mirror_ids. + -e Enable mirror, after it was scanned. Useful with -f. + -f Force. Scan listed mirror_ids even if they are not enabled. + -d dir Scan only in dir under mirror's baseurl. + Default: start at baseurl. Consider using -x and or -k with -d . + -x Extra-Schedule run. Do not update 'server.last_scan' tstamp. + Default: 'server.last_scan' is updated after each run. + -k Keep dead files. Default: Entries not found again are removed. + + -j N Run up to N scanner queries in parallel. + + --exclude regexp + Define pattern(s) for path names to ignore. Paths matching this pattern + will not be recursed into (thus saving resources) and also, when + matching a file, not added into the database. + This option is effective only for scans via HTTP/FTP. For rsync, + use the --exclude-rsync option (due to different patterns used there). + Here, regular expressions are used. + Path names don't start with a slash; thus, if the regexp starts with a slash + it will not match at the top-level directory. + Option can be repeated. + Default: @exclude_list + --exclude-rsync pattern + Similar like --exclude, but used (only) for rsync scans. + For HTTP/FTP, use the --exclude option (due to different patterns + used there). + The patterns are rsync(1) patterns. Option can be repeated. + Default: @exclude_list_rsync + + -T dir Directory to be scanned at the top level; option can be repeated. + +Both, names(identifier) and numbers(id) are accepted as mirror_ids. +}; + print STDERR "\nERROR: $msg\n" if $msg; + return 0; +} + + + +sub mirror_list +{ + my ($list, $longflag) = @_; + print " id name scan_speed last_scan\n"; + print "---+-------------------------+-----------+-------------\n"; + my $nl = ($longflag > 1) ? "\t" : "\n"; + for my $row(@$list) { + printf "%3d %-30s %5d %s$nl", $row->{id}, $row->{identifier}||'--', $row->{scan_fpm}||0, $row->{last_scan}||''; + if($longflag) { + print "\t$row->{baseurl_rsync}$nl" if length($row->{baseurl_rsync}||'') > 0; + print "\t$row->{baseurl_ftp}$nl" if length($row->{baseurl_ftp}||'') > 0; + print "\t$row->{baseurl}$nl" if length($row->{baseurl}||'') > 0; + printf "\tscore=%d country=%s region=%s enabled=%d$nl", + $row->{score}||0, $row->{country}||'', $row->{region}||'', $row->{enabled}||0; + print "\n"; + } + } + return 0; +} + + + + + +sub wait_worker +{ + my ($a, $n) = @_; + die if $n < 1; + my %pids; + + for(;;) { + for(my $i = 0; $i < $n; $i++) { + return $i unless $a->[$i]; + my $p = $a->[$i]{pid}; + unless (kill(0, $p)) { # already dead? okay take him home. + print "kill(0, $p) returned 0. reusing $i!\n" if $verbose; + undef $a->[$i]; + return $i; + } + $pids{$p} = $i; # not? okay wait. + } + my $p = wait; + my $rc = $?; + if(defined(my $i = $pids{$p})) { + if (($verbose > 1) || ($rc != 0)) { + print "$a->[$i]{identifier}: [#$i, id=$a->[$i]{serverid} pid=$p exit: $?]\n"; + } + undef $a->[$i]; + return $i; # now, been there, done that. + } + # $p = -1 or other silly things... + warn "wait failed: $!, $?\n"; + die "wait failed" if $p < 0; + } +} + + + +sub fork_child +{ + my ($idx, @args) = @_; + if (my $p = fork()) { + # parent + print "worker $idx, pid=$p start.\n" if $verbose > 1; + return $p; + } + my $cmd = shift @args; + exec { $cmd } "scanner [#$idx]", @args; # ourselves with a false name and some data. +} + + + +# http://ftp1.opensuse.org/repositories/#@^@repositories/@@ +sub http_readdir +{ + my ($identifier, $id, $url, $name) = @_; + + my $item; + + my $urlraw = $url; + my $re = ''; $re = $1 if $url =~ s{#(.*?)$}{}; + print "$identifier: http_readdir: url=$url re=$re\n" if $verbose > 2; + $url =~ s{/+$}{}; # we add our own trailing slashes... + $name =~ s{/+$}{}; + + # are we looking at a top-level directory name? + # (we recognize it by not containing slashes) + my $attop = 0; + $attop = 1 if (length $name) && !($name =~ "/"); + if ($attop && scalar(@top_include_list)) { + my $included = 0; + foreach my $item(@top_include_list) { + if ($name =~ $item) { + $included = 1; + } + } + if (!$included) { + print "$identifier: not in top_include_list: $name\n";# if $verbose > 1; + return; + } + } + + foreach $item(@exclude_list) { + if("$name/" =~ $item) { + print "$identifier: ignore match: $name matches ignored item $item, skipped.\n" if $verbose > 1; + return; + } + } + + my @r; + print "$identifier: http dir: $url/$name\n" if $verbose > 2; + print "$identifier: http dir: $name\n" if $verbose == 2; + my $contents = cont("$url/$name/?F=1"); + if($contents =~ s{^.*<(PRE|pre|table)>.*<(a href|A HREF)="\?(N=A|C=.*;O=)[^"]*">}{}s) { + ## good, we know that one. It is a standard apache dir-listing. + ## + ## bad, apache shows symlinks as a copy of the file or dir they point to. + ## no way to avoid duplicate crawls. + ## + $contents =~ s{</(PRE|pre|table)>.*$}{}s; + for my $line (split "\n", $contents) { + $line =~ s/<\/*t[rd].*?>/ /g; + print "$identifier: line: $line\n" if $verbose > 2; + if($line =~ m{^(.*)[Hh][Rr][Ee][Ff]="([^"]+)">([^<]+)</[Aa]>\s+([\w\s:-]+)\s+(-|[\d\.]+[KMG]?)}) { + my ($pre, $name1, $name2, $date, $size) = ($1, $2, $3, $4, $5); + next if $name1 =~ m{^/} or $name1 =~ m{^\.\.}; + if($verbose > 2) { + print "$identifier: pre $pre\n"; + print "$identifier: name1 $name1\n"; + print "$identifier: name2 $name2\n"; + print "$identifier: date $date\n"; + print "$identifier: size $size\n"; + } + $name1 =~ s{%([\da-fA-F]{2})}{pack 'c', hex $1}ge; + $name1 =~ s{^\./}{}; + my $dir = 1 if $pre =~ m{"\[DIR\]"}; + #print "$identifier: $pre^$name1^$date^$size\n" if $verbose > 1; + my $t = length($name) ? "$name/$name1" : $name1; + if($size eq '-' and ($dir or $name1 =~ m{/$})) { + ## we must be really sure it is a directory, when we come here. + ## otherwise, we'll retrieve the contents of a file! + sleep($recursion_delay) if $recursion_delay; + push @r, http_readdir($identifier, $id, $urlraw, $t, 0); + } + else { + ## it is a file. + my $time = str2time($date); + my $len = byte_size($size); + + # str2time returns undef in some rare cases causing KILL! FIXME + # workaround: don't store files with broken times + if(not defined($time)) { + print "$identifier: Error: str2time returns undef on parsing \"$date\". Skipping file $name1\n"; + print "$identifier: current line was:\n$line\nat url $url/$name\nname= $name1\n" if $verbose > 1; + } + elsif(largefile_check($identifier, $id, $t, $len)) { + #save timestamp and file in database + if(save_file($t, $identifier, $id, $time, $re)) { + push @r, [ $t , $time ]; + } + } + } + } + } + print "$identifier: committing http dir $name\n" if $verbose > 2; + if($do_transaction) { + $dbh->commit or die "$DBI::errstr"; + } + } + else { + ## we come here, whenever we stumble into an automatic index.html + $contents = substr($contents, 0, 500); + warn Dumper $contents, "$identifier: http_readdir: unknown HTML format"; + } + + return @r; +} + + + +sub byte_size +{ + my ($len) = @_; + return $len unless $len =~ m{(.*)([KMG])$}; + my ($n, $l) = ($1,$2); + return int($n*1024) if $l eq 'K'; + return int($1*1024*1024) if $l eq 'M'; + return int($1*1024*1024*1024) if $l eq 'G'; + die "byte_size: $len not impl\n"; +} + + + +# $file_count = scalar ftp_readdir($row->{identifier}, $row->{id}, $row->{baseurl_ftp}, $ftp_timer, $start_dir); +# first call: $ftp undefined +sub ftp_readdir +{ + my ($identifier, $id, $url, $ftp_timer, $name, $ftp) = @_; + + my $ftp_age = (time() - $ftp_timer); + print "$identifier: last command issued $ftp_age"."s ago\n" if $verbose > 2; + $ftp_timer = time; + + my $item; + + print "$identifier: ftp dir: $name\n" if $verbose > 1; + + my $urlraw = $url; + my $re = ''; $re = $1 if $url =~ s{#(.*?)$}{}; + $url =~ s{/+$}{}; # we add our own trailing slashes... + + + my $toplevel = ($ftp) ? 0 : 1; + $ftp = ftp_connect($identifier, "$url/$name", "anonymous", $scanner_email) unless defined $ftp; + return unless defined $ftp; + my $text = ftp_cont($ftp, "$url/$name"); + + if(!ref($text) && \$text =~ m/^(\d\d\d)\s/) { # some FTP status code? Not good. + + # Bug: Net::FTP wrongly reports timeouts (421) as code 550: + # sunsite.informatik.rwth-aachen.de: ftp dir: ftp://sunsite.informatik.rwth-aachen.de/pub/linux/opensuse/distribution/11.0/repo/debug/suse/i686 + # Net::FTP=GLOB(0x112f480)>>> CWD /pub/linux/opensuse/distribution/11.0/repo/debug/suse/i686 + # Net::FTP=GLOB(0x112f480)<<< 421 Timeout. + # sunsite.informatik.rwth-aachen.de: ftp status code 550 (550 failed: ftp-cwd(/pub/linux/opensuse/distribution/11.0/repo/debug/suse/i686): ), closing. + # + # Thus, if the connection is older than 60 seconds, we attempt a reconnect. + # Otherwise we quit. + if ($ftp_age > 60) { + warn "$identifier: ftp status code $1. Last command " . $ftp_age . "s ago; attempting reconnect\n"; + print "$identifier: $text" if $verbose > 2; + ftp_close($ftp); + $ftp = ftp_connect($identifier, "$url/$name", "anonymous", $scanner_email); + return unless defined $ftp; + $text = ftp_cont($ftp, "$url/$name"); + } else { + warn "$identifier: ftp status code $1, closing.\n"; + print "$identifier: $text" if $verbose > 2; + ftp_close($ftp); + return; + } + } + + print "$identifier: ".join("\n", @$text)."\n" if $verbose > 2; + + my @r; + for my $i (0..$#$text) { + if($text->[$i] =~ m/^([dl-])(.........).*\s(\d+)\s(\w\w\w\s+\d\d?\s+\d\d:?\d\d)\s+([\S]+)$/) { + my ($type, $mode, $size, $timestamp, $fname) = ($1, $2, $3, $4, $5); + next if $fname eq "." or $fname eq ".."; + + #print "$name / $fname\n"; + + # are we looking at a top-level directory name? + # (can be recognized by name being an empty string) + if (!length($name) && scalar(@top_include_list)) { + my $included = 0; + foreach my $item(@top_include_list) { + if ($fname =~ $item) { + $included = 1; + } + } + if (!$included) { + print "$identifier: not in top_include_list: $fname\n";# if $verbose > 1; + next; + } + } + + my $excluded = 0; + my $s = "$name/$fname"; + if($type eq "d") { + $s = "$s/"; + } + for $item(@exclude_list) { + if ($s =~ $item) { + print "$identifier: $s ignored (matches $item)\n" if $verbose > 0; + $excluded = 1; + } + } + next if ($excluded); + + #convert to timestamp + my $time = str2time($timestamp); + my $t = length($name) ? "$name/$fname" : $fname; + + if($type eq "d") { + if($mode !~ m{r.[xs]r.[xs]r.[xs]}) { + print "$identifier: bad mode $mode, skipping directory $fname\n" if $verbose; + next; + } + sleep($recursion_delay) if $recursion_delay; + push @r, ftp_readdir($identifier, $id, $urlraw, $ftp_timer, $t, $ftp); + } + + if($type eq 'l') { + warn "symlink($t) not impl."; + } else { + if ($mode !~ m{r..r..r..}) { + print "$identifier: bad mode $mode, skipping file $fname\n" if $verbose; + next; + } + #save timestamp and file in database + if(largefile_check($identifier, $id, $t, $size)) { + if(save_file($t, $identifier, $id, $time, $re)) { + push @r, [ $t , $time ]; + } + } + } + } + } + + print "$identifier: committing ftp dir $name\n" if $verbose > 2; + if($do_transaction) { + $dbh->commit or die "$DBI::errstr"; + } + + ftp_close($ftp) if $toplevel; + return @r; +} + + +sub save_file +{ + my ($path, $identifier, $serverid, $mod_re, $ign_re) = @_; + + # + # optional patch the file names by adding or removing components. + # you never know what strange paths mirror admins choose. + # + + return undef if $ign_re and $path =~ m{$ign_re}; + + if ($mod_re and $mod_re =~ m{@([^@]*)@([^@]*)}) { + print "$identifier: save_file: $path + #$mod_re -> " if $verbose > 2; + my ($m, $r) = ($1, $2); + $path =~ s{$m}{$r}; + print "$path\n" if $verbose > 2; + } + + $path =~ s{^/+}{}; # be sure we have no leading slashes. + $path =~ s{//+}{/}g; # avoid double slashes. + + # explicitely tell Perl that the filename is in UTF-8 encoding + $path = decode_utf8($path); + + my $sql = "SELECT mirr_add_bypath(?, ?);"; + if (!defined $sth_mirr_addbypath) { + printf "\nPreparing add statement\n\n" if $sqlverbose; + $sth_mirr_addbypath = $dbh->prepare( $sql ) or die "$identifier: $DBI::errstr"; + + } + + printf "$sql <-- $serverid, $path \n" if $sqlverbose; + $sth_mirr_addbypath->execute( $serverid, $path ) or die "$identifier: $DBI::errstr"; + + my @data = $sth_mirr_addbypath->fetchrow_array(); + #if ($sth_mirr_addbypath->rows > 0) { + my $fileid = $data[0]; + #print "fileid: $fileid\n"; + #} + $sth_mirr_addbypath->finish; + if (!$keep_dead_files) { + $sql = "DELETE FROM temp1 WHERE id = $fileid"; + print "$sql\n" if $sqlverbose; + $dbh->do($sql) or die "$sql: ".$DBI::errstr; + } + + return $path; +} + + + +sub delete_file +{ + my ($dbh, $serverid, $path) = @_; + warn "FIXME: delete_file() not impl.\n"; +} + + + +sub cont +{ + my $url = shift; + + # Create a request + my $req = HTTP::Request->new(GET => $url); + + # Pass request to the user agent and get a response back + my $res = $ua->request($req); + + # Check the outcome of the response + if ($res->is_success) { + return ($res->content); + } + else { + return ($res->status_line); + } +} + + +# getfileid returns the id as inserted in table file. +# +sub getfileid +{ + my $path = shift; + my @data; + my $id; + + # prepare statements once + my $sql_select_file = "SELECT id FROM file WHERE path = ? LIMIT 1;"; + if (!defined $sth_select_file) { + printf "\nPreparing select_file statement: $sql_select_file\n\n" if $sqlverbose; + $sth_select_file = $dbh->prepare( $sql_select_file ); + } + + my $sql_insert_file = "INSERT INTO file (path) VALUES (?);"; + if (!defined $sth_insert_file) { + printf "\nPreparing insert_file statement: $sql_insert_file\n\n" if $sqlverbose; + $sth_insert_file = $dbh->prepare( $sql_insert_file ); + } + + + printf "select_file: $sql_select_file <--- $path \n" if $sqlverbose; + + $sth_select_file->execute( $path ) or die $sth_select_file->errstr; + @data = $sth_select_file->fetchrow_array(); + if ($sth_select_file->rows > 0) { + $id = $data[0]; + + $sth_select_file->finish; + printf "select_id result: $id \n" if $sqlverbose; + return $id if defined $id; + } + + + $sth_insert_file->execute( $path ) or die $sth_insert_file->err; + + # now we still need the id + # FIXME: should use something like last_insert_id rather + printf "select_file (get the id after insertion): $sql_insert_file <--- $path \n" if $sqlverbose; + + $sth_select_file->execute( $path ) or die $sth_select_file->errstr; + @data = $sth_select_file->fetchrow_array(); + if ($sth_select_file->rows > 0) { + $id = $data[0]; + + $sth_select_file->finish; + printf "select_id result: $id \n" if $sqlverbose; + return $id; + } + die "insert of $path failed - could not get last id\n"; +} + + + +# callback function +sub rsync_cb +{ + my ($priv, $name, $len, $mode, $mtime, @info) = @_; + return 0 if $name eq '.' or $name eq '..'; + my $r = 0; + + if($priv->{subdir}) { + # subdir is expected not to start or end in slashes. + $name = $priv->{subdir} . '/' . $name; + + } + + + if($mode & 0x1000) { # directories have 0 here. + if($mode & 004) { # readable for the world is good. + # params for largefile check: url=$ary_ref->{$priv->{serverid}}/$name, size=$len + if(largefile_check($priv->{identifier}, $priv->{serverid}, $name, $len) == 0) { + printf "$priv->{identifier}: warning: $name cannot be delivererd via HTTP! Skipping\n" if $verbose > 0; + } + else { + $name = save_file($name, $priv->{identifier}, $priv->{serverid}, $mtime, $priv->{re}); + $priv->{counter}++; + if (($priv->{counter} % 50) == 0) { + print "$priv->{identifier}: commit after 50 files\n" if $verbose > 1; + if($do_transaction) { + $dbh->commit or die "$DBI::errstr"; + } + } + + $r = [$name, $len, $mode, $mtime, @info]; + printf "%s: rsync ADD: %03o %10d %-25s %-50s\n", $priv->{identifier}, ($mode & 0777), $len, scalar(localtime $mtime), $name if $verbose > 2; + } + } + else { + printf "%s: rsync skip: %03o %10d %-25s %-50s\n", $priv->{identifier}, ($mode & 0777), $len, scalar(localtime $mtime), $name if $verbose > 1; + } + } + elsif($verbose) { + printf "%s: rsync dir: %03o %10d %-25s %-50s\n", $priv->{identifier}, ($mode & 0777), $len, scalar(localtime $mtime), $name if $verbose > 1; + } + return $r; +} + + + +# example rsync address: +# rsync://user:passwd_at_ftp.sunet.se/pub/Linux/distributions/opensuse/#@^opensuse/@@ +# parameters: +# serverid: id field content from database row +# url: base url from database +# d: base directory (can be 'undef'): parameter to the '-d' switch +sub rsync_readdir +{ + my ($identifier, $serverid, $url, $d) = @_; + return 0 unless $url; + + $url =~ s{^rsync://}{}s; # trailing s: treat as single line, strip off protocol id + my $re = ''; $re = $1 if $url =~ s{#(.*?)$}{}; # after a hash can be a regexp, see example above + my $cred = $1 if $url =~ s{^(.*?)@}{}; # username/passwd if specified + die "$identifier: rsync_readdir: cannot parse url '$url'\n" unless $url =~ m{^([^:/]+)(:(\d*))?(.*)$}; + my ($host, $dummy, $port, $path) = ($1,$2,$3,$4); + $port = 873 unless $port; + $path =~ s{^/+}{}; + + my $peer = { identifier => $identifier, addr => inet_aton($host), port => $port, serverid => $serverid }; + $peer->{re} = $re if $re; + $peer->{pass} = $1 if $cred and $cred =~ s{:(.*)}{}; + $peer->{user} = $cred if $cred; + $peer->{subdir} = $d if length $d; + $peer->{counter} = 0; + $path .= "/". $d if length $d; + rsync_get_filelist($identifier, $peer, $path, 0, \&rsync_cb, $peer); + return $peer->{counter}; +} + + +####################################################################### +# rsync protocol +####################################################################### +# +# Copyright (c) 2005 Michael Schroeder (mls_at_suse.de) +# +# This program is licensed under the BSD license, read LICENSE.BSD +# for further information +# +sub sread +{ + local *SS = shift; + my $len = shift; + my $ret = ''; + while($len > 0) { + alarm 600; + my $r = sysread(SS, $ret, $len, length($ret)); + alarm 0; + die("read error") unless $r; + $len -= $r; + die("read too much") if $r < 0; + } + return $ret; +} + + + +sub swrite +{ + local *SS = shift; + my ($var, $len) = @_; + $len = length($var) unless defined $len; + return if $len == (syswrite(SS, $var, $len) || 0); + warn "syswrite: $!\n"; +} + + + +sub muxread +{ + my $identifier = shift; + local *SS = shift; + my $len = shift; + + #print "$identifier: muxread $len\n"; + while(length($rsync_muxbuf) < $len) { + #print "$identifier: muxbuf len now ".length($muxbuf)."\n"; + my $tag = ''; + $tag = sread(*SS, 4); + $tag = unpack('V', $tag); + my $tlen = 0+$tag & 0xffffff; + $tag >>= 24; + if ($tag == 7) { + $rsync_muxbuf .= sread(*SS, $tlen); + next; + } + if ($tag == 8 || $tag == 9) { + my $msg = sread(*SS, $tlen); + warn("$identifier: tag=8 $msg\n") if $tag == 8; + print "$identifier: info: $msg\n"; + next; + } + warn("$identifier: unknown tag: $tag\n"); + return undef; + } + my $ret = substr($rsync_muxbuf, 0, $len); + $rsync_muxbuf = substr($rsync_muxbuf, $len); + return $ret; +} + + + +sub rsync_get_filelist +{ + my ($identifier, $peer, $syncroot, $norecurse, $callback, $priv) = @_; + my $syncaddr = $peer->{addr}; + my $syncport = $peer->{port}; + + if(!defined($peer->{have_md4})) { + ## why not rely on %INC here? + $peer->{have_md4} = 0; + eval { + # this causes funny messages, if perl-Digest-MD4 is not installed: + # __DIE__: (/usr/bin/scanner 311 main::rsync_readdir => /usr/bin/scanner 961 main::rsync_get_filelist => /usr/bin/scanner 1046 (eval)) + # not sure whether it is worth installing it. + # we never had it on mirrordb.opensuse.org, the main openSUSE scan host. + require Digest::MD4; + $peer->{have_md4} = 1; + }; + } + $syncroot =~ s/^\/+//; + my $module = $syncroot; + $module =~ s/\/.*//; + my $tcpproto = getprotobyname('tcp'); + socket(S, PF_INET, SOCK_STREAM, $tcpproto) || die("$identifier: socket: $!\n"); + setsockopt(S, SOL_SOCKET, SO_KEEPALIVE, pack("l",1)); + connect(S, sockaddr_in($syncport, $syncaddr)) || die("$identifier: connect: $!\n"); + my $hello = "\@RSYNCD: 28\n"; + swrite(*S, $hello); + my $buf = ''; + alarm 600; + sysread(S, $buf, 4096); + alarm 0; + die("$identifier: protocol error [$buf]\n") if $buf !~ /^\@RSYNCD: ([\d.]+)\n/s; + $peer->{rsync_protocol} = $1; + $peer->{rsync_protocol} = 28 if $peer->{rsync_protocol} > 28; + swrite(*S, "$module\n"); + while(1) { + alarm 600; + sysread(S, $buf, 4096); + alarm 0; + die("$identifier: protocol error [$buf]\n") if $buf !~ s/\n//s; + last if $buf eq "\@RSYNCD: OK"; + die("$identifier: $buf\n") if $buf =~ /^\@ERROR/s; + if($buf =~ /^\@RSYNCD: AUTHREQD /) { + die("$identifier: '$module' needs authentification, but Digest::MD4 is not installed\n") unless $peer->{have_md4}; + my $user = "nobody" if !defined($peer->{user}) || $peer->{user} eq ''; + my $password = '' unless defined $peer->{password}; + my $digest = "$user ".Digest::MD4::md4_base64("\0\0\0\0$password".substr($buf, 18))."\n"; + swrite(*S, $digest); + next; + } + } + my @args = ('--server', '--sender', '-rl'); + push @args, '--exclude=/*/*' if $norecurse; + + if(@top_include_list && !defined($peer->{subdir})) { + foreach my $item (@top_include_list) { + push @args, "--include=/$item"; + } + push @args, "--exclude=/*"; + } + + print "$identifier: rsync excludes: @exclude_list_rsync\n" if $verbose > 1; + foreach my $item (@exclude_list_rsync) { + push @args, "--exclude=$item"; + } + print "$identifier: rsync args: @args\n" if $verbose > 2; + + for my $arg (@args, '.', "$syncroot/.", '') { + swrite(*S, "$arg\n"); + } + sread(*S, 4); # checksum seed + swrite(*S, "\0\0\0\0"); + my @filelist; + my $name = ''; + my $mtime = 0; + my $mode = 0; + my $uid = 0; + my $gid = 0; + my $flags; + while(1) { + $flags = muxread($identifier, *S, 1); + $flags = ord($flags); + # printf "flags = %02x\n", $flags; + last if $flags == 0; + $flags |= ord(muxread($identifier, *S, 1)) << 8 if $peer->{rsync_protocol} >= 28 && ($flags & 0x04) != 0; + my $l1 = $flags & 0x20 ? ord(muxread($identifier, *S, 1)) : 0; + my $l2 = $flags & 0x40 ? unpack('V', muxread($identifier, *S, 4)) : ord(muxread($identifier, *S, 1)); + $name = substr($name, 0, $l1).muxread($identifier, *S, $l2); + my $len = unpack('V', muxread($identifier, *S, 4)); + if($len == 0xffffffff) { + $len = unpack('V', muxread($identifier, *S, 4)); + my $len2 = unpack('V', muxread($identifier, *S, 4)); + $len += $len2 * 4294967296; + } + $mtime = unpack('V', muxread($identifier, *S, 4)) unless $flags & 0x80; + $mode = unpack('V', muxread($identifier, *S, 4)) unless $flags & 0x02; + my @info = (); + my $mmode = $mode & 07777; + if(($mode & 0170000) == 0100000) { + $mmode |= 0x1000; + } elsif (($mode & 0170000) == 0040000) { + $mmode |= 0x0000; + } elsif (($mode & 0170000) == 0120000) { + $mmode |= 0x2000; + my $ln = muxread($identifier, *S, unpack('V', muxread($identifier, *S, 4))); + @info = ($ln); + } else { + print "$name: unknown mode: $mode\n"; + next; + } + if($callback) { + my $r = &$callback($priv, $name, $len, $mmode, $mtime, @info); + push @filelist, $r if $r; + } + else { + push @filelist, [$name, $len, $mmode, $mtime, @info]; + } + } + my $io_error = unpack('V', muxread($identifier, *S, 4)); + + # rsync_send_fin + swrite(*S, pack('V', -1)); # switch to phase 2 + swrite(*S, pack('V', -1)); # switch to phase 3 + if($peer->{rsync_protocol} >= 24) { + swrite(*S, pack('V', -1)); # goodbye + } + close(S); + return @filelist; +} + + + +sub ftp_connect +{ + my ($identifier, $url) = @_; + my $port = 21; + my $user ||= 'anonymous'; + my $pass ||= "$0@" . Net::Domain::hostfqdn; + + if($url =~ s{^(\w+)://}{}) { # no protocol prefix please + if(lc $1 ne 'ftp') { + warn "$identifier: ftp_connect: not an ftp url: '$1://$url'\n"; + return undef; + } + } + $url =~ s{/.*$}{}; # no path components please + $port = $1 if $url =~ s{:(\d+)$}{}; # port number? + my $ftp = Net::FTP->new($url, Timeout => 360, Port => $port, Debug => (($verbose||0)>2)?1:0, Passive => 1, Hash => 0); + unless (defined $ftp) { + warn "$identifier: ftp_connect($identifier, $url, $port) failed: $! $@\n"; + return undef; + } + $ftp->login($user, $pass) or warn "$identifier: ftp-login failed: $! $@\n"; + $ftp->type('I'); # binary mode please. + print STDERR "$identifier: connected to $url, ($user,$pass)\n" if $verbose > 1; + return $ftp; +} + + + +sub ftp_close +{ + my ($ftp) = @_; + $ftp->quit; +} + + + +sub ftp_cont +{ + my ($ftp, $path) = @_; + $path =~ s{^\w+://[^/:]+(:\d+)?/}{/}; # no proto host port prefix, please. + $ftp->cwd($path) or return "550 failed: ftp-cwd($path): $! $@"; + + $ftp->dir(); + # In an array context, returns a list of lines returned from the server. + # In a scalar context, returns a reference to a list. + # + ## should use File::Listing to parse this + # + # [ + # 'drwx-wx-wt 2 incoming 49 4096 Jul 03 23:00 incoming', + # '-rw-r--r-- 1 root root 16146417 Jul 04 23:12 ls-Ral.txt' + # ], +} + + + +# double check large files. +# some mirrors can't deliver large files via http. +# try a http range request for files larger than 2G/4G in http/ftp/rsync +sub largefile_check +{ + my ($identifier, $id, $path, $size, $recurse) = @_; + + if(not defined $recurse) { + $recurse = 0; + } + # don't follow more than three redirections + return if($recurse >= 3); + + $http_size_hint = 128; + $http_slice_counter = 2*$http_size_hint; + + if($size==0) { + if($path =~ m{.*\.iso$}) { + print "$identifier: Error: cd size is zero! Illegal file $path\n"; + goto error; + } + } + + goto all_ok if($size <= $gig2); + + my $url = "$ary_ref->{$id}->{baseurl}/$path"; + my $header = new HTTP::Headers('Range' => "bytes=".($gig2-$http_size_hint)."-".($gig2+1)); + my $req = new HTTP::Request('GET', "$url", $header); + + #turn off implicit redirects (handle manually): + $ua->max_redirect(0); + + my $result = $ua->request( + $req, + sub { + my ($chunk, $result) = @_; + $http_slice_counter -= $http_size_hint; + die() if $http_slice_counter <= 0; + return $chunk; + }, + $http_size_hint + ); + + my $code = $result->code(); + goto all_ok if($code == 206 or $code == 200); + # check some redirect types: + # 301 - permanent redirect -> client is adviesd to remember redirected address + # 302 - temporary redirect -> client shall continue using this address + # 303 - redirect from POST command to another URI via GET command + # 307 - same as 302 except different caching behaviour + if($code == 301 or $code == 302 or $code == 303 or $code == 307) { + if($result->header('location') =~ m{^ftp:.*}) { + print "$identifier: Moved to ftp location, assuming success if followed"; + goto all_ok; + } + if($result->header('location') =~ m{^http:.*}) { + print "$identifier: [RECURSE] Moved to other http location, recursing scan..."; + return largefile_check($id, $result->header('location'), $size, $recurse+1); + } + } + + if($result->code() == 416) { + print "$identifier: Error: range error: filesize broken for file $url\n" if $verbose >= 1; + } + else { + print "$identifier: Error ".$result->code()." occured\n" if $verbose >= 1; + } + + error: + return 0; + + all_ok: + return 1; +} + +# vim: ai ts=2 sw=2 smarttab expandtab _______________________________________________ Opensuse-svn mailing list Opensuse-svn_at_forge.novell.com http://forge.novell.com/mailman/listinfo/opensuse-svn _______________________________________________ mirrorbrain-commits mailing list Archive: http://mirrorbrain.org/archive/mirrorbrain-commits/ Note: To remove yourself from this list, send a mail with the content unsubscribe to the address mirrorbrain-commits-request_at_mirrorbrain.orgReceived on 2009-03-30Z00:10:26
This archive was generated by hypermail 2.2.0 : 2009-07-10Z19:18:12 GMT