Author: poeml Date: 2009-03-01 14:21:26 -0700 (Sun, 01 Mar 2009) New Revision: 6636 Modified: trunk/tools/download-redirector-v2/scanner/scanner.pl Log: scanner (v0.30): - more work on the new database scheme; now fully implemented - group database writes and commit them in chunks of useful size - fix criminal indentation - pass down -I to spawned scanners - added a few more hardcoded directories to ignore in the openSUSE tree. Of course, this needs to be made cleanly configurable. Modified: trunk/tools/download-redirector-v2/scanner/scanner.pl =================================================================== --- trunk/tools/download-redirector-v2/scanner/scanner.pl 2009-03-01 21:15:10 UTC (rev 6635) +++ trunk/tools/download-redirector-v2/scanner/scanner.pl 2009-03-01 21:21:26 UTC (rev 6636) _at_@ -67,6 +67,13 @@ # add -S option for SQL debugging # 2009-02-21, poeml - V0.23, timestamp_scanner is a UNIX epoch now, instead of # a SQL timestamp. +# 2009-02-27, poeml - V0.30, new database scheme, which is based on arrays, 5x +# faster and 1/3 the size. Also a lot more +# versatile. Timestamps are no longer stored along +# the found files. Instead, the scanner saves known +# files to a temporary table at start, and uses it +# after the scan to delete the remaining (unseen) +# files. Various fixes for scanning. # # # _at_@ -95,7 +102,7 @@ use Config::IniFiles; use Time::HiRes qw(gettimeofday); -my $version = '0.23'; +my $version = '0.30'; my $scanner_email = 'poeml_at_suse.de'; my $verbose = 1; my $sqlverbose = 0; _at_@ -135,16 +142,16 @@ my $brain_instance = ''; # FIXME: use DBI functions transaction handling -my $do_transaction = 0; +my $do_transaction = 1; # experimental other database scheme -my $use_file_array = 0; +my $use_file_array = 1; # save prepared statements my $sth_update; my $sth_insert_rel; my $sth_select_file; my $sth_insert_file; -my $sth_mirr_addbyname; +my $sth_mirr_addbypath; my $gig2 = 1<<31; # 2*1024*1024*1024 == 2^1 * 2^10 * 2^10 * 2^10 = 2^31 _at_@ -171,6 +178,9 @@ # because itentical to directories in the directory listing HTML push _at_norecurse_list, '/openSUSE-current/'; push _at_norecurse_list, '/openSUSE-stable/'; +push _at_norecurse_list, '/SL-OSS-factory/'; +push _at_norecurse_list, '/SL-OSS-factory-debug/'; +push _at_norecurse_list, '/SL-10.1/'; exit usage() unless _at_ARGV; while (defined (my $arg = shift)) { _at_@ -295,6 +305,9 @@ push _at_cmd, '-b', $brain_instance; push _at_cmd, '-q' unless $verbose; push _at_cmd, ('-v') x ($verbose - 1) if $verbose > 1; + foreach my $item(_at_top_include_list) { + push _at_cmd, '-I', $item; + } push _at_cmd, '-x' if $extra_schedule_run; push _at_cmd, '-k' if $keep_dead_files; push _at_cmd, '-d', $start_dir if length $start_dir; _at_@ -319,11 +332,34 @@ print localtime(time) . " $row->{identifier}: starting\n" if $verbose; if($do_transaction) { - $sql = "BEGIN;"; - print "$sql\n" if $sqlverbose; - $dbh->do($sql) or die "$sql: ".$DBI::errstr; + $dbh->{AutoCommit} = 0; + #$dbh�>{RaiseError} = 1; } + if ($use_file_array) { + if (!$keep_dead_files) { + $sql = "CREATE TEMPORARY TABLE temp1 AS SELECT id FROM filearr WHERE $row->{id} = ANY(mirrors); CREATE INDEX temp1_key ON temp1 (id)"; + print "$sql\n" if $sqlverbose; + $dbh->do($sql) or die "$sql: ".$DBI::errstr; + $sql = "SELECT COUNT(*) FROM temp1"; + print "$sql\n" if $sqlverbose; + my $ary_ref = $dbh->selectall_arrayref($sql) or die $dbh->errstr(); + my $file_count = defined($ary_ref->[0]) ? $ary_ref->[0][0] : 0; + print "$row->{identifier}: files before scan: $file_count\n"; + } else { + $sql = "SELECT COUNT(*) FROM filearr WHERE $row->{id} = ANY(mirrors)"; + print "$sql\n" if $sqlverbose; + my $ary_ref = $dbh->selectall_arrayref($sql) or die $dbh->errstr(); + my $file_count = defined($ary_ref->[0]) ? $ary_ref->[0][0] : 0; + print "$row->{identifier}: files before scan: $file_count\n"; + } + } + + if($do_transaction) { + $dbh->commit or die "$DBI::errstr"; + } + + my $start = int(gettimeofday * 1000); my $file_count = rsync_readdir($row->{identifier}, $row->{id}, $row->{baseurl_rsync}, $start_dir); if(!$file_count and $row->{baseurl_ftp}) { _at_@ -335,6 +371,9 @@ $file_count = scalar http_readdir($row->{identifier}, $row->{id}, $row->{baseurl}, $start_dir); } + if($do_transaction) { + $dbh->commit or die "$DBI::errstr"; + } my $duration = (int(gettimeofday * 1000) - $start) / 1000; if (!$duration) { $duration = 1; } if (!$file_count) { $file_count = 0; } _at_@ -351,8 +390,21 @@ if ($use_file_array) { - print "$row->{identifier}: FIXME cleanup \n"; + #$sql = "SELECT COUNT(*) FROM temp1"; + $sql = "SELECT COUNT(mirr_del_byid($row->{id}, id)) FROM temp1"; + print "$sql\n" if $sqlverbose; + $ary_ref = $dbh->selectall_arrayref($sql) or die $dbh->errstr(); + $file_count = defined($ary_ref->[0]) ? $ary_ref->[0][0] : 0; + print localtime(time) . " $row->{identifier}: files to be purged: $file_count\n"; + + $sql = "SELECT COUNT(*) FROM filearr WHERE $row->{id} = ANY(mirrors);"; + print "$sql\n" if $sqlverbose; + my $ary_ref = $dbh->selectall_arrayref($sql) or die $dbh->errstr(); + my $file_count = defined($ary_ref->[0]) ? $ary_ref->[0][0] : 0; + print localtime(time) . " $row->{identifier}: number of files: $file_count\n"; + + } else { my $sql = "DELETE FROM file_server WHERE serverid = $row->{id} AND timestamp_scanner <= (SELECT extract(epoch from last_scan) FROM server _at_@ -389,9 +441,7 @@ } if($do_transaction) { - $sql = "COMMIT;"; - print "$sql\n" if $sqlverbose; - $dbh->do($sql) or die "$sql: ".$DBI::errstr; + $dbh->commit or die "$DBI::errstr"; } print localtime(time) . " $row->{identifier}: done.\n" if $verbose > 0; _at_@ -433,7 +483,7 @@ -i regexp Define regexp-pattern for path names to ignore. - Use '-i 0' to disable any ignore patterns. Default: _at_norecurse_list + Use '-i 0' to disable any ignore patterns. Default: _at_norecurse_list -T dir Directory to be scanned at the top level; option can be repeated. Both, names(identifier) and numbers(id) are accepted as mirror_ids. _at_@ -457,7 +507,7 @@ print "\t$row->{baseurl_ftp}$nl" if length($row->{baseurl_ftp}||'') > 0; print "\t$row->{baseurl}$nl" if length($row->{baseurl}||'') > 0; printf "\tscore=%d country=%s region=%s enabled=%d$nl", - $row->{score}||0, $row->{country}||'', $row->{region}||'', $row->{enabled}||0; + $row->{score}||0, $row->{country}||'', $row->{region}||'', $row->{enabled}||0; print "\n"; } } _at_@ -479,9 +529,9 @@ return $i unless $a->[$i]; my $p = $a->[$i]{pid}; unless (kill(0, $p)) { # already dead? okay take him home. - print "kill(0, $p) returned 0. reusing $i!\n" if $verbose; - undef $a->[$i]; - return $i; + print "kill(0, $p) returned 0. reusing $i!\n" if $verbose; + undef $a->[$i]; + return $i; } $pids{$p} = $i; # not? okay wait. } _at_@ -564,8 +614,8 @@ $line =~ s/<\/*t[rd].*?>/ /g; print "$identifier: line: $line\n" if $verbose > 2; if($line =~ m{^(.*)[Hh][Rr][Ee][Ff]="([^"]+)">([^<]+)</[Aa]>\s+([\w\s:-]+)\s+(-|[\d\.]+[KMG]?)}) { - my ($pre, $name1, $name2, $date, $size) = ($1, $2, $3, $4, $5); - next if $name1 =~ m{^/} or $name1 =~ m{^\.\.}; + my ($pre, $name1, $name2, $date, $size) = ($1, $2, $3, $4, $5); + next if $name1 =~ m{^/} or $name1 =~ m{^\.\.}; if($verbose > 2) { print "$identifier: pre $pre\n"; print "$identifier: name1 $name1\n"; _at_@ -576,34 +626,38 @@ $name1 =~ s{%([\da-fA-F]{2})}{pack 'c', hex $1}ge; $name1 =~ s{^\./}{}; my $dir = 1 if $pre =~ m{"\[DIR\]"}; - #print "$identifier: $pre^$name1^$date^$size\n" if $verbose > 1; + #print "$identifier: $pre^$name1^$date^$size\n" if $verbose > 1; my $t = length($name) ? "$name/$name1" : $name1; if($size eq '-' and ($dir or $name1 =~ m{/$})) { - ## we must be really sure it is a directory, when we come here. - ## otherwise, we'll retrieve the contents of a file! - sleep($recursion_delay) if $recursion_delay; - push _at_r, http_readdir($identifier, $id, $urlraw, $t); - } - else { - ## it is a file. - my $time = str2time($date); - my $len = byte_size($size); + ## we must be really sure it is a directory, when we come here. + ## otherwise, we'll retrieve the contents of a file! + sleep($recursion_delay) if $recursion_delay; + push _at_r, http_readdir($identifier, $id, $urlraw, $t); + } + else { + ## it is a file. + my $time = str2time($date); + my $len = byte_size($size); - # str2time returns undef in some rare cases causing KILL! FIXME - # workaround: don't store files with broken times - if(not defined($time)) { - print "$identifier: Error: str2time returns undef on parsing \"$date\". Skipping file $name1\n"; - print "$identifier: current line was:\n$line\nat url $url\nname= $name1\n"; - } - elsif(largefile_check($identifier, $id, $t, $len)) { - #save timestamp and file in database - if(save_file($t, $identifier, $id, $time, $re)) { - push _at_r, [ $t , $time ]; - } - } - } + # str2time returns undef in some rare cases causing KILL! FIXME + # workaround: don't store files with broken times + if(not defined($time)) { + print "$identifier: Error: str2time returns undef on parsing \"$date\". Skipping file $name1\n"; + print "$identifier: current line was:\n$line\nat url $url\nname= $name1\n"; + } + elsif(largefile_check($identifier, $id, $t, $len)) { + #save timestamp and file in database + if(save_file($t, $identifier, $id, $time, $re)) { + push _at_r, [ $t , $time ]; + } + } + } } } + print "$identifier: committing http dir $name\n" if $verbose > 1; + if($do_transaction) { + $dbh->commit or die "$DBI::errstr"; + } } else { ## we come here, whenever we stumble into an automatic index.html _at_@ -708,30 +762,35 @@ my $t = length($name) ? "$name/$fname" : $fname; if($type eq "d") { - if($mode !~ m{r.[xs]r.[xs]r.[xs]}) { - print "$identifier: bad mode $mode, skipping directory $fname\n" if $verbose; - next; - } - sleep($recursion_delay) if $recursion_delay; - push _at_r, ftp_readdir($identifier, $id, $urlraw, $ftp_timer, $t, $ftp); + if($mode !~ m{r.[xs]r.[xs]r.[xs]}) { + print "$identifier: bad mode $mode, skipping directory $fname\n" if $verbose; + next; + } + sleep($recursion_delay) if $recursion_delay; + push _at_r, ftp_readdir($identifier, $id, $urlraw, $ftp_timer, $t, $ftp); } + if($type eq 'l') { - warn "symlink($t) not impl."; + warn "symlink($t) not impl."; + } else { + if ($mode !~ m{r..r..r..}) { + print "$identifier: bad mode $mode, skipping file $fname\n" if $verbose; + next; + } + #save timestamp and file in database + if(largefile_check($identifier, $id, $t, $size)) { + if(save_file($t, $identifier, $id, $time, $re)) { + push _at_r, [ $t , $time ]; + } + } } - else { - if ($mode !~ m{r..r..r..}) { - print "$identifier: bad mode $mode, skipping file $fname\n" if $verbose; - next; - } - #save timestamp and file in database - if(largefile_check($identifier, $id, $t, $size)) { - if(save_file($t, $identifier, $id, $time, $re)) { - push _at_r, [ $t , $time ]; - } - } - } } } + + print "$identifier: committing ftp dir $name\n" if $verbose > 1; + if($do_transaction) { + $dbh->commit or die "$DBI::errstr"; + } ftp_close($ftp) if $toplevel; return _at_r; _at_@ -761,16 +820,28 @@ if ($use_file_array) { - my $sql = "SELECT mirr_add_byname(?, ?);"; - if (!defined $sth_mirr_addbyname) { + my $sql = "SELECT mirr_add_bypath(?, ?);"; + if (!defined $sth_mirr_addbypath) { printf "\nPreparing add statement\n\n" if $sqlverbose; - $sth_mirr_addbyname = $dbh->prepare( $sql ) or die "$identifier: $DBI::errstr"; + $sth_mirr_addbypath = $dbh->prepare( $sql ) or die "$identifier: $DBI::errstr"; + } printf "$sql <-- $serverid, $path \n" if $sqlverbose; - $sth_mirr_addbyname->execute( $serverid, $path ) or die "$identifier: $DBI::errstr"; - $sth_mirr_addbyname->finish; + $sth_mirr_addbypath->execute( $serverid, $path ) or die "$identifier: $DBI::errstr"; + my _at_data = $sth_mirr_addbypath->fetchrow_array(); + #if ($sth_mirr_addbypath->rows > 0) { + my $fileid = $data[0]; + #print "fileid: $fileid\n"; + #} + $sth_mirr_addbypath->finish; + if (!$keep_dead_files) { + $sql = "DELETE FROM temp1 WHERE id = $fileid"; + print "$sql\n" if $sqlverbose; + $dbh->do($sql) or die "$sql: ".$DBI::errstr; + } + } else { my $fileid = getfileid($path); _at_@ -826,7 +897,7 @@ } else { return ($res->status_line); - } + } } _at_@ -907,19 +978,28 @@ if($priv->{subdir}) { # subdir is expected not to start or end in slashes. $name = $priv->{subdir} . '/' . $name; + } - if($mode & 0x1000) { # directories have 0 here. + + if($mode & 0x1000) { # directories have 0 here. if($mode & 004) { # readable for the world is good. # params for largefile check: url=$ary_ref->{$priv->{serverid}}/$name, size=$len if(largefile_check($priv->{identifier}, $priv->{serverid}, $name, $len) == 0) { - printf "$priv->{identifier}: warning: $name cannot be delivererd via HTTP! Skipping\n" if $verbose > 0; + printf "$priv->{identifier}: warning: $name cannot be delivererd via HTTP! Skipping\n" if $verbose > 0; } else { - $name = save_file($name, $priv->{identifier}, $priv->{serverid}, $mtime, $priv->{re}); - $priv->{counter}++; - $r = [$name, $len, $mode, $mtime, _at_info]; - printf "%s: rsync ADD: %03o %10d %-25s %-50s\n", $priv->{identifier}, ($mode & 0777), $len, scalar(localtime $mtime), $name if $verbose > 2; + $name = save_file($name, $priv->{identifier}, $priv->{serverid}, $mtime, $priv->{re}); + $priv->{counter}++; + if (($priv->{counter} % 500) == 0) { + print "$priv->{identifier}: commit after 500 files\n" if $verbose > 1; + if($do_transaction) { + $dbh->commit or die "$DBI::errstr"; + } + } + + $r = [$name, $len, $mode, $mtime, _at_info]; + printf "%s: rsync ADD: %03o %10d %-25s %-50s\n", $priv->{identifier}, ($mode & 0777), $len, scalar(localtime $mtime), $name if $verbose > 2; } } else { _at_@ -1286,7 +1366,7 @@ return largefile_check($id, $result->header('location'), $size, $recurse+1); } } - + if($result->code() == 416) { print "$identifier: Error: range error: filesize broken for file $url\n" if $verbose >= 2; } _at_@ -1301,3 +1381,4 @@ return 1; } +# vim: ai ts=2 sw=2 smarttab expandtab _______________________________________________ Opensuse-svn mailing list Opensuse-svn_at_forge.novell.com http://forge.novell.com/mailman/listinfo/opensuse-svn _______________________________________________ mirrorbrain-commits mailing list Note: To remove yourself from this list, send a mail with the content unsubscribe to the address mirrorbrain-commits-request_at_mirrorbrain.orgReceived on Sun Mar 01 2009 - 21:21:59 GMT
This archive was generated by hypermail 2.3.0 : Mon Feb 20 2012 - 23:47:04 GMT