Author: poeml Date: Fri Sep 17 16:50:35 2010 New Revision: 8132 URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain?rev=8132&view=rev Log: mb scan: - Scanning lighttpd web servers is now supported. Thanks to patch contributed by Phillip Smith. This fixes issue #60. Modified: trunk/tools/scanner.pl Modified: trunk/tools/scanner.pl URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain/trunk/tools/scanner.pl?rev=8132&r1=8131&r2=8132&view=diff ============================================================================== --- trunk/tools/scanner.pl (original) +++ trunk/tools/scanner.pl Fri Sep 17 16:50:35 2010 _at_@ -607,7 +607,57 @@ if($do_transaction) { $dbh->commit or die "$DBI::errstr"; } - } + } elsif($contents =~ s{^.*<thead>.*>Name<.*<tbody>}{}s) { + ## Oh look, it's a lighttpd directory index! + $contents =~ s{</tbody>.*$}{}s; + for my $line (split "\n", $contents) { + $line =~ s/<\/*t[rd].*?>/ /g; + print "$identifier: line: $line\n" if $verbose > 2; + if($line =~ m{^(.*)[Hh][Rr][Ee][Ff]="([^"]+)">([^<]+)</[Aa]>.+([\w\s:-]+)\s+(-|[\d\.]+[KMG]?)}) { + my ($pre, $name1, $name2, $date, $size) = ($1, $2, $3, $4, $5); + next if $name1 =~ m{^/} or $name1 =~ m{^\.\.}; + if($verbose > 2) { + print "$identifier: pre $pre\n"; + print "$identifier: name1 $name1\n"; + print "$identifier: name2 $name2\n"; + print "$identifier: date $date\n"; + print "$identifier: size $size\n"; + } + $name1 =~ s{%([\da-fA-F]{2})}{pack 'c', hex $1}ge; + $name1 =~ s{^\./}{}; + my $dir = 1 if $pre =~ m{>Directory<}; + my $t = length($name) ? "$name/$name1" : $name1; + if($size eq '-' and ($dir or $name1 =~ m{/$})) { + ## we must be really sure it is a directory, when we come here. + ## otherwise, we'll retrieve the contents of a file! + sleep($recursion_delay) if $recursion_delay; + push _at_r, http_readdir($identifier, $id, $urlraw, $t, 0); + } + else { + ## it is a file. + my $time = $date; + my $len = byte_size($size); + + # str2time returns undef in some rare cases causing KILL! FIXME + # workaround: don't store files with broken times + if(not defined($time)) { + print "$identifier: Error: str2time returns undef on parsing \"$date\". Skipping file $name1\n"; + print "$identifier: current line was:\n$line\nat url $url/$name\nname= $name1\n" if $verbose > 1; + } + elsif(largefile_check($identifier, $id, $t, $len)) { + #save timestamp and file in database + if(save_file($t, $identifier, $id, $time, $re)) { + push _at_r, [ $t , $time ]; + } + } + } + } + } + print "$identifier: committing http dir $name\n" if $verbose > 2; + if($do_transaction) { + $dbh->commit or die "$DBI::errstr"; + } + } else { ## we come here, whenever we stumble into an automatic index.html $contents = substr($contents, 0, 500); _______________________________________________ mirrorbrain-commits mailing list Archive: http://mirrorbrain.org/archive/mirrorbrain-commits/ Note: To remove yourself from this list, send a mail with the content unsubscribe to the address mirrorbrain-commits-request_at_mirrorbrain.orgReceived on Fri Sep 17 2010 - 14:50:42 GMT
This archive was generated by hypermail 2.3.0 : Mon Feb 20 2012 - 23:47:04 GMT