[mirrorbrain-commits] r8132 - /trunk/tools/scanner.pl

From: <poeml_at_mirrorbrain.org>
Date: Fri, 17 Sep 2010 14:50:37 -0000
Author: poeml
Date: Fri Sep 17 16:50:35 2010
New Revision: 8132

URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain?rev=8132&view=rev
Log:
mb scan:
- Scanning lighttpd web servers is now supported. Thanks to patch contributed
  by Phillip Smith. This fixes issue #60.

Modified:
    trunk/tools/scanner.pl

Modified: trunk/tools/scanner.pl
URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain/trunk/tools/scanner.pl?rev=8132&r1=8131&r2=8132&view=diff
==============================================================================
--- trunk/tools/scanner.pl (original)
+++ trunk/tools/scanner.pl Fri Sep 17 16:50:35 2010
_at_@ -607,7 +607,57 @@
     if($do_transaction) {
       $dbh->commit or die "$DBI::errstr";
     }
-  }
+  } elsif($contents =~ s{^.*<thead>.*>Name<.*<tbody>}{}s) {
+    ## Oh look, it's a lighttpd directory index!
+    $contents =~ s{</tbody>.*$}{}s;
+    for my $line (split "\n", $contents) {
+      $line =~ s/<\/*t[rd].*?>/ /g;
+      print "$identifier: line: $line\n" if $verbose > 2;
+      if($line =~ m{^(.*)[Hh][Rr][Ee][Ff]="([^"]+)">([^<]+)</[Aa]>.+([\w\s:-]+)\s+(-|[\d\.]+[KMG]?)}) {
+        my ($pre, $name1, $name2, $date, $size) = ($1, $2, $3, $4, $5);
+        next if $name1 =~ m{^/} or $name1 =~ m{^\.\.};
+        if($verbose > 2) {
+          print "$identifier: pre $pre\n";
+          print "$identifier: name1 $name1\n";
+          print "$identifier: name2 $name2\n";
+          print "$identifier: date $date\n";
+          print "$identifier: size $size\n";
+        }
+        $name1 =~ s{%([\da-fA-F]{2})}{pack 'c', hex $1}ge;
+        $name1 =~ s{^\./}{};
+        my $dir = 1 if $pre =~ m{>Directory<};
+        my $t = length($name) ? "$name/$name1" : $name1;
+        if($size eq '-' and ($dir or $name1 =~ m{/$})) {
+          ## we must be really sure it is a directory, when we come here.
+          ## otherwise, we'll retrieve the contents of a file!
+          sleep($recursion_delay) if $recursion_delay;
+          push _at_r, http_readdir($identifier, $id, $urlraw, $t, 0);
+        }
+        else {
+          ## it is a file.
+          my $time = $date;
+          my $len = byte_size($size);
+
+          # str2time returns undef in some rare cases causing KILL! FIXME
+          # workaround: don't store files with broken times
+          if(not defined($time)) {
+            print "$identifier: Error: str2time returns undef on parsing \"$date\". Skipping file $name1\n";
+            print "$identifier: current line was:\n$line\nat url $url/$name\nname= $name1\n" if $verbose > 1;
+          }
+          elsif(largefile_check($identifier, $id, $t, $len)) {
+            #save timestamp and file in database
+            if(save_file($t, $identifier, $id, $time, $re)) {
+              push _at_r, [ $t , $time ];
+            }
+          }
+        }
+      }
+    }
+    print "$identifier: committing http dir $name\n" if $verbose > 2;
+    if($do_transaction) {
+      $dbh->commit or die "$DBI::errstr";
+    }
+ }
   else {
     ## we come here, whenever we stumble into an automatic index.html 
     $contents = substr($contents, 0, 500);




_______________________________________________
mirrorbrain-commits mailing list
Archive: http://mirrorbrain.org/archive/mirrorbrain-commits/

Note: To remove yourself from this list, send a mail with the content
 	unsubscribe
to the address mirrorbrain-commits-request_at_mirrorbrain.org
Received on Fri Sep 17 2010 - 14:50:42 GMT

This archive was generated by hypermail 2.3.0 : Mon Feb 20 2012 - 23:47:04 GMT