[mirrorbrain-commits] r8224 - /trunk/tools/scanner.pl

From: <poeml_at_mirrorbrain.org>
Date: Sun, 14 Nov 2010 12:36:12 -0000
Author: poeml
Date: Sun Nov 14 13:36:08 2010
New Revision: 8224

URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain?rev=8224&view=rev
Log:
mb scan:
- implement support for scanning Nginx directory indexes.

Modified:
    trunk/tools/scanner.pl

Modified: trunk/tools/scanner.pl
URL: http://svn.mirrorbrain.org/viewvc/mirrorbrain/trunk/tools/scanner.pl?rev=8224&r1=8223&r2=8224&view=diff
==============================================================================
--- trunk/tools/scanner.pl (original)
+++ trunk/tools/scanner.pl Sun Nov 14 13:36:08 2010
_at_@ -566,7 +566,8 @@
     ## good, we know that one. It is a standard apache dir-listing.
     ## 
     ## bad, apache shows symlinks as a copy of the file or dir they point to.
-    ## no way to avoid duplicate crawls.
+    ## no way to avoid duplicate crawls except by defining top_include_dirs,
+    ## scan_exclude or scan_exclude_rsync in /etc/mirrorbrain.conf.
     ##
     $contents =~ s{</(PRE|pre|table)>.*$}{}s;
     for my $line (split "\n", $contents) {
_at_@ -638,6 +639,57 @@
         my $dir = 1 if $pre =~ m{>Directory<};
         my $t = length($name) ? "$name/$name1" : $name1;
         if($size eq '-' and ($dir or $name1 =~ m{/$})) {
+          ## we must be really sure it is a directory, when we come here.
+          ## otherwise, we'll retrieve the contents of a file!
+          sleep($recursion_delay) if $recursion_delay;
+          push _at_r, http_readdir($identifier, $id, $urlraw, $t, 0);
+        }
+        else {
+          ## it is a file.
+          my $time = $date;
+          my $len = byte_size($size);
+
+          # str2time returns undef in some rare cases causing KILL! FIXME
+          # workaround: don't store files with broken times
+          if(not defined($time)) {
+            print "$identifier: Error: str2time returns undef on parsing \"$date\". Skipping file $name1\n";
+            print "$identifier: current line was:\n$line\nat url $url/$name\nname= $name1\n" if $verbose > 1;
+          }
+          elsif(largefile_check($identifier, $id, $t, $len)) {
+            #save timestamp and file in database
+            if(save_file($t, $identifier, $id, $time, $re)) {
+              push _at_r, [ $t , $time ];
+            }
+          }
+        }
+      }
+    }
+    print "$identifier: committing http dir $name\n" if $verbose > 2;
+    if($do_transaction) {
+      $dbh->commit or die "$DBI::errstr";
+    }
+ } elsif($contents =~ s{^<html>.*<head><title>Index of .*<h1>Index of .*</h1><hr><pre><a href="../">../</a>}{}s) {
+    ## Oh look, it's a nginx directory index!
+    $contents =~ s{<pre><a href="../">../</a>.*</pre><hr></body>$}{}s;
+    for my $line (split "\n", $contents) {
+      #$line =~ s/<\/*t[rd].*?>/ /g;
+      print "$identifier: line: $line\n" if $verbose > 2;
+
+      # <a href="addons/">addons/</a>                                            14-May-2010 15:38                   -
+      if($line =~ m{^<a href="([^"]+)">([^<]+)</a>\s*([\w\s:-]+)\s+(-|[\d\.]+)}) {
+
+        my ($name1, $name2, $date, $size) = ($1, $2, $3, $4, $5);
+        next if $name1 =~ m{^/} or $name1 =~ m{^\.\.};
+        if($verbose > 2) {
+          print "$identifier: name1 $name1\n";
+          print "$identifier: name2 $name2\n";
+          print "$identifier: date $date\n";
+          print "$identifier: size $size\n";
+        }
+        #$name1 =~ s{%([\da-fA-F]{2})}{pack 'c', hex $1}ge;
+        #$name1 =~ s{^\./}{};
+        my $t = length($name) ? "$name/$name1" : $name1;
+        if($size eq '-' and ($name1 =~ m{/$})) {
           ## we must be really sure it is a directory, when we come here.
           ## otherwise, we'll retrieve the contents of a file!
           sleep($recursion_delay) if $recursion_delay;




_______________________________________________
mirrorbrain-commits mailing list
Archive: http://mirrorbrain.org/archive/mirrorbrain-commits/

Note: To remove yourself from this list, send a mail with the content
 	unsubscribe
to the address mirrorbrain-commits-request_at_mirrorbrain.org
Received on Sun Nov 14 2010 - 12:36:18 GMT

This archive was generated by hypermail 2.3.0 : Sun Nov 14 2010 - 12:47:22 GMT