Perl爬虫代码 | 易学教程

目前在做Perl页面爬虫的模块，发现一些代码，做个详细的分析，把好的引用一下给自己用用。
  1 #!/usr/bin/perl -w   2    3 use strict;    4     5 use HTTP::Request;    6 use HTTP::Status;    7 use HTML::LinkExtor;    8 use URI::URL;    9 use LWP::UserAgent;   10 #use Digest::MD5  qw(md5_hex);   11    12    13 use Compress::Zlib;   14    15 ####################################################################   16 # Parameters Setting   17 our $StartUrl = "http://xxx";   18 our $bRestrict = 1;   19 our @restrictSite = ('cxxx','context:');   20 our $bContinueBefore = 1;   21    22    23 ####################################################################   24    25    26 print __FILE__,"\n";   27    28 our %img_seen = ();   29 our %url_seen = ();   30 our @url_queue = ();   31 our %url_processed = ();   32    33 our %RobotDisallow = ();   34 our %RobotAllow = ();   35 our %site_seen = ();   36    37    38 if($bContinueBefore){   39     &LoadBefore();   40 }else{   41     $url_seen{$StartUrl} = 1;   42     push @url_queue, $StartUrl;   43 }   44    45 our $pageNum = 0;   46 our $BucketNum = 0;   47    48 &OpenOutFile();   49    50 open(URLHASH,">>urlhash.txt") or die;   51 open(URLPROCESSED,">>urlprocessed.txt") or die;   52 open(URLREDIRECT,">>urlredirect.txt") or die;   53 open(PAGELIST,">>pagelist.txt") or die;   54 open(IMGLIST,">>imglist.txt") or die;   55    56    57 $| = 1, select $_ for select URLHASH;   58 $| = 1, select $_ for select URLPROCESSED;   59 $| = 1, select $_ for select URLREDIRECT;   60 $| = 1, select $_ for select PAGELIST;   61 $| = 1, select $_ for select IMGLIST;   62    63 our $urlhash_log = *URLHASH;   64 our $urlprocessed_log = *URLPROCESSED;   65 our $urlredirect_log = *URLREDIRECT;   66 our $pagelist_log = *PAGELIST;   67 our $imglist_log = *IMGLIST;   68    69    70 our $UA =  new LWP::UserAgent(keep_alive =>  1,   71                               timeout    =>  60,   72                               );   73 $UA->agent('Mozilla/5.0');   74 $UA->proxy(['ftp', 'http', 'wais', 'gopher'],'http://jpproxy:80/');   75    76 our $linkExtor = new HTML::LinkExtor(\&linkCallback);   77 our @tmpLinks = ();   78 our @tmpImgs = ();   79    80 my $url;   81 while ( $url = &next_url() )   82 {   83     print $urlprocessed_log $url,"\n";   84        85     #sleep(1000);   86        87     my $response = &get_url( $url );   88        89     if(!defined $response){   90         next;   91     }   92        93     my $base = $response->base;   94     $base = $base->as_string;   95     #$base =~ tr/A-Z/a-z/;   96        97     if ( $base ne $url )   98     {   99         if(!&ValidUrl($base)){  100             next;  101         }  102           103         print $urlredirect_log $url,"\t",$base,"\n";  104           105         $url_seen{$base} ++;  106         print $urlhash_log $base,"\n";  107           108         if(exists($url_processed{$base})){  109                next;  110         }  111     }  112       113     my $contents = $response->content;  114   115     #my $digest = md5_hex($base);      116       117     &SavePage(\$base,\$contents);  118     print $pagelist_log $base,"\n";  119     $url_processed{$base} ++;  120           121       122     @tmpLinks = ();  123     @tmpImgs = ();  124     $linkExtor->parse($contents);  125       126     foreach (@tmpLinks){  127         $_ = URI::URL->new($_,$base)->abs->as_string;  128         #$_ =~ tr/A-Z/a-z/;  129     }  130       131     foreach (@tmpImgs){  132         $_ = URI::URL->new($_,$base)->abs->as_string;  133         #$_ =~ tr/A-Z/a-z/;  134     }  135       136     #@tmpLinks = map {$_ = URI::URL->new($_,$base)->abs->as_string;} @tmpLinks;  137     #@tmpImgs = map {$_ = URI::URL->new($_,$base)->abs->as_string;} @tmpImgs;  138       139     &RecordLinks();  140     &RecordImgs();  141       142 }  143   144   145   146 sub next_url  147 {  148   149     # We return 'undef' to signify no URLs on the list  150     if (@url_queue == 0 )  151     {  152         return undef;  153     }  154       155     return shift @url_queue;  156 }  157   158 sub get_url  159 {  160     my $url   = shift;  161   162     my $request = new HTTP::Request( 'HEAD', $url );  163     return undef unless $request;  164   165     my $response = $UA->request( $request );  166     return undef unless defined $response;  167     return undef unless $response->is_success;  168   169     my $content_type = $response->content_type();  170     return undef unless defined $content_type;  171   172     return undef if 'text/html' ne $content_type;  173       174     $request = new HTTP::Request( 'GET', $url );  175     return undef unless $request;  176   177     $response = $UA->request( $request );  178     return undef unless defined $response;  179     return undef unless $response->is_success;  180        181     return $response;  182 }  183   184 sub linkCallback  185 {  186     my($tag, %attr) = @_;  187     if($tag eq 'a' || $tag eq 'frame' || $tag eq 'area'){  188         push(@tmpLinks,values %attr);  189         return;  190     }  191     if($tag eq 'img'){  192         push(@tmpImgs,values %attr);  193         return;  194     }  195     return;  196 }  197   198 sub RecordLinks  199 {  200     foreach (@tmpLinks){  201         if(/\/.+\.(\w{1,4})$/){  202             if($1 =~ /(html|htm|asp|php|jsp)/i){  203   204             }elsif($1 =~ /(jpg|jpeg|bmp|png|gif)/i){  205                 if(/^http/i){  206                       207                     if(exists($img_seen{$_})){  208                         next;  209                     }  210                       211                     $img_seen{$_} = 1;  212                     print $imglist_log $_,"\n";  213                       214                 }  215                 next;  216   217             }else{  218                 next;  219             }  220         }  221               222
标签
select
perl
base