目前在做Perl页面爬虫的模块,发现一些代码,做个详细的分析,把好的引用一下给自己用用。


1 #!/usr/bin/perl -w 2 3 use strict; 4 5 use HTTP::Request; 6 use HTTP::Status; 7 use HTML::LinkExtor; 8 use URI::URL; 9 use LWP::UserAgent; 10 #use Digest::MD5 qw(md5_hex); 11 12 13 use Compress::Zlib; 14 15 #################################################################### 16 # Parameters Setting 17 our $StartUrl = "http://xxx"; 18 our $bRestrict = 1; 19 our @restrictSite = ('cxxx','context:'); 20 our $bContinueBefore = 1; 21 22 23 #################################################################### 24 25 26 print __FILE__,"\n"; 27 28 our %img_seen = (); 29 our %url_seen = (); 30 our @url_queue = (); 31 our %url_processed = (); 32 33 our %RobotDisallow = (); 34 our %RobotAllow = (); 35 our %site_seen = (); 36 37 38 if($bContinueBefore){ 39 &LoadBefore(); 40 }else{ 41 $url_seen{$StartUrl} = 1; 42 push @url_queue, $StartUrl; 43 } 44 45 our $pageNum = 0; 46 our $BucketNum = 0; 47 48 &OpenOutFile(); 49 50 open(URLHASH,">>urlhash.txt") or die; 51 open(URLPROCESSED,">>urlprocessed.txt") or die; 52 open(URLREDIRECT,">>urlredirect.txt") or die; 53 open(PAGELIST,">>pagelist.txt") or die; 54 open(IMGLIST,">>imglist.txt") or die; 55 56 57 $| = 1, select $_ for select URLHASH; 58 $| = 1, select $_ for select URLPROCESSED; 59 $| = 1, select $_ for select URLREDIRECT; 60 $| = 1, select $_ for select PAGELIST; 61 $| = 1, select $_ for select IMGLIST; 62 63 our $urlhash_log = *URLHASH; 64 our $urlprocessed_log = *URLPROCESSED; 65 our $urlredirect_log = *URLREDIRECT; 66 our $pagelist_log = *PAGELIST; 67 our $imglist_log = *IMGLIST; 68 69 70 our $UA = new LWP::UserAgent(keep_alive => 1, 71 timeout => 60, 72 ); 73 $UA->agent('Mozilla/5.0'); 74 $UA->proxy(['ftp', 'http', 'wais', 'gopher'],'http://jpproxy:80/'); 75 76 our $linkExtor = new HTML::LinkExtor(\&linkCallback); 77 our @tmpLinks = (); 78 our @tmpImgs = (); 79 80 my $url; 81 while ( $url = &next_url() ) 82 { 83 print $urlprocessed_log $url,"\n"; 84 85 #sleep(1000); 86 87 my $response = &get_url( $url ); 88 89 if(!defined $response){ 90 next; 91 } 92 93 my $base = $response->base; 94 $base = $base->as_string; 95 #$base =~ tr/A-Z/a-z/; 96 97 if ( $base ne $url ) 98 { 99 if(!&ValidUrl($base)){ 100 next; 101 } 102 103 print $urlredirect_log $url,"\t",$base,"\n"; 104 105 $url_seen{$base} ++; 106 print $urlhash_log $base,"\n"; 107 108 if(exists($url_processed{$base})){ 109 next; 110 } 111 } 112 113 my $contents = $response->content; 114 115 #my $digest = md5_hex($base); 116 117 &SavePage(\$base,\$contents); 118 print $pagelist_log $base,"\n"; 119 $url_processed{$base} ++; 120 121 122 @tmpLinks = (); 123 @tmpImgs = (); 124 $linkExtor->parse($contents); 125 126 foreach (@tmpLinks){ 127 $_ = URI::URL->new($_,$base)->abs->as_string; 128 #$_ =~ tr/A-Z/a-z/; 129 } 130 131 foreach (@tmpImgs){ 132 $_ = URI::URL->new($_,$base)->abs->as_string; 133 #$_ =~ tr/A-Z/a-z/; 134 } 135 136 #@tmpLinks = map {$_ = URI::URL->new($_,$base)->abs->as_string;} @tmpLinks; 137 #@tmpImgs = map {$_ = URI::URL->new($_,$base)->abs->as_string;} @tmpImgs; 138 139 &RecordLinks(); 140 &RecordImgs(); 141 142 } 143 144 145 146 sub next_url 147 { 148 149 # We return 'undef' to signify no URLs on the list 150 if (@url_queue == 0 ) 151 { 152 return undef; 153 } 154 155 return shift @url_queue; 156 } 157 158 sub get_url 159 { 160 my $url = shift; 161 162 my $request = new HTTP::Request( 'HEAD', $url ); 163 return undef unless $request; 164 165 my $response = $UA->request( $request ); 166 return undef unless defined $response; 167 return undef unless $response->is_success; 168 169 my $content_type = $response->content_type(); 170 return undef unless defined $content_type; 171 172 return undef if 'text/html' ne $content_type; 173 174 $request = new HTTP::Request( 'GET', $url ); 175 return undef unless $request; 176 177 $response = $UA->request( $request ); 178 return undef unless defined $response; 179 return undef unless $response->is_success; 180 181 return $response; 182 } 183 184 sub linkCallback 185 { 186 my($tag, %attr) = @_; 187 if($tag eq 'a' || $tag eq 'frame' || $tag eq 'area'){ 188 push(@tmpLinks,values %attr); 189 return; 190 } 191 if($tag eq 'img'){ 192 push(@tmpImgs,values %attr); 193 return; 194 } 195 return; 196 } 197 198 sub RecordLinks 199 { 200 foreach (@tmpLinks){ 201 if(/\/.+\.(\w{1,4})$/){ 202 if($1 =~ /(html|htm|asp|php|jsp)/i){ 203 204 }elsif($1 =~ /(jpg|jpeg|bmp|png|gif)/i){ 205 if(/^http/i){ 206 207 if(exists($img_seen{$_})){ 208 next; 209 } 210 211 $img_seen{$_} = 1; 212 print $imglist_log $_,"\n"; 213 214 } 215 next; 216 217 }else{ 218 next; 219 } 220 } 221 222