├── LICENSE ├── README.md └── src ├── csscrawl ├── mixcrawl └── rexcrawl /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # crawlpl 2 | compact crawling tools written in perl 3 | 4 | --- 5 | ### NAME 6 | rexcrawl -- crawler with regex patterns for content and crawl-urls 7 | 8 | csscrawl -- crawler with css selector for content and crwal-urls (broken) 9 | 10 | mixcrawl -- crawler with css selector for content and regex for crawl-urls 11 | 12 | ### SYNOPSIS 13 | 14 | `rexcrawl url extract-pattern [crawl-pattern]` 15 | 16 | `csscrawl url extract-selector [crawl-selector]` 17 | 18 | `mixcrawl url extract-selector [crawl-pattern]` 19 | 20 | ### DESCRIPTION 21 | 22 | to be continued 23 | 24 | ### EXAMPLES 25 | 26 | ``` 27 | * crawl searchresult headers 28 | rexcrawl http://www.google.com/search?q=test '' 29 | 30 | * crawl with a http proxy 31 | HTTP_PROXY=127.0.0.1:8080 rexcrawl http://www.google.com/search?q=test '' 32 | 33 | * crawl searchresult headers and descend to subpages 34 | rexcrawl http://www.google.com/search?q=test '' 'http:\/\/.*start[^"]+' 35 | 36 | csscrawl --- 37 | 38 | * crawl searchresult headers 39 | mixcrawl http://www.google.com/search?q=test 'h3.r' 40 | 41 | * crawl with a http proxy 42 | HTTP_PROXY=127.0.0.1:8080 mixcrawl http://www.google.com/search?q=test 'h3.r' 43 | 44 | * crawl searchresult headers and descend to subpages 45 | mixcrawl http://www.google.com/search?q=test 'h3.r' 'http:\/\/.*start[^"]+' 46 | 47 | 48 | 49 | ``` 50 | -------------------------------------------------------------------------------- /src/csscrawl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # 3 | # cpan install LWP::Simple 4 | # cpan install Mojo::DOM 5 | 6 | use strict; 7 | use warnings; 8 | 9 | use LWP::Simple; 10 | use LWP::UserAgent; 11 | use Mojo::DOM; 12 | use HTTP::Request; 13 | use HTTP::Response; 14 | 15 | my @urls; 16 | my %visited; 17 | my $browser; 18 | 19 | my $SLEEP = $ENV{REQUEST_PAUSE} // 1; 20 | my $TIMEOUT = $ENV{REQUEST_TIMEOUT} // 15; 21 | 22 | my $UAMOZ = 'Mozilla/5.0 (Windows NT 6.1; WOW64)'; 23 | my $UAENG = 'AppleWebKit/537.36 (KHTML, like Gecko)'; 24 | my $UATAG = 'Chrome/46.0.2490.86 Safari/537.36'; 25 | 26 | $browser = LWP::UserAgent->new(join(' ', $UAMOZ, $UAENG, $UATAG)); 27 | $browser->proxy( ['http'], $ENV{HTTP_PROXY} ) if exists $ENV{HTTP_PROXY}; 28 | $browser->timeout($TIMEOUT); 29 | 30 | die("usage: crawler url extract-selector [crawl-selector]") if(scalar @ARGV < 2); 31 | my ($url, $crawl, $extract) = @ARGV; 32 | push(@urls, $url); 33 | 34 | foreach my $url (@urls) { 35 | 36 | next if exists $visited{$url}; 37 | 38 | $visited{$url} = 1; 39 | 40 | my $request = HTTP::Request->new(GET => $url); 41 | my $response = $browser->request($request); 42 | 43 | if ($response->is_error()) { 44 | printf STDERR "crawler: %s for %s\n", $response->status_line, $url; 45 | next; 46 | } 47 | 48 | 49 | my $dom = Mojo::DOM->new($response->content()); 50 | 51 | for my $node( $dom->find($crawl)->each) { 52 | if($crawl =~ /\]$/) { 53 | push @urls, $node->text; 54 | } else { 55 | push @urls, $node->to_string; 56 | } 57 | 58 | } 59 | 60 | for my $node( $dom->find($extract)->each) { 61 | print $node->text. "\n"; next; 62 | if($crawl =~ /\]$/) { 63 | #print "matches\n"; 64 | print $node->text. "\n"; 65 | } else { 66 | #print "does not match\n"; 67 | print $node->to_string. "\n"; 68 | } 69 | 70 | } 71 | 72 | sleep $SLEEP; 73 | } 74 | 75 | -------------------------------------------------------------------------------- /src/mixcrawl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # 3 | # cpan install LWP::Simple 4 | # cpan install Mojo::DOM 5 | 6 | use strict; 7 | use warnings; 8 | 9 | use LWP::Simple; 10 | use LWP::UserAgent; 11 | use Mojo::DOM; 12 | use HTTP::Request; 13 | use HTTP::Response; 14 | 15 | my @urls; 16 | my %visited; 17 | my $browser; 18 | 19 | my $SLEEP = $ENV{REQUEST_PAUSE} // 1; 20 | my $TIMEOUT = $ENV{REQUEST_TIMEOUT} // 15; 21 | 22 | die("usage: mixcrawler url extract-selector [crawl-regex]") if(scalar @ARGV < 2); 23 | $sleep = $ENV{REQUEST_INTERVAL} ) if exists $ENV{REQUEST_INTERVAL}; 24 | 25 | 26 | my $UAMOZ = 'Mozilla/5.0 (Windows NT 6.1; WOW64)'; 27 | my $UAENG = 'AppleWebKit/537.36 (KHTML, like Gecko)'; 28 | my $UATAG = 'Chrome/46.0.2490.86 Safari/537.36'; 29 | 30 | 31 | $browser = LWP::UserAgent->new(join(' ', $UAMOZ, $UAENG, $UATAG)); 32 | $browser->proxy( ['http'], $ENV{HTTP_PROXY} ) if exists $ENV{HTTP_PROXY}; 33 | $browser->timeout($TIMEOUT); 34 | 35 | 36 | my ($url, $extract, $crawl) = @ARGV; 37 | push(@urls, $url); 38 | 39 | foreach my $url (@urls) { 40 | next if $url eq ""; 41 | next if exists $visited{$url}; 42 | 43 | $visited{$url} = 1; 44 | 45 | my $request = HTTP::Request->new(GET => $url); 46 | my $response = $browser->request($request); 47 | 48 | if ($response->is_error()) { 49 | printf STDERR "crawler: %s for %s\n", $response->status_line,$url; 50 | next; 51 | } 52 | 53 | my $content = $response->content(); 54 | my $dom = Mojo::DOM->new($content); 55 | 56 | if($crawl) { 57 | push @urls, $& while($content =~ m/$crawl/g); 58 | } 59 | 60 | for my $node( $dom->find($extract)->each) { 61 | print $node->to_string. "\n\n"; 62 | 63 | } 64 | 65 | sleep $SLEEP; 66 | } 67 | -------------------------------------------------------------------------------- /src/rexcrawl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | # 3 | # cpan install LWP::Simple 4 | # 5 | 6 | use strict; 7 | use warnings; 8 | 9 | use LWP::Simple; 10 | use LWP::UserAgent; 11 | use HTTP::Request; 12 | use HTTP::Response; 13 | 14 | my @urls; 15 | my %visited; 16 | my $browser; 17 | 18 | my $SLEEP = $ENV{REQUEST_PAUSE} // 1; 19 | my $TIMEOUT = $ENV{REQUEST_TIMEOUT} // 15; 20 | 21 | my $UAMOZ = 'Mozilla/5.0 (Windows NT 6.1; WOW64)'; 22 | my $UAENG = 'AppleWebKit/537.36 (KHTML, like Gecko)'; 23 | my $UATAG = 'Chrome/46.0.2490.86 Safari/537.36'; 24 | 25 | $browser = LWP::UserAgent->new(join(' ', $UAMOZ, $UAENG, $UATAG)); 26 | $browser->proxy( ['http'], $ENV{HTTP_PROXY} ) if exists $ENV{HTTP_PROXY}; 27 | $browser->timeout($TIMEOUT); 28 | 29 | die("usage: rexcrawl url extract-pattern [crawl-pattern]") if(scalar @ARGV < 2); 30 | my ($url, $extract, $crawl) = @ARGV; 31 | push(@urls, $url); 32 | 33 | foreach my $url (@urls) { 34 | 35 | next if exists $visited{$url}; 36 | 37 | $visited{$url} = 1; 38 | 39 | my $request = HTTP::Request->new(GET => $url); 40 | my $response = $browser->request($request); 41 | 42 | if ($response->is_error()) { 43 | printf STDERR "rexcrawl: %s for %s\n", $response->status_line, $url; 44 | next; 45 | } 46 | 47 | my $content = $response->content(); 48 | 49 | while($content =~ m/$extract/g) { 50 | my @matches = $& =~ m/$extract/; 51 | print join("\t",@matches)."\n"; 52 | } 53 | 54 | if($crawl) { 55 | push @urls, $& while($content =~ m/$crawl/g); 56 | } 57 | 58 | sleep $SLEEP; 59 | } 60 | --------------------------------------------------------------------------------