├── LICENSE
├── README.md
└── src
├── csscrawl
├── mixcrawl
└── rexcrawl
/LICENSE:
--------------------------------------------------------------------------------
1 | The MIT License (MIT)
2 |
3 | Copyright (c) 2016
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # crawlpl
2 | compact crawling tools written in perl
3 |
4 | ---
5 | ### NAME
6 | rexcrawl -- crawler with regex patterns for content and crawl-urls
7 |
8 | csscrawl -- crawler with css selector for content and crwal-urls (broken)
9 |
10 | mixcrawl -- crawler with css selector for content and regex for crawl-urls
11 |
12 | ### SYNOPSIS
13 |
14 | `rexcrawl url extract-pattern [crawl-pattern]`
15 |
16 | `csscrawl url extract-selector [crawl-selector]`
17 |
18 | `mixcrawl url extract-selector [crawl-pattern]`
19 |
20 | ### DESCRIPTION
21 |
22 | to be continued
23 |
24 | ### EXAMPLES
25 |
26 | ```
27 | * crawl searchresult headers
28 | rexcrawl http://www.google.com/search?q=test '
'
29 |
30 | * crawl with a http proxy
31 | HTTP_PROXY=127.0.0.1:8080 rexcrawl http://www.google.com/search?q=test ''
32 |
33 | * crawl searchresult headers and descend to subpages
34 | rexcrawl http://www.google.com/search?q=test '' 'http:\/\/.*start[^"]+'
35 |
36 | csscrawl ---
37 |
38 | * crawl searchresult headers
39 | mixcrawl http://www.google.com/search?q=test 'h3.r'
40 |
41 | * crawl with a http proxy
42 | HTTP_PROXY=127.0.0.1:8080 mixcrawl http://www.google.com/search?q=test 'h3.r'
43 |
44 | * crawl searchresult headers and descend to subpages
45 | mixcrawl http://www.google.com/search?q=test 'h3.r' 'http:\/\/.*start[^"]+'
46 |
47 |
48 |
49 | ```
50 |
--------------------------------------------------------------------------------
/src/csscrawl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | #
3 | # cpan install LWP::Simple
4 | # cpan install Mojo::DOM
5 |
6 | use strict;
7 | use warnings;
8 |
9 | use LWP::Simple;
10 | use LWP::UserAgent;
11 | use Mojo::DOM;
12 | use HTTP::Request;
13 | use HTTP::Response;
14 |
15 | my @urls;
16 | my %visited;
17 | my $browser;
18 |
19 | my $SLEEP = $ENV{REQUEST_PAUSE} // 1;
20 | my $TIMEOUT = $ENV{REQUEST_TIMEOUT} // 15;
21 |
22 | my $UAMOZ = 'Mozilla/5.0 (Windows NT 6.1; WOW64)';
23 | my $UAENG = 'AppleWebKit/537.36 (KHTML, like Gecko)';
24 | my $UATAG = 'Chrome/46.0.2490.86 Safari/537.36';
25 |
26 | $browser = LWP::UserAgent->new(join(' ', $UAMOZ, $UAENG, $UATAG));
27 | $browser->proxy( ['http'], $ENV{HTTP_PROXY} ) if exists $ENV{HTTP_PROXY};
28 | $browser->timeout($TIMEOUT);
29 |
30 | die("usage: crawler url extract-selector [crawl-selector]") if(scalar @ARGV < 2);
31 | my ($url, $crawl, $extract) = @ARGV;
32 | push(@urls, $url);
33 |
34 | foreach my $url (@urls) {
35 |
36 | next if exists $visited{$url};
37 |
38 | $visited{$url} = 1;
39 |
40 | my $request = HTTP::Request->new(GET => $url);
41 | my $response = $browser->request($request);
42 |
43 | if ($response->is_error()) {
44 | printf STDERR "crawler: %s for %s\n", $response->status_line, $url;
45 | next;
46 | }
47 |
48 |
49 | my $dom = Mojo::DOM->new($response->content());
50 |
51 | for my $node( $dom->find($crawl)->each) {
52 | if($crawl =~ /\]$/) {
53 | push @urls, $node->text;
54 | } else {
55 | push @urls, $node->to_string;
56 | }
57 |
58 | }
59 |
60 | for my $node( $dom->find($extract)->each) {
61 | print $node->text. "\n"; next;
62 | if($crawl =~ /\]$/) {
63 | #print "matches\n";
64 | print $node->text. "\n";
65 | } else {
66 | #print "does not match\n";
67 | print $node->to_string. "\n";
68 | }
69 |
70 | }
71 |
72 | sleep $SLEEP;
73 | }
74 |
75 |
--------------------------------------------------------------------------------
/src/mixcrawl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | #
3 | # cpan install LWP::Simple
4 | # cpan install Mojo::DOM
5 |
6 | use strict;
7 | use warnings;
8 |
9 | use LWP::Simple;
10 | use LWP::UserAgent;
11 | use Mojo::DOM;
12 | use HTTP::Request;
13 | use HTTP::Response;
14 |
15 | my @urls;
16 | my %visited;
17 | my $browser;
18 |
19 | my $SLEEP = $ENV{REQUEST_PAUSE} // 1;
20 | my $TIMEOUT = $ENV{REQUEST_TIMEOUT} // 15;
21 |
22 | die("usage: mixcrawler url extract-selector [crawl-regex]") if(scalar @ARGV < 2);
23 | $sleep = $ENV{REQUEST_INTERVAL} ) if exists $ENV{REQUEST_INTERVAL};
24 |
25 |
26 | my $UAMOZ = 'Mozilla/5.0 (Windows NT 6.1; WOW64)';
27 | my $UAENG = 'AppleWebKit/537.36 (KHTML, like Gecko)';
28 | my $UATAG = 'Chrome/46.0.2490.86 Safari/537.36';
29 |
30 |
31 | $browser = LWP::UserAgent->new(join(' ', $UAMOZ, $UAENG, $UATAG));
32 | $browser->proxy( ['http'], $ENV{HTTP_PROXY} ) if exists $ENV{HTTP_PROXY};
33 | $browser->timeout($TIMEOUT);
34 |
35 |
36 | my ($url, $extract, $crawl) = @ARGV;
37 | push(@urls, $url);
38 |
39 | foreach my $url (@urls) {
40 | next if $url eq "";
41 | next if exists $visited{$url};
42 |
43 | $visited{$url} = 1;
44 |
45 | my $request = HTTP::Request->new(GET => $url);
46 | my $response = $browser->request($request);
47 |
48 | if ($response->is_error()) {
49 | printf STDERR "crawler: %s for %s\n", $response->status_line,$url;
50 | next;
51 | }
52 |
53 | my $content = $response->content();
54 | my $dom = Mojo::DOM->new($content);
55 |
56 | if($crawl) {
57 | push @urls, $& while($content =~ m/$crawl/g);
58 | }
59 |
60 | for my $node( $dom->find($extract)->each) {
61 | print $node->to_string. "\n\n";
62 |
63 | }
64 |
65 | sleep $SLEEP;
66 | }
67 |
--------------------------------------------------------------------------------
/src/rexcrawl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | #
3 | # cpan install LWP::Simple
4 | #
5 |
6 | use strict;
7 | use warnings;
8 |
9 | use LWP::Simple;
10 | use LWP::UserAgent;
11 | use HTTP::Request;
12 | use HTTP::Response;
13 |
14 | my @urls;
15 | my %visited;
16 | my $browser;
17 |
18 | my $SLEEP = $ENV{REQUEST_PAUSE} // 1;
19 | my $TIMEOUT = $ENV{REQUEST_TIMEOUT} // 15;
20 |
21 | my $UAMOZ = 'Mozilla/5.0 (Windows NT 6.1; WOW64)';
22 | my $UAENG = 'AppleWebKit/537.36 (KHTML, like Gecko)';
23 | my $UATAG = 'Chrome/46.0.2490.86 Safari/537.36';
24 |
25 | $browser = LWP::UserAgent->new(join(' ', $UAMOZ, $UAENG, $UATAG));
26 | $browser->proxy( ['http'], $ENV{HTTP_PROXY} ) if exists $ENV{HTTP_PROXY};
27 | $browser->timeout($TIMEOUT);
28 |
29 | die("usage: rexcrawl url extract-pattern [crawl-pattern]") if(scalar @ARGV < 2);
30 | my ($url, $extract, $crawl) = @ARGV;
31 | push(@urls, $url);
32 |
33 | foreach my $url (@urls) {
34 |
35 | next if exists $visited{$url};
36 |
37 | $visited{$url} = 1;
38 |
39 | my $request = HTTP::Request->new(GET => $url);
40 | my $response = $browser->request($request);
41 |
42 | if ($response->is_error()) {
43 | printf STDERR "rexcrawl: %s for %s\n", $response->status_line, $url;
44 | next;
45 | }
46 |
47 | my $content = $response->content();
48 |
49 | while($content =~ m/$extract/g) {
50 | my @matches = $& =~ m/$extract/;
51 | print join("\t",@matches)."\n";
52 | }
53 |
54 | if($crawl) {
55 | push @urls, $& while($content =~ m/$crawl/g);
56 | }
57 |
58 | sleep $SLEEP;
59 | }
60 |
--------------------------------------------------------------------------------