├── .gitignore
├── .perltidyrc
├── Dockerfile
├── LICENSE
├── README.pod
├── bin
    ├── docker-build
    ├── get-links-from-s3
    ├── perlybot
    ├── publish-links-to-s3
    └── run-perlybot-nonstop
├── config
    ├── default.yml
    └── prod.yml
├── cpanfile
└── lib
    └── Perly
        ├── Bot.pm
        └── Bot
            ├── Config.pm
            ├── Feed.pm
            ├── Media
                └── JSON.pm
            ├── Post.pm
            └── UserAgent.pm


/.gitignore:
--------------------------------------------------------------------------------
 1 | MYMETA*
 2 | Makefile
 3 | blib
 4 | pm_to_blib
 5 | Makefile.old
 6 | *.swp
 7 | *.old
 8 | *.log
 9 | cache
10 | t/cache
11 | credentials
12 | links.json
13 | bin/image-to-ecr
14 | feeds*yml
15 | 


--------------------------------------------------------------------------------
/.perltidyrc:
--------------------------------------------------------------------------------
1 | -b
2 | -bext='/'
3 | -i=2
4 | -nbl
5 | -sot
6 | -sct
7 | -nolq
8 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM alpine:3.11
 2 | 
 3 | LABEL version=20200623
 4 | 
 5 | WORKDIR /perly-bot
 6 | 
 7 | COPY perly-bot.tar.gz .
 8 | 
 9 | RUN tar xvzf perly-bot.tar.gz && \
10 |   apk update && apk add --no-cache \
11 |     gcc \
12 |     g++ \
13 |     make \
14 |     libressl-dev \
15 |     zlib-dev \
16 |     expat-dev \
17 |     curl \
18 |     perl \
19 |     perl-io-socket-ssl \
20 |     perl-dev \
21 |     shared-mime-info \
22 |     wget && \
23 |   curl -L https://cpanmin.us | perl - App::cpanminus && \
24 |     cpanm --notest --installdeps . -M https://cpan.metacpan.org && \
25 |   apk del \
26 |     curl \
27 |     gcc \
28 |     g++ \
29 |     expat-dev \
30 |     make \
31 |     perl-dev \
32 |     wget && \
33 |   rm -rf /root/.cpanm/* /usr/local/share/man/*
34 | 
35 | ENV PERLYBOT_PROD=1 AWS_CONFIG_FILE=./credentials AWS_DEFAULT_PROFILE=perly-bot
36 | 
37 | CMD ["bin/run-perlybot-nonstop"]
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2015, David Farrell
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice, this
 8 |   list of conditions and the following disclaimer.
 9 | 
10 | * Redistributions in binary form must reproduce the above copyright notice,
11 |   this list of conditions and the following disclaimer in the documentation
12 |   and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 | 
25 | 


--------------------------------------------------------------------------------
/README.pod:
--------------------------------------------------------------------------------
 1 | =encoding utf8
 2 | 
 3 | =head1 Perly-Bot
 4 | 
 5 | A social media broadcaster written in Perl. See C<bin/perlybot> for the command
 6 | line program.
 7 | 
 8 | =head2 Authors
 9 | 
10 | © David Farrell, brian d foy 2015
11 | 
12 | =head2 License
13 | 
14 | FreeBSD, see LICENSE
15 | 
16 | =cut
17 | 
18 | 


--------------------------------------------------------------------------------
/bin/docker-build:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | set -x
4 | tar cvzf perly-bot.tar.gz bin lib config cpanfile credentials
5 | docker build -t perly-bot/app .
6 | rm perly-bot.tar.gz
7 | 


--------------------------------------------------------------------------------
/bin/get-links-from-s3:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | use strict;
 3 | use warnings;
 4 | use Getopt::Long 'GetOptions';
 5 | use File::MimeInfo::Magic 'mimetype';
 6 | use Path::Tiny 'path';
 7 | use Paws;
 8 | 
 9 | GetOptions(
10 |   'out-dir=s' =>\(my $out_dir = '.'),
11 |   'bucket=s'  => \ my $bucket,
12 |   'region=s'  => \(my $region = 'us-east-1'),
13 | ) or die 'unrecognized options: ' . join ' ', @ARGV;
14 | 
15 | my $s3 = Paws->service('S3', region => $region);
16 | 
17 | for my $file (@ARGV) {
18 |   my $res = $s3->GetObject(
19 |     Bucket => $bucket,
20 |     Key    => $file,
21 |   );
22 |   my $pt = path(join '/', $out_dir, $file);
23 |   $pt->spew($res->Body);
24 | }
25 | 


--------------------------------------------------------------------------------
/bin/perlybot:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | use lib 'lib';
 3 | use Perly::Bot;
 4 | use Perly::Bot::Config;
 5 | 
 6 | my $prod = $ENV{PERLYBOT_PROD};
 7 | 
 8 | Perly::Bot::Config->instance(tier => $prod ? 'prod' : 'default');
 9 | Perly::Bot->new->run;
10 | 


--------------------------------------------------------------------------------
/bin/publish-links-to-s3:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env perl
 2 | use strict;
 3 | use warnings;
 4 | use Getopt::Long 'GetOptions';
 5 | use File::MimeInfo::Magic 'mimetype';
 6 | use Path::Tiny 'path';
 7 | use Paws;
 8 | 
 9 | GetOptions(
10 |   'bucket=s' => \ my $bucket,
11 |   'prefix=s' => \ my $prefix,
12 |   'region=s' => \(my $region = 'us-east-1'),
13 |   'acl=s'    => \(my $acl    = 'public-read'),
14 | ) or die 'unrecognized options: ' . join ' ', @ARGV;
15 | 
16 | my $s3 = Paws->service('S3', region => $region);
17 | 
18 | for my $file (@ARGV) {
19 |   my $pt  = path($file);
20 |   my $key = $prefix ? join('/',$prefix,$pt->basename) : $pt->basename;
21 |   $s3->PutObject(
22 |     ContentType => mimetype($file),
23 |     Bucket      => $bucket,
24 |     Body        => $pt->slurp,
25 |     Key         => $key,
26 |     ACL         => $acl,
27 |   );
28 |   print "uploaded $file to $key\n";
29 | }
30 | 
31 | 


--------------------------------------------------------------------------------
/bin/run-perlybot-nonstop:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | while :
3 | do
4 |   bin/get-links-from-s3 --bucket perly-bot.org --out-dir /tmp links.json feeds.yml 2>&1 && \
5 |   bin/perlybot 2>&1 && \
6 |   bin/publish-links-to-s3 --bucket perly-bot.org /tmp/links.json 2>&1
7 |   sleep 3600
8 | done
9 | 


--------------------------------------------------------------------------------
/config/default.yml:
--------------------------------------------------------------------------------
 1 | agent_string: 'Perly_Bot/v2'
 2 | should_emit: {
 3 |   age_threshold_secs: 100000000
 4 | }
 5 | media: {
 6 |   'Perly::Bot::Media::JSON': {
 7 |     link_limit: 100,
 8 |     filepath: /tmp/links.json
 9 |   }
10 | }
11 | feeds: {
12 |   filepath: feeds-dev.yml
13 | }
14 | 


--------------------------------------------------------------------------------
/config/prod.yml:
--------------------------------------------------------------------------------
 1 | agent_string: 'Perly_Bot/v2'
 2 | should_emit: {
 3 |   age_threshold_secs: 86400 # one day
 4 | }
 5 | media: {
 6 |   'Perly::Bot::Media::JSON': {
 7 |     link_limit: 100,
 8 |     filepath: /tmp/links.json
 9 |   }
10 | }
11 | feeds: {
12 |   filepath: /tmp/feeds.yml
13 | }
14 | 


--------------------------------------------------------------------------------
/cpanfile:
--------------------------------------------------------------------------------
 1 | requires "Class::Accessor", "0";
 2 | requires "File::MimeInfo::Magic", "0";
 3 | requires "Getopt::Long", "0";
 4 | requires "HTML::Entities", "0";
 5 | requires "List::Util", "0";
 6 | requires "namespace::autoclean", "0";
 7 | requires "Mojolicious", "0";
 8 | requires "Path::Tiny", "0";
 9 | requires "Paws", "0";
10 | requires "Scalar::Util", "0";
11 | requires "Time::Piece", "0";
12 | requires "Time::Seconds", "0";
13 | requires "XML::FeedPP", "0";
14 | requires "YAML::XS", "0";
15 | 


--------------------------------------------------------------------------------
/lib/Perly/Bot.pm:
--------------------------------------------------------------------------------
 1 | package Perly::Bot;
 2 | use strict;
 3 | use warnings;
 4 | use Path::Tiny;
 5 | use Perly::Bot::Config;
 6 | use Perly::Bot::Feed;
 7 | use Perly::Bot::Media::JSON;
 8 | 
 9 | sub new {
10 |   my ($class) = @_;
11 |   return bless {}, $class;
12 | }
13 | 
14 | sub run {
15 |   my $self = shift;
16 |   my $total_emitted = 0;
17 |   my $feeds         = 0;
18 |   my $config        = Perly::Bot::Config->instance;
19 | 
20 |   binmode STDOUT, ':utf8';
21 |   binmode STDERR, ':utf8';
22 | 
23 |   for my $feed_data ( $config->feed_data->@* ) {
24 |     $feeds++;
25 |     my $feed = Perly::Bot::Feed->new($feed_data);
26 |     printf STDERR "Processing feed [%s]\n", $feed->url;
27 |     my $posts = $feed->trawl_blog;
28 |     printf STDERR "Found %d posts in [%s]\n", scalar @$posts, $feed->url;
29 | 
30 |     my $emitted = 0;
31 |     for my $post ( $posts->@* ) {
32 |       my $emit = eval { $post->should_emit };
33 |       if ($@) {
34 |         warn $@;
35 |       }
36 |       next unless !$@ && $emit;
37 |       $emitted += $self->emit($feed, $post);
38 |     }
39 | 
40 |     $total_emitted += $emitted;
41 |     printf STDERR "Emitted [%d] posts for [%s]\n", $emitted, $feed->url;
42 |   }
43 |   printf STDERR "Emitted [%d] posts in [%d] feeds\n", $total_emitted, $feeds;
44 | }
45 | 
46 | sub emit {
47 |   my ($self, $feed, $post) = @_;
48 |   printf "Emitting [%s]\n", $post->title;
49 | 
50 |   my $config  = Perly::Bot::Config->instance;
51 |   my @errors  = ();
52 |   my $emitted = 0;
53 | 
54 |   for my $media_target ( $feed->media_targets->@* ) {
55 |     printf "Media target is [%s]\n", $media_target;
56 |     my $media = $media_target->new($config->media->{$media_target});
57 |     my $res = eval { $media->emit($post) };
58 | 
59 |     if ( $@ || !$res ) {
60 |       my $error = sprintf 'Could not send post! [%s] %s', $post->title, $@;
61 |       warn "$error\n";
62 |       push @errors, $error;
63 |     }
64 |     else {
65 |       $emitted = 1;
66 |     }
67 |   }
68 |   printf "[%d] errors for [%s]\n", scalar @errors, $post->title;
69 |   return $emitted;
70 | }
71 | 
72 | 1;
73 | 


--------------------------------------------------------------------------------
/lib/Perly/Bot/Config.pm:
--------------------------------------------------------------------------------
 1 | package Perly::Bot::Config;
 2 | use strict;
 3 | use warnings;
 4 | use Path::Tiny;
 5 | use YAML::XS;
 6 | 
 7 | my $self;
 8 | sub instance {
 9 |   my ($class, %args) = @_;
10 |   unless ($self) {
11 |     $self = bless { %args }, $class;
12 |     $self->{config} = $self->load_config;
13 |   }
14 |   return $self;
15 | }
16 | 
17 | sub config_filepath {
18 |   my $self = shift;
19 |   return "config/$self->{tier}.yml";
20 | }
21 | 
22 | sub load_config {
23 |   my $file = $self->config_filepath;
24 |   # use canonpath for cross platform support
25 |   return YAML::XS::LoadFile( Path::Tiny->new($file)->canonpath );
26 | }
27 | 
28 | sub feeds {
29 |   my $self = shift;
30 |   return $self->{config}{feeds};
31 | }
32 | 
33 | sub media {
34 |   my $self = shift;
35 |   return $self->{config}{media};
36 | }
37 | 
38 | sub agent_string {
39 |   my $self = shift;
40 |   return $self->{config}{agent_string};
41 | }
42 | 
43 | sub age_threshold_secs {
44 |   my $self = shift;
45 |   return $self->{config}{should_emit}{age_threshold_secs};
46 | }
47 | 
48 | sub feed_data {
49 |   my ($self) = @_;
50 |   my $filepath = $self->feeds->{filepath};
51 |   return YAML::XS::LoadFile( Path::Tiny->new($filepath)->canonpath );
52 | }
53 | 
54 | 1;
55 | 


--------------------------------------------------------------------------------
/lib/Perly/Bot/Feed.pm:
--------------------------------------------------------------------------------
  1 | use v5.22;
  2 | use feature qw(signatures postderef);
  3 | no warnings qw(experimental::signatures experimental::postderef);
  4 | 
  5 | package Perly::Bot::Feed;
  6 | use utf8;
  7 | 
  8 | use Perly::Bot::UserAgent;
  9 | use namespace::autoclean;
 10 | use Scalar::Util qw(weaken);
 11 | use Time::Piece;
 12 | use Time::Seconds;
 13 | use XML::FeedPP;
 14 | 
 15 | use base 'Class::Accessor';
 16 | __PACKAGE__->mk_accessors(
 17 |   qw/url type date_name date_format active
 18 |     proxy media_targets post_class/
 19 | );
 20 | 
 21 | =encoding utf8
 22 | 
 23 | =head1 NAME
 24 | 
 25 | Perly::Bot::Feed - represent a feed
 26 | 
 27 | =head1 SYNOPSIS
 28 | 
 29 | =head1 DESCRIPTION
 30 | 
 31 | =head1 FUNCTIONS
 32 | 
 33 | =head2 get_posts ($xml)
 34 | 
 35 | This method requires an xml string of the blog feed and returns an
 36 | arrayref of L<Perl::Bot::Feed::Blog> objects.
 37 | 
 38 | =cut
 39 | 
 40 | sub type_defaults ($self) {
 41 |   state $type_defaults = {
 42 |     rdf => {
 43 |       date_name   => 'dc:date',
 44 |       date_format => '%Y-%m-%dT%T%z',
 45 |       parser      => 'XML::FeedPP::RDF',
 46 |     },
 47 |     rss => {
 48 |       date_name   => 'pubDate',
 49 |       date_format => '%a, %d %b %Y %T %z',
 50 |       parser      => 'XML::FeedPP::RSS',
 51 |     },
 52 |     atom => {
 53 |       date_name   => 'published',
 54 |       date_format => '%Y-%m-%dT%TZ',
 55 |       parser      => 'XML::FeedPP::Atom',
 56 |     },
 57 |   };
 58 | 
 59 |   return $type_defaults;
 60 | }
 61 | 
 62 | sub defaults_for_type ($self, $type = 'rss') {
 63 |   state $type_defaults = $self->type_defaults;
 64 | 
 65 |   unless (exists $type_defaults->{$type}) {
 66 |     warn "No defaults for media type [$type]!\n";
 67 |     return;
 68 |   }
 69 | 
 70 |   return $self->type_defaults->{$type};
 71 | }
 72 | 
 73 | sub defaults ($class) {
 74 |   state $defaults = {
 75 |     active        => 1,
 76 |     proxy         => 0,
 77 |     post_class    => 'Perly::Bot::Post',
 78 |     media_targets => [
 79 |       'Perly::Bot::Media::JSON',
 80 |     ],
 81 |   };
 82 | 
 83 |   $defaults;
 84 | }
 85 | 
 86 | sub new ($class, $args) {
 87 |   my %feed = ($class->defaults->%*, $args->%*);
 88 |   my $self = bless \%feed, $class;
 89 | 
 90 |   unless (defined $self->{type}) {
 91 |     $self->{type} = 'rss';
 92 |   }
 93 | 
 94 |   while (my ($k, $v) = each $self->defaults_for_type($self->{type})->%*) {
 95 |     next if defined $self->{$k};
 96 |     $self->{$k} = $v;
 97 |   }
 98 | 
 99 |   state $required = [
100 |     qw(url type date_name date_format active media_targets proxy parser)
101 |   ];
102 |   my @missing = grep { !exists $self->{$_} } $required->@*;
103 |   die "Missing fields (@missing) for feed $self->{url}" if @missing;
104 | 
105 |   die "Unallowed content parser $self->{parser}"
106 |     unless $self->parser_allowed($self->{parser});
107 | 
108 |   unless ($self->post_class =~ m/ \A [A-Z0-9_]+ (?: :: [A-Z0-9_]+)+ \z /xi) {
109 |     die "Invalid post class " . $self->post_class . " for " . $self->url;
110 |   }
111 |   else {
112 |     unless (eval "require " . $self->post_class . "; 1") {
113 |       die "Could not load post class " . $self->post_class . ": $@";
114 |     }
115 |   }
116 | 
117 |   $self;
118 | }
119 | 
120 | sub parser_allowed ($self, $parser) {
121 |   return exists $self->_allowed_parsers->{$parser};
122 | }
123 | 
124 | sub _allowed_parsers {
125 |   state $allowed = {
126 |     map { $_ => 1 }
127 |       qw(
128 |       XML::FeedPP::RSS
129 |       XML::FeedPP::RDF
130 |       XML::FeedPP::Atom
131 |      ) };
132 |   $allowed;
133 | }
134 | 
135 | sub is_active ($self) {
136 |   return 0 if (defined $self->{active}   and !$self->{active});
137 |   return 0 if (defined $self->{inactive} and $self->{inactive});
138 |   return 1;
139 | }
140 | 
141 | sub trawl_blog ($self) {
142 |   warn "Trawling " . $self->url . "\n";
143 | 
144 |   my $ua = Perly::Bot::UserAgent->instance;
145 | 
146 |   if (my $response = $ua->get($self->url)) {
147 |     my $content    = $response->text;
148 |     my $blog_posts = $self->extract_posts($content);
149 |     return $blog_posts;
150 |   }
151 |   else {
152 |     warn "Received nothing for feed " . $self->url . "\n";
153 |     return [];
154 |   }
155 | }
156 | 
157 | sub fetch_feed ($self) {
158 |   my $ua       = Perly::Bot::UserAgent->instance;
159 |   my $response = $ua->get($self->url);
160 | 
161 |   if (my $response = $ua->get($self->url)) {
162 |     my $content = $response->text;    # decode
163 |     printf STDERR "Received content length: %s\n", length $content;
164 |     $self->{content} = $content;
165 |     return $content;
166 |   }
167 | 
168 |   return;
169 | }
170 | 
171 | sub extract_posts ($self, $xml) {
172 |   my @posts = ();
173 | 
174 |   my @items =
175 |     eval { $self->{parser}->new($xml, -type => 'string')->get_item() };
176 | 
177 |   if ($@) {
178 |     warn  "Bad XML for " . $self->url . ": $@\n";
179 |     return [];
180 |   }
181 | 
182 |   foreach my $i (@items) {
183 | 
184 |     # extract the post date
185 |     my $datetime_raw   = $i->get($self->date_name);
186 |     my $date_format    = $self->date_format;
187 |     my $datetime_clean = $datetime_raw;
188 | 
189 |     # time::piece does not recognise UTC as a time zone
190 |     $datetime_clean =~ s/UTC/GMT/ if $date_format =~ /\%Z/;
191 | 
192 |     # time::piece requires timezone modifiers to not have a semicolon
193 |     $datetime_clean =~ s/([+\-][0-9][0-9]):([0-9][0-9]$)/$1$2/
194 |       if $date_format =~ /\%z/;
195 | 
196 |     # trim whitespace
197 |     $datetime_clean =~ s/\A\s+|\s+\Z//gm;
198 | 
199 |     # time::piece struggles with milliseconds
200 |     if ($self->date_format =~ /%ms/) {
201 |       $datetime_clean =~ s/\.[0-9][0-9][0-9]//;
202 |       $date_format =~
203 |         s/\%ms//;    # %ms is a Perly bot convention not used by strptime
204 |     }
205 | 
206 |     my $weak_self = $self;
207 |     weaken($weak_self);
208 | 
209 |     my $post = eval {
210 |       my $datetime = Time::Piece->strptime($datetime_clean, $date_format);
211 |       $self->post_class->new({
212 |         description   => $i->description,
213 |         datetime      => $datetime,
214 |         proxy         => $self->proxy,
215 |         title         => $i->title,
216 |         epoch         => $datetime->epoch,
217 |         url           => $i->link,
218 |       });
219 |     };
220 | 
221 |     if ($@) {
222 |       warn "Error creating post object: $@";
223 |     }
224 |     else {
225 |       push @posts, $post;
226 |     }
227 |   }
228 |   return \@posts;
229 | }
230 | 
231 | 1;
232 | 


--------------------------------------------------------------------------------
/lib/Perly/Bot/Media/JSON.pm:
--------------------------------------------------------------------------------
 1 | package Perly::Bot::Media::JSON;
 2 | use autodie;
 3 | use strict;
 4 | use warnings;
 5 | use Mojo::JSON qw/decode_json encode_json/;
 6 | 
 7 | sub new {
 8 |   my ($class, $args) = @_;
 9 |   my @missing = grep { !(exists $args->{$_} && defined $args->{$_}) }
10 |     qw(filepath link_limit);
11 | 
12 |   die "Missing required parameters (@missing) for $class" if @missing;
13 | 
14 |   return bless $args, $class;
15 | }
16 | 
17 | sub emit {
18 |   my ($self, $blog_post) = @_;
19 | 
20 |   my $json = '[]';
21 |   if (-e $self->{filepath}) {
22 |     open my $fh_r, '<', $self->{filepath};
23 |     $json = do { local $/; <$fh_r>; };
24 |     close $fh_r;
25 |   }
26 | 
27 |   my @links = @{ decode_json($json) };
28 | 
29 |   if (grep { $blog_post->root_url eq $_->{url} } @links) {
30 |     printf STDERR "already emitted %s, skipping\n", $blog_post->root_url;
31 |     return 1;
32 |   }
33 | 
34 |   unshift @links, {
35 |     posted => $blog_post->datetime->datetime, #YYYY-MM-DDT00:00:00
36 |     title  => $blog_post->decoded_title,
37 |     url    => $blog_post->root_url,
38 |   };
39 | 
40 |   # don't slice larger than our data or link limit
41 |   my $limit = $self->{link_limit} > @links
42 |     ? @links - 1
43 |     : $self->{link_limit} - 1;
44 |   @links = @links[ 0 .. $limit ];
45 | 
46 |   open my $fh_w, '>', $self->{filepath};
47 |   print $fh_w encode_json(\@links), "\n";
48 | 
49 |   return 1;
50 | }
51 | 
52 | 1;
53 | 


--------------------------------------------------------------------------------
/lib/Perly/Bot/Post.pm:
--------------------------------------------------------------------------------
  1 | use v5.22;
  2 | use feature qw(signatures postderef);
  3 | no warnings qw(experimental::signatures experimental::postderef);
  4 | 
  5 | package Perly::Bot::Post;
  6 | 
  7 | use Perly::Bot::UserAgent;
  8 | use Perly::Bot::Config;
  9 | use HTML::Entities;
 10 | use List::Util qw(sum0 any);
 11 | use Time::Piece;
 12 | 
 13 | use base 'Class::Accessor';
 14 | __PACKAGE__->mk_accessors(
 15 |   qw/url title datetime description domain epoch content_regex/);
 16 | 
 17 | =encoding utf8
 18 | 
 19 | =head1 NAME
 20 | 
 21 | Perly::Bot::Post - process a social media post
 22 | 
 23 | =head1 FUNCTIONS
 24 | 
 25 | =head2 clean_url
 26 | 
 27 | Removes the query component of the url. This is to reduce the risk of posting duplicate urls with different query parameters.
 28 | 
 29 | =cut
 30 | 
 31 | sub clean_url ($self, $url = undef) {
 32 |   my $uri       = Mojo::URL->new(($url || $self->url));
 33 |   my $clean_url = $uri->scheme . '://' . $uri->host . $uri->path;
 34 |   printf STDERR "Cleaned URL to [%s]\n", $clean_url;
 35 |   return $clean_url;
 36 | }
 37 | 
 38 | sub domain ($self) { $self->{domain} //= Mojo::URL->new($self->root_url)->host }
 39 | 
 40 | 
 41 | =head2 root_url
 42 | 
 43 | Returns the clean url, it will follow the url and return the ultimate location the URL redirects to. Sets the post's raw content.
 44 | 
 45 | =cut
 46 | 
 47 | sub root_url ($self) {
 48 |   # if we've already retrieved the root url, don't pull it again
 49 |   return $self->{_root_url} if $self->{_root_url};
 50 | 
 51 |   my ($request, $response) = Perly::Bot::UserAgent->instance->get($self->url);
 52 |   if ($response)
 53 |   {
 54 |     my $url = $request->url->to_abs();
 55 |     printf STDERR "URL is [%s]\n", $url;
 56 | 
 57 |     # set the post content
 58 |     $self->{raw_content} = $response->body;
 59 | 
 60 |     $self->{_root_url} = $self->clean_url($url);
 61 |     return $self->{_root_url};
 62 |   }
 63 | }
 64 | 
 65 | =head2 raw_content
 66 | 
 67 | Returns the raw HTML of the post
 68 | 
 69 | =cut
 70 | 
 71 | sub raw_content ($self) {
 72 |   return $self->{raw_content} if exists $self->{raw_content};
 73 |   $self->root_url(); # fetch the article and set the content
 74 |   return $self->{raw_content} or die 'root_url() did not set the post content';
 75 | }
 76 | 
 77 | =head2 extract_body_text
 78 | 
 79 | Extracts text from the raw body HTML
 80 | 
 81 | =cut
 82 | 
 83 | sub extract_body_text ($self, $content = $self->raw_content){
 84 |   my $regex = $self->get_extraction_regex;
 85 |   my $paragraphs = join "\n", $content =~ /$regex/g;
 86 |   die sprintf 'failed to extract text from [%s]', $self->domain unless $paragraphs;
 87 |   $paragraphs =~ s/<\/?.+?>//g;
 88 |   return decode_entities($paragraphs);
 89 | }
 90 | 
 91 | sub get_extraction_regex ($self, $domain = $self->domain) {
 92 |   state $domain_regexes = {
 93 |     'blogs.perl.org'                  => qr/<div class="entry-body">(.+?)<!-- .entry-body -->/si,
 94 |     'blog.plover.com'                 => qr/class="mainsection"(.+?)<\/table>/si,
 95 |     'rjbs.manxome.org'                => qr/<div class='body markup:md'>(.+?)<div id='footer'>/si,
 96 |     'perltricks.com'                  => qr/<article>(.+?)<\/article>/si,
 97 |     'blog.geekuni.com'                => qr/<div class='post-header'>(.+?)<div class='post-footer'>/si,
 98 |     'www.learning-perl.com'           => qr/<div class="entry">(.+?)<!-- END entry -->/si,
 99 |     'www.masteringperl.org'           => qr/<div class="entry">(.+?)<!-- END entry -->/si,
100 |     'www.intermediateperl.com'        => qr/<div class="entry">(.+?)<!-- END entry -->/si,
101 |     'www.effectiveperlprogramming.com'=> qr/<div class="entry">(.+?)<!-- END entry -->/si,
102 |     'techblog.babyl.ca'               => qr/<div class="blog_entry">(.+?)<div id="disqus_thread">/si,
103 |     'www.dagolden.com'                => qr/<div class="entry-content">(.+?)<!-- .entry-content -->/si,
104 |     'p6weekly.wordpress.com'          => qr/<div class="entry-content">(.+?)<!-- .entry-content -->/si,
105 |     '6guts.wordpress.com'             => qr/<div class="entry-content">(.+?)<!-- .entry-content -->/si,
106 |     'perlhacks.com'                   => qr/<div class="entry-content">(.+?)<!-- .entry-content -->/si,
107 |     'blog.urth.com'                   => qr/<div class="entry-content(.+?)<!-- .entry-content -->/si,
108 |     'perlmaven.com'                   => qr/<div role="main" id="main">(.+?)<div class="prev-next">/si,
109 |     'leonerds-code.blogspot.com'      => qr/articleBody(.+?)<div style='clear: both;'>/si,
110 |     'default'                         => qr/<p>(.+?)<\/p>/si,
111 |   };
112 |   my $key = exists $domain_regexes->{$domain} ? $domain : 'default';
113 |   $domain_regexes->{$key};
114 | }
115 | 
116 | 
117 | sub body ($self, $content = undef) {
118 |   $self->{body} //= $self->extract_body_text($content)
119 | }
120 | 
121 | =head2 decoded_title
122 | 
123 | Returns the blog post title decoded from html using L<HTML::Entities>. This is required because some titles have HTML encoded characters in them.
124 | 
125 | =cut
126 | 
127 | sub decoded_title ($self) { decode_entities($self->title) }
128 | 
129 | =head2 should_emit
130 | 
131 | The logic to decide if a blog post should be emitted or not. This is:
132 | 
133 | - if the post is recent
134 | - not too new to exceed the delay (to allow authors to post their own links)
135 | - it looks Perl-related and is not already posted
136 | 
137 | Feel free to subclass and override this logic with your own needs for a particular
138 | post type!
139 | 
140 | =cut
141 | 
142 | sub _content_exclusion_methods ($self) {
143 |   qw(
144 |     has_short_title
145 |     has_no_perlybot_tag
146 |     no_byterock
147 |  );
148 | }
149 | 
150 | sub _content_metric_methods ($self) {
151 |   qw(body_looks_perly
152 |      body_word_count
153 |  );
154 | }
155 | 
156 | sub too_old ($post) {
157 |   my $elapsed = time - $post->epoch;
158 |   return $elapsed > $post->age_threshold_secs;
159 | }
160 | 
161 | sub threshold ($self) { 2 }
162 | 
163 | sub should_emit ($post) {
164 |   my $too_old = $post->too_old || 0;
165 |   my $exclusion = sum0(map { $post->$_() || 0 } $post->_content_exclusion_methods);
166 |   my $metric = sum0(map { !$post->$_($post->raw_content) || 0 }
167 |     $post->_content_metric_methods);
168 | 
169 |   my $emit = !($too_old + $exclusion + $metric);
170 |   printf STDERR "%s failed emit check: too old:%s, exclude:%s, metric:%s\n",
171 |     $post->title, $too_old, $exclusion, $metric unless $emit;
172 | 
173 |   return $emit;
174 | }
175 | 
176 | =head2 age_threshold_secs
177 | 
178 | Returns the configured age_threshold_secs value. You can override this
179 | to decide a value based on anything you like.
180 | 
181 | =cut
182 | 
183 | sub age_threshold_secs { Perly::Bot::Config->instance->age_threshold_secs }
184 | 
185 | =head2 has_short_title
186 | 
187 | Returns true if the post has a title shorter than 6 characters.
188 | 
189 | =cut
190 | 
191 | sub has_short_title ($post) { length ($post->title || '') < 6 }
192 | 
193 | =head2 looks_perly(POST)
194 | 
195 | Returns true if the post looks like it's about Perl. Since it's a method
196 | you can override this in specialized post types.
197 | 
198 | =cut
199 | 
200 | sub looks_perly ($post, $text) {
201 |   state $looks_perly = qr/\b(?:perl|cpan)\b/i;
202 | 
203 |   $text =~ $looks_perly;
204 | }
205 | 
206 | sub body_looks_perly ($self, $content = undef) { $self->looks_perly($self->body($content)) }
207 | 
208 | sub has_no_perlybot_tag ($self) { $self->raw_content =~ /no-perly-bot/i }
209 | 
210 | # this user writes a lot of posts we don't want to share
211 | sub no_byterock ($self) { $self->raw_content =~ /byterock/i }
212 | 
213 | sub word_count ($self, $content) {
214 |   my @words = split /\s+/, $content;
215 |   return scalar @words;
216 | }
217 | sub body_word_count ($self, $content = undef) { $self->word_count($self->body($content)) > 150 }
218 | 
219 | sub clone ($self) {
220 |   state $storable = require Storable;
221 |   my $clone = Storable::dclone($self);
222 | }
223 | 
224 | 1;
225 | 


--------------------------------------------------------------------------------
/lib/Perly/Bot/UserAgent.pm:
--------------------------------------------------------------------------------
 1 | package Perly::Bot::UserAgent;
 2 | use strict;
 3 | use warnings;
 4 | 
 5 | use parent qw(Mojo::UserAgent);
 6 | 
 7 | my $self;
 8 | sub instance {
 9 |   my ($class, $args) = @_;
10 |   $args //= {};
11 |   return $self if defined $self;
12 |   $self = $class->new($args);
13 | }
14 | 
15 | sub new {
16 |   my ($class, $args) = @_;
17 |   $args //= {};
18 | 
19 |   my $config = Perly::Bot::Config->instance;
20 |   $args->{name} //= $config->agent_string;
21 | 
22 |   my $ua = Mojo::UserAgent->new;
23 |   $ua->request_timeout(30)->connect_timeout(30)->max_redirects(3)
24 |     ->transactor->name('Perly_Bot');
25 | 
26 |   my $obj = bless $ua, $class;
27 | 
28 |   # tolerate invalid certs and slow websites
29 |   $obj->insecure(1);
30 |   $obj->connect_timeout(5);
31 |   $obj->request_timeout(5);
32 | }
33 | 
34 | sub get {
35 |   my ($self, $url) = @_;
36 |   my $tx = $self->SUPER::get($url);
37 |   my $res = $tx->result;
38 |   unless ($res->is_success) {
39 |     warn sprintf
40 |         "Could not fetch [%s] Got a [%d]\n------\n%s\n------\n%s\n------\n",
41 |       $url, $res->code, $tx->req->to_string, $res->to_string;
42 |     return;
43 |   }
44 | 
45 |   if (wantarray) {
46 |     ($tx->req, $res, $tx);
47 |   }
48 |   else {
49 |     $tx->res;
50 |   }
51 | }
52 | 
53 | 1;
54 | 


--------------------------------------------------------------------------------