├── .gitignore ├── .perltidyrc ├── Dockerfile ├── LICENSE ├── README.pod ├── bin ├── docker-build ├── get-links-from-s3 ├── perlybot ├── publish-links-to-s3 └── run-perlybot-nonstop ├── config ├── default.yml └── prod.yml ├── cpanfile └── lib └── Perly ├── Bot.pm └── Bot ├── Config.pm ├── Feed.pm ├── Media └── JSON.pm ├── Post.pm └── UserAgent.pm /.gitignore: -------------------------------------------------------------------------------- 1 | MYMETA* 2 | Makefile 3 | blib 4 | pm_to_blib 5 | Makefile.old 6 | *.swp 7 | *.old 8 | *.log 9 | cache 10 | t/cache 11 | credentials 12 | links.json 13 | bin/image-to-ecr 14 | feeds*yml 15 | -------------------------------------------------------------------------------- /.perltidyrc: -------------------------------------------------------------------------------- 1 | -b 2 | -bext='/' 3 | -i=2 4 | -nbl 5 | -sot 6 | -sct 7 | -nolq 8 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM alpine:3.11 2 | 3 | LABEL version=20200623 4 | 5 | WORKDIR /perly-bot 6 | 7 | COPY perly-bot.tar.gz . 8 | 9 | RUN tar xvzf perly-bot.tar.gz && \ 10 | apk update && apk add --no-cache \ 11 | gcc \ 12 | g++ \ 13 | make \ 14 | libressl-dev \ 15 | zlib-dev \ 16 | expat-dev \ 17 | curl \ 18 | perl \ 19 | perl-io-socket-ssl \ 20 | perl-dev \ 21 | shared-mime-info \ 22 | wget && \ 23 | curl -L https://cpanmin.us | perl - App::cpanminus && \ 24 | cpanm --notest --installdeps . -M https://cpan.metacpan.org && \ 25 | apk del \ 26 | curl \ 27 | gcc \ 28 | g++ \ 29 | expat-dev \ 30 | make \ 31 | perl-dev \ 32 | wget && \ 33 | rm -rf /root/.cpanm/* /usr/local/share/man/* 34 | 35 | ENV PERLYBOT_PROD=1 AWS_CONFIG_FILE=./credentials AWS_DEFAULT_PROFILE=perly-bot 36 | 37 | CMD ["bin/run-perlybot-nonstop"] 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, David Farrell 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, this 8 | list of conditions and the following disclaimer. 9 | 10 | * Redistributions in binary form must reproduce the above copyright notice, 11 | this list of conditions and the following disclaimer in the documentation 12 | and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 15 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 18 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 20 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 21 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 22 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 23 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | 25 | -------------------------------------------------------------------------------- /README.pod: -------------------------------------------------------------------------------- 1 | =encoding utf8 2 | 3 | =head1 Perly-Bot 4 | 5 | A social media broadcaster written in Perl. See C for the command 6 | line program. 7 | 8 | =head2 Authors 9 | 10 | © David Farrell, brian d foy 2015 11 | 12 | =head2 License 13 | 14 | FreeBSD, see LICENSE 15 | 16 | =cut 17 | 18 | -------------------------------------------------------------------------------- /bin/docker-build: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | set -x 4 | tar cvzf perly-bot.tar.gz bin lib config cpanfile credentials 5 | docker build -t perly-bot/app . 6 | rm perly-bot.tar.gz 7 | -------------------------------------------------------------------------------- /bin/get-links-from-s3: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use strict; 3 | use warnings; 4 | use Getopt::Long 'GetOptions'; 5 | use File::MimeInfo::Magic 'mimetype'; 6 | use Path::Tiny 'path'; 7 | use Paws; 8 | 9 | GetOptions( 10 | 'out-dir=s' =>\(my $out_dir = '.'), 11 | 'bucket=s' => \ my $bucket, 12 | 'region=s' => \(my $region = 'us-east-1'), 13 | ) or die 'unrecognized options: ' . join ' ', @ARGV; 14 | 15 | my $s3 = Paws->service('S3', region => $region); 16 | 17 | for my $file (@ARGV) { 18 | my $res = $s3->GetObject( 19 | Bucket => $bucket, 20 | Key => $file, 21 | ); 22 | my $pt = path(join '/', $out_dir, $file); 23 | $pt->spew($res->Body); 24 | } 25 | -------------------------------------------------------------------------------- /bin/perlybot: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use lib 'lib'; 3 | use Perly::Bot; 4 | use Perly::Bot::Config; 5 | 6 | my $prod = $ENV{PERLYBOT_PROD}; 7 | 8 | Perly::Bot::Config->instance(tier => $prod ? 'prod' : 'default'); 9 | Perly::Bot->new->run; 10 | -------------------------------------------------------------------------------- /bin/publish-links-to-s3: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env perl 2 | use strict; 3 | use warnings; 4 | use Getopt::Long 'GetOptions'; 5 | use File::MimeInfo::Magic 'mimetype'; 6 | use Path::Tiny 'path'; 7 | use Paws; 8 | 9 | GetOptions( 10 | 'bucket=s' => \ my $bucket, 11 | 'prefix=s' => \ my $prefix, 12 | 'region=s' => \(my $region = 'us-east-1'), 13 | 'acl=s' => \(my $acl = 'public-read'), 14 | ) or die 'unrecognized options: ' . join ' ', @ARGV; 15 | 16 | my $s3 = Paws->service('S3', region => $region); 17 | 18 | for my $file (@ARGV) { 19 | my $pt = path($file); 20 | my $key = $prefix ? join('/',$prefix,$pt->basename) : $pt->basename; 21 | $s3->PutObject( 22 | ContentType => mimetype($file), 23 | Bucket => $bucket, 24 | Body => $pt->slurp, 25 | Key => $key, 26 | ACL => $acl, 27 | ); 28 | print "uploaded $file to $key\n"; 29 | } 30 | 31 | -------------------------------------------------------------------------------- /bin/run-perlybot-nonstop: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | while : 3 | do 4 | bin/get-links-from-s3 --bucket perly-bot.org --out-dir /tmp links.json feeds.yml 2>&1 && \ 5 | bin/perlybot 2>&1 && \ 6 | bin/publish-links-to-s3 --bucket perly-bot.org /tmp/links.json 2>&1 7 | sleep 3600 8 | done 9 | -------------------------------------------------------------------------------- /config/default.yml: -------------------------------------------------------------------------------- 1 | agent_string: 'Perly_Bot/v2' 2 | should_emit: { 3 | age_threshold_secs: 100000000 4 | } 5 | media: { 6 | 'Perly::Bot::Media::JSON': { 7 | link_limit: 100, 8 | filepath: /tmp/links.json 9 | } 10 | } 11 | feeds: { 12 | filepath: feeds-dev.yml 13 | } 14 | -------------------------------------------------------------------------------- /config/prod.yml: -------------------------------------------------------------------------------- 1 | agent_string: 'Perly_Bot/v2' 2 | should_emit: { 3 | age_threshold_secs: 86400 # one day 4 | } 5 | media: { 6 | 'Perly::Bot::Media::JSON': { 7 | link_limit: 100, 8 | filepath: /tmp/links.json 9 | } 10 | } 11 | feeds: { 12 | filepath: /tmp/feeds.yml 13 | } 14 | -------------------------------------------------------------------------------- /cpanfile: -------------------------------------------------------------------------------- 1 | requires "Class::Accessor", "0"; 2 | requires "File::MimeInfo::Magic", "0"; 3 | requires "Getopt::Long", "0"; 4 | requires "HTML::Entities", "0"; 5 | requires "List::Util", "0"; 6 | requires "namespace::autoclean", "0"; 7 | requires "Mojolicious", "0"; 8 | requires "Path::Tiny", "0"; 9 | requires "Paws", "0"; 10 | requires "Scalar::Util", "0"; 11 | requires "Time::Piece", "0"; 12 | requires "Time::Seconds", "0"; 13 | requires "XML::FeedPP", "0"; 14 | requires "YAML::XS", "0"; 15 | -------------------------------------------------------------------------------- /lib/Perly/Bot.pm: -------------------------------------------------------------------------------- 1 | package Perly::Bot; 2 | use strict; 3 | use warnings; 4 | use Path::Tiny; 5 | use Perly::Bot::Config; 6 | use Perly::Bot::Feed; 7 | use Perly::Bot::Media::JSON; 8 | 9 | sub new { 10 | my ($class) = @_; 11 | return bless {}, $class; 12 | } 13 | 14 | sub run { 15 | my $self = shift; 16 | my $total_emitted = 0; 17 | my $feeds = 0; 18 | my $config = Perly::Bot::Config->instance; 19 | 20 | binmode STDOUT, ':utf8'; 21 | binmode STDERR, ':utf8'; 22 | 23 | for my $feed_data ( $config->feed_data->@* ) { 24 | $feeds++; 25 | my $feed = Perly::Bot::Feed->new($feed_data); 26 | printf STDERR "Processing feed [%s]\n", $feed->url; 27 | my $posts = $feed->trawl_blog; 28 | printf STDERR "Found %d posts in [%s]\n", scalar @$posts, $feed->url; 29 | 30 | my $emitted = 0; 31 | for my $post ( $posts->@* ) { 32 | my $emit = eval { $post->should_emit }; 33 | if ($@) { 34 | warn $@; 35 | } 36 | next unless !$@ && $emit; 37 | $emitted += $self->emit($feed, $post); 38 | } 39 | 40 | $total_emitted += $emitted; 41 | printf STDERR "Emitted [%d] posts for [%s]\n", $emitted, $feed->url; 42 | } 43 | printf STDERR "Emitted [%d] posts in [%d] feeds\n", $total_emitted, $feeds; 44 | } 45 | 46 | sub emit { 47 | my ($self, $feed, $post) = @_; 48 | printf "Emitting [%s]\n", $post->title; 49 | 50 | my $config = Perly::Bot::Config->instance; 51 | my @errors = (); 52 | my $emitted = 0; 53 | 54 | for my $media_target ( $feed->media_targets->@* ) { 55 | printf "Media target is [%s]\n", $media_target; 56 | my $media = $media_target->new($config->media->{$media_target}); 57 | my $res = eval { $media->emit($post) }; 58 | 59 | if ( $@ || !$res ) { 60 | my $error = sprintf 'Could not send post! [%s] %s', $post->title, $@; 61 | warn "$error\n"; 62 | push @errors, $error; 63 | } 64 | else { 65 | $emitted = 1; 66 | } 67 | } 68 | printf "[%d] errors for [%s]\n", scalar @errors, $post->title; 69 | return $emitted; 70 | } 71 | 72 | 1; 73 | -------------------------------------------------------------------------------- /lib/Perly/Bot/Config.pm: -------------------------------------------------------------------------------- 1 | package Perly::Bot::Config; 2 | use strict; 3 | use warnings; 4 | use Path::Tiny; 5 | use YAML::XS; 6 | 7 | my $self; 8 | sub instance { 9 | my ($class, %args) = @_; 10 | unless ($self) { 11 | $self = bless { %args }, $class; 12 | $self->{config} = $self->load_config; 13 | } 14 | return $self; 15 | } 16 | 17 | sub config_filepath { 18 | my $self = shift; 19 | return "config/$self->{tier}.yml"; 20 | } 21 | 22 | sub load_config { 23 | my $file = $self->config_filepath; 24 | # use canonpath for cross platform support 25 | return YAML::XS::LoadFile( Path::Tiny->new($file)->canonpath ); 26 | } 27 | 28 | sub feeds { 29 | my $self = shift; 30 | return $self->{config}{feeds}; 31 | } 32 | 33 | sub media { 34 | my $self = shift; 35 | return $self->{config}{media}; 36 | } 37 | 38 | sub agent_string { 39 | my $self = shift; 40 | return $self->{config}{agent_string}; 41 | } 42 | 43 | sub age_threshold_secs { 44 | my $self = shift; 45 | return $self->{config}{should_emit}{age_threshold_secs}; 46 | } 47 | 48 | sub feed_data { 49 | my ($self) = @_; 50 | my $filepath = $self->feeds->{filepath}; 51 | return YAML::XS::LoadFile( Path::Tiny->new($filepath)->canonpath ); 52 | } 53 | 54 | 1; 55 | -------------------------------------------------------------------------------- /lib/Perly/Bot/Feed.pm: -------------------------------------------------------------------------------- 1 | use v5.22; 2 | use feature qw(signatures postderef); 3 | no warnings qw(experimental::signatures experimental::postderef); 4 | 5 | package Perly::Bot::Feed; 6 | use utf8; 7 | 8 | use Perly::Bot::UserAgent; 9 | use namespace::autoclean; 10 | use Scalar::Util qw(weaken); 11 | use Time::Piece; 12 | use Time::Seconds; 13 | use XML::FeedPP; 14 | 15 | use base 'Class::Accessor'; 16 | __PACKAGE__->mk_accessors( 17 | qw/url type date_name date_format active 18 | proxy media_targets post_class/ 19 | ); 20 | 21 | =encoding utf8 22 | 23 | =head1 NAME 24 | 25 | Perly::Bot::Feed - represent a feed 26 | 27 | =head1 SYNOPSIS 28 | 29 | =head1 DESCRIPTION 30 | 31 | =head1 FUNCTIONS 32 | 33 | =head2 get_posts ($xml) 34 | 35 | This method requires an xml string of the blog feed and returns an 36 | arrayref of L objects. 37 | 38 | =cut 39 | 40 | sub type_defaults ($self) { 41 | state $type_defaults = { 42 | rdf => { 43 | date_name => 'dc:date', 44 | date_format => '%Y-%m-%dT%T%z', 45 | parser => 'XML::FeedPP::RDF', 46 | }, 47 | rss => { 48 | date_name => 'pubDate', 49 | date_format => '%a, %d %b %Y %T %z', 50 | parser => 'XML::FeedPP::RSS', 51 | }, 52 | atom => { 53 | date_name => 'published', 54 | date_format => '%Y-%m-%dT%TZ', 55 | parser => 'XML::FeedPP::Atom', 56 | }, 57 | }; 58 | 59 | return $type_defaults; 60 | } 61 | 62 | sub defaults_for_type ($self, $type = 'rss') { 63 | state $type_defaults = $self->type_defaults; 64 | 65 | unless (exists $type_defaults->{$type}) { 66 | warn "No defaults for media type [$type]!\n"; 67 | return; 68 | } 69 | 70 | return $self->type_defaults->{$type}; 71 | } 72 | 73 | sub defaults ($class) { 74 | state $defaults = { 75 | active => 1, 76 | proxy => 0, 77 | post_class => 'Perly::Bot::Post', 78 | media_targets => [ 79 | 'Perly::Bot::Media::JSON', 80 | ], 81 | }; 82 | 83 | $defaults; 84 | } 85 | 86 | sub new ($class, $args) { 87 | my %feed = ($class->defaults->%*, $args->%*); 88 | my $self = bless \%feed, $class; 89 | 90 | unless (defined $self->{type}) { 91 | $self->{type} = 'rss'; 92 | } 93 | 94 | while (my ($k, $v) = each $self->defaults_for_type($self->{type})->%*) { 95 | next if defined $self->{$k}; 96 | $self->{$k} = $v; 97 | } 98 | 99 | state $required = [ 100 | qw(url type date_name date_format active media_targets proxy parser) 101 | ]; 102 | my @missing = grep { !exists $self->{$_} } $required->@*; 103 | die "Missing fields (@missing) for feed $self->{url}" if @missing; 104 | 105 | die "Unallowed content parser $self->{parser}" 106 | unless $self->parser_allowed($self->{parser}); 107 | 108 | unless ($self->post_class =~ m/ \A [A-Z0-9_]+ (?: :: [A-Z0-9_]+)+ \z /xi) { 109 | die "Invalid post class " . $self->post_class . " for " . $self->url; 110 | } 111 | else { 112 | unless (eval "require " . $self->post_class . "; 1") { 113 | die "Could not load post class " . $self->post_class . ": $@"; 114 | } 115 | } 116 | 117 | $self; 118 | } 119 | 120 | sub parser_allowed ($self, $parser) { 121 | return exists $self->_allowed_parsers->{$parser}; 122 | } 123 | 124 | sub _allowed_parsers { 125 | state $allowed = { 126 | map { $_ => 1 } 127 | qw( 128 | XML::FeedPP::RSS 129 | XML::FeedPP::RDF 130 | XML::FeedPP::Atom 131 | ) }; 132 | $allowed; 133 | } 134 | 135 | sub is_active ($self) { 136 | return 0 if (defined $self->{active} and !$self->{active}); 137 | return 0 if (defined $self->{inactive} and $self->{inactive}); 138 | return 1; 139 | } 140 | 141 | sub trawl_blog ($self) { 142 | warn "Trawling " . $self->url . "\n"; 143 | 144 | my $ua = Perly::Bot::UserAgent->instance; 145 | 146 | if (my $response = $ua->get($self->url)) { 147 | my $content = $response->text; 148 | my $blog_posts = $self->extract_posts($content); 149 | return $blog_posts; 150 | } 151 | else { 152 | warn "Received nothing for feed " . $self->url . "\n"; 153 | return []; 154 | } 155 | } 156 | 157 | sub fetch_feed ($self) { 158 | my $ua = Perly::Bot::UserAgent->instance; 159 | my $response = $ua->get($self->url); 160 | 161 | if (my $response = $ua->get($self->url)) { 162 | my $content = $response->text; # decode 163 | printf STDERR "Received content length: %s\n", length $content; 164 | $self->{content} = $content; 165 | return $content; 166 | } 167 | 168 | return; 169 | } 170 | 171 | sub extract_posts ($self, $xml) { 172 | my @posts = (); 173 | 174 | my @items = 175 | eval { $self->{parser}->new($xml, -type => 'string')->get_item() }; 176 | 177 | if ($@) { 178 | warn "Bad XML for " . $self->url . ": $@\n"; 179 | return []; 180 | } 181 | 182 | foreach my $i (@items) { 183 | 184 | # extract the post date 185 | my $datetime_raw = $i->get($self->date_name); 186 | my $date_format = $self->date_format; 187 | my $datetime_clean = $datetime_raw; 188 | 189 | # time::piece does not recognise UTC as a time zone 190 | $datetime_clean =~ s/UTC/GMT/ if $date_format =~ /\%Z/; 191 | 192 | # time::piece requires timezone modifiers to not have a semicolon 193 | $datetime_clean =~ s/([+\-][0-9][0-9]):([0-9][0-9]$)/$1$2/ 194 | if $date_format =~ /\%z/; 195 | 196 | # trim whitespace 197 | $datetime_clean =~ s/\A\s+|\s+\Z//gm; 198 | 199 | # time::piece struggles with milliseconds 200 | if ($self->date_format =~ /%ms/) { 201 | $datetime_clean =~ s/\.[0-9][0-9][0-9]//; 202 | $date_format =~ 203 | s/\%ms//; # %ms is a Perly bot convention not used by strptime 204 | } 205 | 206 | my $weak_self = $self; 207 | weaken($weak_self); 208 | 209 | my $post = eval { 210 | my $datetime = Time::Piece->strptime($datetime_clean, $date_format); 211 | $self->post_class->new({ 212 | description => $i->description, 213 | datetime => $datetime, 214 | proxy => $self->proxy, 215 | title => $i->title, 216 | epoch => $datetime->epoch, 217 | url => $i->link, 218 | }); 219 | }; 220 | 221 | if ($@) { 222 | warn "Error creating post object: $@"; 223 | } 224 | else { 225 | push @posts, $post; 226 | } 227 | } 228 | return \@posts; 229 | } 230 | 231 | 1; 232 | -------------------------------------------------------------------------------- /lib/Perly/Bot/Media/JSON.pm: -------------------------------------------------------------------------------- 1 | package Perly::Bot::Media::JSON; 2 | use autodie; 3 | use strict; 4 | use warnings; 5 | use Mojo::JSON qw/decode_json encode_json/; 6 | 7 | sub new { 8 | my ($class, $args) = @_; 9 | my @missing = grep { !(exists $args->{$_} && defined $args->{$_}) } 10 | qw(filepath link_limit); 11 | 12 | die "Missing required parameters (@missing) for $class" if @missing; 13 | 14 | return bless $args, $class; 15 | } 16 | 17 | sub emit { 18 | my ($self, $blog_post) = @_; 19 | 20 | my $json = '[]'; 21 | if (-e $self->{filepath}) { 22 | open my $fh_r, '<', $self->{filepath}; 23 | $json = do { local $/; <$fh_r>; }; 24 | close $fh_r; 25 | } 26 | 27 | my @links = @{ decode_json($json) }; 28 | 29 | if (grep { $blog_post->root_url eq $_->{url} } @links) { 30 | printf STDERR "already emitted %s, skipping\n", $blog_post->root_url; 31 | return 1; 32 | } 33 | 34 | unshift @links, { 35 | posted => $blog_post->datetime->datetime, #YYYY-MM-DDT00:00:00 36 | title => $blog_post->decoded_title, 37 | url => $blog_post->root_url, 38 | }; 39 | 40 | # don't slice larger than our data or link limit 41 | my $limit = $self->{link_limit} > @links 42 | ? @links - 1 43 | : $self->{link_limit} - 1; 44 | @links = @links[ 0 .. $limit ]; 45 | 46 | open my $fh_w, '>', $self->{filepath}; 47 | print $fh_w encode_json(\@links), "\n"; 48 | 49 | return 1; 50 | } 51 | 52 | 1; 53 | -------------------------------------------------------------------------------- /lib/Perly/Bot/Post.pm: -------------------------------------------------------------------------------- 1 | use v5.22; 2 | use feature qw(signatures postderef); 3 | no warnings qw(experimental::signatures experimental::postderef); 4 | 5 | package Perly::Bot::Post; 6 | 7 | use Perly::Bot::UserAgent; 8 | use Perly::Bot::Config; 9 | use HTML::Entities; 10 | use List::Util qw(sum0 any); 11 | use Time::Piece; 12 | 13 | use base 'Class::Accessor'; 14 | __PACKAGE__->mk_accessors( 15 | qw/url title datetime description domain epoch content_regex/); 16 | 17 | =encoding utf8 18 | 19 | =head1 NAME 20 | 21 | Perly::Bot::Post - process a social media post 22 | 23 | =head1 FUNCTIONS 24 | 25 | =head2 clean_url 26 | 27 | Removes the query component of the url. This is to reduce the risk of posting duplicate urls with different query parameters. 28 | 29 | =cut 30 | 31 | sub clean_url ($self, $url = undef) { 32 | my $uri = Mojo::URL->new(($url || $self->url)); 33 | my $clean_url = $uri->scheme . '://' . $uri->host . $uri->path; 34 | printf STDERR "Cleaned URL to [%s]\n", $clean_url; 35 | return $clean_url; 36 | } 37 | 38 | sub domain ($self) { $self->{domain} //= Mojo::URL->new($self->root_url)->host } 39 | 40 | 41 | =head2 root_url 42 | 43 | Returns the clean url, it will follow the url and return the ultimate location the URL redirects to. Sets the post's raw content. 44 | 45 | =cut 46 | 47 | sub root_url ($self) { 48 | # if we've already retrieved the root url, don't pull it again 49 | return $self->{_root_url} if $self->{_root_url}; 50 | 51 | my ($request, $response) = Perly::Bot::UserAgent->instance->get($self->url); 52 | if ($response) 53 | { 54 | my $url = $request->url->to_abs(); 55 | printf STDERR "URL is [%s]\n", $url; 56 | 57 | # set the post content 58 | $self->{raw_content} = $response->body; 59 | 60 | $self->{_root_url} = $self->clean_url($url); 61 | return $self->{_root_url}; 62 | } 63 | } 64 | 65 | =head2 raw_content 66 | 67 | Returns the raw HTML of the post 68 | 69 | =cut 70 | 71 | sub raw_content ($self) { 72 | return $self->{raw_content} if exists $self->{raw_content}; 73 | $self->root_url(); # fetch the article and set the content 74 | return $self->{raw_content} or die 'root_url() did not set the post content'; 75 | } 76 | 77 | =head2 extract_body_text 78 | 79 | Extracts text from the raw body HTML 80 | 81 | =cut 82 | 83 | sub extract_body_text ($self, $content = $self->raw_content){ 84 | my $regex = $self->get_extraction_regex; 85 | my $paragraphs = join "\n", $content =~ /$regex/g; 86 | die sprintf 'failed to extract text from [%s]', $self->domain unless $paragraphs; 87 | $paragraphs =~ s/<\/?.+?>//g; 88 | return decode_entities($paragraphs); 89 | } 90 | 91 | sub get_extraction_regex ($self, $domain = $self->domain) { 92 | state $domain_regexes = { 93 | 'blogs.perl.org' => qr/
(.+?)/si, 94 | 'blog.plover.com' => qr/class="mainsection"(.+?)<\/table>/si, 95 | 'rjbs.manxome.org' => qr/
(.+?)