├── delete-gwene ├── encode-content ├── feed-gwene-hashed ├── fetch-rss ├── new-gwene ├── sh-feed-gwene-comments-hashed ├── sh-feed-gwene-hashed └── validate-rss /delete-gwene: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | open(GR, "/var/lib/news/newsgroups") || die; 4 | while () { 5 | chop; 6 | ($group, $desc) = split(/\t/); 7 | $groups{$group} = 1; 8 | } 9 | close GROUPS; 10 | 11 | open(DELETE, "/home/tmp/gwene-deletions") || die; 12 | while () { 13 | chop; 14 | ($group) = split(/\t/); 15 | if ($groups{$group}) { 16 | print "Deleting $group\n"; 17 | system("/gmane/rmgroup", $group); 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /encode-content: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # Some feeds have "raw" HTML in the content/description part. This 4 | # script quotes the HTML so that the feed can be parsed as XML. 5 | 6 | $file = $ARGV[0]; 7 | $out = $ARGV[1]; 8 | 9 | open(FILE, $file) || die; 10 | open(OUT, ">$out") || die; 11 | undef $/; 12 | $content = ; 13 | close FILE; 14 | 15 | $content =~ s///g; 16 | 17 | 18 | $orig_content = $content; 19 | 20 | @elems = ("content", "description"); 21 | 22 | foreach $tag (@elems) { 23 | $new = ""; 24 | $content =~ s/<$tag \/>//g; 25 | 26 | $regexp = "(^.*?)(<$tag [^>]*>|<$tag>)(.*?)(<\/$tag)"; 27 | 28 | while ($content =~ /$regexp/s && 29 | $content !~ /CDATA/) { 30 | $all = $&; 31 | $ending = $4; 32 | $new .= $1; 33 | $new .= $2; 34 | $bit = $3; 35 | if ($bit =~ //) { 36 | $bit =~ s/&/&/g; 37 | $bit =~ s//>/g; 39 | } 40 | $new .= $bit; 41 | $new .= $ending; 42 | $content = substr($content, length($all)); 43 | $did = 1; 44 | } 45 | 46 | $new .= $content; 47 | $content = $new; 48 | } 49 | 50 | if ($did) { 51 | print OUT $new; 52 | } else { 53 | print OUT $orig_content; 54 | } 55 | -------------------------------------------------------------------------------- /feed-gwene-hashed: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | use Digest::SHA1 qw(sha1 sha1_hex sha1_base64); 4 | 5 | $instance = $ARGV[0] % 30; 6 | $suffix = $ARGV[1]; 7 | $cutoff = $ARGV[2]; 8 | 9 | open(GROUPS, "/var/lib/news/newsgroups") || die; 10 | 11 | while () { 12 | chop; 13 | ($group, $desc) = split(/\t/); 14 | $groups{$group} = 1; 15 | } 16 | 17 | open(GWENE, "/home/tmp/gwene$suffix-requests") || die; 18 | 19 | while () { 20 | chop; 21 | ($group, $url, $urlid, $date, $parent_id) = split(/\t/); 22 | $sha = sha1($url); 23 | $result = 0; 24 | foreach $s ($sha) { 25 | $result += ord($s); 26 | } 27 | if ($groups{$group} && 28 | ($instance == -1 || (($result % 30) == $instance)) && 29 | (! $cutoff || $date > $cutoff)) { 30 | print "Feeding $group $url\n"; 31 | system("/home/larsi/gwene/fetch-rss", $url, $group, $parent_id); 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /fetch-rss: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | require "/usr/local/src/XML-TreePP-0.41/blib/lib/XML/TreePP.pm"; 4 | require "/usr/local/src/XML-FeedPP-0.43/blib/lib/XML/FeedPP.pm"; 5 | #use XML::FeedPP; 6 | use Encode qw/encode decode/; 7 | use Date::Parse; 8 | use POSIX qw(strftime); 9 | use Digest::SHA1 qw(sha1 sha1_hex sha1_base64); 10 | use Net::NNTP; 11 | use HTML::Entities; 12 | use BerkeleyDB; 13 | use CGI; 14 | use Scalar::Util qw/reftype/; 15 | 16 | if ($ARGV[0] =~ /http/) { 17 | $source = $ARGV[0]; 18 | $newsgroup = $ARGV[1]; 19 | } else { 20 | $source = $ARGV[1]; 21 | $newsgroup = $ARGV[0]; 22 | } 23 | 24 | $parent_id = $ARGV[2]; 25 | 26 | my $postedenv = new BerkeleyDB::Env 27 | -Home => "/var/tmp/gwene" , 28 | -Flags => DB_CREATE| DB_INIT_CDB | DB_INIT_MPOOL 29 | or die "cannot open environment: $BerkeleyDB::Error\n"; 30 | 31 | tie %posted, "BerkeleyDB::Hash", 32 | -Filename => '/var/tmp/gwene/posted.db', 33 | -Flags => DB_CREATE, 34 | -Env => $postedenv 35 | or die "cannot open database: $BerkeleyDB::Error\n"; 36 | 37 | sub rfc2047_encode { 38 | my $string = shift; 39 | my $word = ""; 40 | my $encodingp = 0; 41 | my $result = ""; 42 | @strings = split(/[ \t\n]+/, $string); 43 | foreach $sub (@strings) { 44 | $encodingp = 0; 45 | $word = ""; 46 | @chars = split(//, $sub); 47 | foreach $char (@chars) { 48 | $c = ord($char); 49 | if (($c >= 8 && $c <= 10) || 50 | ($c == 12) || 51 | ($c >= 33 && $c <= 60) || 52 | ($c == 62) || 53 | ($c >= 64 && $c <= 127)) { 54 | $word .= $char; 55 | } elsif ($c == 32) { 56 | $word .= "_"; 57 | } else { 58 | $encodingp = 1; 59 | $word .= sprintf("=%02x", $c); 60 | } 61 | } 62 | if ($encodingp) { 63 | $word = "=?utf-8?q?" . $word . "?="; 64 | } 65 | if ($result eq "") { 66 | $result = $word; 67 | } else { 68 | $result .= " " . $word; 69 | } 70 | } 71 | $result =~ s/\?= =\?utf-8\?q\?/_/g; 72 | return $result; 73 | } 74 | 75 | sub my_sort_item { 76 | my $self = shift; 77 | my $list = $self->{rss}->{channel}->{item} or return; 78 | my $epoch = [ map { $_->get_pubDate_epoch() || 0 } @$list ]; 79 | my $sorted = [ map { $list->[$_] } sort { 80 | $epoch->[$a] <=> $epoch->[$b] 81 | } 0 .. $#$list ]; 82 | return @sorted; 83 | } 84 | 85 | sub relative_urls_p { 86 | my $contents = shift; 87 | my $relative = 0; 88 | foreach (split(/(href=['\"])|(src=['\"])/, $contents)) { 89 | if (/^\//) { 90 | $relative = 1; 91 | } 92 | } 93 | return $relative; 94 | } 95 | 96 | sub my_description { 97 | my $self = shift; 98 | my $content = $self->get_value("content:encoded") || 99 | $self->description(); 100 | 101 | # This is a bug. On XHTML, FeedPP will return a tree instead of 102 | # the text. In that case, transform the data into text explicity. 103 | if (reftype $content eq "HASH") { 104 | my $tpp = XML::TreePP->new(); 105 | bless($content); 106 | $content = $tpp->write($content, "utf-8"); 107 | } 108 | return $content; 109 | } 110 | 111 | 112 | $tmp = "/tmp/feed.$$.rss"; 113 | system("curl", "-s", "-o", $tmp, "-m", "15", "-L", 114 | "-A", "Gwene/1.0 (The gwene.org rss-to-news gateway like Googlebot)", 115 | "--compressed", $source); 116 | if ($? != 0) { 117 | if (-f $tmp) { 118 | unlink $tmp; 119 | } 120 | print "Unable to fetch feed\n"; 121 | exit; 122 | } 123 | 124 | system("/home/larsi/gwene/encode-content", $tmp, "$tmp.code"); 125 | rename "$tmp.code", $tmp; 126 | 127 | eval { 128 | $feed = XML::FeedPP->new($tmp, utf8_flag => 0); 129 | }; 130 | if (-f $tmp) { 131 | unlink $tmp; 132 | } 133 | 134 | if ($feed) { 135 | print "Ok\n"; 136 | } else { 137 | print "Error in feed\n"; 138 | exit; 139 | } 140 | 141 | eval { 142 | $feed->sort_item(); 143 | }; 144 | 145 | foreach my $item ($feed->get_item()) { 146 | $items[$a++] = $item; 147 | } 148 | 149 | #my $ENCODING = 'utf8'; 150 | 151 | foreach my $item (@items) { 152 | $link = $item->link(); 153 | if (! $links{$link}) { 154 | $difflinks++; 155 | } 156 | $numlinks++; 157 | $links{$link}++; 158 | } 159 | 160 | if ($numlinks > 3 && $difflinks == 1) { 161 | $date_id = 1; 162 | } 163 | 164 | foreach my $item (reverse @items) { 165 | $count = 1; 166 | $old_newsgroups = ""; 167 | $subject = $item->title(); 168 | if ($parent_id) { 169 | $subject = $feed->title(); 170 | $subject =~ s/^comments on[^:]*: *//i; 171 | } 172 | if ($subject =~ /&.*;/) { 173 | $new = decode_entities(decode("utf8", $subject)); 174 | if ($new ne $subject) { 175 | $subject = encode("utf8", $new); 176 | } 177 | } 178 | $subject =~ s/\n/ /g; 179 | $subject =~ s/^ +//; 180 | $subject =~ s/ +$//; 181 | if ($subject eq "") { 182 | $subject = "(untitled)"; 183 | } 184 | $subject =~ s/<[^>]+>//g; 185 | 186 | $from = $item->author(); 187 | if ($from =~ /&.*;/) { 188 | $new = decode_entities(decode("utf8", $from)); 189 | if ($new ne $from) { 190 | $from = encode("utf8", $new); 191 | } 192 | } 193 | $from =~ s/\n/ /g; 194 | $from =~ s/<[^>]+>//g; 195 | if ($from eq "") { 196 | $from = "unknown"; 197 | } 198 | $address = $item->get_value("email"); 199 | $date = $item->pubDate(); 200 | $guid = $item->guid(); 201 | $link = $item->link(); 202 | 203 | $id = $guid; 204 | $skip = 0; 205 | $a = 0; 206 | undef $art; 207 | if ($id eq "") { 208 | $id = $link; 209 | } 210 | if ($date_id && ! $guid) { 211 | $id .= "-$date"; 212 | } 213 | 214 | $enclosure = ""; 215 | # This may fail, which is OK. 216 | eval { 217 | %enclosure = %{$item->{"enclosure"}}; 218 | $enclosure = $enclosure{"-url"}; 219 | }; 220 | 221 | $contents = my_description($item); 222 | if (relative_urls_p($contents) || 223 | relative_urls_p($link)) { 224 | my $url = $source; 225 | # Remove the bits after the domain name. 226 | $url =~ s/^([^:]+:\/\/[^\/]+).*/\1/; 227 | $contents = "$contents"; 228 | } 229 | $content_hash = sha1_base64(encode("utf8", $contents)); 230 | 231 | $message_id = ""; 234 | 235 | if ($posted{$message_id} eq $content_hash) { 236 | #print "Local DB already seen this content; ignoring\n"; 237 | $skip = 1; 238 | } else { 239 | if (! $nntp) { 240 | @nntp = ("news.gmane.org"); 241 | $nntp = Net::NNTP->new(@nntp) || die "cannot connect to NNTP server"; 242 | } 243 | for ($i = 1; $i < 10; $i++) { 244 | $message_id = ""; 247 | if ($headers = $nntp->head($message_id)) { 248 | #print "Found $message_id\n"; 249 | $skip = 1; 250 | foreach (@{$headers}) { 251 | if (/^X-Content-Hash: (.*)/) { 252 | if ($1 eq $content_hash) { 253 | #print "inn already seen this content; skipping\n"; 254 | $posted{$message_id} = $content_hash; 255 | $skip = 1; 256 | break; 257 | } 258 | } 259 | if (/^Newsgroups: (.*)/) { 260 | $old_newsgroups = $1; 261 | } 262 | } 263 | $count = $i + 1; 264 | $message_id = "Message-ID: \n"; 266 | #$posted{$message_id} = $content_hash; 267 | break; 268 | } 269 | } 270 | } 271 | 272 | undef $comment_feed; 273 | undef $comment_group; 274 | if ($item->get_value("wfw:commentRss")) { 275 | $comment_feed = $item->get_value("wfw:commentRss"); 276 | } else { 277 | eval { 278 | my @links= map {$_} (@{$item->{'link'}}); 279 | foreach $l (@links) { 280 | %alink = %$l; 281 | if ($alink{'-rel'} eq "replies" && 282 | $alink{'-type'} =~ /atom|xml|rss/) { 283 | $comment_feed = $alink{'-href'}; 284 | } 285 | } 286 | }; 287 | } 288 | if ($comment_feed && $parent_id eq "" && 289 | $newsgroup =~ /languagelog|photoshop.dis|yegge|com.tcj|badscience|comics.comics.mag|github.larsmagne|hooded.ut/) { 290 | $comment_group = $newsgroup; 291 | $comment_group =~ s/^gwene./gwene.full./; 292 | open(FEEDS, "/home/tmp/gwene-comment-requests") || die; 293 | $found = 0; 294 | while () { 295 | chop; 296 | ($cgroup, $cfeed, $cid) = split(/\t/); 297 | if ($cfeed eq $comment_feed) { 298 | $found = 1; 299 | } 300 | } 301 | close FEEDS; 302 | $comment_feed =~ s/&/%26/g; 303 | if (! $found) { 304 | $mid = "x" . $count . "-" . 305 | sha1_base64($newsgroup . encode("utf8", $id) . "geheimNis") . 306 | "\@gwene.org"; 307 | $mid = CGI::escape($mid); 308 | print "http://gwene.org/index.php?group=$comment_group&url=$comment_feed&confirm=t&ignore_feed_id=t&parent=$mid\n"; 309 | system("curl", "-s", 310 | "http://gwene.org/index.php?group=$comment_group&url=$comment_feed&confirm=t&ignore_feed_id=t&parent=$mid"); 311 | } 312 | } 313 | 314 | if (! $skip) { 315 | $try_again = 2; 316 | $efrom = rfc2047_encode($from); 317 | if ($efrom =~ /.+\@.+\..+/) { 318 | $hfrom = "From: $efrom\n"; 319 | } elsif ($address =~ /.+\@.+\..+/) { 320 | $hfrom = "From: $efrom <$address>\n"; 321 | } else { 322 | $hfrom = "From: $efrom \n"; 323 | } 324 | while ($try_again) { 325 | undef @art; 326 | undef $art; 327 | $a = 0; 328 | $nntp->quit; 329 | $nntp = Net::NNTP->new(@nntp) || die "cannot connect to NNTP server"; 330 | 331 | if ($old_newsgroups) { 332 | @groups = split(/,/, $old_newsgroups); 333 | $includeg = 1; 334 | foreach $g (@groups) { 335 | if ($g eq $newsgroup) { 336 | $includeg = 0; 337 | } 338 | } 339 | if ($includeg) { 340 | $n = "$newsgroup,$old_newsgroups"; 341 | } else { 342 | $n = $old_newsgroups; 343 | } 344 | } else { 345 | $n = $newsgroup; 346 | } 347 | if ($comment_group) { 348 | $n .= ",$comment_group"; 349 | } 350 | $art[$a++] = "Newsgroups: $n\n"; 351 | $art[$a++] = $hfrom; 352 | if ($parent_id) { 353 | $subject = "Re: $subject"; 354 | } 355 | $art[$a++] = "Subject: " . rfc2047_encode($subject) . "\n"; 356 | eval { 357 | $time = str2time($date); 358 | }; 359 | if ($time < 10000) { 360 | $time = time(); 361 | } 362 | $art[$a++] = "Date: " . strftime("%a, %d %b %Y %H:%M:%S %z", 363 | localtime($time)) . 364 | "\n"; 365 | $art[$a++] = "Content-type: text/html; charset=utf-8\n"; 366 | $art[$a++] = "Content-Transfer-Encoding: 8bit\n"; 367 | $message_id = ""; 370 | $art[$a++] = "Message-ID: $message_id\n"; 371 | if ($parent_id) { 372 | $art[$a++] = "References: <$parent_id>\n"; 373 | } 374 | 375 | if ($count > 1) { 376 | $prev = $count - 1; 377 | $art[$a++] = "Supersedes: \n"; 380 | } 381 | $art[$a++] = "X-Content-Hash: $content_hash\n"; 382 | $art[$a++] = "Approved: news\@gmane.org\n"; 383 | $art[$a++] = "X-Feed: $source\n"; 384 | if ($link) { 385 | $art[$a++] = "Archived-at: <$link>\n"; 386 | } 387 | 388 | 389 | $art[$a++] = "\n"; 390 | $art[$a++] = $contents; 391 | $art[$a++] = "\n"; 392 | 393 | if ($enclosure) { 394 | $art[$a++] = "

Data"; 395 | } 396 | 397 | $art[$a++] = "

Link"; 398 | $art[$a++] = "\n"; 399 | 400 | #print @art; 401 | $result = $nntp->post(\@art); 402 | $msg = (reverse $nntp->message())[0]; 403 | if ($msg =~ /From: address not in Internet syntax/) { 404 | $hfrom = "From: unknown ) { 11 | chop; 12 | $created{$_} = 1; 13 | } 14 | close CREATED; 15 | unlink "/home/tmp/gwene-created"; 16 | 17 | sleep(2); 18 | 19 | open(GROUPS, "/var/lib/news/newsgroups") || die; 20 | while () { 21 | chop; 22 | ($group, $desc) = split(/\t/); 23 | $groups{$group} = 1; 24 | } 25 | close GROUPS; 26 | 27 | open(GWENE, "/home/tmp/gwene-requests") || die; 28 | while () { 29 | chop; 30 | ($group, $url, $urlid, $date, $parent_id) = split(/\t/); 31 | if ($groups{$group} && $created{$group}) { 32 | print "Feeding $group $url\n"; 33 | system("/home/larsi/gwene/fetch-rss", $url, $group, 34 | $parent_id); 35 | } 36 | } 37 | } 38 | sleep(1); 39 | } 40 | 41 | -------------------------------------------------------------------------------- /sh-feed-gwene-comments-hashed: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | /home/larsi/gwene/feed-gwene-hashed `date +%M` "-comment"\ 4 | `date --date="-7days" +%Y%m%d` >> /tmp/feed.log -------------------------------------------------------------------------------- /sh-feed-gwene-hashed: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | /home/larsi/gwene/feed-gwene-hashed `date +%M` "" >> /tmp/feed.log -------------------------------------------------------------------------------- /validate-rss: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | require "/usr/local/src/XML-TreePP-0.41/blib/lib/XML/TreePP.pm"; 4 | require "/usr/local/src/XML-FeedPP-0.42/blib/lib/XML/FeedPP.pm"; 5 | #use XML::FeedPP; 6 | 7 | my $source = $ENV{'URL'}; 8 | 9 | $tmp = "/tmp/feed.$$.rss"; 10 | system("curl", "-s", "-o", $tmp, "-m", "10", "-L", 11 | "-A", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1.10) Gecko/20100623 Iceweasel/3.5.10 (like Firefox/3.5.10 like Googlebot)", 12 | $source); 13 | 14 | my $feed = XML::FeedPP->new($tmp); 15 | 16 | if (-f $tmp) { 17 | unlink $tmp; 18 | } 19 | 20 | if ($feed) { 21 | print "Ok " . $feed->link() . "\n"; 22 | } 23 | --------------------------------------------------------------------------------