├── templates ├── statuses-bottom.html ├── posts-top.html ├── posts-bottom.html ├── group_posts-top.html ├── statuses-top.html ├── group_posts-bottom.html ├── footer.html └── header.html ├── .gitmodules ├── htaccess-default ├── fetlife-export.command ├── README.markdown ├── index.php └── fetlife-export.pl /templates/statuses-bottom.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "lib"] 2 | path = lib 3 | url = git://github.com/meitar/libFetLife.git 4 | -------------------------------------------------------------------------------- /templates/posts-top.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |
4 | -------------------------------------------------------------------------------- /templates/posts-bottom.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |
4 | -------------------------------------------------------------------------------- /templates/group_posts-top.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |
4 | -------------------------------------------------------------------------------- /templates/statuses-top.html: -------------------------------------------------------------------------------- 1 |
2 |
3 | -------------------------------------------------------------------------------- /htaccess-default: -------------------------------------------------------------------------------- 1 | 2 | # Let file names be as long as they need to be. 3 | IndexOptions NameWidth=* 4 | 5 | -------------------------------------------------------------------------------- /templates/group_posts-bottom.html: -------------------------------------------------------------------------------- 1 |
2 |
3 |
4 | -------------------------------------------------------------------------------- /fetlife-export.command: -------------------------------------------------------------------------------- 1 | #!/bin/bash - 2 | cat << "EOF" 3 | @@@@@@@@ @@@@@@@@ @@@@@@@ @@@ @@@ @@@@@@@@ @@@@@@@@ 4 | @@@@@@@@ @@@@@@@@ @@@@@@@ @@@ @@@ @@@@@@@@ @@@@@@@@ 5 | @@! @@! @@! @@! @@! @@! @@! 6 | !@! !@! !@! !@! !@! !@! !@! 7 | @!!!:! @!!!:! @!! @!! !!@ @!!!:! @!!!:! 8 | !!!!!: !!!!!: !!! !!! !!! !!!!!: !!!!!: 9 | !!: !!: !!: !!: !!: !!: !!: 10 | :!: :!: :!: :!: :!: :!: :!: 11 | :: :: :::: :: :: :::: :: :: :: :::: 12 | : : :: :: : : :: : : : : : :: :: 13 | 14 | 15 | @@@@@@@@ @@@ @@@ @@@@@@@ @@@@@@ @@@@@@@ @@@@@@@ 16 | @@@@@@@@ @@@ @@@ @@@@@@@@ @@@@@@@@ @@@@@@@@ @@@@@@@ 17 | @@! @@! !@@ @@! @@@ @@! @@@ @@! @@@ @@! 18 | !@! !@! @!! !@! @!@ !@! @!@ !@! @!@ !@! 19 | @!!!:! !@@!@! @!@@!@! @!@ !@! @!@!!@! @!! 20 | !!!!!: @!!! !!@!!! !@! !!! !!@!@! !!! 21 | !!: !: :!! !!: !!: !!! !!: :!! !!: 22 | :!: :!: !:! :!: :!: !:! :!: !:! :!: 23 | :: :::: :: ::: :: ::::: :: :: ::: :: 24 | : :: :: : :: : : : : : : : : 25 | 26 | 27 | @@@ @@@ @@@ @@@ @@@@@@@@ @@@@@@ @@@@@@@ @@@@@@@ 28 | @@@ @@@ @@@ @@@ @@@@@@@@ @@@@@@@@ @@@@@@@@ @@@@@@@@ 29 | @@! @@! @@! @@! @@! @@! @@@ @@! @@@ @@! @@@ 30 | !@! !@! !@! !@! !@! !@! @!@ !@! @!@ !@! @!@ 31 | @!! !!@ @!@ !!@ @!! @!@!@!@! @!@!!@! @!@ !@! 32 | !@! !!! !@! !!! !!! !!!@!!!! !!@!@! !@! !!! 33 | !!: !!: !!: !!: !!: !!: !!! !!: :!! !!: !!! 34 | :!: :!: :!: :!: :!: :!: !:! :!: !:! :!: !:! 35 | :::: :: ::: :: :: :::: :: ::: :: ::: :::: :: 36 | :: : : : : : :: : : : : : : : : :: : : 37 | EOF 38 | echo 39 | echo "FETLIFE EXPORT WIZARD" 40 | echo "This software is released to the public domain. Fuck copyright." 41 | echo 42 | echo "Make a copy of your own or any other user's FetLife account." 43 | echo 44 | 45 | readonly FL_EXPORT="fetlife-export.pl" 46 | readonly DIR=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) 47 | 48 | echo "You will need a FetLife account to use this tool. If you don't" 49 | echo "have a FetLife account, you can easily create one at this page:" 50 | echo " https://FetLife.com/signup" 51 | echo "Creating a FetLife account is free and does not require a valid" 52 | echo "email address or other personally identifying information. For" 53 | echo "maximum security, create an account while using the Tor Browser." 54 | echo " https://torproject.org/" 55 | echo 56 | echo -n "Type your FetLife username, then press return: " 57 | read USERNAME 58 | echo 59 | 60 | echo "Type the name of a folder to save to, or leave blank to use the default shown." 61 | echo "If this folder does not exist, it will be created." 62 | echo -n "Save to folder [$DIR]: " 63 | read SAVE_TO_FOLDER 64 | echo 65 | 66 | echo "Type the ID number of the export target. For example, if you want to" 67 | echo "create a copy of JohnBaku's entire FetLife history, type: 1" 68 | echo "Leave blank to automatically detect and use $USERNAME's ID number." 69 | echo -n "Export target's user ID: " 70 | read TARGET_USER_ID 71 | echo 72 | 73 | echo "Use a proxy? (Leave blank to make a direct connection.)" 74 | echo "If you want to use a proxy, enter the proxy's URL here. For example," 75 | echo "to make use of a default Tor Browser, type: socks://localhost:9150" 76 | echo -n "Proxy URL: " 77 | read PROXYURL 78 | echo 79 | 80 | if [ ! -z "$PROXYURL" ]; then 81 | PROXYOPT="--proxy=$PROXYURL" 82 | fi 83 | if [ -z "$SAVE_TO_FOLDER" ]; then 84 | SAVE_TO_FOLDER="$DIR" 85 | fi 86 | 87 | echo "$FL_EXPORT will now run with these parameters:" 88 | echo $FL_EXPORT $PROXYOPT $USERNAME $SAVE_TO_FOLDER $TARGET_USER_ID 89 | echo 90 | echo "When prompted next, enter the password for $USERNAME." 91 | "$DIR"/$FL_EXPORT $PROXYOPT $USERNAME $SAVE_TO_FOLDER $TARGET_USER_ID 92 | -------------------------------------------------------------------------------- /templates/footer.html: -------------------------------------------------------------------------------- 1 |
2 | 3 |
4 | ^ going up? ^ 5 |
6 | 7 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /README.markdown: -------------------------------------------------------------------------------- 1 | # FetLife Export - Technical Documentation 2 | 3 | The FetLife Export suite requires Perl (5.10.1 or later). If you want to install it on a website so others can use your copy, it also requires PHP (version 5.2 or later) and the Apache web server running on a UNIX-like operating system. For all features to work out-of-the-box, you’ll need to install the suite into and run it from its own domain or subdomain. 4 | 5 | If you’re on a shared host, you may wish to compile your own Perl to do this, although in the vast majority of cases, you can probably [skip directly to installing requred CPAN modules](#installing-required-cpan-modules). Following are step-by-step instructions for installing from source. 6 | 7 | ## Installing from Source 8 | 9 | Installing from source is the most reliable way to ensure everything is functional. It also may be required if your system lacks the necessary prerequisites for running `fetlife-export.pl`. 10 | 11 | ### Installing Perl 12 | 13 | wget http://www.cpan.org/src/5.0/perl-5.14.2.tar.gz 14 | tar -xvzf perl-5.14.2.tar.gz 15 | cd perl-5.14.2 16 | ./configure 17 | make 18 | make test # This is optional, but a good check. :) 19 | make install 20 | 21 | ### Installing required CPAN Modules 22 | 23 | Most systems will have the modules you need. However, if you experience errors running `fetlife-export.pl`, you may also want to install all the required components yourself. To do this, run the following commands after you’ve installed your Perl: 24 | 25 | cpan App::cpanminus 26 | cpanm WWW::Mechanize 27 | cpanm HTML::TreeBuilder 28 | cpanm String::Escape 29 | cpanm Unicode::Escape 30 | cpanm LWP::Protocol::socks # Optional. Only needed if you'll use a SOCKS proxy. 31 | 32 | ### Configure the tool 33 | 34 | * Edit the shebang line in fetlife-export.pl to point to the perl you want to use. 35 | * Make sure the `fetlife-export.pl` script is executable by running `chmod u+x fetlife-export.pl`. 36 | 37 | ## Using 38 | 39 | Once installed, you can use the tool from a command line by invoking it as follows: 40 | 41 | ./fetlife-export.pl username 42 | 43 | Obviously, replace `username` with your username. If your username begins with a hyphen (`-`), preface your username with two dashes and a space (`-- `). For instance, if your username is `-username`, then invoke the tool with a command as follows: 44 | 45 | ./fetlife-export.pl -- -username 46 | 47 | You’ll be prompted for a password. If you supply a valid password, running the program will give you some output that looks something like the following example: 48 | 49 | $ ./fetlife-export.pl fetfails 50 | Password: 51 | userID: 959391 52 | Loading profile: . 53 | Loading conversations: . 2 conversations found. 54 | Loading wall: . 0 wall-to-walls found. 55 | Loading activity feed: ... 9 statuses found. 56 | 2 pictures found. 57 | 1 writing found. 58 | 0 group threads found. 59 | Downloading 9 statuses... 60 | Downloading 2 pictures... 61 | Downloading 1 posts... 62 | 63 | You can optionally direct `fetlife-export.pl` to make requests through a proxy, such as [Privoxy](http://privoxy.org/) or [Tor](https://torproject.org/). 64 | 65 | ./fetlife-export.pl --proxy=http://example.proxy.com:8080 fetfails # Use an open HTTP proxy. 66 | ./fetlife-export.pl --proxy=socks://localhost:9050 fetfails # Use a local SOCKS proxy, like Tor. 67 | 68 | The Web portion of the suite is mostly a simple wrapper around this command-line tool that provides an HTML interface to its options. 69 | 70 | Additional command line arguments allow you to set an output directory as well as declaring an alternate export target (a user account to export other than your own). For example, to log in to FetLife as `fetfails` but to archive the activity of the FetLife user whose ID is `1` (`JohnBaku`) in a directory named `my-archive-folder`, invoke `fetlife-export.pl` as follows: 71 | 72 | ./fetlife-export.pl fetfails my-archive-folder 1 73 | 74 | (This works because `JohnBaku`'s FetLife user ID number is `1`. To find a FetLife user ID, look at the URL of their FetLife profile URL. The trailing numeric part of their profile URL is their user ID number.) 75 | 76 | ## Configuring the optional Web utility 77 | 78 | Default options are provided in several files. To use them, simply run the following commands on your web server: 79 | 80 | cp htaccess-default .htaccess 81 | 82 | ### Troubleshooting 83 | 84 | If you’re running PHP as a CGI or FastCGI, such as by using `mod_fcgi`, you may begin to frequently experience an “Internal Server Error” if a user with a large FetLife history attempts an export. This may appear in your Apache error log as “Premature end of script headers.” To resolve this, try setting the various timeouts, such as the [FcgidIOTimeout directive](https://httpd.apache.org/mod_fcgid/mod/mod_fcgid.html#fcgidiotimeout "Apache manual page for FcgidIOTimeout directive."), to high values in your server config: 85 | 86 | 87 | # Set a high timeout for large FetLife export calls. 88 | # This may help avoid 500 Internal Server Errors. 89 | # 3600 seconds = 1 hour 90 | FcgidIdleTimeout 3600 91 | FcgidIOTimeout 3600 92 | 93 | 94 | After you edit your server config file, remember to restart the web server, or at least reload the config file so the changes take effect: 95 | 96 | /etc/init.d/httpd2 reload 97 | 98 | See also [Apache mod_fcgid read data timeout error](http://rickchristie.com/blog/2011/note/apache-mod_fcgid-read-data-timeout-error/). 99 | -------------------------------------------------------------------------------- /index.php: -------------------------------------------------------------------------------- 1 | logIn()) { 29 | // If a user wants to delete their archive from this server, delete ALL archives. 30 | exec(escapeshellcmd('rm -rf ' . escapeshellarg(substr($export_dir, 0, -11))) . '*', $output); 31 | exec(escapeshellcmd('rm -f ' . escapeshellarg("$zip_dir.zip")), $output); 32 | } 33 | header("Location: {$SERVER_['PHP_SELF']}"); 34 | exit(0); 35 | } 36 | 37 | // TODO: Make this work regardless of the current position of this script. 38 | // Right now, it only functions correctly if this file is placed in 39 | // the DOCUMENT_ROOT. 40 | $robotstxt = realpath(dirname(basename($_SERVER['SCRIPT_NAME']))) . '/robots.txt'; 41 | define('FLEXPORT_ROBOTS_TXT', $robotstxt); 42 | 43 | if (!file_exists(FLEXPORT_ROBOTS_TXT)) { 44 | if ($fh = fopen(FLEXPORT_ROBOTS_TXT, 'w')) { 45 | fwrite($fh, "User-Agent: *\n"); 46 | fclose($fh); 47 | } else { 48 | die("Couldn't create " . FLEXPORT_ROBOTS_TXT . ". Make sure directory permissions are set appropriately?"); 49 | } 50 | } 51 | ?> 52 | 53 | 54 | 55 | FetLife Exporter 56 | 57 | 58 |

FetLife Exporter

59 |

This tool lets you export your FetLife history.

60 |
61 |
62 | FetLife connection details 63 | 64 | 65 | 66 | 67 |
68 | > 69 | Export options 70 | 71 | 72 | 73 | 74 | 75 | 78 |

Exported directories on this server:

79 | '; 83 | foreach ($globbed_dirs as $globbed_dir_name) { 84 | ?> 85 |
  • 86 | '; 89 | } 90 | } 91 | 92 | if (empty($username) || empty($password)) { 93 | die(""); 94 | } 95 | 96 | $cmd = 'fetlife-export.pl'; 97 | $cmd_safe = escapeshellcmd("./$cmd " . escapeshellarg($username) . ' ' . escapeshellarg($export_dir)); 98 | 99 | $descriptorspec = array( 100 | 0 => array("pipe", "r"), // stdin is a pipe that the child will read from 101 | 1 => array("pipe", "w"), // stdout is a pipe that the child will write to 102 | 2 => array("pipe", "w") // stderr is a pipe that the child will write to 103 | ); 104 | $pipes = array(); 105 | $ph = proc_open($cmd_safe, $descriptorspec, $pipes, './'); 106 | if (!is_resource($ph)) { 107 | die("Error executing $cmd_safe"); 108 | } 109 | 110 | if ('Password: ' === stream_get_contents($pipes[1], 10)) { 111 | fwrite($pipes[0], "$password\n"); 112 | } 113 | 114 | while ($line = stream_get_line($pipes[1], 1024)) { 115 | // var_dump(str_replace("\n", '\n', str_replace("\r", '\r', $line))); 116 | 117 | if (empty($line)) { continue; } 118 | 119 | // Extract info from output. 120 | $matches = array(); 121 | if (preg_match('/userID: ([0-9]+)/', $line, $matches)) { 122 | $id = $matches[1]; 123 | } 124 | if (preg_match('/([0-9]+) conversations? found./', $line, $matches)) { 125 | $num_conversations = $matches[1]; 126 | } 127 | if (preg_match('/([0-9]+) wall-to-walls? found./', $line, $matches)) { 128 | $num_wall_to_walls = $matches[1]; 129 | } 130 | if (preg_match('/([0-9]+) status(?:es)? found./', $line, $matches)) { 131 | $num_statuses = $matches[1]; 132 | } 133 | if (preg_match('/([0-9]+) pictures? found./', $line, $matches)) { 134 | $num_pics = $matches[1]; 135 | } 136 | if (preg_match('/([0-9]+) writings? found./', $line, $matches)) { 137 | $num_writings = $matches[1]; 138 | } 139 | if (preg_match('/([0-9]+) group threads? found./', $line, $matches)) { 140 | $num_group_threads = $matches[1]; 141 | } 142 | } 143 | 144 | foreach ($pipes as $pipe) { 145 | fclose($pipe); 146 | } 147 | proc_close($ph); 148 | 149 | if ($disallow_robots && is_dir($export_dir)) { 150 | if (disallowRobots($export_dir)) { 151 | ?> 152 |

    We've requested that search engines not index your FetLife export. (This is not a guarantee they'll behave!)

    153 | 156 |

    You requested that search engines not index your FetLife export, but there was an error handling this request. Please contact the site administrator for assistance.

    157 | 161 |

    Done exporting user ID . Found:

    162 |
      163 |
    • conversations,
    • 164 |
    • wall-to-walls,
    • 165 |
    • statuses,
    • 166 |
    • pictures,
    • 167 |
    • writings,
    • 168 |
    • group threads.
    • 169 |
    170 |

    Browse . Or:

    171 |
    172 | 173 | 174 | 175 | 176 |
    177 | Archive options 178 | 179 | 180 |
    181 |
    182 | 183 | 184 | 213 | -------------------------------------------------------------------------------- /fetlife-export.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl -w 2 | 3 | use strict; 4 | use WWW::Mechanize; 5 | use Term::ReadKey; 6 | use LWP::Simple qw/getstore/; 7 | use File::Basename; 8 | use File::Path; 9 | use HTML::TreeBuilder; 10 | use String::Escape; 11 | use Unicode::Escape; 12 | use Getopt::Long; 13 | 14 | $|++; 15 | 16 | my $mech = new WWW::Mechanize( stack_depth => 0 ); # No need for history, save memory! 17 | 18 | die "Failed parsing options." unless GetOptions( 19 | 'proxy=s' => sub { 20 | $mech->proxy(['http', 'https'], $_[1]) # Pass the option value as the proxy value. 21 | } ); 22 | 23 | my $username = shift or &usage; 24 | my $dir = shift || "."; 25 | my $target = shift; 26 | print "Password: "; 27 | ReadMode('noecho'); 28 | my $password = ReadLine 0; 29 | ReadMode('normal'); 30 | chomp $password; 31 | print "\n"; 32 | 33 | &login($username, $password); 34 | my $id = &getId(); 35 | if (defined $target) { 36 | $id = $target; 37 | } 38 | print "userID: $id\n"; 39 | 40 | mkpath("$dir/fetlife"); 41 | 42 | &downloadProfile(); 43 | # Only download conversations when the export target is the logged in user. 44 | if (not defined $target) { 45 | &downloadConversations(); 46 | } 47 | &downloadWall(); 48 | &collectLinksInActivityFeed(); 49 | 50 | sub downloadProfile { 51 | print "Loading profile: .", "\n"; 52 | $mech->get("https://fetlife.com/users/$id"); 53 | my $tree; 54 | $tree = HTML::TreeBuilder->new(); 55 | $tree->ignore_unknown(0); 56 | $tree->parse($mech->content()); 57 | 58 | open(DATA, "> $dir/fetlife/$id.html") or die "Can't write $id.html: $!"; 59 | if (open(FILE, "< templates/header.html")) { 60 | while () { 61 | print DATA $_; 62 | } 63 | close FILE; 64 | } 65 | print DATA $tree->look_down( id => 'profile' )->as_HTML(undef, "\t", {}), "\n\n"; 66 | if (open(FILE, "< templates/footer.html")) { 67 | while () { 68 | print DATA $_; 69 | } 70 | close FILE; 71 | } 72 | 73 | close DATA; 74 | $tree->delete(); 75 | } 76 | 77 | sub downloadConversations { 78 | mkdir "$dir/fetlife/conversations"; 79 | 80 | print "Loading conversations: ."; 81 | $mech->get("https://fetlife.com/conversations/all"); 82 | my @links = $mech->find_all_links( url_regex => qr{/conversations/\d+} ); 83 | while (my $next = $mech->find_link( url_regex => qr{/conversations/all\?page=(\d)}, text_regex => qr/Next/ )) { 84 | print "."; 85 | $mech->get($next); 86 | push @links, $mech->find_all_links( url_regex => qr{/conversations/\d+} ); 87 | } 88 | 89 | my $num = @links; 90 | my $s = &s($num); 91 | my $i = 1; 92 | print " $num conversation$s found.\n"; 93 | return unless $num; 94 | foreach my $page (@links) { 95 | print "$i/$num\r"; 96 | 97 | &getMessages($page); 98 | 99 | $i++; 100 | } 101 | } 102 | 103 | sub getMessages { 104 | my $page = shift; 105 | my $tree; 106 | $mech->get($page); 107 | if (!$mech->success()) { 108 | print "$0: Error GETing $page"; 109 | return; 110 | } 111 | $tree = HTML::TreeBuilder->new(); 112 | $tree->ignore_unknown(0); 113 | $tree->parse($mech->content()); 114 | my $x = basename($page->url()); 115 | my @y = split(/#/, $x); 116 | my $name = $y[0]; 117 | 118 | open(DATA, "> $dir/fetlife/conversations/$name.html") or die "Can't write $name.html"; 119 | print DATA "", "\n\n"; 120 | print DATA $tree->look_down( id => 'messages' )->as_HTML(undef, "\t", {}), "\n\n"; 121 | 122 | close DATA; 123 | $tree->delete(); 124 | } 125 | 126 | sub downloadWall { 127 | print "Loading wall: ."; 128 | 129 | # Grab the first page of my wall. 130 | $mech->get("https://fetlife.com/users/$id/wall_posts"); 131 | 132 | my @links = $mech->find_all_links( url_regex => qr/wall_to_wall/ ); 133 | 134 | while (my $next = $mech->find_link( url_regex => qr{users/$id/wall_posts\?page}, text_regex => qr/^Next/ )) { 135 | print "."; 136 | $mech->get($next); 137 | push @links, $mech->find_all_links( url_regex => qr/wall_to_wall/ ); 138 | } 139 | @links = &filterLinksList(@links); 140 | 141 | my $num = @links; 142 | my $s = &s($num); 143 | print " $num wall-to-wall$s found.\n"; 144 | 145 | if ($num) { 146 | downloadWallToWall($num, @links); 147 | } 148 | } 149 | 150 | sub downloadWallToWall ($$) { 151 | mkdir "$dir/fetlife/wall_to_wall"; 152 | 153 | my $num = shift; 154 | my @links = @_; 155 | 156 | print "Downloading $num wall-to-walls...\n"; 157 | 158 | my $i = 1; 159 | foreach my $page (@links) { 160 | print "$i/$num\r"; 161 | 162 | &getWallToWall($page); 163 | 164 | $i++; 165 | } 166 | } 167 | 168 | sub getWallToWall { 169 | my $page = shift; 170 | my $tree; 171 | 172 | $mech->get($page); 173 | if (!$mech->success()) { 174 | print "$0: Error GETing $page"; 175 | return; 176 | } 177 | 178 | my $name = $mech->title(); 179 | 180 | $tree = HTML::TreeBuilder->new(); 181 | $tree->ignore_unknown(0); 182 | $tree->parse($mech->content()); 183 | 184 | open(DATA, "> $dir/fetlife/wall_to_wall/$name.html") or die "Can't write wall.html"; 185 | print DATA "", "\n\n"; 186 | print DATA $tree->look_down( id => 'wall_posts' )->as_HTML(undef, "\t", {}), "\n\n"; 187 | 188 | close DATA; 189 | $tree->delete(); 190 | } 191 | 192 | # Traverses a user's activity feed, collecting links to download. 193 | # TODO: Refactor this so the `downloadStatuses()` and `downloadGroupPosts()` functions 194 | # aren't actually nested here. 195 | sub collectLinksInActivityFeed { 196 | print "Loading activity feed: ."; 197 | 198 | $mech->get("https://fetlife.com/users/$id/activity"); 199 | 200 | # Only links to one's own statuses are FQURIs, so use absolute (server-relative) URI. 201 | my @statuses = $mech->find_all_links( url_regex => qr{/users/\d+/statuses/\d+$} ); 202 | my @pictures = $mech->find_all_links( url_regex => qr{https?://fetlife.com/users/\d+/pictures/\d+$} ); 203 | my @writings = $mech->find_all_links( url_regex => qr{https?://fetlife.com/users/\d+/posts/\d+$} ); 204 | my @group_posts = $mech->find_all_links( url_regex => qr{https?://fetlife.com/groups/\d+/group_posts/\d+$} ); 205 | 206 | # Catch errors, but crudely. 207 | eval { 208 | while (my $next = $mech->find_link( url_regex => qr{/users/$id/activity/more\?page}, text_regex => qr/view more/ )) { 209 | print "."; 210 | $mech->get($next); 211 | 212 | #### FetLife returns straight-up jQuery, so clean this out before parsing. 213 | #### TODO: Can we refactor this? It feels kludgy. 214 | # Split into lines. 215 | my @x = split("\n", $mech->content); 216 | 217 | # If this is the end of the feed, we'll only get 2 lines back with which we can do nothing. 218 | # Otherwise, we'll get three lines. 219 | if (3 == scalar(grep $_, @x)) { 220 | # Ignore the first line. 221 | 222 | # Clean the second line. 223 | ## Extract the JavaScript and Unicode-encoded text from the jQuery commands. 224 | ### Cut out the first 24 characters, which are always: `$("#mini_feed").append("` 225 | my $x1 = substr($x[1], 24); 226 | ### Cut out the last 3 characters, which are always: `");` 227 | $x1 = substr($x1, 0, -3); 228 | $x1 = Unicode::Escape::unescape($x1, 'UTF-8'); 229 | $x1 = String::Escape::unbackslash($x1); 230 | 231 | my $x2 = substr($x[2], 23); 232 | $x2 = substr($x2, 0, -3); 233 | $x2 = String::Escape::unbackslash($x2); 234 | 235 | # Concatenate the cleaned-up lines together. 236 | my $html = Encode::decode_utf8($x1 . $x2); 237 | $mech->update_html($html); 238 | } 239 | 240 | push @statuses, $mech->find_all_links( url_regex => qr{/users/\d+/statuses/\d+$} ); 241 | push @pictures, $mech->find_all_links( url_regex => qr{https?://fetlife.com/users/\d+/pictures/\d+$} ); 242 | push @writings, $mech->find_all_links( url_regex => qr{https?://fetlife.com/users/\d+/posts/\d+$} ); 243 | push @group_posts, $mech->find_all_links( url_regex => qr{https?://fetlife.com/groups/\d+/group_posts/\d+$} ); 244 | } 245 | }; 246 | # Did we hit an error while trying to download the activity feed? 247 | # TODO: This error handling should be a bit more robust, methinks. 248 | if ($@) { 249 | print "$0 encountered an error loading activity feed for $username (ID $id): $@"; 250 | } 251 | 252 | @statuses = &filterLinksList(@statuses); 253 | @pictures = &filterLinksList(@pictures); 254 | @writings = &filterLinksList(@writings); 255 | @group_posts = &filterLinksList(@group_posts); 256 | 257 | # Count how many statuses were found. 258 | my $snum = @statuses; 259 | my $s = &s($snum, 1); 260 | print " $snum status$s found.\n"; 261 | 262 | # Count how many pictures were found. 263 | my $pnum = @pictures; 264 | $s = &s($pnum); 265 | print " $pnum picture$s found.\n"; 266 | 267 | # Count how many writings were found. 268 | my $wnum = @writings; 269 | $s = &s($wnum); 270 | print " $wnum writing$s found.\n"; 271 | 272 | # Count how many group threads were found. 273 | my $gnum = @group_posts; 274 | $s = &s($gnum); 275 | print " $gnum group thread$s found.\n"; 276 | 277 | # If we found things to download, go download them. 278 | if ($snum) { 279 | downloadStatuses($snum, @statuses); 280 | } 281 | 282 | if ($pnum) { 283 | downloadPics($pnum, @pictures); 284 | } 285 | 286 | if ($wnum) { 287 | downloadWritings($wnum, @writings); 288 | } 289 | 290 | if ($gnum) { 291 | downloadGroupPosts($gnum, @group_posts); 292 | } 293 | } 294 | 295 | sub downloadStatuses ($$) { 296 | mkdir "$dir/fetlife/statuses"; 297 | 298 | my $num = shift; 299 | my @links = @_; 300 | 301 | print "Downloading $num statuses...\n"; 302 | 303 | my $i = 1; 304 | foreach my $page (@links) { 305 | print "$i/$num\r"; 306 | 307 | my $name = basename($page->url()); 308 | unless ( -f "$dir/fetlife/statuses/$name.html" ) { 309 | &getStatus($page); 310 | } 311 | 312 | $i++; 313 | } 314 | } 315 | 316 | sub getStatus { 317 | my $page = shift; 318 | my $tree; 319 | my $name = basename($page->url()); 320 | 321 | $mech->get($page); 322 | if (!$mech->success()) { 323 | print "$0: Error GETing $page"; 324 | return; 325 | } 326 | 327 | $tree = HTML::TreeBuilder->new(); 328 | $tree->ignore_unknown(0); 329 | $tree->parse($mech->content()); 330 | 331 | # Strip out problematic HTML. 332 | my @comments = $tree->look_down( class => qr/status_comment/ ); 333 | foreach my $comment (@comments) { 334 | $comment->attr( 'style', undef ); 335 | } 336 | eval { 337 | $tree->look_down( class => qr/new_comment/ )->delete(); 338 | }; 339 | # If we've hit an error, we don't write a file, so we may catch it on next round. 340 | if ($@) { 341 | print "$0: Oh no, Molly! Error on " . $page->url() . " $@\n"; 342 | } else { 343 | open(DATA, "> $dir/fetlife/statuses/$name.html") or die "Can't write $name.html"; 344 | if (open(FILE, "< templates/header.html")) { 345 | while () { 346 | print DATA $_; 347 | } 348 | close FILE; 349 | } 350 | if (open(FILE, "< templates/statuses-top.html")) { 351 | while () { 352 | print DATA $_; 353 | } 354 | close FILE; 355 | } 356 | print DATA "", "\n\n"; 357 | print DATA $tree->look_down( id => "status_$name" )->as_HTML(undef, "\t", {}), "\n\n"; 358 | if (open(FILE, "< templates/statuses-bottom.html")) { 359 | while () { 360 | print DATA $_; 361 | } 362 | close FILE; 363 | } 364 | if (open(FILE, "< templates/footer.html")) { 365 | while () { 366 | print DATA $_; 367 | } 368 | close FILE; 369 | } 370 | close DATA; 371 | } 372 | 373 | $tree->delete(); 374 | } 375 | 376 | sub downloadGroupPosts ($$) { 377 | mkdir "$dir/fetlife/group_posts"; 378 | 379 | my $num = shift; 380 | my @links = @_; 381 | 382 | print "Downloading $num group posts...\n"; 383 | 384 | my $i = 1; 385 | foreach my $page (@links) { 386 | print "$i/$num\r"; 387 | 388 | my $name = basename($page->url()); 389 | unless ( -f "$dir/fetlife/group_posts/$name.html" ) { 390 | &getGroupThread($page); 391 | } 392 | 393 | $i++; 394 | } 395 | } 396 | 397 | sub getGroupThread { 398 | my $page = shift; 399 | my $tree; 400 | my $name = basename($page->url()); 401 | 402 | # Grab the first page of the group thread. 403 | $mech->get($page); 404 | if (!$mech->success()) { 405 | print "$0: Error GETing $page"; 406 | return; 407 | } 408 | 409 | # Download the first page. 410 | $tree = HTML::TreeBuilder->new(); 411 | $tree->ignore_unknown(0); 412 | $tree->parse($mech->content()); 413 | 414 | # TODO: Edit HTML so `#comments` ID isn't repeated and pagination links are intra-page. 415 | 416 | open(DATA, "> $dir/fetlife/group_posts/$name.html") or die "Can't write $name.html"; 417 | if (open(FILE, "< templates/header.html")) { 418 | while () { 419 | print DATA $_; 420 | } 421 | close FILE; 422 | } 423 | if (open(FILE, "< templates/group_posts-top.html")) { 424 | while () { 425 | print DATA $_; 426 | } 427 | close FILE; 428 | } 429 | print DATA "", "\n\n"; 430 | print DATA $tree->look_down( class => qr{group_post} )->as_HTML(undef, "\t", {}), "\n\n"; 431 | my $comments = $tree->look_down( id => 'comments' ); 432 | if ($comments) { 433 | print DATA '
    '; # FetLife's HTML. 434 | print DATA $tree->look_down( id => 'comments' )->as_HTML(undef, "\t", {}), "\n\n"; 435 | } 436 | $tree->delete(); 437 | 438 | # Also download comments on next pages. 439 | while (my $next = $mech->find_link( url_regex => qr{groups/\d+/group_posts/\d+\?page}, text_regex => qr/^Next/ )) { 440 | $mech->get($next); 441 | 442 | $tree = HTML::TreeBuilder->new(); 443 | $tree->ignore_unknown(0); 444 | $tree->parse($mech->content()); 445 | 446 | print DATA $tree->look_down( id => 'comments' )->as_HTML(undef, "\t", {}), "\n\n"; 447 | print DATA '
    '; # FetLife's HTML. 448 | 449 | $tree->delete(); 450 | } 451 | if (open(FILE, "< templates/group_posts-bottom.html")) { 452 | while () { 453 | print DATA $_; 454 | } 455 | close FILE; 456 | } 457 | if (open(FILE, "< templates/footer.html")) { 458 | while () { 459 | print DATA $_; 460 | } 461 | close FILE; 462 | } 463 | 464 | close DATA; 465 | } 466 | 467 | sub downloadWritings ($$) { 468 | mkdir "$dir/fetlife/posts"; 469 | 470 | my $num = shift; 471 | my @links = @_; 472 | 473 | print "Downloading $num posts...\n"; 474 | 475 | my $i = 1; 476 | foreach my $page (@links) { 477 | print "$i/$num\r"; 478 | 479 | my $name = basename($page->url()); 480 | unless ( -f "$dir/fetlife/posts/$name.html" ) { 481 | &getPost($page); 482 | } 483 | 484 | $i++; 485 | } 486 | } 487 | 488 | sub getPost { 489 | my $page = shift; 490 | my $tree; 491 | $mech->get($page); 492 | if (!$mech->success()) { 493 | print "$0: Error GETing $page"; 494 | return; 495 | } 496 | $tree = HTML::TreeBuilder->new(); 497 | $tree->ignore_unknown(0); 498 | $tree->parse($mech->content()); 499 | my $name = basename($page->url()); 500 | if (!$tree->look_down( id => 'post_content' )) { 501 | print "$0: Oh no, Molly! Error on " . $page->url() . "\n"; 502 | return; 503 | } 504 | open(DATA, "> $dir/fetlife/posts/$name.html") or die "Can't write $name.html: $!"; 505 | if (open(FILE, "< templates/header.html")) { 506 | while () { 507 | print DATA $_; 508 | } 509 | close FILE; 510 | } 511 | if (open(FILE, "< templates/posts-top.html")) { 512 | while () { 513 | print DATA $_; 514 | } 515 | close FILE; 516 | } 517 | print DATA "", "\n\n"; 518 | print DATA $tree->look_down( id => 'post_content' )->as_HTML(undef, "\t", {}), "\n\n"; 519 | print DATA $tree->look_down( id => 'comments' )->as_HTML(undef, "\t", {}), "\n\n"; 520 | if (open(FILE, "< templates/posts-bottom.html")) { 521 | while () { 522 | print DATA $_; 523 | } 524 | close FILE; 525 | } 526 | if (open(FILE, "< templates/footer.html")) { 527 | while () { 528 | print DATA $_; 529 | } 530 | close FILE; 531 | } 532 | close DATA; 533 | $tree->delete(); 534 | } 535 | 536 | sub downloadPics ($$) { 537 | mkdir "$dir/fetlife/pictures"; 538 | 539 | my $num = shift; 540 | my @links = @_; 541 | 542 | print "Downloading $num pictures...\n"; 543 | 544 | my $i = 1; 545 | foreach my $page (@links) { 546 | print "$i/$num\r"; 547 | 548 | &getImage($page); 549 | 550 | $i++; 551 | } 552 | } 553 | 554 | sub getImage { 555 | my $page = shift; 556 | my $tree; 557 | $mech->get($page); 558 | if (!$mech->success()) { 559 | print "$0: Error GETing $page"; 560 | return; 561 | } 562 | $tree = HTML::TreeBuilder->new(); 563 | $tree->ignore_unknown(0); 564 | $tree->parse($mech->content()); 565 | 566 | my $x = $tree->find_by_tag_name('style'); 567 | if (!$x) { 568 | print "$0: Oh no, Molly! Error finding CSS on " . $page->url() . "\n"; 569 | return; 570 | } 571 | my @pic_css = $x->content_list(); 572 | my @pic_src = ($pic_css[0] =~ /(https:\/\/flpics.*_720.jpg)/); 573 | if (!@pic_src) { 574 | print "$0: Oh no, Molly! Error on " . $page->url() . "\n"; 575 | return; 576 | } 577 | my $name = basename(@pic_src); 578 | 579 | # Don't download images we've already grabbed. 580 | # TODO: Extend this so we don't download pages/threads we've already grabbed, either. 581 | unless ( -f "$dir/fetlife/pictures/$name" ) { 582 | getstore($pic_src[0], "$dir/fetlife/pictures/$name"); 583 | } 584 | 585 | my $picture = $tree->look_down( class => "main_pic" ); 586 | my $pic_img = HTML::Element->new( 'img', 'src' => $name, 'alt' => "" ); 587 | $picture->insert_element($pic_img); 588 | 589 | open(DATA, "> $dir/fetlife/pictures/$name.html") or die "Can't write $name.html: $!"; 590 | print DATA "", "\n\n"; 591 | print DATA $picture->as_HTML(undef, "\t", {}), "\n\n"; 592 | print DATA $tree->look_down( id => "comments" )->as_HTML(undef, "\t", {}), "\n\n"; 593 | close DATA; 594 | 595 | $tree->delete(); 596 | } 597 | 598 | sub filterLinksList { 599 | my %uniq = map { $_->url_abs(), $_ } @_; 600 | return values %uniq; 601 | } 602 | 603 | sub getId { 604 | my $link = $mech->find_link( text_regex => qr/View Your Profile/i ); 605 | die "Failed to find profile link!" unless $link; 606 | if ($link->url() =~ m{/(\d+)$}) { 607 | return $1; 608 | } else { 609 | die "Failed to get user ID out of profile link: " . $link->url(); 610 | } 611 | } 612 | 613 | sub login { 614 | my ($username, $password) = @_; 615 | 616 | $mech->get( "https://fetlife.com/login" ); 617 | $mech->form_with_fields( qw/nickname_or_email password/ ); 618 | $mech->field( 'nickname_or_email' => $username ); 619 | $mech->field( 'password' => $password ); 620 | my $res = $mech->submit(); 621 | die "Login failed!" unless $res->is_success; 622 | } 623 | 624 | sub usage { 625 | print "$0 [--proxy=URL] []\n"; 626 | exit 1; 627 | } 628 | 629 | sub s { 630 | my $num = shift; 631 | my $alt = shift; 632 | unless ($alt) { return $num == 1 ? "" : "s"; } 633 | else { return $num == 1 ? "" : "es"; } 634 | } 635 | -------------------------------------------------------------------------------- /templates/header.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | TODO: Write a title? - FetLife 15 | 16 | 2279 | 2297 | 2298 | 2299 | 2300 | 2301 | 2302 | 2303 |
    2304 | 2305 |
    2306 |
    2307 | 2344 |
    2345 |
    2346 | 2347 |
    --------------------------------------------------------------------------------