├── README.md ├── collect_conversations.py └── get_friends.py /README.md: -------------------------------------------------------------------------------- 1 | # facebook 2 | Be the master of your own private Facebook data. 3 | 4 | ### get_friends.py 5 | 6 | Get all of your friend's Facebook user ids from your messaging history. 7 | 8 | ``` 9 | Usage: python get_friends.py username password output_file 10 | ``` 11 | 12 | The output has the following format: id \t name \t username 13 | 14 | This script uses a browser automation library and the Phantom JS browser. 15 | + brew install phantomjs 16 | + pip install splinter 17 | 18 | ### collect_conversations.py 19 | Downloads your entire Facebook messaging history. 20 | 21 | ``` 22 | Usage: python collect_conversations.py username password your_facebook_id friend_file output_dir 23 | ``` 24 | You can find your Facebook ID by digging in the HTML. This script will store each conversation in a .txt file named after your friend's username, in the provided output directory. This script also takes as input a tab-delimited file containing your friends' user ids, names, and usernames. You can manually compile this or generate it using the get_friends.py script. 25 | -------------------------------------------------------------------------------- /collect_conversations.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Collect all of your Facebook conversations, ever. 3 | 4 | Usage: python collect_conversations.py 5 | 6 | You can find your Facebook ID by digging in the HTML. 7 | This script will store each conversation in a .txt file 8 | named after your friend's username, in the provided 9 | output directory. This script also takes as input a tab- 10 | delimited file containing your friends' user ids, names, 11 | and usernames. You can manually compile this or generate it 12 | using the get_friends.py script. 13 | ''' 14 | 15 | import re 16 | import os 17 | import sys 18 | import json 19 | import urllib 20 | import requests 21 | import operator 22 | from datetime import datetime 23 | from collections import Counter 24 | from bs4 import BeautifulSoup as bs 25 | 26 | try: 27 | username = sys.argv[1] 28 | password = sys.argv[2] 29 | my_id = sys.argv[3] 30 | friend_file_path = sys.argv[4] 31 | output_dir = sys.argv[5] 32 | except IndexError: 33 | print "Usage: python collect_conversations.py " 34 | sys.exit(1) 35 | 36 | # Specifies the maximum number of messages that are stored in a single request's 37 | # response JSON, 5000 is a safe value (breaks at higher values) 38 | json_limit = 5000 39 | 40 | # Facebook urls 41 | base_url = "https://www.facebook.com/" 42 | login_url = "https://www.facebook.com/login.php?login_attempt=1" 43 | 44 | # Regex for extracting the fb_dtsg token from your facebook home page (after login) 45 | dtsg_str = '' 46 | 47 | # POST headers sent with both the login and the message history requests 48 | headers = {'Host': 'www.facebook.com', 49 | 'Origin':'http://www.facebook.com', 50 | 'Referer':'http://www.facebook.com/', 51 | 'User-Agent': '"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.122 Safari/537.36' 52 | } 53 | 54 | # The basic skeleton of the ajax request facebook uses to get more messages between you and a friend 55 | def friend_url(my_id, friend_id, json_limit, offset, fb_dtsg): 56 | return 'https://www.facebook.com/ajax/mercury/thread_info.php?&messages[user_ids][%d][limit]=%d&messages[user_ids][%d][offset]=%d&client=web_messenger&__user=%d&__a=1&fb_dtsg=%s' % (friend_id, json_limit, friend_id, offset, my_id, fb_dtsg) 57 | 58 | # The basic skeleton of the ajax request facebook uses to get more messages from a group thread 59 | def thread_url(my_id, friend_id, json_limit, offset, fb_dtsg): 60 | return 'https://www.facebook.com/ajax/mercury/thread_info.php?&messages[thread_fbids][%d][limit]=%d&messages[thread_fbids][%d][offset]=%d&client=web_messenger&__user=%d&__a=1&fb_dtsg=%s' % (friend_id, json_limit, friend_id, offset, my_id, fb_dtsg) 61 | 62 | 63 | def login(session, username, password): 64 | # Open the Facebook login page in the session 65 | home_request = session.get(base_url).text 66 | # Load the login page into a BeautifulSoup soup object 67 | login_soup = bs(home_request, "html5lib") 68 | # Extract the LSD token from Facebook login page, required for login post request 69 | lsd = str(login_soup.find_all('input', attrs={"name": "lsd"})[0]['value']) 70 | # Login data for the Login POST request 71 | login_data = { 72 | 'locale': 'en_US', 73 | 'non_com_login': '', 74 | 'email': username, 75 | 'pass': password, 76 | 'lsd': lsd 77 | } 78 | # Log in and store the response page (your Facebook home feed) 79 | content = session.post(login_url, data=login_data, verify=False) 80 | return content 81 | 82 | def get_message_history(session, fb_dtsg, my_id, friend_id, username, thread=False): 83 | 84 | offset = 0 85 | num_messages = 0 86 | 87 | # Will store history for this friend in its own file 88 | output = open(os.join(output_dir, username + '.txt'), 'w') 89 | 90 | while True: 91 | 92 | # Create the ajax url 93 | if thread: 94 | url = thread_url(my_id, friend_id, json_limit, offset, fb_dtsg) 95 | else: 96 | url = friend_url(my_id, friend_id, json_limit, offset, fb_dtsg) 97 | 98 | # Run the GET request and store the response data (JSON of message history) 99 | data = session.get(url, headers=headers, verify=False).text 100 | # Remove some leading characters 101 | data = data[9:] 102 | # Convert JSON response to Python dict 103 | message_dict = json.loads(data) 104 | 105 | try: 106 | message_list = message_dict['payload']['actions'] 107 | except KeyError: 108 | print 'Error:::No payload or actions in message dict, make sure you have correct log in information/user id' 109 | break 110 | 111 | num_messages += len(message_list) 112 | 113 | for message in message_list: 114 | 115 | # Message text 116 | try: 117 | message_content = message['body'].replace('\t', ' ').replace('\n', ' ') 118 | except KeyError: 119 | # This occurs when people name threads 120 | message_content = message['log_message_body'].replace('\t', ' ').replace('\n', ' ') 121 | 122 | # When the message was sent 123 | sent = str(message['timestamp']) 124 | 125 | # Type of device from which message was sent 126 | source = ' '.join(message['source_tags']) 127 | if 'mobile' in source: 128 | source = 'mobile' 129 | else: 130 | source = 'chat' 131 | 132 | # Who sent the message 133 | author = int(message['author'].split(':')[1]) 134 | if author == my_id: 135 | sender = 'me' 136 | else: 137 | sender = str(friend_id) 138 | 139 | #attachments = message['attachments'] 140 | 141 | # Location data, if possible 142 | if message['coordinates'] is not None: 143 | lat = message['coordinates']['latitude'] 144 | lng = message['coordinates']['longitude'] 145 | coords = str(lat) + ',' + str(lng) 146 | else: 147 | coords = 'None,None' 148 | 149 | out = [sender, message_content.encode('utf8'), source, sent, coords] 150 | 151 | # Additional group thread stuff 152 | if thread: 153 | try: 154 | chat_name = message['log_message_data']['name'] 155 | except KeyError: 156 | chat_name = '' 157 | out.append(chat_name) 158 | 159 | output.write('\t'.join(out) + '\n') 160 | 161 | 162 | if len(message_list) < json_limit: 163 | print '\t', num_messages, 'with', username 164 | break 165 | else: 166 | offset += json_limit 167 | 168 | output.close() 169 | return num_messages 170 | 171 | # To log in to facebook, start a session 172 | session = requests.Session() 173 | content = login(session, username, password) 174 | try: 175 | # If the dtsg token is in the html of the page, you've successfully logged in 176 | fb_dtsg = re.search(dtsg_str, content.content).group(1).split('\"')[0] 177 | print "Logged in!" 178 | except: 179 | # Otherwise, something went wrong and you're most likely back at the log in 180 | # page - make sure you have the correct login credentials and user id 181 | print "Unable to log in... :(" 182 | sys.exit(1) 183 | 184 | # Collect messages for each id in friend file 185 | friend_file = open(friend_file_path).read().splitlines() 186 | for line in friend_file: 187 | uid, name, username = line.split('\t') 188 | try: 189 | uid = int(uid) 190 | except ValueError: 191 | print 'Found group thread:' 192 | if 'conversation' in line: 193 | convo_id = line.split('conversation-')[1] 194 | if '.' in convo_id: 195 | convo_id = convo_id.split('.')[1] 196 | print 'Getting message history for group thread:', convo_id 197 | convo_name = 'conversation-' + convo_id 198 | num_messages = get_message_history(session, fb_dtsg, my_id, int(convo_id), convo_name, thread=True) 199 | print num_messages, 'with', convo_name 200 | else: 201 | print 'Word conversation not in name...' 202 | continue 203 | 204 | 205 | print 'Getting message history for:', line 206 | num_messages = get_message_history(session, fb_dtsg, my_id, uid, username) 207 | print num_messages, 'with', name 208 | 209 | # Now have fun analyzing your results! 210 | -------------------------------------------------------------------------------- /get_friends.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Get all of your friend's facebook user ids from your messaging history. 3 | Usage: python get_friends.py 4 | 5 | The output has the following format: id \t name \t username 6 | 7 | Uses a browser automation library and the Phantom JS browser. 8 | brew install phantomjs 9 | pip install splinter 10 | ''' 11 | 12 | import sys 13 | import json 14 | import time 15 | import selenium 16 | import requests 17 | from splinter import Browser 18 | from urllib2 import URLError 19 | 20 | _base_url = "https://www.facebook.com" 21 | _login_url = "https://login.facebook.com/login.php?login_attempt=1" 22 | _base_msg_url = _base_url + "/messages/" 23 | 24 | try: 25 | username = sys.argv[1] 26 | password = sys.argv[2] 27 | out = sys.argv[3] 28 | except IndexError: 29 | print "Usage: python get_friends.py " 30 | sys.exit() 31 | 32 | # Create the phantom js browser instance 33 | browser = Browser("phantomjs") 34 | 35 | # Log in by finding the log in form and filling it 36 | # with your username and password. Then automatically 37 | # "click" on the enter button, and navigate to the 38 | # messages page. 39 | def login(): 40 | browser.visit(_base_url) 41 | inputs = browser.find_by_id("login_form")[0].find_by_tag("input") 42 | inputs[1].fill(username) 43 | inputs[2].fill(password) 44 | 45 | enter = browser.find_by_id("u_0_n").first 46 | enter.click() 47 | print "Logged in!" 48 | 49 | browser.visit(_base_msg_url) 50 | 51 | # This is the automatic equivalent of scrolling up. 52 | def load_more(): 53 | # Messages are contained within these divs 54 | pagers = browser.find_by_css("#contentArea")[0].find_by_css("div.uiMorePager") 55 | for div in pagers: 56 | # The 'load more' area is really an a href tag, which can be 'clicked' on 57 | link = div.find_by_css("div")[0].find_by_css("a")[0] 58 | if link.value == 'Load Older Threads': 59 | print 'Loading more...' 60 | link.click() 61 | return True 62 | 63 | # Once all threads have been loaded, call this function to 64 | # get all the user ids and save them to a file. 65 | def get_threads(): 66 | f = open(out, 'w') 67 | # Each thread is stored in a list element with class _k- 68 | users = browser.find_by_css("li._k-") 69 | for user in users: 70 | fid = user['id'].split(':')[2] # Your friend's id 71 | name = user.find_by_css("span.accessible_elem").text.lower() # Your friend's name 72 | username = user.find_by_css("a._k_")['href'].split("/")[-1] # Your friend's username (what appears in their url) 73 | 74 | f.write(str(fid) + '\t' + name.encode("utf8") + '\t' + username.encode("utf8") + '\n') 75 | 76 | f.close() 77 | 78 | # Log in to Facebook 79 | login() 80 | 81 | # Keep loading new threads 82 | while True: 83 | try: 84 | loaded_more = load_more() 85 | if not loaded_more: 86 | break 87 | time.sleep(1) 88 | except (URLError, selenium.common.exceptions.StaleElementReferenceException): 89 | print "reconnecting......" 90 | # Try to reconnect 91 | time.sleep(5) 92 | try: 93 | login() 94 | except: 95 | continue 96 | 97 | # Now that all threads have been loaded in the HTML, 98 | # get all your friends' information 99 | get_threads() 100 | --------------------------------------------------------------------------------