├── .gitignore ├── samples ├── sample_date_graph.png ├── sample_pie_chart.png ├── README.md └── facebook_and_sms.py ├── facebook.py ├── LICENSE.md ├── README.md ├── fb_parser.py ├── fb_chat.py └── fb_analysis.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.htm 2 | *.pickle 3 | *.json 4 | *.csv 5 | *.zip 6 | *.pyc 7 | __pycache__/ 8 | -------------------------------------------------------------------------------- /samples/sample_date_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsharkey13/facebook_message_parser/HEAD/samples/sample_date_graph.png -------------------------------------------------------------------------------- /samples/sample_pie_chart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jsharkey13/facebook_message_parser/HEAD/samples/sample_pie_chart.png -------------------------------------------------------------------------------- /samples/README.md: -------------------------------------------------------------------------------- 1 | __`facebook_and_sms.py`__ 2 | 3 | The file contains code to merge a parsed Facebook Message history with an iOS6 iPhone Message history 4 | (using the code from [iphone_message_parser](https://github.com/jsharkey13/iphone_message_parser)). The 5 | resulting object can be used with the `fb_analysis.py` code, or browsed as if it was an `fb_chat.Chat` object. 6 | 7 | ----- 8 | 9 | __`sample_date_graph.png`__ 10 | 11 | The image is the graph produced from a parsed Facebook Message history, using the code from `fb_parser.py` 12 | to produce the `Facebook.Chat` object. 13 | The code to produce the graph is in `fb_analysis.py` and requires a single command: 14 | ``` 15 | fb_analysis.messages_date_graph(Facebook.Chat, name="Their Name", filename="sample_date_graph.png", 16 | start_date=(2014, 9, 1), end_date=(2015, 5, 1), no_gui=True) 17 | ``` 18 | 19 | ----- 20 | 21 | __`sample_pie_chart.png`__ 22 | 23 | The image is again a graph produced using `fb_analysis.py`: 24 | ``` 25 | fb_analysis.messages_pie_chart(Facebook.Chat, N=8, filename="sample_pie_chart.png", count_type="total", groups=False, 26 | no_gui=True, percentages=False) 27 | -------------------------------------------------------------------------------- /facebook.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import codecs 4 | 5 | import fb_parser 6 | import fb_analysis 7 | 8 | 9 | # Nasty hack to force utf-8 encoding by default: 10 | reload(sys) 11 | sys.setdefaultencoding('utf8') 12 | 13 | # Change stdout to allow printing of unicode characters: 14 | streamWriter = codecs.lookup('utf-8')[-1] 15 | sys.stdout = streamWriter(sys.stdout) 16 | 17 | 18 | if __name__ == "__main__": 19 | """Allow the parser to be run from the command line. 20 | 21 | Optionally, the function allows specifying the filename to read in from 22 | as the first argument.""" 23 | if len(sys.argv) >= 2: 24 | # If filname passed in and a recognised format, continue: 25 | if ((".zip" in sys.argv[1]) or (".htm" in sys.argv[1]) or (".pickle" in sys.argv[1])): 26 | fname = sys.argv[1] 27 | else: 28 | # If not a recognised format, stop but allow override: 29 | print "File is not a .zip file, a .htm file or a pickle file." 30 | cont = raw_input("Continue anyway? (y/n)") 31 | if cont == "n": 32 | sys.exit(-1) 33 | else: 34 | # If no argument, attempt to open the default .zip export file: 35 | fname = "facebook-" + fb_parser.FBMessageParse._MYUSERNAME + ".zip" 36 | if not os.path.isfile(fname): 37 | print "File " + fname + " does not exist or could not be found! Abort." 38 | sys.exit(-1) 39 | 40 | # Now use the Facebook.Chat object to do stuff! 41 | # Some example code to add functionality immediately. 42 | 43 | # Create the parser, and parse the messages file: 44 | if ".pickle" in fname: 45 | Facebook = fb_parser.FBMessageParse(fname, load_pickle=True) 46 | else: 47 | Facebook = fb_parser.FBMessageParse(fname) 48 | Facebook.parse_messages() 49 | # Now find and print the Top 10 Friends: 50 | print "Top 10 Most Messaged Friends: Total Thread Length" 51 | top10 = fb_analysis.top_n_people(Facebook.Chat, N=10) 52 | print top10 53 | # Output to a csv file: 54 | Facebook.write_to_csv() 55 | # Show a graph of the most messaged friend's messages: 56 | fb_analysis.messages_date_graph(Facebook.Chat, top10[0][0]) 57 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | The code in the files `fb_analysis.py` and `facebook.py` is released under the MIT License below. 2 | __The files `fb_parser.py` and `fb_chat.py` contain code from another author, covered by the original license at the end of this file.__ 3 | 4 | The MIT License (MIT) 5 | 6 | Copyright (c) 2015 James Sharkey 7 | 8 | Permission is hereby granted, free of charge, to any person obtaining a copy 9 | of this software and associated documentation files (the "Software"), to deal 10 | in the Software without restriction, including without limitation the rights 11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | copies of the Software, and to permit persons to whom the Software is 13 | furnished to do so, subject to the following conditions: 14 | 15 | The above copyright notice and this permission notice shall be included in all 16 | copies or substantial portions of the Software. 17 | 18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | SOFTWARE. 25 | 26 | __The code in `fb_chat.py` and `fb_parser.py` is covered by the original license also:__ 27 | 28 | The MIT License (MIT) 29 | 30 | Copyright (c) 2015 Chris Copley 31 | 32 | Permission is hereby granted, free of charge, to any person obtaining a copy 33 | of this software and associated documentation files (the "Software"), to deal 34 | in the Software without restriction, including without limitation the rights 35 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 36 | copies of the Software, and to permit persons to whom the Software is 37 | furnished to do so, subject to the following conditions: 38 | 39 | The above copyright notice and this permission notice shall be included in all 40 | copies or substantial portions of the Software. 41 | 42 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 43 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 44 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 45 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 46 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 47 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 48 | SOFTWARE. 49 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Facebook Message Export Parser 2 | 3 | --- 4 | ## Update September 2018 5 | 6 | __Deprecation Notice__ 7 | 8 | Facebook now support exports in JSON format, which somewhat reduces the need for this code. The format of these exports has changed a great deal, and is still changing. This code will not work with exports newer than April 2018. 9 | 10 | Some (unmaintained) code that may work with newer JSON exports can be found [in this Gist](https://gist.github.com/jsharkey13/d60b7b421e08c98d426d03c39f8b4a12), but be aware that code is Python 3 and the format of Facebook's export may have changed since it was written. 11 | 12 | --- 13 | 14 | Facebook has a feature that allows users to download a copy of their data as a zip archive containing htm files with their data. The aim of this parser is to take this archive and to extract a user's Facebook Messages from it; to transfer them into a more useful format, as well as performing some analysis to produce interesting data. 15 | 16 | This code is adapted from [CopOnTheRun/FB-Message-Parser](https://github.com/CopOnTheRun/FB-Message-Parser). 17 | 18 | #### Running the Code 19 | The Facebook Export can be downloaded from the [Facebook Settings](https://www.facebook.com/settings) menu. 20 | 21 | __*Before any code can be run:*__ [Lines 26 and 27](https://github.com/jsharkey13/facebook_message_parser/blob/master/fb_parser.py#L27-L28) in `fb_parser.py` will need to be updated to the name and username of the account being parsed. If this is done, the code will attempt to open the zip file `facebook-[myusername].zip` by default if no argument is given to `facebook.py`. 22 | 23 | Run "`python facebook.py [optional_filename]`" with the `facebook-[myusername].zip` or `messages.htm` files in the same directory to export to CSV, display top 10 most messaged friends and output a graph showing messages with the most messaged friend. This sample code can easily be adapted. 24 | 25 | The `fb_chat.Chat` object returned by the parser (the object called `Facebook.Chat` in `facebook.py`) could be pickled and loaded in another program to form a base API to interact with the messages there. (Note that this, like the export, contains private messages in plain text format, and that the `fb_chat` code may need to be imported too). 26 | 27 | __Producing Graphs__ 28 | 29 | The `fb_analysis.py` file contains code to produce a stacked histogram showing the number of messages sent and recieved with a contact each month: 30 | 31 | ![Sample Graph](/samples/sample_date_graph.png?raw=true) 32 | 33 | __A browser-based interface__ 34 | 35 | If you want to view the export in a browser (and don't want to use the perfectly servicable way of viewing Facebook Messages in a browser that is `www.facebook.com`) then [Flask Facebook Messages](https://github.com/jsharkey13/flask_facebook_messages) may be of use. Add `Facebook.dump_to_pickle()` on a new line after [Line 52](https://github.com/jsharkey13/facebook_message_parser/blob/master/facebook.py#L52) of `facebook.py` to produce a pickle export, then use the code in that repository to view it! 36 | 37 | #### Dependencies 38 | The code is written in Python 2.7. 39 | 40 | The parser uses [Beautiful Soup](http://www.crummy.com/software/BeautifulSoup/) to do the bulk of the capture from the htm file. 41 | 42 | The analysis code uses [matplotlib](https://matplotlib.org/) to produce graphs of message counts. An example graph can be found in the `samples` directory. 43 | 44 | [Anaconda Python](https://store.continuum.io/cshop/anaconda/) for scientific computing is a simple and easy way to install all the dependencies for the code, alongside many other useful libraries. It can be downloaded [here](https://www.continuum.io/downloads). 45 | -------------------------------------------------------------------------------- /samples/facebook_and_sms.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import codecs 3 | import fb_parser 4 | import ios_parser 5 | import ios_chat 6 | 7 | 8 | class Merge_Chat_Logs(object): 9 | """An object to merge the iOS and Facebook Chat objects. 10 | 11 | - The Merge_Chat_Logs object can be treated like a Chat object, and contains 12 | the original iOS and Facebook Chat objects unchanged, whilst allowing 13 | the functionality of having combined the two. 14 | - Contains all the methods and attributes required to allow the fb_analysis 15 | code to use a Merge_Chat_Logs object as a Chat object. Can consider ios_chat.Chat, 16 | fb_chat.Chat and facebook_and_sms.Merge_Chat_Logs to be concrete implementations 17 | of an abstract Chat_Log class; functionality not easily obtainable in Python. 18 | - When initialising, the Facebook Chat object and the iOS Chat object should 19 | be passed in as the two arguments.""" 20 | 21 | def __init__(self, facebook_Chat, ios_Chat): 22 | self.Chat = facebook_Chat 23 | self.Texts = ios_Chat 24 | self.threads = self.Chat.threads + self.Texts.threads 25 | self._myname = self.Chat._myname 26 | self._all_people = self.Chat._all_people.union(self.Texts._all_people) 27 | self._total_messages = self.Chat._total_messages + self.Texts._total_messages 28 | 29 | def __repr__(self): 30 | """Set Python's representation of the Chat object.""" 31 | return "<{}'s GROUPED CHAT LOGs: TOTAL_THREADS={} TOTAL_MESSAGES={}>".format(self._myname, len(self), self.Chat._total_messages + self.Texts._total_messages) 32 | 33 | def __len__(self): 34 | """Return the total number of threads in both Chat objects. 35 | 36 | Allows the len() method to be called on a Merge_Chat_Logs object. This 37 | could be changed to be the total number of messages, currently stored as 38 | Merge_Chat_Logs._total_messages()""" 39 | return len(self.Chat) + len(self.Texts) 40 | 41 | def __getitem__(self, key): 42 | """Allow accessing Thread objects using Merge_Chat_Logs["Thread Name"]. 43 | 44 | - If the thread exists only in one Chat object; that Thread is returned; 45 | but if a Thread of the same name appears in both, a new Thread object 46 | combining the two threads is created. Message numbering in this new 47 | Thread object may be confusing, but functionality remains unchanged. 48 | - The method will fail silently; None is returned if a key is not present. 49 | This is different to the more standard rasing of 'KeyError'.""" 50 | if ((key in self.Chat._thread_dict) and (key in self.Texts._thread_dict)): 51 | return ios_chat.Thread(key, self.Chat[key].messages + self.Texts[key].messages) 52 | elif key in self.Chat._thread_dict: 53 | return self.Chat[key] 54 | elif key in self.Texts._thread_dict: 55 | return self.Texts[key] 56 | else: 57 | return None 58 | 59 | def all_messages(self): 60 | """Return a date ordered list of all messages. 61 | 62 | The list is all messages contained in both Chat objects, as a list of 63 | Message objects.""" 64 | return sorted([m for m in self.Chat.all_messages() + self.Texts.all_messages()]) 65 | 66 | def all_from(self, name): 67 | """Return a date ordered list of all messages sent by 'name', from both Chat objects. 68 | 69 | The list returned is a list of Message objects. This is distinct from 70 | Thread.by(name) since all threads are searched by this method. For all 71 | messages in one thread from 'name', use Thread.by(name) on the correct Thread.""" 72 | return sorted([m for m in self.Chat.all_from(name) + self.Texts.all_from(name)]) 73 | 74 | def sent_before(self, date): 75 | """Return a date ordered list of all messages sent before specified date, from both Chat objects. 76 | 77 | The function returns a list of Message objects. The 'date' can be a 78 | datetime.datetime object, or a three to six tuple (YYYY, MM, DD[, HH, MM, SS]).""" 79 | return sorted([m for m in self.Chat.sent_before(date) + self.Texts.sent_before(date)]) 80 | 81 | def sent_after(self, date): 82 | """Return a date ordered list of all messages sent after specified date, from both Chat objects. 83 | 84 | The list returned is a list of Message objects. The 'date' can be a 85 | datetime.datetime object, or a three to six tuple (YYYY, MM, DD[, HH, MM, SS]).""" 86 | return sorted([m for m in self.Chat.sent_after(date) + self.Texts.sent_after(date)]) 87 | 88 | def sent_between(self, start, end=None): 89 | """Return a date ordered list of all messages sent between specified dates, from both Chat objects. 90 | 91 | - The list returned is a list of Message objects. The 'start' and 'end' 92 | can be datetime.datetime objects, or a three to six tuple 93 | (YYYY, MM, DD[, HH, MM, SS]). 94 | - Not entering an 'end' date is interpreted as all messages sent on 95 | the day 'start'. Where a time is specified also, a 24 hour period 96 | beginning at 'start' is used.""" 97 | return sorted([m for m in self.Chat.sent_between(start, end) + self.Texts.sent_between(start, end)]) 98 | 99 | def search(self, string, ignore_case=False): 100 | """Return a date ordered list of all messages containing 'string', from both Chat objects. 101 | 102 | This function searches in all threads, and returns a list of Message 103 | objects. 104 | - The function can be made case-insensitive by setting 'ignore_case' 105 | to True.""" 106 | return sorted([m for m in self.Chat.search(string, ignore_case) + self.Texts.search(string, ignore_case)]) 107 | 108 | 109 | if __name__ == "__main__": 110 | """The code to get to a Merge_Chat_Logs object, assuming both ios_parser and 111 | fb_parser have been run and used to pickle their Chat objects with the default 112 | names.""" 113 | 114 | # Nasty hack to force utf-8 encoding by default: 115 | reload(sys) 116 | sys.setdefaultencoding('utf8') 117 | 118 | # Change stdout to allow printing of unicode characters: 119 | streamWriter = codecs.lookup('utf-8')[-1] 120 | sys.stdout = streamWriter(sys.stdout) 121 | 122 | # To avoid work, assume can load from pickle: 123 | fb_fname = 'messages.pickle' 124 | ios_fname = 'sms.pickle' 125 | 126 | Facebook = fb_parser.FBMessageParse(fb_fname) 127 | SMS = ios_parser.iOSMessageParse(ios_fname) 128 | 129 | All = Merge_Chat_Logs(Facebook.Chat, SMS.Texts) 130 | -------------------------------------------------------------------------------- /fb_parser.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import dateutil 3 | import sys 4 | from bs4 import BeautifulSoup as bs 5 | import zipfile 6 | import pickle 7 | import fb_chat 8 | 9 | 10 | class FBMessageParse(object): 11 | """An object to encapsulate all the methods required to parse messages.htm. 12 | 13 | These include methods to initialise, save and load a fb_chat.Chat object, 14 | which contains a Pythonic representation of Facebook Message history. 15 | - Can read in messages from the .zip archive exported from Facebook, or 16 | the .htm file contained in the archive. 17 | - Can dump the Chat object to a pickle file and load it again in another 18 | session: use dump_to_pickle() and load_from_pickle(). 19 | - Can export messages to csv format: use write_to_csv() 20 | - Using a 'uid_people' file, can turn unrecognised nnnnnnn@facebook.com identifiers 21 | into names. Lines should be '[uid]:[name]'. See the print_unknowns() function. 22 | - Allows customised renaming of contacts using a 'duplicates' file. In a similar 23 | way to the 'uid_people' file; add lines containing '[old name]:[new name]' to 24 | a file called 'duplicates' for the process to occur on the next read in from 25 | zip or htm.""" 26 | 27 | _MYNAME = "My Name" 28 | _MYUSERNAME = "myusername" 29 | 30 | def __init__(self, fname, load_pickle=False): 31 | self._UIDPEOPLE = {} 32 | self._PEOPLEUID = {} 33 | self._PEOPLEDUPLICATES = {} 34 | self._UNKNOWNS = [] 35 | # 36 | self.Chat = None 37 | # 38 | self._archive = None 39 | self._messages_htm = None 40 | # Open either the .zip and contained htm, the pickle file, or another file: 41 | if ".zip" in fname: 42 | self._archive = zipfile.ZipFile(fname, 'r') 43 | if self._archive is not None: 44 | self._messages_htm = self._archive.open('html/messages.htm') 45 | elif load_pickle or ".pickle" in fname: 46 | self.load_from_pickle(fname) 47 | else: 48 | self._messages_htm = open(fname, "r") 49 | # 50 | self._read_uid_people() 51 | self._read_duplicate_list() 52 | 53 | def _close(self): 54 | """Close any open files before deletion. Do not call manually.""" 55 | if self._archive is not None: 56 | self._messages_htm = None 57 | self._archive.close() 58 | if self._messages_htm is not None: 59 | self._messages_htm.close() 60 | 61 | def __del__(self): 62 | """Ensure _close() is called on deletion.""" 63 | self._close() 64 | 65 | def _read_uid_people(self): 66 | """Read in the 'uid_people' file and add line entries to dictionaries. 67 | 68 | Called automatically; do not call manually. Read in the 'uid_people' 69 | file and add line entries to the dictionaries used to translate between 70 | UID and Name, and vice versa. 71 | - Lines should be formatted '[uid]:[name]'. 72 | - Ill-formatted lines are ignored, and the file does not have to be present 73 | for the code to function: unrecognised UIDs are left unchanged.""" 74 | try: 75 | with open('uid_people') as f: 76 | lines = [line.rstrip('\n') for line in f] 77 | for line in lines: 78 | try: 79 | key, value = line.split(":") 80 | self._UIDPEOPLE.update({key: value}) 81 | self._PEOPLEUID.update({value: key}) 82 | except ValueError: 83 | pass 84 | except IOError: 85 | pass 86 | 87 | def _read_duplicate_list(self): 88 | """Read in the 'duplicates' file and add line entries to the dictionary. 89 | 90 | Called automatically; do not call manually. Read in the 'duplicates' 91 | file and add line entries to the dictionary used to replace names. 92 | Useful for people who have changed their Facebook name to a nickname, 93 | or appear in the Chat logs with two versions of their name. 94 | - Lines should be formatted '[old name]:[new name]'. 95 | - Ill-formatted lines are ignored, and the file does not have to be 96 | present for the code to function: unrecognised names are left unchanged.""" 97 | try: 98 | with open('duplicates') as f: 99 | lines = [line.rstrip('\n') for line in f] 100 | for line in lines: 101 | try: 102 | key, value = line.split(":") 103 | self._PEOPLEDUPLICATES.update({key: value}) 104 | except ValueError: 105 | pass 106 | except IOError: 107 | pass 108 | 109 | def _thread_name_cleanup(self, namestr): 110 | """Parse the thread's name. 111 | 112 | Change any message author names and remove the name of the Chat owner 113 | (_MYNAME) from the name unless messages are sent to oneself.""" 114 | namestr = namestr.encode('ascii', 'replace') # BeutifulSoup works in Unicode, do we want ASCII names? 115 | namelist = sorted(namestr.split(", ")) 116 | for i, name in enumerate(namelist): 117 | namelist[i] = self._message_author_parse(name) 118 | if ((self._MYNAME in namelist) and (len(namelist) > 1)): # You can send yourself messages, so don't delete name if it's the only one. 119 | namelist.remove(self._MYNAME) # Otherwise remove your name from the list. 120 | return ", ".join(namelist) 121 | 122 | def _message_author_parse(self, name): 123 | """Tidy up the name of the sender of a message. 124 | 125 | If the name is a UID email address, use the UID dictionary to replace 126 | their name if possible. If the name is a duplicate (or to be renamed) 127 | then rename. Any UIDs which remain are added to a list to facilitate 128 | populating a 'uid_people' file: see print_unknowns().""" 129 | if name is None: 130 | return "UNKNOWN_AUTHOR" # Facebook has been providing messages with no recorded author! 131 | name = name.encode('ascii', 'replace') # BeutifulSoup works in Unicode, do we want ASCII names? 132 | n = name.replace("@facebook.com", "") 133 | if n in self._UIDPEOPLE: 134 | name = self._UIDPEOPLE[n] 135 | if n in self._PEOPLEDUPLICATES: 136 | name = self._PEOPLEDUPLICATES[n] 137 | if ((n in name) and (n != name)): # If n is still the UID, and we still don't have a name: 138 | self._UNKNOWNS.append(n) # Add the UID to the UNKNOWN list 139 | return name 140 | 141 | def _message_date_parse(self, datestr): 142 | """Turn the datestamp on the message into a datetime object. 143 | 144 | This will parse to a timezone aware Python timestamp and then 145 | remove the timezone info; converting it to local time. 146 | This may not be the behaviour desired, and can be changed; 147 | but most other functions assume naive datetimes.""" 148 | return dateutil.parser.parse(datestr).replace(tzinfo=None) 149 | 150 | def _date_unix(self, datetime_date): 151 | """Turn a datetime.datetime object into a UNIX time int.""" 152 | return int((datetime_date - datetime.datetime(1970, 1, 1)).total_seconds()) 153 | 154 | def _message_body_parse(self, message_body): 155 | """Tidy up the message body itself. 156 | 157 | This turns newline characters into a unique custom string which can 158 | be replaced after export if necessary. Quote marks are also escaped, 159 | to allow the use of quotes and commas in messages whilst allowing 160 | export to csv. Those two lines can be removed if desired.""" 161 | if message_body is None: 162 | message_body = "" 163 | message_body = '<|NEWLINE|>'.join(message_body.splitlines()) # We can't have newline characters in a csv file 164 | message_body = message_body.replace('"', '""') # Attempt to escape " characters in messages, for csv output 165 | return message_body 166 | 167 | def print_unknowns(self): 168 | """Print out any UIDs of people not recognised by the code. 169 | 170 | Prints lines containing unrecognised UIDs, along with instructions on 171 | how to find names for these people and how to add the names to the 172 | 'uid_people' file.""" 173 | if self.Chat is None: 174 | print "The message export file has not been parsed. Run parse_messages()." 175 | return 176 | if len(self._UNKNOWNS) == 0: 177 | return 178 | self._UNKNOWNS = list(set(self._UNKNOWNS)) # An unordered duplicate removal method 179 | print "To identify these accounts, try visiting www.facebook.com/[uid] and adding '[uid]:[name]' to a file in the current directory named 'uid_people'" 180 | for uid in self._UNKNOWNS: 181 | print uid 182 | 183 | def parse_messages(self, group_duplicates=True): 184 | """Take the loaded zip file or htm file and create a Chat object. 185 | 186 | Takes the messages.htm file and reads in the messages using 187 | BeautifulSoup to parse the html data. Creates the Chat object, which 188 | can be used independently and accessed as the FBMessageParse.Chat object. 189 | - Optional argument 'group_duplicates' groups together Threads containing 190 | the same participants. Message Threads over 10,000 messages long are 191 | split by Facebook for export: this can help group them. True by default. 192 | - Contains code to verify that the file being examined is in fact a 193 | Facebook Messages export, though it allows manual override.""" 194 | # Check we have a htm file open to import from: 195 | if self._messages_htm is None: 196 | print "No archive/message file open. Was data loaded from a pickle file?" 197 | return 198 | # 199 | soup = bs(self._messages_htm, "lxml") 200 | # Verify that we're parsing a Facebook Message export and _MYNAME is right: 201 | check_header = self._MYNAME + " - Messages" 202 | try: 203 | actual_header = soup.html.head.title.string 204 | except AttributeError: 205 | actual_header = None 206 | if ((actual_header is None) or (check_header != actual_header)): 207 | print "The title of the htm document does not match that expected:" 208 | print '"' + check_header + '"' 209 | print "Is the file a message export? Is the user's name correct?" 210 | cont = raw_input("Continue anyway? (y/n)") 211 | if cont == "n": 212 | sys.exit(-1) 213 | # Set up some important lists: 214 | thread_list = soup.find_all(class_='thread') 215 | thread_num = 0 216 | _chat_list = [] 217 | _thread_names = [] 218 | _duplicates_list = [] 219 | # Start going through the threads: 220 | for t in thread_list: 221 | message_list = t.find_all(class_='message') 222 | _thread_list = [] 223 | total_message_num = len(message_list) 224 | # 225 | message_num = total_message_num 226 | thread_name = self._thread_name_cleanup(t.contents[0]) 227 | # Work out if the thread is a duplicate: 228 | duplicate_thread = False 229 | if thread_name in _thread_names: 230 | duplicate_thread = True 231 | else: 232 | _thread_names.append(thread_name) 233 | # For each message, sort Author, Date and Body then create Message object: 234 | for m in message_list: 235 | message_author = self._message_author_parse(m.find(class_='user').string) 236 | message_date = self._message_date_parse(m.find(class_='meta').string) 237 | message_body = self._message_body_parse(m.next_sibling.string) 238 | # 239 | _thread_list.append(fb_chat.Message(thread_name, message_author, message_date, message_body, message_num)) 240 | # 241 | message_num -= 1 242 | # 243 | thread_num += 1 244 | # If we're grouping duplicated threads, deal with them now: 245 | if ((not duplicate_thread) or (not group_duplicates)): 246 | _chat_list.append(fb_chat.Thread(thread_name.split(", "), _thread_list)) 247 | else: 248 | for t in _chat_list: 249 | if t.people_str == thread_name: 250 | _duplicates_list.append(thread_name) 251 | t._add_messages(_thread_list) 252 | break 253 | # Create the Chat object, set and return it: 254 | self.Chat = fb_chat.Chat(self._MYNAME, _chat_list) 255 | for t in _duplicates_list: 256 | self.Chat[t]._renumber_messages() # If we've grouped them, the messages need renumbering. 257 | return self.Chat 258 | 259 | def write_to_csv(self, filename='messages.csv', chronological=False): 260 | """Export all messages to csv format. 261 | 262 | The filename can be specified as an optional argument. If 'chronological' 263 | is True, messages are printed in date order, otherwise they are printed 264 | grouped in Threads sorted by total thread length.""" 265 | with open(filename, "w") as f: 266 | header_line = '"Thread","Message Number","Message Author","Message Timestamp","Message Body"\n' 267 | f.write(header_line.encode('utf8')) 268 | if chronological: 269 | for message in self.Chat.all_messages(): 270 | text = str(message) 271 | f.write(text.encode('utf8')) 272 | else: 273 | for thread in self.Chat.threads: 274 | for message in thread.messages: 275 | text = str(message) 276 | f.write(text.encode('utf8')) 277 | 278 | def dump_to_pickle(self, filename='messages.pickle'): 279 | """Serialise the Chat object to a pickle file. 280 | 281 | The pickle file can be used to restore the Chat object in another 282 | session without re-importing the zip or htm file. Load either using 283 | load_from_pickle(), or in another program using Pickle's standard load() 284 | command.""" 285 | with open(filename, "w") as f: 286 | pickle.dump(self.Chat, f) 287 | 288 | def load_from_pickle(self, filename='messages.pickle'): 289 | """Read in the pickle file, optionally from a specified filename. 290 | 291 | The function sets the internal Chat object and returns the Chat object. 292 | Provided mainly as an example, since the parser's main aim to to read 293 | in from zip or htm, and to output csv or the Chat object.""" 294 | with open(filename, "r") as f: 295 | self.Chat = pickle.load(f) 296 | return self.Chat 297 | -------------------------------------------------------------------------------- /fb_chat.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | 4 | class Chat(object): 5 | """An object to encapsulate the entire Facebook Message history. 6 | 7 | - Contains a list of Thread objects, which can be accessed using item 8 | accessing Chat["Thread Name"] style. 9 | - When initialising, 'myname' should be the name of the user, and 'threads' 10 | should be a list of Thread objects. 11 | - Provides useful functions for accessing messages.""" 12 | 13 | def __init__(self, myname, threads): 14 | self.threads = sorted(threads, key=len, reverse=True) 15 | self._thread_dict = {", ".join(thread.people): thread for thread in self.threads} 16 | self._total_messages = len(self.all_messages()) 17 | self._myname = myname 18 | self._all_people = {myname} 19 | for thread in self.threads: 20 | self._all_people.update(thread.people) 21 | 22 | def __getitem__(self, key): 23 | """Allow accessing Thread objects in the list using Chat["Thread Name"]. 24 | 25 | This method allows the threads list to be accessed using Chat["Thread Name"] 26 | or Chat[n] notation.""" 27 | if type(key) is int: 28 | return self.threads[key] 29 | elif type(key) is str: 30 | return self._thread_dict[key] 31 | 32 | def __repr__(self): 33 | """Set Python's representation of the Chat object.""" 34 | return "<{}'s CHAT LOG: TOTAL_THREADS={} TOTAL_MESSAGES={}>".format(self._myname, len(self.threads), self._total_messages) 35 | 36 | def __len__(self): 37 | """Return the total number of threads. 38 | 39 | Allows the len() method to be called on a Chat object. This could be 40 | changed to be the total number of messages, currently stored as 41 | Chat._total_messages()""" 42 | return len(self.threads) 43 | 44 | def _date_parse(self, date): 45 | """Allow dates to be entered as integer tuples (YYYY, MM, DD[, HH, MM]). 46 | 47 | Removes the need to supply datetime objects, but still allows dates 48 | to be entered as datetime.datetime objects. The Year, Month and 49 | Day are compulsory, the Hours and Minutes optional. May cause exceptions 50 | if poorly formatted tuples are used.""" 51 | if type(date) is datetime.datetime: 52 | return date 53 | else: 54 | return datetime.datetime(*date) 55 | 56 | def _recount_messages(self): 57 | """Update the count of total messages. 58 | 59 | Since Thread objects can be extended dynamically, this may prove 60 | necessary.""" 61 | self._total_messages = len(self.all_messages()) 62 | 63 | def all_messages(self): 64 | """Return a date ordered list of all messages. 65 | 66 | The list is all messages contained in the Chat object, as a list of 67 | Message objects.""" 68 | return sorted([message for thread in self.threads for message in thread.messages]) 69 | 70 | def all_from(self, name): 71 | """Return a date ordered list of all messages sent by 'name'. 72 | 73 | The list returned is a list of Message objects. This is distinct from 74 | Thread.by(name) since all threads are searched by this method. For all 75 | messages in one thread from 'name', use Thread.by(name) on the correct Thread.""" 76 | return sorted([message for thread in self.threads for message in thread.by(name)]) 77 | 78 | def sent_before(self, date): 79 | """Return a date ordered list of all messages sent before specified date. 80 | 81 | The function returns a list of Message objects. The 'date' can be a 82 | datetime.datetime object, or a three or five tuple (YYYY, MM, DD[, HH, MM]).""" 83 | return sorted([message for thread in self.threads for message in thread.sent_before(date)]) 84 | 85 | def sent_after(self, date): 86 | """Return a date ordered list of all messages sent after specified date. 87 | 88 | The list returned is a list of Message objects. The 'date' can be a 89 | datetime.datetime object, or a three or five tuple (YYYY, MM, DD[, HH, MM]).""" 90 | return sorted([message for thread in self.threads for message in thread.sent_after(date)]) 91 | 92 | def sent_between(self, start, end=None): 93 | """Return a date ordered list of all messages sent between specified dates. 94 | 95 | - The list returned is a list of Message objects. The 'start' and 'end' 96 | can be datetime.datetime objects, or a three or five tuple 97 | (YYYY, MM, DD[, HH, MM]). 98 | - Not entering an 'end' date is interpreted as all messages sent on 99 | the day 'start'. Where a time is specified also, a 24 hour period 100 | beginning at 'start' is used.""" 101 | return sorted([message for thread in self.threads for message in thread.sent_between(start, end)]) 102 | 103 | def search(self, string, ignore_case=False): 104 | """Return a date ordered list of all messages containing 'string'. 105 | 106 | This function searches in all threads, and returns a list of Message 107 | objects. 108 | - The function can be made case-insensitive by setting 'ignore_case' 109 | to True.""" 110 | return sorted([message for thread in self.threads for message in thread.search(string, ignore_case)]) 111 | 112 | def on(self, date): 113 | """Return the Chat object as it would have been on 'date'. 114 | 115 | The Chat object returned is a new object containing the subset of the 116 | Threads which contain messages sent before 'date', where each of these 117 | Threads is a new Thread with only these messages in. 118 | - 'date' can be a datetime.datetime object, or a three or five tuple 119 | (YYYY, MM, DD[, HH, MM]).""" 120 | threads_on = [t.on(date) for t in self.threads if len(t.on(date)) > 0] 121 | return Chat(self._myname, threads_on) 122 | 123 | 124 | class Thread(object): 125 | """An object to encapsulate a Facebook Message thread. 126 | 127 | - Contains a list of participants, a string form of the list and a list 128 | of messages in the thread as Message objects. 129 | - When initialising, 'people' should be a list of strings containing the 130 | names of the participants and 'messages' should be a list of Message 131 | objects.""" 132 | 133 | def __init__(self, people, messages): 134 | self.people = people 135 | self.people_str = ", ".join(self.people) 136 | self.messages = sorted(messages) 137 | 138 | def __getitem__(self, key): 139 | """Allow accessing Message objects in the messages list using Thread[n]. 140 | 141 | Beware out by one errors! The message numbers start counting at 1, 142 | but the list they are stored in is indexed from 0. 143 | - This behaviour could be corrected by either subtracting one from 144 | the key (which causes issues when slicing), or by counting messages 145 | from 0.""" 146 | return self.messages[key] 147 | 148 | def __repr__(self): 149 | """Set Python's representation of the Thread object.""" 150 | return ''.format(self.people_str, len(self.messages)) 151 | 152 | def __len__(self): 153 | """Return the total number of messages in the thread.""" 154 | return len(self.messages) 155 | 156 | def _add_messages(self, new_messages): 157 | """Allow adding messages to an already created Thread object. 158 | 159 | This function is useful for merging duplicate threads together.""" 160 | self.messages.extend(new_messages) 161 | self.messages = sorted(self.messages) 162 | 163 | def _renumber_messages(self): 164 | """Renumber all messages in the 'messages' list. 165 | 166 | Message objects are are sorted after being added; but if messages are 167 | added using _add_messages() then the numbering may be incorrect. This 168 | function fixes that.""" 169 | i = 1 170 | for message in self.messages: 171 | message._num = i 172 | i += 1 173 | 174 | def by(self, name): 175 | """Return a date ordered list of all messages sent by 'name'. 176 | 177 | Returns a list of Message objects.""" 178 | return [message for message in self.messages if message.sent_by(name)] 179 | 180 | def sent_before(self, date): 181 | """Return a date ordered list of all messages sent before specified date. 182 | 183 | The function returns a list of Message objects. The 'date' can be a 184 | datetime.datetime object, or a three or five tuple (YYYY, MM, DD[, HH, MM]).""" 185 | return [message for message in self.messages if message.sent_before(date)] 186 | 187 | def sent_after(self, date): 188 | """Return a date ordered list of all messages sent after specified date. 189 | 190 | The list returned is a list of Message objects. The 'date' can be a 191 | datetime.datetime object, or a three or five tuple (YYYY, MM, DD[, HH, MM]).""" 192 | return [message for message in self.messages if message.sent_after(date)] 193 | 194 | def sent_between(self, start, end=None): 195 | """Return a date ordered list of all messages sent between specified dates. 196 | 197 | - The list returned is a list of Message objects. The 'start' and 'end' 198 | can be datetime.datetime objects, or a three or five tuple 199 | (YYYY, MM, DD[, HH, MM]). 200 | - Not entering an 'end' date is interpreted as all messages sent on 201 | the day 'start'. Where a time is specified also, a 24 hour period 202 | beginning at 'start' is used.""" 203 | return [message for message in self.messages if message.sent_between(start, end)] 204 | 205 | def search(self, string, ignore_case=False): 206 | """Return a date ordered list of messages in Thread containing 'string'. 207 | 208 | This function searches the current thread, and returns a list of Message 209 | objects. 210 | - The function can be made case-insensitive by setting 'ignore_case' 211 | to True.""" 212 | return sorted([message for message in self.messages if message.contains(string, ignore_case)]) 213 | 214 | def on(self, date): 215 | """Return the Thread object as it would have been on 'date'. 216 | 217 | The Thread object returned is a new object containing the subset of the 218 | messages sent before 'date'. 219 | - 'date' can be a datetime.datetime object, or a three or five tuple 220 | (YYYY, MM, DD[, HH, MM]).""" 221 | return Thread(self.people, self.sent_before(date)) 222 | 223 | 224 | class Message(object): 225 | """An object to encapsulate a Facebook Message. 226 | 227 | - Contains a string of the author's name, the timestamp, number in the thread 228 | and the body of the message. 229 | - When initialising, thread_name' should be the containing Thread.people_str, 230 | 'author' should be string containing the message sender's name, 'date_time' 231 | should be a datetime.datetime object, 'text' should be the content of 232 | the message and 'num' should be the number of the message in the thread.""" 233 | 234 | def __init__(self, thread, author, date_time, text, num): 235 | self.thread_name = thread 236 | self.author = author 237 | self.date_time = date_time 238 | self.text = text 239 | self._num = num 240 | 241 | def __repr__(self): 242 | """Set Python's representation of the Message object.""" 243 | return ''.\ 244 | format(self.thread_name, self._num, self.date_time, self.author, self.text) 245 | 246 | def __str__(self): 247 | """Return a string form of a Message in format required for csv output.""" 248 | out = '"' + self.thread_name + '","' + str(self._num) + '","' + self.author + '","' + str(self.date_time) + '","' + self.text + '"\n' 249 | return out 250 | 251 | def __lt__(self, message): 252 | """Allow sorting of messages by implementing the less than operator. 253 | 254 | Sorting is by date, unless two messages were sent at the same time, 255 | in which case message number is used to resolve conflicts. This number 256 | ordering holds fine for messages in single threads, but offers no real 257 | objective order outside a thread.""" 258 | if self.date_time == message.date_time: 259 | if abs(self._num - message._num) > 9000: # If dates equal, but numbers miles apart 260 | return False # MUST be where two 10000 groups join: larger number actually smaller here! 261 | else: 262 | return self._num < message._num 263 | return self.sent_before(message.date_time) 264 | 265 | def __gt__(self, message): 266 | """Allow sorting of messages by implementing the greater than operator. 267 | 268 | Sorting is by date, unless two messages were sent at the same time, 269 | in which case message number is used to resolve conflicts. This number 270 | ordering holds fine for messages in single threads, but offers no real 271 | objective order outside a thread.""" 272 | if self.date_time == message.date_time: 273 | if abs(self._num - message._num) > 9000: # If dates equal, but numbers miles apart 274 | return True # MUST be where two 10000 groups join: smaller number actually larger here! 275 | else: 276 | return self._num > message._num 277 | return self.sent_after(message.date_time) 278 | 279 | def __eq__(self, message): 280 | """Messages are equal if their number, date, author and text are the same.""" 281 | equal = (self._num == message._num) and (self.author == message.author) 282 | equal = equal and (self.date_time == message.date_time) and (self.text == message.text) 283 | return equal 284 | 285 | def __len__(self): 286 | """Return the number of characters in the message body.""" 287 | text = self.text.replace("<|NEWLINE|>", "") # Undo adding extra characters 288 | text = text.replace('""', '"') # And escaping quote marks 289 | return len(text) 290 | 291 | def _date_parse(self, date): 292 | """Allow dates to be entered as integer tuples (YYYY, MM, DD[, HH, MM]). 293 | 294 | Removes the need to supply datetime objects, but still allows dates 295 | to be entered as datetime.datetime objects. The Year, Month and 296 | Day are compulsory, the Hours and Minutes optional. May cause exceptions 297 | if poorly formatted tuples are used.""" 298 | if type(date) is datetime.datetime: 299 | return date 300 | else: 301 | return datetime.datetime(*date) 302 | 303 | def sent_by(self, name): 304 | """Return True if the message was sent by 'name'.""" 305 | return self.author == name 306 | 307 | def sent_before(self, date): 308 | """Return True if the message was sent before the date specified. 309 | 310 | The 'date' can be a datetime.datetime object, or a three or five tuple 311 | (YYYY, MM, DD[, HH, MM]).""" 312 | date = self._date_parse(date) 313 | return self.date_time < date 314 | 315 | def sent_after(self, date): 316 | """Return True if the message was sent after the date specified. 317 | 318 | The 'date' can be a datetime.datetime object, or a three or five tuple 319 | (YYYY, MM, DD[, HH, MM]).""" 320 | date = self._date_parse(date) 321 | return self.date_time > date 322 | 323 | def sent_between(self, start, end=None): 324 | """Return True if the message was sent between the dates specified. 325 | 326 | - The 'start' and 'end' can be datetime.datetime objects, or 327 | a three or five tuple (YYYY, MM, DD[, HH, MM]). The start and end times 328 | are inclusive since this is simplest. 329 | - Not entering an 'end' date is interpreted as all messages sent on 330 | the day 'start'. Where a time is specified also, a 24 hour period 331 | beginning at 'start' is used.""" 332 | start = self._date_parse(start) 333 | if end is not None: 334 | end = self._date_parse(end) 335 | else: 336 | end = start + datetime.timedelta(1) # 1 day (24 hours) later than 'start' 337 | return start <= self.date_time <= end 338 | 339 | def contains(self, search_string, ignore_case=False): 340 | """Return True if 'search_string' is contained in the message text.""" 341 | if ignore_case: 342 | return search_string.lower() in self.text.lower() 343 | else: 344 | return search_string in self.text 345 | -------------------------------------------------------------------------------- /fb_analysis.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import matplotlib.pyplot as plt 3 | from matplotlib.dates import date2num, num2date 4 | from matplotlib import ticker 5 | import matplotlib 6 | import re 7 | 8 | # ============================================================================= 9 | # Top N Most Messaged People # 10 | # # 11 | # Public Functions: # 12 | # - top_n_people(Chat, N, count_type, groups) # 13 | # # 14 | # ============================================================================= 15 | 16 | _COUNT_TYPES = ["total", "to", "from", "allfrom", "words", "wordsfrom", "wordsto", 17 | "chars", "charsfrom", "charsto"] 18 | 19 | 20 | def _update_thread_dict(thread_dict, thread_name, num): 21 | """Add new entries to count dictionary, dealing with duplicates carefully.""" 22 | if thread_name not in thread_dict: 23 | thread_dict.update({thread_name: num}) 24 | else: # Deal with duplicates, otherwise old entries get overwritten: 25 | thread_dict[thread_name] += num 26 | 27 | 28 | def top_n_people(Chat, N=-1, count_type="total", groups=False): 29 | """Return a list of the top N most messaged people. 30 | 31 | The "Top N People" can be judged by one of four criteria. The list 32 | contains tuples of (name, message count). A negative or zero value for 33 | N returns the full list, this is the default. The optional argument 34 | 'groups' allows group conversations to be included where this makes 35 | sense. The 'count_type' argument can be one of four values: 36 | - "total" - the default. This counts the total number of messages in 37 | message threads, and sorts by this. Groups can be enabled. 38 | - "to" - the total number of messages sent in a direct thread by 39 | the current user: '_myname'. Groups can be enabled. 40 | - "from" - the total number of messages sent in a direct thread by 41 | the other person in the thread. If 'groups' is enabled, all messages 42 | not from '_myname' are counted. 43 | - "allfrom" - the total number of messages from each individual person 44 | across all threads. Groups cannot be enabled and will be ignored.""" 45 | thread_dict = {} 46 | if count_type is "to": 47 | # Count the number of messages sent directly to each person. 48 | for t in Chat.threads: 49 | num = len(t.by(Chat._myname)) 50 | _update_thread_dict(thread_dict, t.people_str, num) 51 | elif count_type is "from": 52 | # Count the number of messages received directly from each person. 53 | for t in Chat.threads: 54 | my_num = len(t.by(Chat._myname)) 55 | tot_num = len(t) 56 | num = tot_num - my_num 57 | _update_thread_dict(thread_dict, t.people_str, num) 58 | elif count_type is "allfrom": 59 | # Count all messages in all threads received from each person. 60 | all_people = Chat._all_people.copy() 61 | all_people.remove(Chat._myname) # Remove _myname from all_people (but not the original!): 62 | for p in all_people: 63 | num = len(Chat.all_from(p)) 64 | thread_dict.update({p: num}) 65 | elif count_type is "words": 66 | # Count total number of words exchanged in threads. 67 | for t in Chat.threads: 68 | num = 0 69 | for m in t.messages: 70 | num += len(re.findall(r'\S+', m.text)) # Matches any non-whitespace sub-string 71 | # num += len(m.text.split(" ")) # Counts all things separated by a space 72 | _update_thread_dict(thread_dict, t.people_str, num) 73 | elif count_type is "wordsfrom": 74 | # Count total number of words sent by other people in threads. 75 | for t in Chat.threads: 76 | num = 0 77 | for m in t.messages: 78 | if not m.sent_by(Chat._myname): 79 | num += len(re.findall(r'\S+', m.text)) 80 | _update_thread_dict(thread_dict, t.people_str, num) 81 | elif count_type is "wordsto": 82 | # Count total number of words sent to the other people in threads. 83 | for t in Chat.threads: 84 | num = 0 85 | for m in t.messages: 86 | if m.sent_by(Chat._myname): 87 | num += len(re.findall(r'\S+', m.text)) 88 | _update_thread_dict(thread_dict, t.people_str, num) 89 | elif count_type is "chars": 90 | # Count total number of characters exchanged in threads. 91 | for t in Chat.threads: 92 | num = 0 93 | for m in t.messages: 94 | num += len(m) 95 | _update_thread_dict(thread_dict, t.people_str, num) 96 | elif count_type is "charsfrom": 97 | # Count total number of characters sent by other people in threads. 98 | for t in Chat.threads: 99 | num = 0 100 | for m in t.messages: 101 | if not m.sent_by(Chat._myname): 102 | num += len(m) 103 | _update_thread_dict(thread_dict, t.people_str, num) 104 | elif count_type is "charsto": 105 | # Count total number of characters sent to the other people in threads. 106 | for t in Chat.threads: 107 | num = 0 108 | for m in t.messages: 109 | if m.sent_by(Chat._myname): 110 | num += len(m) 111 | _update_thread_dict(thread_dict, t.people_str, num) 112 | else: 113 | # Else the default: count the total messages in each thread. 114 | for t in Chat.threads: 115 | num = len(t) 116 | _update_thread_dict(thread_dict, t.people_str, num) 117 | sorted_list = sorted(thread_dict.items(), key=lambda tup: tup[1], reverse=True) 118 | top_n = [] 119 | for i, item in enumerate(sorted_list): 120 | if ((len(top_n) >= N) and (N > 0)): 121 | return top_n 122 | if ((len(item[0].split(", ")) == 1) or groups): 123 | top_n.append((item[0], item[1])) 124 | return top_n 125 | 126 | 127 | # ============================================================================= 128 | # Graphing Message Counts # 129 | # # 130 | # Public Functions: # 131 | # - use_facebook_colours() # 132 | # - use_ios_colours() # 133 | # - messages_time_graph(Chat, name, filename, no_gui) # 134 | # - messages_date_graph(Chat, name, filename, start_date, end_date, no_gui) # 135 | # - messages_pie_chart(Chat, N, filename, count_type, groups, # 136 | # no_gui, percentages) # 137 | # # 138 | # ============================================================================= 139 | 140 | # Some useful colours: 141 | _FB_BLUE = (0.2314, 0.3490, 0.5961) 142 | _FB_GREY = (0.9294, 0.9294, 0.9294) 143 | _IOS_GREEN = (0.5451, 0.8235, 0.2824) 144 | _IOS_GREY = (0.8980, 0.8980, 0.9176) 145 | 146 | # The colours used by the code: 147 | _BG_COLOUR = (1.0, 1.0, 1.0) 148 | _TEXT_COLOUR = (0.0, 0.0, 0.0) 149 | _MY_COLOUR = None 150 | _OTHER_COLOUR = None 151 | 152 | 153 | def _change_matplotlib_colours(text_color=_TEXT_COLOUR, bg_colour=_BG_COLOUR): 154 | """Change matplotlib default colors for ALL graphs produced in current session. 155 | 156 | - 'text_colour' sets the colour of all text, as well as axes colours and 157 | axis tick mark colours. 158 | - 'bg_colour' changes the background and outside fill colour of the plot.""" 159 | matplotlib.rc('figure', facecolor=_BG_COLOUR) 160 | matplotlib.rc('savefig', facecolor=_BG_COLOUR, edgecolor=_TEXT_COLOUR) 161 | matplotlib.rc('axes', edgecolor=_TEXT_COLOUR, facecolor=_BG_COLOUR, labelcolor=_TEXT_COLOUR) 162 | matplotlib.rc('text', color=_TEXT_COLOUR) 163 | matplotlib.rc('grid', color=_TEXT_COLOUR) 164 | matplotlib.rc('xtick', color=_TEXT_COLOUR) 165 | matplotlib.rc('ytick', color=_TEXT_COLOUR) 166 | 167 | 168 | def _change_graph_colours(my_colour, other_colour): 169 | """Change the colours used in histograms, both self colour and the other person colour.""" 170 | global _MY_COLOUR, _OTHER_COLOUR 171 | _MY_COLOUR = my_colour 172 | _OTHER_COLOUR = other_colour 173 | 174 | 175 | def use_facebook_colours(): 176 | """Use Facebook's colours for graphs; blue for self, grey for others.""" 177 | _change_graph_colours(my_colour=_FB_BLUE, other_colour=_FB_GREY) 178 | 179 | 180 | def use_ios_colours(): 181 | """Use iOS's colours for graphs; green for self, grey for others.""" 182 | _change_graph_colours(my_colour=_IOS_GREEN, other_colour=_IOS_GREY) 183 | 184 | 185 | # Run the colour change code on import of the module: 186 | use_facebook_colours() 187 | _change_matplotlib_colours() 188 | 189 | 190 | # ====== Histogram of Time of Day: 191 | 192 | 193 | def _hour_list(): 194 | """Generate a list containing hours in day converted to floats.""" 195 | hours_bins = [n / 24.0 for n in range(0, 25)] 196 | return hours_bins 197 | 198 | 199 | def _dt_to_decimal_time(datetime): 200 | """Convert a datetime.datetime object into a fraction of a day float. 201 | 202 | Take the decimal part of the date converted to number of days from 01/01/0001 203 | and return it. It gives fraction of way through day: the time.""" 204 | datetime_decimal = date2num(datetime) 205 | time_decimal = datetime_decimal - int(datetime_decimal) 206 | return time_decimal 207 | 208 | 209 | def messages_time_graph(Chat, name=None, filename=None, no_gui=False): 210 | """Create a graph of the time of day of messages sent between users. 211 | 212 | Produces a histogram of the times of messages sent to and received from 213 | another user. The method only works for individuals, not for threads between 214 | multiple friends. 215 | 216 | - 'Chat' should be the Chat object to analyse. 217 | - 'name' should be the name of the user, and so the Thread, to be graphed. 218 | A special case is when 'name' is the name of the current user, in which 219 | case the graph of ALL messages the current user has sent is produced. 220 | - If a 'filename' is specified, output to file as well as displaying 221 | onscreen for viewing. 222 | - To run without displaying a graph onscreen, set 'no_gui' to True. If no filename 223 | is specified with this, the function will run but produce no output anywhere.""" 224 | # Implement a default case: 225 | if name is None: 226 | name = Chat._myname 227 | # Divide up into hourly bins, changing datetime objects to times in range [0,1): 228 | bins = _hour_list() 229 | # If looking at graph with other users, get messages to and from: 230 | if name != Chat._myname: 231 | Thread = Chat[name] 232 | times_from = [_dt_to_decimal_time(message.date_time) for message in Thread.by(name)] 233 | times_to = [_dt_to_decimal_time(message.date_time) for message in Thread.by(Chat._myname)] 234 | label = [Chat._myname, name] 235 | else: # If looking at all messages sent; do things differently: 236 | times_from = [_dt_to_decimal_time(message.date_time) for message in Chat.all_messages() if message.author != Chat._myname] 237 | times_to = [_dt_to_decimal_time(message.date_time) for message in Chat.all_messages() if message.author == Chat._myname] 238 | label = [Chat._myname, "Others"] 239 | # Create the figure, hiding the display if no_gui set: 240 | if no_gui: 241 | plt.ioff() 242 | plt.figure(figsize=(18, 9), dpi=80) 243 | plt.hist([times_to, times_from], bins, histtype='bar', color=[_MY_COLOUR, _OTHER_COLOUR], label=label, stacked=True) 244 | # Title the graph correctly, and label axes: 245 | if name != Chat._myname: 246 | plt.suptitle("Messages with " + name, size=18) 247 | else: 248 | plt.suptitle("All Messages Sent", size=18) 249 | plt.xlabel("Time of Day", labelpad=20, size=15) 250 | plt.ylabel("Number of Messages", labelpad=20, size=15) 251 | # Move tick marks to centre of hourly bins by adding ~ half an hour (in days) 252 | axes = plt.gca() 253 | axes.set_xticks([b + 0.02 for b in bins]) 254 | # Place tickmarks 255 | plt.xticks(rotation=0, ha='center') 256 | # Change the tick marks from useless fraction through day, to recognisable times: 257 | # To do this use strftime to convert times to string (which needs dates >= 1900), 258 | # so shift to 1900 (add 693596 days) and take off added half hour (minus 0.02) 259 | axes.xaxis.set_major_formatter(ticker.FuncFormatter(lambda numdate, _: num2date(numdate + 693596 - 0.02).strftime('%H:%M'))) 260 | # Add some space at either end of the graph (axis in number of days, so +- 15 mins): 261 | plt.xlim([bins[0] - 0.01, bins[-1] + 0.01]) 262 | # Place y gridlines beneath the plot: 263 | axes.yaxis.grid(True) 264 | axes.set_axisbelow(True) 265 | # Hide unnecessary borders and tickmarks: 266 | axes.spines['right'].set_visible(False) 267 | axes.spines['top'].set_visible(False) 268 | axes.yaxis.set_ticks_position('left') 269 | plt.tick_params(axis='x', which='both', bottom='off', top='off') 270 | # Add the legend at the top, underneath the title but outside the figure: 271 | plt.legend(frameon=False, bbox_to_anchor=(0.5, 1.05), loc=9, ncol=2, borderaxespad=0) 272 | # If given a filename, output to file: 273 | if ((filename is not None) and (type(filename) is str)): 274 | plt.savefig(filename, bbox_inches='tight') 275 | 276 | 277 | # ====== Histogram of Date: 278 | 279 | 280 | def _month_list(d1, d2): 281 | """Generate a list of months between d1 and d2 inclusive. 282 | 283 | The list includes the months containing d1 and d2, with an extra month 284 | on the end for the upper limit of a histogram.""" 285 | months = [] 286 | d1 = datetime.datetime(d1.year, d1.month, 1) 287 | try: 288 | d2 = datetime.datetime(d2.year, d2.month + 1, 1) 289 | # If month is 12 (=December), adding one causes error: 290 | except ValueError: 291 | # So January of the next year instead 292 | d2 = datetime.datetime(d2.year + 1, 1, 1) 293 | # Just generate all months in the required years-range, including unecessary ones 294 | for y in range(d1.year, d2.year + 1): 295 | for m in range(1, 13): 296 | months.append(datetime.datetime(y, m, 1)) 297 | # Then remove extra months 298 | months = [m for m in months if (d1 <= m <= d2)] 299 | return months 300 | 301 | 302 | def messages_date_graph(Chat, name=None, filename=None, start_date=None, end_date=None, no_gui=False): 303 | """Create a graph of the number of messages sent between users. 304 | 305 | Produces a graph of messages sent to and received from another user. The 306 | method only works for individuals, not for threads between multiple friends. 307 | 308 | - 'Chat' should be the Chat object to analyse. 309 | - 'name' should be the name of the user, and so the Thread, to be graphed. 310 | A special case is when 'name' is the name of the current user, in which 311 | case the graph of ALL messages the current user has sent is produced. 312 | - If a 'filename' is specified, output to file as well as displaying 313 | onscreen for viewing. 314 | - 'start_date' and 'end_date' can be used to narrow the range of dates 315 | covered; the default is the first message to the last, but specifying dates 316 | inside this range can be used to narrow down the region considered. 317 | - To run without displaying a graph onscreen, set 'no_gui' to True. If no filename 318 | is specified with this, the function will run but produce no output anywhere.""" 319 | # Implement a default case: 320 | if name is None: 321 | name = Chat._myname 322 | # Sanity check input dates, and fix if necessary (note MUST be one line to avoid reassignment before comparison): 323 | if ((start_date is not None) and (end_date is not None)): 324 | start_date, end_date = min(start_date, end_date), max(start_date, end_date) 325 | # If looking at graph with other users, get messages to and from: 326 | if name != Chat._myname: 327 | Thread = Chat[name] 328 | # If a start date given (which is after the message thread starts), use it: 329 | if start_date is None: 330 | d_min = Thread[0].date_time 331 | else: 332 | d_min = max(Chat._date_parse(start_date), Thread[0].date_time) 333 | # If an end date given (which is before the message thread ends), use it: 334 | if end_date is None: 335 | d_max = Thread[-1].date_time 336 | else: 337 | d_max = min(Chat._date_parse(end_date), Thread[-1].date_time) 338 | dates_from = [date2num(message.date_time) for message in Thread.by(name)] 339 | dates_to = [date2num(message.date_time) for message in Thread.by(Chat._myname)] 340 | label = [Chat._myname, name] 341 | # If looking at all messages sent; do things differently: 342 | else: 343 | message_list = Chat.all_messages() 344 | # If a start date given (which is after the message thread starts), use it: 345 | if start_date is None: 346 | d_min = message_list[0].date_time 347 | else: 348 | d_min = max(Chat._date_parse(start_date), message_list[0].date_time) 349 | # If an end date given (which is before the message thread ends), use it: 350 | if end_date is None: 351 | d_max = message_list[-1].date_time 352 | else: 353 | d_max = min(Chat._date_parse(end_date), message_list[-1].date_time) 354 | dates_from = [date2num(message.date_time) for message in message_list if message.author != Chat._myname] 355 | dates_to = [date2num(message.date_time) for message in message_list if message.author == Chat._myname] 356 | label = [Chat._myname, "Others"] 357 | # Divide up into month bins, changing datetime objects to number of days for plotting: 358 | bins = [date2num(b) for b in _month_list(d_min, d_max)] 359 | # Create the figure, hiding the display if no_gui set: 360 | if no_gui: 361 | plt.ioff() 362 | plt.figure(figsize=(18, 9), dpi=80) 363 | plt.hist([dates_to, dates_from], bins, histtype='bar', color=[_MY_COLOUR, _OTHER_COLOUR], label=label, stacked=True) 364 | # Title the graph correctly, and label axes: 365 | if name != Chat._myname: 366 | plt.suptitle("Messages with " + name, size=18) 367 | else: 368 | plt.suptitle("All Messages Sent", size=18) 369 | plt.ylabel("Number of Messages", labelpad=20, size=15) 370 | # Put the tick marks at the rough centre of months by adding 15 days (~ 1/2 a month): 371 | axes = plt.gca() 372 | axes.set_xticks([b + 15 for b in bins]) 373 | # The x labels are unreadbale at angle if more than ~50 of them, put them vertical if so: 374 | if len(bins) > 45: 375 | plt.xticks(rotation='vertical') 376 | else: 377 | plt.xticks(rotation=30, ha='right') 378 | # Change the tick marks from useless number of days, to recognisable dates: 379 | axes.xaxis.set_major_formatter(ticker.FuncFormatter(lambda numdate, _: num2date(numdate).strftime('%b %Y'))) 380 | # Add some space at either end of the graph (axis in number of days, so -10 days and +5 days): 381 | plt.xlim([bins[0] - 10, bins[-1] + 5]) 382 | # Place y gridlines beneath the plot: 383 | axes.yaxis.grid(True) 384 | axes.set_axisbelow(True) 385 | # Hide unnecessary borders and tickmarks: 386 | axes.spines['right'].set_visible(False) 387 | axes.spines['top'].set_visible(False) 388 | axes.yaxis.set_ticks_position('left') 389 | plt.tick_params(axis='x', which='both', bottom='off', top='off') 390 | # Add the legend at the top, underneath the title but outside the figure: 391 | plt.legend(frameon=False, bbox_to_anchor=(0.5, 1.05), loc=9, ncol=2, borderaxespad=0) 392 | # If given a filename, output to file: 393 | if ((filename is not None) and (type(filename) is str)): 394 | plt.savefig(filename, bbox_inches='tight') 395 | 396 | 397 | # ====== Pie Chart of Totals: 398 | 399 | 400 | # Colours from http://www.mulinblog.com/a-color-palette-optimized-for-data-visualization/ 401 | _COLOURS = ['#5DA5DA', '#FAA43A', '#60BD68', '#F17CB0', '#B2912F', '#B276B2', '#DECF3F', '#F15854'] 402 | 403 | 404 | def _make_labels_wrap(labels): 405 | """Break labels which contain more than one name into multiple lines.""" 406 | for i, l in enumerate(labels): 407 | if len(l) > 25: 408 | # Split lines at ", " and rejoin with newline. 409 | labels[i] = '\n'.join(l.split(", ")) 410 | return labels 411 | 412 | 413 | def messages_pie_chart(Chat, N=10, filename=None, count_type="total", groups=False, 414 | no_gui=False, percentages=True): 415 | """Create a pie chart of the number of messages exchanged with friends. 416 | 417 | The graph shows the most messaged friends sorted using the top_n_people() 418 | code. The graph also shows percentage sizes of wedges, though this can be disabled. 419 | - 'Chat' should be the Chat object to analyse. 420 | - 'N' should be how many people to show explicitly; all others are grouped 421 | together in a final chunk. 422 | - If a 'filename' is specified, output to file as well as displaying 423 | onscreen for viewing. 424 | - The 'count_type' argument is passed to top_n_people() and so one of the 425 | four valid counts can be used. 426 | - Setting 'groups' to True will include message threads with groups where 427 | appropriate. 428 | - To run without displaying a graph onscreen, set 'no_gui' to True. If no filename 429 | is specified with this, the function will run but produce no output anywhere. 430 | - The percentages on the graph can be removed by setting 'percentages' to 431 | False.""" 432 | # The title of the graph depends on the count_type: 433 | _title_dict = {"total": "Total Lengths of Message Threads", 434 | "allfrom": "Total Number of Messages Received", 435 | "from": "Number of Messages Received from People in Personal Threads", 436 | "to": "Number of Messages Sent to People in Personal Threads", 437 | "words": "Total Word Counts of Message Threads", "wordsfrom": "Word Count of All Messages Received from People in Personal Threads", 438 | "wordsto": "Word Count of All Messages Sent to People in Personal Threads", 439 | "chars": "Total Character Lengths of Message Threads", 440 | "charsfrom": "Character Length of All Messages Received from People in Personal Threads", 441 | "charsto": "Character Length of All Messages Sent to People in Personal Threads"} 442 | # The data to plot: 443 | thread_counts = top_n_people(Chat, count_type=count_type, groups=groups) 444 | # Set up useful lists and counts: 445 | names = [] 446 | counts = [] 447 | other_count = 0 448 | colours = [] 449 | # Run through the data, adding it to the correct list. If not in N, add to Other: 450 | for n, t in enumerate(thread_counts): 451 | if n < N: 452 | names.append(t[0]) 453 | counts.append(t[1]) 454 | colours.append(_COLOURS[n % len(_COLOURS)]) 455 | else: 456 | other_count += t[1] 457 | # Add an "Others" section in dark grey using the other_count: 458 | names.append("Others") 459 | counts.append(other_count) 460 | colours.append('#4D4D4D') 461 | # If long names, wrap them: 462 | _make_labels_wrap(names) 463 | # Create the figure, hiding the display if no_gui set: 464 | if no_gui: 465 | plt.ioff() 466 | plt.figure(figsize=(18, 9), dpi=80) 467 | # We want the edges of the wedges in the chart to be white for aesthetics: 468 | plt.rcParams['patch.edgecolor'] = 'white' 469 | # Plot percentage counts on the figure: 470 | if percentages: 471 | pct = '%1.1f%%' 472 | else: 473 | pct = None 474 | # Make the plot, starting at the top (90 degrees from horizontal) and percentages outside (pctdist > 1) 475 | plt.pie(counts, colors=colours, autopct=pct, pctdistance=1.1, startangle=90, counterclock=False) 476 | # Put the right title on the graph: 477 | plt.suptitle(_title_dict[count_type], size=18) 478 | # And make it circular: 479 | plt.axis('equal') 480 | # Add the legend: 481 | plt.legend(labels=names, frameon=False, labelspacing=1, loc="center", bbox_to_anchor=[0, 0.5]) 482 | # If given a filename, output to file: 483 | if ((filename is not None) and (type(filename) is str)): 484 | plt.savefig(filename, bbox_inches='tight') 485 | # To get white outlines, we changed default. Fix this: 486 | plt.rcParams['patch.edgecolor'] = _TEXT_COLOUR 487 | 488 | 489 | # ============================================================================= 490 | # Word Frequency Analysis # 491 | # # 492 | # Public Functions: # 493 | # - top_word_use(Chat, name, from_me, ignore_single_words) # 494 | # # 495 | # ============================================================================= 496 | 497 | 498 | def _str_to_word_list(text): 499 | """Turn a string into a list of words, removing URLs and punctuation. 500 | 501 | - The function takes in a string and returns a list of strings.""" 502 | # Some characters and strings need deleting from messages to separate them into proper words: 503 | _EXCLUDE = ["'s", "'ll", ".", ",", ":", ";", "!", "?", "*", '"', "-", "+", "^", "_", "~", "(", ")", "[", "]", "/", "\\", "@", "="] 504 | # Some things need removing, but not deleting as with _EXCLUDE: 505 | _CHANGE = {"'": "", ":p": "tongueoutsmiley", ":-p": "tongueoutsmiley", 506 | ":)": "happyfacesmiley", ":-)": "happyfacesmiley", ":/": "awkwardfacesmiley", 507 | ":-/": "awkwardfacesmiley", "<3": "loveheartsmiley", ":(": "sadfacesmiley", 508 | ":-(": "sadfacesmiley", ":'(": "cryingfacesmiley", ":d": "grinningfacesmiley", 509 | ":-d": "grinningfacesmiley", ";)": "winkfacesmiley", ";-)": "winkfacesmiley", 510 | ":o": "shockedfacesmiley"} 511 | # Remove URLs with a regular expression, else they mess up when removing punctuation: 512 | text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text) 513 | # Remove the NEWLINE denoting string, and replace with a space before anything else: 514 | text = text.replace("<|NEWLINE|>", " ") 515 | text = text.lower() 516 | # Change and exclude things: 517 | for old, new in _CHANGE.items(): 518 | text = text.replace(old, new) 519 | for ex in _EXCLUDE: 520 | text = text.replace(ex, " ") 521 | # A hack to replace all whitespace with one space: 522 | text = " ".join(text.split()) 523 | # Get rid of non-ASCII characters for simplicity 524 | text = text.encode('ascii', 'replace') 525 | # Return a list of words: 526 | return text.split() 527 | 528 | 529 | def _message_list_word_list(messages): 530 | """Take a list of Message objects and return a list of strings. 531 | 532 | The returned list of strings contains all of the words in the messages.""" 533 | word_list = [] 534 | for m in messages: 535 | word_list.extend(_str_to_word_list(m.text)) 536 | return word_list 537 | 538 | 539 | def _word_list_to_freq(words, ignore_single_words=False): 540 | """Take a list of strings, and return a list of (word, word_use_count). 541 | 542 | - The returned list of pairs is sorted in descending order. 543 | - Passing 'ignore_single_words' will remove any words only used once in 544 | a message thread.""" 545 | # The order of items in the CHANGE dictionary means changing back isn't quite so simple; just use a second dictionary: 546 | _CHANGE_BACK = {"tongueoutsmiley": ":P", "happyfacesmiley": ":)", "awkwardfacesmiley": ":/", 547 | "loveheartsmiley": "<3", "sadfacesmiley": ":(", "cryingfacesmiley": ":'(", 548 | "grinningfacesmiley": ":D", "winkfacesmiley": ";)", "shockedfacesmiley": ":o"} 549 | # Make a dictionary of words and their total count: 550 | freq = {x: words.count(x) for x in words} 551 | # Change the emoticons back to emoticons: 552 | for new, old in _CHANGE_BACK.items(): 553 | if new in freq: 554 | freq[old] = freq.pop(new) 555 | # Convert to a list and sort: 556 | freq = sorted(freq.items(), key=lambda tup: tup[1], reverse=True) 557 | # If only want words used more than once, remove those with count <= 1 558 | if ignore_single_words: 559 | freq = [f for f in freq if f[1] > 1] 560 | return freq 561 | 562 | 563 | def top_word_use(Chat, name, from_me=False, ignore_single_words=False): 564 | """Work out the most commonly used words by a friend. 565 | 566 | The function returns a list of (word, word_use_count) tuples. For long threads, 567 | THIS FUNCTION WILL TAKE A VERY LONG TIME, due to the analysis being done 568 | directly in Python, not in a module using the faster C or C++. 569 | 570 | - 'name' is a string of the name of the Thread to consider. 571 | - 'from_me' is a boolean flag to consider messages sent by you to 'name' 572 | if True, otherwise messages received from 'name' are used, the default. 573 | - Setting 'ignore_single_words' to True removes words which are only used 574 | once, which reduces the length of the list returned.""" 575 | if name != Chat._myname: 576 | if from_me: 577 | messages = Chat[name].by(Chat._myname) 578 | else: 579 | messages = Chat[name].by(name) 580 | else: 581 | messages = Chat.all_from(Chat._myname) 582 | wlist = _message_list_word_list(messages) 583 | freq = _word_list_to_freq(wlist, ignore_single_words) 584 | return freq 585 | --------------------------------------------------------------------------------