├── .gitignore
├── samples
    ├── sample_date_graph.png
    ├── sample_pie_chart.png
    ├── README.md
    └── facebook_and_sms.py
├── facebook.py
├── LICENSE.md
├── README.md
├── fb_parser.py
├── fb_chat.py
└── fb_analysis.py


/.gitignore:
--------------------------------------------------------------------------------
1 | *.htm
2 | *.pickle
3 | *.json
4 | *.csv
5 | *.zip
6 | *.pyc
7 | __pycache__/
8 | 


--------------------------------------------------------------------------------
/samples/sample_date_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jsharkey13/facebook_message_parser/HEAD/samples/sample_date_graph.png


--------------------------------------------------------------------------------
/samples/sample_pie_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jsharkey13/facebook_message_parser/HEAD/samples/sample_pie_chart.png


--------------------------------------------------------------------------------
/samples/README.md:
--------------------------------------------------------------------------------
 1 | __`facebook_and_sms.py`__
 2 | 
 3 | The file contains code to merge a parsed Facebook Message history with an iOS6 iPhone Message history
 4 | (using the code from [iphone_message_parser](https://github.com/jsharkey13/iphone_message_parser)). The
 5 | resulting object can be used with the `fb_analysis.py` code, or browsed as if it was an `fb_chat.Chat` object.
 6 | 
 7 | -----
 8 | 
 9 | __`sample_date_graph.png`__
10 | 
11 | The image is the graph produced from a parsed Facebook Message history, using the code from `fb_parser.py`
12 | to produce the `Facebook.Chat` object.
13 | The code to produce the graph is in `fb_analysis.py` and requires a single command:
14 | ```
15 | fb_analysis.messages_date_graph(Facebook.Chat, name="Their Name", filename="sample_date_graph.png",
16 |                                 start_date=(2014, 9, 1), end_date=(2015, 5, 1), no_gui=True)
17 | ```
18 | 
19 | -----
20 | 
21 | __`sample_pie_chart.png`__
22 | 
23 | The image is again a graph produced using `fb_analysis.py`:
24 | ```
25 | fb_analysis.messages_pie_chart(Facebook.Chat, N=8, filename="sample_pie_chart.png", count_type="total", groups=False,
26 |                                no_gui=True, percentages=False)
27 | 


--------------------------------------------------------------------------------
/facebook.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import codecs
 4 | 
 5 | import fb_parser
 6 | import fb_analysis
 7 | 
 8 | 
 9 | # Nasty hack to force utf-8 encoding by default:
10 | reload(sys)
11 | sys.setdefaultencoding('utf8')
12 | 
13 | # Change stdout to allow printing of unicode characters:
14 | streamWriter = codecs.lookup('utf-8')[-1]
15 | sys.stdout = streamWriter(sys.stdout)
16 | 
17 | 
18 | if __name__ == "__main__":
19 |     """Allow the parser to be run from the command line.
20 | 
21 |        Optionally, the function allows specifying the filename to read in from
22 |        as the first argument."""
23 |     if len(sys.argv) >= 2:
24 |         # If filname passed in and a recognised format, continue:
25 |         if ((".zip" in sys.argv[1]) or (".htm" in sys.argv[1]) or (".pickle" in sys.argv[1])):
26 |             fname = sys.argv[1]
27 |         else:
28 |             # If not a recognised format, stop but allow override:
29 |             print "File is not a .zip file, a .htm file or a pickle file."
30 |             cont = raw_input("Continue anyway? (y/n)")
31 |             if cont == "n":
32 |                 sys.exit(-1)
33 |     else:
34 |         # If no argument, attempt to open the default .zip export file:
35 |         fname = "facebook-" + fb_parser.FBMessageParse._MYUSERNAME + ".zip"
36 |     if not os.path.isfile(fname):
37 |         print "File " + fname + " does not exist or could not be found! Abort."
38 |         sys.exit(-1)
39 | 
40 |     # Now use the Facebook.Chat object to do stuff!
41 |     # Some example code to add functionality immediately.
42 | 
43 |     # Create the parser, and parse the messages file:
44 |     if ".pickle" in fname:
45 |         Facebook = fb_parser.FBMessageParse(fname, load_pickle=True)
46 |     else:
47 |         Facebook = fb_parser.FBMessageParse(fname)
48 |         Facebook.parse_messages()
49 |     # Now find and print the Top 10 Friends:
50 |     print "Top 10 Most Messaged Friends: Total Thread Length"
51 |     top10 = fb_analysis.top_n_people(Facebook.Chat, N=10)
52 |     print top10
53 |     # Output to a csv file:
54 |     Facebook.write_to_csv()
55 |     # Show a graph of the most messaged friend's messages:
56 |     fb_analysis.messages_date_graph(Facebook.Chat, top10[0][0])
57 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | The code in the files `fb_analysis.py` and `facebook.py` is released under the MIT License below.
 2 | __The files `fb_parser.py` and `fb_chat.py` contain code from another author, covered by the original license at the end of this file.__
 3 | 
 4 | The MIT License (MIT)
 5 | 
 6 | Copyright (c) 2015 James Sharkey
 7 | 
 8 | Permission is hereby granted, free of charge, to any person obtaining a copy
 9 | of this software and associated documentation files (the "Software"), to deal
10 | in the Software without restriction, including without limitation the rights
11 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | copies of the Software, and to permit persons to whom the Software is
13 | furnished to do so, subject to the following conditions:
14 | 
15 | The above copyright notice and this permission notice shall be included in all
16 | copies or substantial portions of the Software.
17 | 
18 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | SOFTWARE.
25 | 
26 | __The code in `fb_chat.py` and `fb_parser.py` is covered by the original license also:__
27 | 
28 | The MIT License (MIT)
29 | 
30 | Copyright (c) 2015 Chris Copley
31 | 
32 | Permission is hereby granted, free of charge, to any person obtaining a copy
33 | of this software and associated documentation files (the "Software"), to deal
34 | in the Software without restriction, including without limitation the rights
35 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
36 | copies of the Software, and to permit persons to whom the Software is
37 | furnished to do so, subject to the following conditions:
38 | 
39 | The above copyright notice and this permission notice shall be included in all
40 | copies or substantial portions of the Software.
41 | 
42 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
43 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
44 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
45 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
46 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
47 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
48 | SOFTWARE.
49 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Facebook Message Export Parser
 2 | 
 3 | ---
 4 | ## Update September 2018
 5 | 
 6 | __Deprecation Notice__
 7 | 
 8 | Facebook now support exports in JSON format, which somewhat reduces the need for this code. The format of these exports has changed a great deal, and is still changing. This code will not work with exports newer than April 2018.
 9 | 
10 | Some (unmaintained) code that may work with newer JSON exports can be found [in this Gist](https://gist.github.com/jsharkey13/d60b7b421e08c98d426d03c39f8b4a12), but be aware that code is Python 3 and the format of Facebook's export may have changed since it was written.
11 | 
12 | ---
13 | 
14 | Facebook has a feature that allows users to download a copy of their data as a zip archive containing htm files with their data. The aim of this parser is to take this archive and to extract a user's Facebook Messages from it; to transfer them into a more useful format, as well as performing some analysis to produce interesting data.
15 | 
16 | This code is adapted from [CopOnTheRun/FB-Message-Parser](https://github.com/CopOnTheRun/FB-Message-Parser).
17 | 
18 | #### Running the Code
19 | The Facebook Export can be downloaded from  the [Facebook Settings](https://www.facebook.com/settings) menu. 
20 | 
21 | __*Before any code can be run:*__ [Lines 26 and 27](https://github.com/jsharkey13/facebook_message_parser/blob/master/fb_parser.py#L27-L28) in `fb_parser.py` will need to be updated to the name and username of the account being parsed. If this is done, the code will attempt to open the zip file `facebook-[myusername].zip` by default if no argument is given to `facebook.py`.
22 | 
23 | Run "`python facebook.py [optional_filename]`" with the `facebook-[myusername].zip` or `messages.htm` files in the same directory to export to CSV, display top 10 most messaged friends and output a graph showing messages with the most messaged friend. This sample code can easily be adapted.
24 | 
25 | The `fb_chat.Chat` object returned by the parser (the object called `Facebook.Chat` in `facebook.py`) could be pickled and loaded in another program to form a base API to interact with the messages there. (Note that this, like the export, contains private messages in plain text format, and that the `fb_chat` code may need to be imported too).
26 | 
27 | __Producing Graphs__
28 | 
29 | The `fb_analysis.py` file contains code to produce a stacked histogram showing the number of messages sent and recieved with a contact each month:
30 | 
31 | ![Sample Graph](/samples/sample_date_graph.png?raw=true)
32 | 
33 | __A browser-based interface__
34 | 
35 | If you want to view the export in a browser (and don't want to use the perfectly servicable way of viewing Facebook Messages in a browser that is `www.facebook.com`) then [Flask Facebook Messages](https://github.com/jsharkey13/flask_facebook_messages) may be of use. Add `Facebook.dump_to_pickle()` on a new line after [Line 52](https://github.com/jsharkey13/facebook_message_parser/blob/master/facebook.py#L52) of `facebook.py` to produce a pickle export, then use the code in that repository to view it!
36 | 
37 | #### Dependencies
38 | The code is written in Python 2.7.
39 | 
40 | The parser uses [Beautiful Soup](http://www.crummy.com/software/BeautifulSoup/) to do the bulk of the capture from the htm file.
41 | 
42 | The analysis code uses [matplotlib](https://matplotlib.org/) to produce graphs of message counts. An example graph can be found in the `samples` directory.
43 | 
44 | [Anaconda Python](https://store.continuum.io/cshop/anaconda/) for scientific computing is a simple and easy way to install all the dependencies for the code, alongside many other useful libraries. It can be downloaded [here](https://www.continuum.io/downloads).
45 | 


--------------------------------------------------------------------------------
/samples/facebook_and_sms.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import codecs
  3 | import fb_parser
  4 | import ios_parser
  5 | import ios_chat
  6 | 
  7 | 
  8 | class Merge_Chat_Logs(object):
  9 |     """An object to merge the iOS and Facebook Chat objects.
 10 | 
 11 |         - The Merge_Chat_Logs object can be treated like a Chat object, and contains
 12 |           the original iOS and Facebook Chat objects unchanged, whilst allowing
 13 |           the functionality of having combined the two.
 14 |         - Contains all the methods and attributes required to allow the fb_analysis
 15 |           code to use a Merge_Chat_Logs object as a Chat object. Can consider ios_chat.Chat,
 16 |           fb_chat.Chat and facebook_and_sms.Merge_Chat_Logs to be concrete implementations
 17 |           of an abstract Chat_Log class; functionality not easily obtainable in Python.
 18 |         - When initialising, the Facebook Chat object and the iOS Chat object should
 19 |           be passed in as the two arguments."""
 20 | 
 21 |     def __init__(self, facebook_Chat, ios_Chat):
 22 |         self.Chat = facebook_Chat
 23 |         self.Texts = ios_Chat
 24 |         self.threads = self.Chat.threads + self.Texts.threads
 25 |         self._myname = self.Chat._myname
 26 |         self._all_people = self.Chat._all_people.union(self.Texts._all_people)
 27 |         self._total_messages = self.Chat._total_messages + self.Texts._total_messages
 28 | 
 29 |     def __repr__(self):
 30 |         """Set Python's representation of the Chat object."""
 31 |         return "<{}'s GROUPED CHAT LOGs: TOTAL_THREADS={} TOTAL_MESSAGES={}>".format(self._myname, len(self), self.Chat._total_messages + self.Texts._total_messages)
 32 | 
 33 |     def __len__(self):
 34 |         """Return the total number of threads in both Chat objects.
 35 | 
 36 |            Allows the len() method to be called on a Merge_Chat_Logs object. This
 37 |            could be changed to be the total number of messages, currently stored as
 38 |            Merge_Chat_Logs._total_messages()"""
 39 |         return len(self.Chat) + len(self.Texts)
 40 | 
 41 |     def __getitem__(self, key):
 42 |         """Allow accessing Thread objects using Merge_Chat_Logs["Thread Name"].
 43 | 
 44 |             - If the thread exists only in one Chat object; that Thread is returned;
 45 |               but if a Thread of the same name appears in both, a new Thread object
 46 |               combining the two threads is created. Message numbering in this new
 47 |               Thread object may be confusing, but functionality remains unchanged.
 48 |             - The method will fail silently; None is returned if a key is not present.
 49 |               This is different to the more standard rasing of 'KeyError'."""
 50 |         if ((key in self.Chat._thread_dict) and (key in self.Texts._thread_dict)):
 51 |             return ios_chat.Thread(key, self.Chat[key].messages + self.Texts[key].messages)
 52 |         elif key in self.Chat._thread_dict:
 53 |             return self.Chat[key]
 54 |         elif key in self.Texts._thread_dict:
 55 |             return self.Texts[key]
 56 |         else:
 57 |             return None
 58 | 
 59 |     def all_messages(self):
 60 |         """Return a date ordered list of all messages.
 61 | 
 62 |            The list is all messages contained in both Chat objects, as a list of
 63 |            Message objects."""
 64 |         return sorted([m for m in self.Chat.all_messages() + self.Texts.all_messages()])
 65 | 
 66 |     def all_from(self, name):
 67 |         """Return a date ordered list of all messages sent by 'name', from both Chat objects.
 68 | 
 69 |            The list returned is a list of Message objects. This is distinct from
 70 |            Thread.by(name) since all threads are searched by this method. For all
 71 |            messages in one thread from 'name', use Thread.by(name) on the correct Thread."""
 72 |         return sorted([m for m in self.Chat.all_from(name) + self.Texts.all_from(name)])
 73 | 
 74 |     def sent_before(self, date):
 75 |         """Return a date ordered list of all messages sent before specified date, from both Chat objects.
 76 | 
 77 |            The function returns a list of Message objects. The 'date' can be a
 78 |            datetime.datetime object, or a three to six tuple (YYYY, MM, DD[, HH, MM, SS])."""
 79 |         return sorted([m for m in self.Chat.sent_before(date) + self.Texts.sent_before(date)])
 80 | 
 81 |     def sent_after(self, date):
 82 |         """Return a date ordered list of all messages sent after specified date, from both Chat objects.
 83 | 
 84 |            The list returned is a list of Message objects. The 'date' can be a
 85 |            datetime.datetime object, or a three to six tuple (YYYY, MM, DD[, HH, MM, SS])."""
 86 |         return sorted([m for m in self.Chat.sent_after(date) + self.Texts.sent_after(date)])
 87 | 
 88 |     def sent_between(self, start, end=None):
 89 |         """Return a date ordered list of all messages sent between specified dates, from both Chat objects.
 90 | 
 91 |             - The list returned is a list of Message objects. The 'start' and 'end'
 92 |               can be datetime.datetime objects, or a three to six tuple
 93 |               (YYYY, MM, DD[, HH, MM, SS]).
 94 |             - Not entering an 'end' date is interpreted as all messages sent on
 95 |               the day 'start'. Where a time is specified also, a 24 hour period
 96 |               beginning at 'start' is used."""
 97 |         return sorted([m for m in self.Chat.sent_between(start, end) + self.Texts.sent_between(start, end)])
 98 | 
 99 |     def search(self, string, ignore_case=False):
100 |         """Return a date ordered list of all messages containing 'string', from both Chat objects.
101 | 
102 |            This function searches in all threads, and returns a list of Message
103 |            objects.
104 |             - The function can be made case-insensitive by setting 'ignore_case'
105 |               to True."""
106 |         return sorted([m for m in self.Chat.search(string, ignore_case) + self.Texts.search(string, ignore_case)])
107 | 
108 | 
109 | if __name__ == "__main__":
110 |     """The code to get to a Merge_Chat_Logs object, assuming both ios_parser and
111 |        fb_parser have been run and used to pickle their Chat objects with the default
112 |        names."""
113 | 
114 |     # Nasty hack to force utf-8 encoding by default:
115 |     reload(sys)
116 |     sys.setdefaultencoding('utf8')
117 | 
118 |     # Change stdout to allow printing of unicode characters:
119 |     streamWriter = codecs.lookup('utf-8')[-1]
120 |     sys.stdout = streamWriter(sys.stdout)
121 | 
122 |     # To avoid work, assume can load from pickle:
123 |     fb_fname = 'messages.pickle'
124 |     ios_fname = 'sms.pickle'
125 | 
126 |     Facebook = fb_parser.FBMessageParse(fb_fname)
127 |     SMS = ios_parser.iOSMessageParse(ios_fname)
128 | 
129 |     All = Merge_Chat_Logs(Facebook.Chat, SMS.Texts)
130 | 


--------------------------------------------------------------------------------
/fb_parser.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import dateutil
  3 | import sys
  4 | from bs4 import BeautifulSoup as bs
  5 | import zipfile
  6 | import pickle
  7 | import fb_chat
  8 | 
  9 | 
 10 | class FBMessageParse(object):
 11 |     """An object to encapsulate all the methods required to parse messages.htm.
 12 | 
 13 |        These include methods to initialise, save and load a fb_chat.Chat object,
 14 |        which contains a Pythonic representation of Facebook Message history.
 15 |         - Can read in messages from the .zip archive exported from Facebook, or
 16 |           the .htm file contained in the archive.
 17 |         - Can dump the Chat object to a pickle file and load it again in another
 18 |           session: use dump_to_pickle() and load_from_pickle().
 19 |         - Can export messages to csv format: use write_to_csv()
 20 |         - Using a 'uid_people' file, can turn unrecognised nnnnnnn@facebook.com identifiers
 21 |           into names. Lines should be '[uid]:[name]'. See the print_unknowns() function.
 22 |         - Allows customised renaming of contacts using a 'duplicates' file. In a similar
 23 |           way to the 'uid_people' file; add lines containing '[old name]:[new name]' to
 24 |           a file called 'duplicates' for the process to occur on the next read in from
 25 |           zip or htm."""
 26 | 
 27 |     _MYNAME = "My Name"
 28 |     _MYUSERNAME = "myusername"
 29 | 
 30 |     def __init__(self, fname, load_pickle=False):
 31 |         self._UIDPEOPLE = {}
 32 |         self._PEOPLEUID = {}
 33 |         self._PEOPLEDUPLICATES = {}
 34 |         self._UNKNOWNS = []
 35 |         #
 36 |         self.Chat = None
 37 |         #
 38 |         self._archive = None
 39 |         self._messages_htm = None
 40 |         # Open either the .zip and contained htm, the pickle file, or another file:
 41 |         if ".zip" in fname:
 42 |             self._archive = zipfile.ZipFile(fname, 'r')
 43 |         if self._archive is not None:
 44 |             self._messages_htm = self._archive.open('html/messages.htm')
 45 |         elif load_pickle or ".pickle" in fname:
 46 |             self.load_from_pickle(fname)
 47 |         else:
 48 |             self._messages_htm = open(fname, "r")
 49 |         #
 50 |         self._read_uid_people()
 51 |         self._read_duplicate_list()
 52 | 
 53 |     def _close(self):
 54 |         """Close any open files before deletion. Do not call manually."""
 55 |         if self._archive is not None:
 56 |             self._messages_htm = None
 57 |             self._archive.close()
 58 |         if self._messages_htm is not None:
 59 |             self._messages_htm.close()
 60 | 
 61 |     def __del__(self):
 62 |         """Ensure _close() is called on deletion."""
 63 |         self._close()
 64 | 
 65 |     def _read_uid_people(self):
 66 |         """Read in the 'uid_people' file and add line entries to dictionaries.
 67 | 
 68 |            Called automatically; do not call manually. Read in the 'uid_people'
 69 |            file and add line entries to the dictionaries used to translate between
 70 |            UID and Name, and vice versa.
 71 |             - Lines should be formatted '[uid]:[name]'.
 72 |             - Ill-formatted lines are ignored, and the file does not have to be present
 73 |               for the code to function: unrecognised UIDs are left unchanged."""
 74 |         try:
 75 |             with open('uid_people') as f:
 76 |                 lines = [line.rstrip('\n') for line in f]
 77 |                 for line in lines:
 78 |                     try:
 79 |                         key, value = line.split(":")
 80 |                         self._UIDPEOPLE.update({key: value})
 81 |                         self._PEOPLEUID.update({value: key})
 82 |                     except ValueError:
 83 |                         pass
 84 |         except IOError:
 85 |             pass
 86 | 
 87 |     def _read_duplicate_list(self):
 88 |         """Read in the 'duplicates' file and add line entries to the dictionary.
 89 | 
 90 |            Called automatically; do not call manually. Read in the 'duplicates'
 91 |            file and add line entries to the dictionary used to replace names.
 92 |            Useful for people who have changed their Facebook name to a nickname,
 93 |            or appear in the Chat logs with two versions of their name.
 94 |             - Lines should be formatted '[old name]:[new name]'.
 95 |             - Ill-formatted lines are ignored, and the file does not have to be
 96 |               present for the code to function: unrecognised names are left unchanged."""
 97 |         try:
 98 |             with open('duplicates') as f:
 99 |                 lines = [line.rstrip('\n') for line in f]
100 |                 for line in lines:
101 |                     try:
102 |                         key, value = line.split(":")
103 |                         self._PEOPLEDUPLICATES.update({key: value})
104 |                     except ValueError:
105 |                         pass
106 |         except IOError:
107 |             pass
108 | 
109 |     def _thread_name_cleanup(self, namestr):
110 |         """Parse the thread's name.
111 | 
112 |            Change any message author names and remove the name of the Chat owner
113 |            (_MYNAME) from the name unless messages are sent to oneself."""
114 |         namestr = namestr.encode('ascii', 'replace')  # BeutifulSoup works in Unicode, do we want ASCII names?
115 |         namelist = sorted(namestr.split(", "))
116 |         for i, name in enumerate(namelist):
117 |             namelist[i] = self._message_author_parse(name)
118 |         if ((self._MYNAME in namelist) and (len(namelist) > 1)):  # You can send yourself messages, so don't delete name if it's the only one.
119 |             namelist.remove(self._MYNAME)                         # Otherwise remove your name from the list.
120 |         return ", ".join(namelist)
121 | 
122 |     def _message_author_parse(self, name):
123 |         """Tidy up the name of the sender of a message.
124 | 
125 |            If the name is a UID email address, use the UID dictionary to replace
126 |            their name if possible. If the name is a duplicate (or to be renamed)
127 |            then rename. Any UIDs which remain are added to a list to facilitate
128 |            populating a 'uid_people' file: see print_unknowns()."""
129 |         if name is None:
130 |             return "UNKNOWN_AUTHOR" # Facebook has been providing messages with no recorded author!
131 |         name = name.encode('ascii', 'replace')  # BeutifulSoup works in Unicode, do we want ASCII names?
132 |         n = name.replace("@facebook.com", "")
133 |         if n in self._UIDPEOPLE:
134 |             name = self._UIDPEOPLE[n]
135 |         if n in self._PEOPLEDUPLICATES:
136 |             name = self._PEOPLEDUPLICATES[n]
137 |         if ((n in name) and (n != name)):  # If n is still the UID, and we still don't have a name:
138 |             self._UNKNOWNS.append(n)      # Add the UID to the UNKNOWN list
139 |         return name
140 | 
141 |     def _message_date_parse(self, datestr):
142 |         """Turn the datestamp on the message into a datetime object.
143 | 
144 |            This will parse to a timezone aware Python timestamp and then
145 |            remove the timezone info; converting it to local time.
146 |            This may not be the behaviour desired, and can be changed;
147 |            but most other functions assume naive datetimes."""
148 |         return dateutil.parser.parse(datestr).replace(tzinfo=None)
149 | 
150 |     def _date_unix(self, datetime_date):
151 |         """Turn a datetime.datetime object into a UNIX time int."""
152 |         return int((datetime_date - datetime.datetime(1970, 1, 1)).total_seconds())
153 | 
154 |     def _message_body_parse(self, message_body):
155 |         """Tidy up the message body itself.
156 | 
157 |            This turns newline characters into a unique custom string which can
158 |            be replaced after export if necessary. Quote marks are also escaped,
159 |            to allow the use of quotes and commas in messages whilst allowing
160 |            export to csv. Those two lines can be removed if desired."""
161 |         if message_body is None:
162 |             message_body = ""
163 |         message_body = '<|NEWLINE|>'.join(message_body.splitlines())  # We can't have newline characters in a csv file
164 |         message_body = message_body.replace('"', '""')  # Attempt to escape " characters in messages, for csv output
165 |         return message_body
166 | 
167 |     def print_unknowns(self):
168 |         """Print out any UIDs of people not recognised by the code.
169 | 
170 |            Prints lines containing unrecognised UIDs, along with instructions on
171 |            how to find names for these people and how to add the names to the
172 |            'uid_people' file."""
173 |         if self.Chat is None:
174 |             print "The message export file has not been parsed. Run parse_messages()."
175 |             return
176 |         if len(self._UNKNOWNS) == 0:
177 |             return
178 |         self._UNKNOWNS = list(set(self._UNKNOWNS))  # An unordered duplicate removal method
179 |         print "To identify these accounts, try visiting www.facebook.com/[uid] and adding '[uid]:[name]' to a file in the current directory named 'uid_people'"
180 |         for uid in self._UNKNOWNS:
181 |             print uid
182 | 
183 |     def parse_messages(self, group_duplicates=True):
184 |         """Take the loaded zip file or htm file and create a Chat object.
185 | 
186 |            Takes the messages.htm file and reads in the messages using
187 |            BeautifulSoup to parse the html data. Creates the Chat object, which
188 |            can be used independently and accessed as the FBMessageParse.Chat object.
189 |             - Optional argument 'group_duplicates' groups together Threads containing
190 |               the same participants. Message Threads over 10,000 messages long are
191 |               split by Facebook for export: this can help group them. True by default.
192 |             - Contains code to verify that the file being examined is in fact a
193 |               Facebook Messages export, though it allows manual override."""
194 |         # Check we have a htm file open to import from:
195 |         if self._messages_htm is None:
196 |             print "No archive/message file open. Was data loaded from a pickle file?"
197 |             return
198 |         #
199 |         soup = bs(self._messages_htm, "lxml")
200 |         # Verify that we're parsing a Facebook Message export and _MYNAME is right:
201 |         check_header = self._MYNAME + " - Messages"
202 |         try:
203 |             actual_header = soup.html.head.title.string
204 |         except AttributeError:
205 |             actual_header = None
206 |         if ((actual_header is None) or (check_header != actual_header)):
207 |             print "The title of the htm document does not match that expected:"
208 |             print '"' + check_header + '"'
209 |             print "Is the file a message export? Is the user's name correct?"
210 |             cont = raw_input("Continue anyway? (y/n)")
211 |             if cont == "n":
212 |                 sys.exit(-1)
213 |         # Set up some important lists:
214 |         thread_list = soup.find_all(class_='thread')
215 |         thread_num = 0
216 |         _chat_list = []
217 |         _thread_names = []
218 |         _duplicates_list = []
219 |         # Start going through the threads:
220 |         for t in thread_list:
221 |             message_list = t.find_all(class_='message')
222 |             _thread_list = []
223 |             total_message_num = len(message_list)
224 |             #
225 |             message_num = total_message_num
226 |             thread_name = self._thread_name_cleanup(t.contents[0])
227 |             # Work out if the thread is a duplicate:
228 |             duplicate_thread = False
229 |             if thread_name in _thread_names:
230 |                 duplicate_thread = True
231 |             else:
232 |                 _thread_names.append(thread_name)
233 |             # For each message, sort Author, Date and Body then create Message object:
234 |             for m in message_list:
235 |                 message_author = self._message_author_parse(m.find(class_='user').string)
236 |                 message_date = self._message_date_parse(m.find(class_='meta').string)
237 |                 message_body = self._message_body_parse(m.next_sibling.string)
238 |                 #
239 |                 _thread_list.append(fb_chat.Message(thread_name, message_author, message_date, message_body, message_num))
240 |                 #
241 |                 message_num -= 1
242 |             #
243 |             thread_num += 1
244 |             # If we're grouping duplicated threads, deal with them now:
245 |             if ((not duplicate_thread) or (not group_duplicates)):
246 |                 _chat_list.append(fb_chat.Thread(thread_name.split(", "), _thread_list))
247 |             else:
248 |                 for t in _chat_list:
249 |                     if t.people_str == thread_name:
250 |                         _duplicates_list.append(thread_name)
251 |                         t._add_messages(_thread_list)
252 |                         break
253 |         # Create the Chat object, set and return it:
254 |         self.Chat = fb_chat.Chat(self._MYNAME, _chat_list)
255 |         for t in _duplicates_list:
256 |             self.Chat[t]._renumber_messages()  # If we've grouped them, the messages need renumbering.
257 |         return self.Chat
258 | 
259 |     def write_to_csv(self, filename='messages.csv', chronological=False):
260 |         """Export all messages to csv format.
261 | 
262 |            The filename can be specified as an optional argument. If 'chronological'
263 |            is True, messages are printed in date order, otherwise they are printed
264 |            grouped in Threads sorted by total thread length."""
265 |         with open(filename, "w") as f:
266 |             header_line = '"Thread","Message Number","Message Author","Message Timestamp","Message Body"\n'
267 |             f.write(header_line.encode('utf8'))
268 |             if chronological:
269 |                 for message in self.Chat.all_messages():
270 |                     text = str(message)
271 |                     f.write(text.encode('utf8'))
272 |             else:
273 |                 for thread in self.Chat.threads:
274 |                     for message in thread.messages:
275 |                         text = str(message)
276 |                         f.write(text.encode('utf8'))
277 | 
278 |     def dump_to_pickle(self, filename='messages.pickle'):
279 |         """Serialise the Chat object to a pickle file.
280 | 
281 |            The pickle file can be used to restore the Chat object in another
282 |            session without re-importing the zip or htm file. Load either using
283 |            load_from_pickle(), or in another program using Pickle's standard load()
284 |            command."""
285 |         with open(filename, "w") as f:
286 |             pickle.dump(self.Chat, f)
287 | 
288 |     def load_from_pickle(self, filename='messages.pickle'):
289 |         """Read in the pickle file, optionally from a specified filename.
290 | 
291 |            The function sets the internal Chat object and returns the Chat object.
292 |            Provided mainly as an example, since the parser's main aim to to read
293 |            in from zip or htm, and to output csv or the Chat object."""
294 |         with open(filename, "r") as f:
295 |             self.Chat = pickle.load(f)
296 |         return self.Chat
297 | 


--------------------------------------------------------------------------------
/fb_chat.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | 
  3 | 
  4 | class Chat(object):
  5 |     """An object to encapsulate the entire Facebook Message history.
  6 | 
  7 |         - Contains a list of Thread objects, which can be accessed using item
  8 |           accessing Chat["Thread Name"] style.
  9 |         - When initialising, 'myname' should be the name of the user, and 'threads'
 10 |           should be a list of Thread objects.
 11 |         - Provides useful functions for accessing messages."""
 12 | 
 13 |     def __init__(self, myname, threads):
 14 |         self.threads = sorted(threads, key=len, reverse=True)
 15 |         self._thread_dict = {", ".join(thread.people): thread for thread in self.threads}
 16 |         self._total_messages = len(self.all_messages())
 17 |         self._myname = myname
 18 |         self._all_people = {myname}
 19 |         for thread in self.threads:
 20 |             self._all_people.update(thread.people)
 21 | 
 22 |     def __getitem__(self, key):
 23 |         """Allow accessing Thread objects in the list using Chat["Thread Name"].
 24 | 
 25 |            This method allows the threads list to be accessed using Chat["Thread Name"]
 26 |            or Chat[n] notation."""
 27 |         if type(key) is int:
 28 |             return self.threads[key]
 29 |         elif type(key) is str:
 30 |             return self._thread_dict[key]
 31 | 
 32 |     def __repr__(self):
 33 |         """Set Python's representation of the Chat object."""
 34 |         return "<{}'s CHAT LOG: TOTAL_THREADS={} TOTAL_MESSAGES={}>".format(self._myname, len(self.threads), self._total_messages)
 35 | 
 36 |     def __len__(self):
 37 |         """Return the total number of threads.
 38 | 
 39 |            Allows the len() method to be called on a Chat object. This could be
 40 |            changed to be the total number of messages, currently stored as
 41 |            Chat._total_messages()"""
 42 |         return len(self.threads)
 43 | 
 44 |     def _date_parse(self, date):
 45 |         """Allow dates to be entered as integer tuples (YYYY, MM, DD[, HH, MM]).
 46 | 
 47 |            Removes the need to supply datetime objects, but still allows dates
 48 |            to be entered as datetime.datetime objects. The Year, Month and
 49 |            Day are compulsory, the Hours and Minutes optional. May cause exceptions
 50 |            if poorly formatted tuples are used."""
 51 |         if type(date) is datetime.datetime:
 52 |             return date
 53 |         else:
 54 |             return datetime.datetime(*date)
 55 | 
 56 |     def _recount_messages(self):
 57 |         """Update the count of total messages.
 58 | 
 59 |            Since Thread objects can be extended dynamically, this may prove
 60 |            necessary."""
 61 |         self._total_messages = len(self.all_messages())
 62 | 
 63 |     def all_messages(self):
 64 |         """Return a date ordered list of all messages.
 65 | 
 66 |            The list is all messages contained in the Chat object, as a list of
 67 |            Message objects."""
 68 |         return sorted([message for thread in self.threads for message in thread.messages])
 69 | 
 70 |     def all_from(self, name):
 71 |         """Return a date ordered list of all messages sent by 'name'.
 72 | 
 73 |            The list returned is a list of Message objects. This is distinct from
 74 |            Thread.by(name) since all threads are searched by this method. For all
 75 |            messages in one thread from 'name', use Thread.by(name) on the correct Thread."""
 76 |         return sorted([message for thread in self.threads for message in thread.by(name)])
 77 | 
 78 |     def sent_before(self, date):
 79 |         """Return a date ordered list of all messages sent before specified date.
 80 | 
 81 |            The function returns a list of Message objects. The 'date' can be a
 82 |            datetime.datetime object, or a three or five tuple (YYYY, MM, DD[, HH, MM])."""
 83 |         return sorted([message for thread in self.threads for message in thread.sent_before(date)])
 84 | 
 85 |     def sent_after(self, date):
 86 |         """Return a date ordered list of all messages sent after specified date.
 87 | 
 88 |            The list returned is a list of Message objects. The 'date' can be a
 89 |            datetime.datetime object, or a three or five tuple (YYYY, MM, DD[, HH, MM])."""
 90 |         return sorted([message for thread in self.threads for message in thread.sent_after(date)])
 91 | 
 92 |     def sent_between(self, start, end=None):
 93 |         """Return a date ordered list of all messages sent between specified dates.
 94 | 
 95 |             - The list returned is a list of Message objects. The 'start' and 'end'
 96 |               can be datetime.datetime objects, or a three or five tuple
 97 |               (YYYY, MM, DD[, HH, MM]).
 98 |             - Not entering an 'end' date is interpreted as all messages sent on
 99 |               the day 'start'. Where a time is specified also, a 24 hour period
100 |               beginning at 'start' is used."""
101 |         return sorted([message for thread in self.threads for message in thread.sent_between(start, end)])
102 | 
103 |     def search(self, string, ignore_case=False):
104 |         """Return a date ordered list of all messages containing 'string'.
105 | 
106 |            This function searches in all threads, and returns a list of Message
107 |            objects.
108 |             - The function can be made case-insensitive by setting 'ignore_case'
109 |               to True."""
110 |         return sorted([message for thread in self.threads for message in thread.search(string, ignore_case)])
111 | 
112 |     def on(self, date):
113 |         """Return the Chat object as it would have been on 'date'.
114 | 
115 |            The Chat object returned is a new object containing the subset of the
116 |            Threads which contain messages sent before 'date', where each of these
117 |            Threads is a new Thread with only these messages in.
118 |            - 'date' can be a datetime.datetime object, or a three or five tuple
119 |               (YYYY, MM, DD[, HH, MM])."""
120 |         threads_on = [t.on(date) for t in self.threads if len(t.on(date)) > 0]
121 |         return Chat(self._myname, threads_on)
122 | 
123 | 
124 | class Thread(object):
125 |     """An object to encapsulate a Facebook Message thread.
126 | 
127 |         - Contains a list of participants, a string form of the list and a list
128 |           of messages in the thread as Message objects.
129 |         - When initialising, 'people' should be a list of strings containing the
130 |           names of the participants and 'messages' should be a list of Message
131 |           objects."""
132 | 
133 |     def __init__(self, people, messages):
134 |         self.people = people
135 |         self.people_str = ", ".join(self.people)
136 |         self.messages = sorted(messages)
137 | 
138 |     def __getitem__(self, key):
139 |         """Allow accessing Message objects in the messages list using Thread[n].
140 | 
141 |            Beware out by one errors! The message numbers start counting at 1,
142 |            but the list they are stored in is indexed from 0.
143 |             - This behaviour could be corrected by either subtracting one from
144 |               the key (which causes issues when slicing), or by counting messages
145 |               from 0."""
146 |         return self.messages[key]
147 | 
148 |     def __repr__(self):
149 |         """Set Python's representation of the Thread object."""
150 |         return '<THREAD: PEOPLE={}, MESSAGE_COUNT={}>'.format(self.people_str, len(self.messages))
151 | 
152 |     def __len__(self):
153 |         """Return the total number of messages in the thread."""
154 |         return len(self.messages)
155 | 
156 |     def _add_messages(self, new_messages):
157 |         """Allow adding messages to an already created Thread object.
158 | 
159 |            This function is useful for merging duplicate threads together."""
160 |         self.messages.extend(new_messages)
161 |         self.messages = sorted(self.messages)
162 | 
163 |     def _renumber_messages(self):
164 |         """Renumber all messages in the 'messages' list.
165 | 
166 |            Message objects are are sorted after being added; but if messages are
167 |            added using _add_messages() then the numbering may be incorrect. This
168 |            function fixes that."""
169 |         i = 1
170 |         for message in self.messages:
171 |             message._num = i
172 |             i += 1
173 | 
174 |     def by(self, name):
175 |         """Return a date ordered list of all messages sent by 'name'.
176 | 
177 |            Returns a list of Message objects."""
178 |         return [message for message in self.messages if message.sent_by(name)]
179 | 
180 |     def sent_before(self, date):
181 |         """Return a date ordered list of all messages sent before specified date.
182 | 
183 |            The function returns a list of Message objects. The 'date' can be a
184 |            datetime.datetime object, or a three or five tuple (YYYY, MM, DD[, HH, MM])."""
185 |         return [message for message in self.messages if message.sent_before(date)]
186 | 
187 |     def sent_after(self, date):
188 |         """Return a date ordered list of all messages sent after specified date.
189 | 
190 |            The list returned is a list of Message objects. The 'date' can be a
191 |            datetime.datetime object, or a three or five tuple (YYYY, MM, DD[, HH, MM])."""
192 |         return [message for message in self.messages if message.sent_after(date)]
193 | 
194 |     def sent_between(self, start, end=None):
195 |         """Return a date ordered list of all messages sent between specified dates.
196 | 
197 |             - The list returned is a list of Message objects. The 'start' and 'end'
198 |               can be datetime.datetime objects, or a three or five tuple
199 |               (YYYY, MM, DD[, HH, MM]).
200 |             - Not entering an 'end' date is interpreted as all messages sent on
201 |               the day 'start'. Where a time is specified also, a 24 hour period
202 |               beginning at 'start' is used."""
203 |         return [message for message in self.messages if message.sent_between(start, end)]
204 | 
205 |     def search(self, string, ignore_case=False):
206 |         """Return a date ordered list of messages in Thread containing 'string'.
207 | 
208 |            This function searches the current thread, and returns a list of Message
209 |            objects.
210 |             - The function can be made case-insensitive by setting 'ignore_case'
211 |               to True."""
212 |         return sorted([message for message in self.messages if message.contains(string, ignore_case)])
213 | 
214 |     def on(self, date):
215 |         """Return the Thread object as it would have been on 'date'.
216 | 
217 |            The Thread object returned is a new object containing the subset of the
218 |            messages sent before 'date'.
219 |            - 'date' can be a datetime.datetime object, or a three or five tuple
220 |               (YYYY, MM, DD[, HH, MM])."""
221 |         return Thread(self.people, self.sent_before(date))
222 | 
223 | 
224 | class Message(object):
225 |     """An object to encapsulate a Facebook Message.
226 | 
227 |         - Contains a string of the author's name, the timestamp, number in the thread
228 |           and the body of the message.
229 |         - When initialising, thread_name' should be the containing Thread.people_str,
230 |           'author' should be string containing the message sender's name, 'date_time'
231 |           should be a datetime.datetime object, 'text' should be the content of
232 |           the message and 'num' should be the number of the message in the thread."""
233 | 
234 |     def __init__(self, thread, author, date_time, text, num):
235 |         self.thread_name = thread
236 |         self.author = author
237 |         self.date_time = date_time
238 |         self.text = text
239 |         self._num = num
240 | 
241 |     def __repr__(self):
242 |         """Set Python's representation of the Message object."""
243 |         return '<MESSAGE: THREAD={} NUMBER={} TIMESTAMP={} AUTHOR={} MESSAGE="{}">'.\
244 |             format(self.thread_name, self._num, self.date_time, self.author, self.text)
245 | 
246 |     def __str__(self):
247 |         """Return a string form of a Message in format required for csv output."""
248 |         out = '"' + self.thread_name + '","' + str(self._num) + '","' + self.author + '","' + str(self.date_time) + '","' + self.text + '"\n'
249 |         return out
250 | 
251 |     def __lt__(self, message):
252 |         """Allow sorting of messages by implementing the less than operator.
253 | 
254 |            Sorting is by date, unless two messages were sent at the same time,
255 |            in which case message number is used to resolve conflicts. This number
256 |            ordering holds fine for messages in single threads, but offers no real
257 |            objective order outside a thread."""
258 |         if self.date_time == message.date_time:
259 |             if abs(self._num - message._num) > 9000:    # If dates equal, but numbers miles apart
260 |                 return False  # MUST be where two 10000 groups join: larger number actually smaller here!
261 |             else:
262 |                 return self._num < message._num
263 |         return self.sent_before(message.date_time)
264 | 
265 |     def __gt__(self, message):
266 |         """Allow sorting of messages by implementing the greater than operator.
267 | 
268 |            Sorting is by date, unless two messages were sent at the same time,
269 |            in which case message number is used to resolve conflicts. This number
270 |            ordering holds fine for messages in single threads, but offers no real
271 |            objective order outside a thread."""
272 |         if self.date_time == message.date_time:
273 |             if abs(self._num - message._num) > 9000:    # If dates equal, but numbers miles apart
274 |                 return True  # MUST be where two 10000 groups join: smaller number actually larger here!
275 |             else:
276 |                 return self._num > message._num
277 |         return self.sent_after(message.date_time)
278 | 
279 |     def __eq__(self, message):
280 |         """Messages are equal if their number, date, author and text are the same."""
281 |         equal = (self._num == message._num) and (self.author == message.author)
282 |         equal = equal and (self.date_time == message.date_time) and (self.text == message.text)
283 |         return equal
284 | 
285 |     def __len__(self):
286 |         """Return the number of characters in the message body."""
287 |         text = self.text.replace("<|NEWLINE|>", "")  # Undo adding extra characters
288 |         text = text.replace('""', '"')  # And escaping quote marks
289 |         return len(text)
290 | 
291 |     def _date_parse(self, date):
292 |         """Allow dates to be entered as integer tuples (YYYY, MM, DD[, HH, MM]).
293 | 
294 |            Removes the need to supply datetime objects, but still allows dates
295 |            to be entered as datetime.datetime objects. The Year, Month and
296 |            Day are compulsory, the Hours and Minutes optional. May cause exceptions
297 |            if poorly formatted tuples are used."""
298 |         if type(date) is datetime.datetime:
299 |             return date
300 |         else:
301 |             return datetime.datetime(*date)
302 | 
303 |     def sent_by(self, name):
304 |         """Return True if the message was sent by 'name'."""
305 |         return self.author == name
306 | 
307 |     def sent_before(self, date):
308 |         """Return True if the message was sent before the date specified.
309 | 
310 |            The 'date' can be a datetime.datetime object, or a three or five tuple
311 |            (YYYY, MM, DD[, HH, MM])."""
312 |         date = self._date_parse(date)
313 |         return self.date_time < date
314 | 
315 |     def sent_after(self, date):
316 |         """Return True if the message was sent after the date specified.
317 | 
318 |            The 'date' can be a datetime.datetime object, or a three or five tuple
319 |            (YYYY, MM, DD[, HH, MM])."""
320 |         date = self._date_parse(date)
321 |         return self.date_time > date
322 | 
323 |     def sent_between(self, start, end=None):
324 |         """Return True if the message was sent between the dates specified.
325 | 
326 |             - The 'start' and 'end' can be datetime.datetime objects, or
327 |               a three or five tuple (YYYY, MM, DD[, HH, MM]). The start and end times
328 |               are inclusive since this is simplest.
329 |             - Not entering an 'end' date is interpreted as all messages sent on
330 |               the day 'start'. Where a time is specified also, a 24 hour period
331 |               beginning at 'start' is used."""
332 |         start = self._date_parse(start)
333 |         if end is not None:
334 |             end = self._date_parse(end)
335 |         else:
336 |             end = start + datetime.timedelta(1)  # 1 day (24 hours) later than 'start'
337 |         return start <= self.date_time <= end
338 | 
339 |     def contains(self, search_string, ignore_case=False):
340 |         """Return True if 'search_string' is contained in the message text."""
341 |         if ignore_case:
342 |             return search_string.lower() in self.text.lower()
343 |         else:
344 |             return search_string in self.text
345 | 


--------------------------------------------------------------------------------
/fb_analysis.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import matplotlib.pyplot as plt
  3 | from matplotlib.dates import date2num, num2date
  4 | from matplotlib import ticker
  5 | import matplotlib
  6 | import re
  7 | 
  8 | # =============================================================================
  9 | #                          Top N Most Messaged People                         #
 10 | #                                                                             #
 11 | # Public Functions:                                                           #
 12 | #  - top_n_people(Chat, N, count_type, groups)                                #
 13 | #                                                                             #
 14 | # =============================================================================
 15 | 
 16 | _COUNT_TYPES = ["total", "to", "from", "allfrom", "words", "wordsfrom", "wordsto",
 17 |                 "chars", "charsfrom", "charsto"]
 18 | 
 19 | 
 20 | def _update_thread_dict(thread_dict, thread_name, num):
 21 |     """Add new entries to count dictionary, dealing with duplicates carefully."""
 22 |     if thread_name not in thread_dict:
 23 |                 thread_dict.update({thread_name: num})
 24 |     else:  # Deal with duplicates, otherwise old entries get overwritten:
 25 |         thread_dict[thread_name] += num
 26 | 
 27 | 
 28 | def top_n_people(Chat, N=-1, count_type="total", groups=False):
 29 |     """Return a list of the top N most messaged people.
 30 | 
 31 |        The "Top N People" can be judged by one of four criteria. The list
 32 |        contains tuples of (name, message count). A negative or zero value for
 33 |        N returns the full list, this is the default. The optional argument
 34 |        'groups' allows group conversations to be included where this makes
 35 |        sense. The 'count_type' argument can be one of four values:
 36 |         - "total" - the default. This counts the total number of messages in
 37 |           message threads, and sorts by this. Groups can be enabled.
 38 |         - "to" - the total number of messages sent in a direct thread by
 39 |           the current user: '_myname'. Groups can be enabled.
 40 |         - "from" - the total number of messages sent in a direct thread by
 41 |           the other person in the thread. If 'groups' is enabled, all messages
 42 |           not from '_myname' are counted.
 43 |         - "allfrom" - the total number of messages from each individual person
 44 |           across all threads. Groups cannot be enabled and will be ignored."""
 45 |     thread_dict = {}
 46 |     if count_type is "to":
 47 |         # Count the number of messages sent directly to each person.
 48 |         for t in Chat.threads:
 49 |             num = len(t.by(Chat._myname))
 50 |             _update_thread_dict(thread_dict, t.people_str, num)
 51 |     elif count_type is "from":
 52 |         # Count the number of messages received directly from each person.
 53 |         for t in Chat.threads:
 54 |             my_num = len(t.by(Chat._myname))
 55 |             tot_num = len(t)
 56 |             num = tot_num - my_num
 57 |             _update_thread_dict(thread_dict, t.people_str, num)
 58 |     elif count_type is "allfrom":
 59 |         # Count all messages in all threads received from each person.
 60 |         all_people = Chat._all_people.copy()
 61 |         all_people.remove(Chat._myname)  # Remove _myname from all_people (but not the original!):
 62 |         for p in all_people:
 63 |             num = len(Chat.all_from(p))
 64 |             thread_dict.update({p: num})
 65 |     elif count_type is "words":
 66 |         # Count total number of words exchanged in threads.
 67 |         for t in Chat.threads:
 68 |             num = 0
 69 |             for m in t.messages:
 70 |                 num += len(re.findall(r'\S+', m.text))  # Matches any non-whitespace sub-string
 71 |                 # num += len(m.text.split(" "))  # Counts all things separated by a space
 72 |             _update_thread_dict(thread_dict, t.people_str, num)
 73 |     elif count_type is "wordsfrom":
 74 |         # Count total number of words sent by other people in threads.
 75 |         for t in Chat.threads:
 76 |             num = 0
 77 |             for m in t.messages:
 78 |                 if not m.sent_by(Chat._myname):
 79 |                     num += len(re.findall(r'\S+', m.text))
 80 |             _update_thread_dict(thread_dict, t.people_str, num)
 81 |     elif count_type is "wordsto":
 82 |         # Count total number of words sent to the other people in threads.
 83 |         for t in Chat.threads:
 84 |             num = 0
 85 |             for m in t.messages:
 86 |                 if m.sent_by(Chat._myname):
 87 |                     num += len(re.findall(r'\S+', m.text))
 88 |             _update_thread_dict(thread_dict, t.people_str, num)
 89 |     elif count_type is "chars":
 90 |         # Count total number of characters exchanged in threads.
 91 |         for t in Chat.threads:
 92 |             num = 0
 93 |             for m in t.messages:
 94 |                 num += len(m)
 95 |             _update_thread_dict(thread_dict, t.people_str, num)
 96 |     elif count_type is "charsfrom":
 97 |         # Count total number of characters sent by other people in threads.
 98 |         for t in Chat.threads:
 99 |             num = 0
100 |             for m in t.messages:
101 |                 if not m.sent_by(Chat._myname):
102 |                     num += len(m)
103 |             _update_thread_dict(thread_dict, t.people_str, num)
104 |     elif count_type is "charsto":
105 |         # Count total number of characters sent to the other people in threads.
106 |         for t in Chat.threads:
107 |             num = 0
108 |             for m in t.messages:
109 |                 if m.sent_by(Chat._myname):
110 |                     num += len(m)
111 |             _update_thread_dict(thread_dict, t.people_str, num)
112 |     else:
113 |         # Else the default: count the total messages in each thread.
114 |         for t in Chat.threads:
115 |             num = len(t)
116 |             _update_thread_dict(thread_dict, t.people_str, num)
117 |     sorted_list = sorted(thread_dict.items(), key=lambda tup: tup[1], reverse=True)
118 |     top_n = []
119 |     for i, item in enumerate(sorted_list):
120 |         if ((len(top_n) >= N) and (N > 0)):
121 |             return top_n
122 |         if ((len(item[0].split(", ")) == 1) or groups):
123 |             top_n.append((item[0], item[1]))
124 |     return top_n
125 | 
126 | 
127 | # =============================================================================
128 | #                           Graphing Message Counts                           #
129 | #                                                                             #
130 | # Public Functions:                                                           #
131 | #  - use_facebook_colours()                                                   #
132 | #  - use_ios_colours()                                                        #
133 | #  - messages_time_graph(Chat, name, filename, no_gui)                        #
134 | #  - messages_date_graph(Chat, name, filename, start_date, end_date, no_gui)  #
135 | #  - messages_pie_chart(Chat, N, filename, count_type, groups,                #
136 | #                                                        no_gui, percentages) #
137 | #                                                                             #
138 | # =============================================================================
139 | 
140 | # Some useful colours:
141 | _FB_BLUE = (0.2314, 0.3490, 0.5961)
142 | _FB_GREY = (0.9294, 0.9294, 0.9294)
143 | _IOS_GREEN = (0.5451, 0.8235, 0.2824)
144 | _IOS_GREY = (0.8980, 0.8980, 0.9176)
145 | 
146 | # The colours used by the code:
147 | _BG_COLOUR = (1.0, 1.0, 1.0)
148 | _TEXT_COLOUR = (0.0, 0.0, 0.0)
149 | _MY_COLOUR = None
150 | _OTHER_COLOUR = None
151 | 
152 | 
153 | def _change_matplotlib_colours(text_color=_TEXT_COLOUR, bg_colour=_BG_COLOUR):
154 |     """Change matplotlib default colors for ALL graphs produced in current session.
155 | 
156 |         - 'text_colour' sets the colour of all text, as well as axes colours and
157 |           axis tick mark colours.
158 |         - 'bg_colour' changes the background and outside fill colour of the plot."""
159 |     matplotlib.rc('figure', facecolor=_BG_COLOUR)
160 |     matplotlib.rc('savefig', facecolor=_BG_COLOUR, edgecolor=_TEXT_COLOUR)
161 |     matplotlib.rc('axes', edgecolor=_TEXT_COLOUR, facecolor=_BG_COLOUR, labelcolor=_TEXT_COLOUR)
162 |     matplotlib.rc('text', color=_TEXT_COLOUR)
163 |     matplotlib.rc('grid', color=_TEXT_COLOUR)
164 |     matplotlib.rc('xtick', color=_TEXT_COLOUR)
165 |     matplotlib.rc('ytick', color=_TEXT_COLOUR)
166 | 
167 | 
168 | def _change_graph_colours(my_colour, other_colour):
169 |     """Change the colours used in histograms, both self colour and the other person colour."""
170 |     global _MY_COLOUR, _OTHER_COLOUR
171 |     _MY_COLOUR = my_colour
172 |     _OTHER_COLOUR = other_colour
173 | 
174 | 
175 | def use_facebook_colours():
176 |     """Use Facebook's colours for graphs; blue for self, grey for others."""
177 |     _change_graph_colours(my_colour=_FB_BLUE, other_colour=_FB_GREY)
178 | 
179 | 
180 | def use_ios_colours():
181 |     """Use iOS's colours for graphs; green for self, grey for others."""
182 |     _change_graph_colours(my_colour=_IOS_GREEN, other_colour=_IOS_GREY)
183 | 
184 | 
185 | # Run the colour change code on import of the module:
186 | use_facebook_colours()
187 | _change_matplotlib_colours()
188 | 
189 | 
190 | # ====== Histogram of Time of Day:
191 | 
192 | 
193 | def _hour_list():
194 |     """Generate a list containing hours in day converted to floats."""
195 |     hours_bins = [n / 24.0 for n in range(0, 25)]
196 |     return hours_bins
197 | 
198 | 
199 | def _dt_to_decimal_time(datetime):
200 |     """Convert a datetime.datetime object into a fraction of a day float.
201 | 
202 |        Take the decimal part of the date converted to number of days from 01/01/0001
203 |        and return it. It gives fraction of way through day: the time."""
204 |     datetime_decimal = date2num(datetime)
205 |     time_decimal = datetime_decimal - int(datetime_decimal)
206 |     return time_decimal
207 | 
208 | 
209 | def messages_time_graph(Chat, name=None, filename=None, no_gui=False):
210 |     """Create a graph of the time of day of messages sent between users.
211 | 
212 |        Produces a histogram of the times of messages sent to and received from
213 |        another user. The method only works for individuals, not for threads between
214 |        multiple friends.
215 | 
216 |        - 'Chat' should be the Chat object to analyse.
217 |        - 'name' should be the name of the user, and so the Thread, to be graphed.
218 |          A special case is when 'name' is the name of the current user, in which
219 |          case the graph of ALL messages the current user has sent is produced.
220 |        - If a 'filename' is specified, output to file as well as displaying
221 |          onscreen for viewing.
222 |        - To run without displaying a graph onscreen, set 'no_gui' to True. If no filename
223 |          is specified with this, the function will run but produce no output anywhere."""
224 |     # Implement a default case:
225 |     if name is None:
226 |         name = Chat._myname
227 |     # Divide up into hourly bins, changing datetime objects to times in range [0,1):
228 |     bins = _hour_list()
229 |     # If looking at graph with other users, get messages to and from:
230 |     if name != Chat._myname:
231 |         Thread = Chat[name]
232 |         times_from = [_dt_to_decimal_time(message.date_time) for message in Thread.by(name)]
233 |         times_to = [_dt_to_decimal_time(message.date_time) for message in Thread.by(Chat._myname)]
234 |         label = [Chat._myname, name]
235 |     else:  # If looking at all messages sent; do things differently:
236 |         times_from = [_dt_to_decimal_time(message.date_time) for message in Chat.all_messages() if message.author != Chat._myname]
237 |         times_to = [_dt_to_decimal_time(message.date_time) for message in Chat.all_messages() if message.author == Chat._myname]
238 |         label = [Chat._myname, "Others"]
239 |     # Create the figure, hiding the display if no_gui set:
240 |     if no_gui:
241 |         plt.ioff()
242 |     plt.figure(figsize=(18, 9), dpi=80)
243 |     plt.hist([times_to, times_from], bins, histtype='bar', color=[_MY_COLOUR, _OTHER_COLOUR], label=label, stacked=True)
244 |     # Title the graph correctly, and label axes:
245 |     if name != Chat._myname:
246 |         plt.suptitle("Messages with " + name, size=18)
247 |     else:
248 |         plt.suptitle("All Messages Sent", size=18)
249 |     plt.xlabel("Time of Day", labelpad=20, size=15)
250 |     plt.ylabel("Number of Messages", labelpad=20, size=15)
251 |     # Move tick marks to centre of hourly bins by adding ~ half an hour (in days)
252 |     axes = plt.gca()
253 |     axes.set_xticks([b + 0.02 for b in bins])
254 |     # Place tickmarks
255 |     plt.xticks(rotation=0, ha='center')
256 |     # Change the tick marks from useless fraction through day, to recognisable times:
257 |     # To do this use strftime to convert times to string (which needs dates >= 1900),
258 |     # so shift to 1900 (add 693596 days) and take off added half hour (minus 0.02)
259 |     axes.xaxis.set_major_formatter(ticker.FuncFormatter(lambda numdate, _: num2date(numdate + 693596 - 0.02).strftime('%H:%M')))
260 |     # Add some space at either end of the graph (axis in number of days, so +- 15 mins):
261 |     plt.xlim([bins[0] - 0.01, bins[-1] + 0.01])
262 |     # Place y gridlines beneath the plot:
263 |     axes.yaxis.grid(True)
264 |     axes.set_axisbelow(True)
265 |     # Hide unnecessary borders and tickmarks:
266 |     axes.spines['right'].set_visible(False)
267 |     axes.spines['top'].set_visible(False)
268 |     axes.yaxis.set_ticks_position('left')
269 |     plt.tick_params(axis='x', which='both', bottom='off', top='off')
270 |     # Add the legend at the top, underneath the title but outside the figure:
271 |     plt.legend(frameon=False, bbox_to_anchor=(0.5, 1.05), loc=9, ncol=2, borderaxespad=0)
272 |     # If given a filename, output to file:
273 |     if ((filename is not None) and (type(filename) is str)):
274 |         plt.savefig(filename, bbox_inches='tight')
275 | 
276 | 
277 | # ====== Histogram of Date:
278 | 
279 | 
280 | def _month_list(d1, d2):
281 |     """Generate a list of months between d1 and d2 inclusive.
282 | 
283 |        The list includes the months containing d1 and d2, with an extra month
284 |        on the end for the upper limit of a histogram."""
285 |     months = []
286 |     d1 = datetime.datetime(d1.year, d1.month, 1)
287 |     try:
288 |         d2 = datetime.datetime(d2.year, d2.month + 1, 1)
289 |     # If month is 12 (=December), adding one causes error:
290 |     except ValueError:
291 |         # So January of the next year instead
292 |         d2 = datetime.datetime(d2.year + 1, 1, 1)
293 |     # Just generate all months in the required years-range, including unecessary ones
294 |     for y in range(d1.year, d2.year + 1):
295 |         for m in range(1, 13):
296 |             months.append(datetime.datetime(y, m, 1))
297 |     # Then remove extra months
298 |     months = [m for m in months if (d1 <= m <= d2)]
299 |     return months
300 | 
301 | 
302 | def messages_date_graph(Chat, name=None, filename=None, start_date=None, end_date=None, no_gui=False):
303 |     """Create a graph of the number of messages sent between users.
304 | 
305 |        Produces a graph of messages sent to and received from another user. The
306 |        method only works for individuals, not for threads between multiple friends.
307 | 
308 |        - 'Chat' should be the Chat object to analyse.
309 |        - 'name' should be the name of the user, and so the Thread, to be graphed.
310 |          A special case is when 'name' is the name of the current user, in which
311 |          case the graph of ALL messages the current user has sent is produced.
312 |        - If a 'filename' is specified, output to file as well as displaying
313 |          onscreen for viewing.
314 |        - 'start_date' and 'end_date' can be used to narrow the range of dates
315 |          covered; the default is the first message to the last, but specifying dates
316 |          inside this range can be used to narrow down the region considered.
317 |        - To run without displaying a graph onscreen, set 'no_gui' to True. If no filename
318 |          is specified with this, the function will run but produce no output anywhere."""
319 |     # Implement a default case:
320 |     if name is None:
321 |         name = Chat._myname
322 |     # Sanity check input dates, and fix if necessary (note MUST be one line to avoid reassignment before comparison):
323 |     if ((start_date is not None) and (end_date is not None)):
324 |         start_date, end_date = min(start_date, end_date), max(start_date, end_date)
325 |     # If looking at graph with other users, get messages to and from:
326 |     if name != Chat._myname:
327 |             Thread = Chat[name]
328 |             # If a start date given (which is after the message thread starts), use it:
329 |             if start_date is None:
330 |                 d_min = Thread[0].date_time
331 |             else:
332 |                 d_min = max(Chat._date_parse(start_date), Thread[0].date_time)
333 |             # If an end date given (which is before the message thread ends), use it:
334 |             if end_date is None:
335 |                 d_max = Thread[-1].date_time
336 |             else:
337 |                 d_max = min(Chat._date_parse(end_date), Thread[-1].date_time)
338 |             dates_from = [date2num(message.date_time) for message in Thread.by(name)]
339 |             dates_to = [date2num(message.date_time) for message in Thread.by(Chat._myname)]
340 |             label = [Chat._myname, name]
341 |     # If looking at all messages sent; do things differently:
342 |     else:
343 |         message_list = Chat.all_messages()
344 |         # If a start date given (which is after the message thread starts), use it:
345 |         if start_date is None:
346 |             d_min = message_list[0].date_time
347 |         else:
348 |             d_min = max(Chat._date_parse(start_date), message_list[0].date_time)
349 |         # If an end date given (which is before the message thread ends), use it:
350 |         if end_date is None:
351 |             d_max = message_list[-1].date_time
352 |         else:
353 |             d_max = min(Chat._date_parse(end_date), message_list[-1].date_time)
354 |         dates_from = [date2num(message.date_time) for message in message_list if message.author != Chat._myname]
355 |         dates_to = [date2num(message.date_time) for message in message_list if message.author == Chat._myname]
356 |         label = [Chat._myname, "Others"]
357 |     # Divide up into month bins, changing datetime objects to number of days for plotting:
358 |     bins = [date2num(b) for b in _month_list(d_min, d_max)]
359 |     # Create the figure, hiding the display if no_gui set:
360 |     if no_gui:
361 |         plt.ioff()
362 |     plt.figure(figsize=(18, 9), dpi=80)
363 |     plt.hist([dates_to, dates_from], bins, histtype='bar', color=[_MY_COLOUR, _OTHER_COLOUR], label=label, stacked=True)
364 |     # Title the graph correctly, and label axes:
365 |     if name != Chat._myname:
366 |         plt.suptitle("Messages with " + name, size=18)
367 |     else:
368 |         plt.suptitle("All Messages Sent", size=18)
369 |     plt.ylabel("Number of Messages", labelpad=20, size=15)
370 |     # Put the tick marks at the rough centre of months by adding 15 days (~ 1/2 a month):
371 |     axes = plt.gca()
372 |     axes.set_xticks([b + 15 for b in bins])
373 |     # The x labels are unreadbale at angle if more than ~50 of them, put them vertical if so:
374 |     if len(bins) > 45:
375 |         plt.xticks(rotation='vertical')
376 |     else:
377 |         plt.xticks(rotation=30, ha='right')
378 |     # Change the tick marks from useless number of days, to recognisable dates:
379 |     axes.xaxis.set_major_formatter(ticker.FuncFormatter(lambda numdate, _: num2date(numdate).strftime('%b %Y')))
380 |     # Add some space at either end of the graph (axis in number of days, so -10 days and +5 days):
381 |     plt.xlim([bins[0] - 10, bins[-1] + 5])
382 |     # Place y gridlines beneath the plot:
383 |     axes.yaxis.grid(True)
384 |     axes.set_axisbelow(True)
385 |     # Hide unnecessary borders and tickmarks:
386 |     axes.spines['right'].set_visible(False)
387 |     axes.spines['top'].set_visible(False)
388 |     axes.yaxis.set_ticks_position('left')
389 |     plt.tick_params(axis='x', which='both', bottom='off', top='off')
390 |     # Add the legend at the top, underneath the title but outside the figure:
391 |     plt.legend(frameon=False, bbox_to_anchor=(0.5, 1.05), loc=9, ncol=2, borderaxespad=0)
392 |     # If given a filename, output to file:
393 |     if ((filename is not None) and (type(filename) is str)):
394 |         plt.savefig(filename, bbox_inches='tight')
395 | 
396 | 
397 | # ====== Pie Chart of Totals:
398 | 
399 | 
400 | # Colours from http://www.mulinblog.com/a-color-palette-optimized-for-data-visualization/
401 | _COLOURS = ['#5DA5DA', '#FAA43A', '#60BD68', '#F17CB0', '#B2912F', '#B276B2', '#DECF3F', '#F15854']
402 | 
403 | 
404 | def _make_labels_wrap(labels):
405 |     """Break labels which contain more than one name into multiple lines."""
406 |     for i, l in enumerate(labels):
407 |         if len(l) > 25:
408 |             # Split lines at ", " and rejoin with newline.
409 |             labels[i] = '\n'.join(l.split(", "))
410 |     return labels
411 | 
412 | 
413 | def messages_pie_chart(Chat, N=10, filename=None, count_type="total", groups=False,
414 |                        no_gui=False, percentages=True):
415 |     """Create a pie chart of the number of messages exchanged with friends.
416 | 
417 |        The graph shows the most messaged friends sorted using the top_n_people()
418 |        code. The graph also shows percentage sizes of wedges, though this can be disabled.
419 |         - 'Chat' should be the Chat object to analyse.
420 |         - 'N' should be how many people to show explicitly; all others are grouped
421 |           together in a final chunk.
422 |         - If a 'filename' is specified, output to file as well as displaying
423 |           onscreen for viewing.
424 |         - The 'count_type' argument is passed to top_n_people() and so one of the
425 |           four valid counts can be used.
426 |         - Setting 'groups' to True will include message threads with groups where
427 |           appropriate.
428 |         - To run without displaying a graph onscreen, set 'no_gui' to True. If no filename
429 |           is specified with this, the function will run but produce no output anywhere.
430 |         - The percentages on the graph can be removed by setting 'percentages' to
431 |           False."""
432 |     # The title of the graph depends on the count_type:
433 |     _title_dict = {"total": "Total Lengths of Message Threads",
434 |                    "allfrom": "Total Number of Messages Received",
435 |                    "from": "Number of Messages Received from People in Personal Threads",
436 |                    "to": "Number of Messages Sent to People in Personal Threads",
437 |                    "words": "Total Word Counts of Message Threads", "wordsfrom": "Word Count of All Messages Received from People in Personal Threads",
438 |                    "wordsto": "Word Count of All Messages Sent to People in Personal Threads",
439 |                    "chars": "Total Character Lengths of Message Threads",
440 |                    "charsfrom": "Character Length of All Messages Received from People in Personal Threads",
441 |                    "charsto": "Character Length of All Messages Sent to People in Personal Threads"}
442 |     # The data to plot:
443 |     thread_counts = top_n_people(Chat, count_type=count_type, groups=groups)
444 |     # Set up useful lists and counts:
445 |     names = []
446 |     counts = []
447 |     other_count = 0
448 |     colours = []
449 |     # Run through the data, adding it to the correct list. If not in N, add to Other:
450 |     for n, t in enumerate(thread_counts):
451 |         if n < N:
452 |             names.append(t[0])
453 |             counts.append(t[1])
454 |             colours.append(_COLOURS[n % len(_COLOURS)])
455 |         else:
456 |             other_count += t[1]
457 |     # Add an "Others" section in dark grey using the other_count:
458 |     names.append("Others")
459 |     counts.append(other_count)
460 |     colours.append('#4D4D4D')
461 |     # If long names, wrap them:
462 |     _make_labels_wrap(names)
463 |     # Create the figure, hiding the display if no_gui set:
464 |     if no_gui:
465 |         plt.ioff()
466 |     plt.figure(figsize=(18, 9), dpi=80)
467 |     # We want the edges of the wedges in the chart to be white for aesthetics:
468 |     plt.rcParams['patch.edgecolor'] = 'white'
469 |     # Plot percentage counts on the figure:
470 |     if percentages:
471 |         pct = '%1.1f%%'
472 |     else:
473 |         pct = None
474 |     # Make the plot, starting at the top (90 degrees from horizontal) and percentages outside (pctdist > 1)
475 |     plt.pie(counts, colors=colours, autopct=pct, pctdistance=1.1, startangle=90, counterclock=False)
476 |     # Put the right title on the graph:
477 |     plt.suptitle(_title_dict[count_type], size=18)
478 |     # And make it circular:
479 |     plt.axis('equal')
480 |     # Add the legend:
481 |     plt.legend(labels=names, frameon=False, labelspacing=1, loc="center", bbox_to_anchor=[0, 0.5])
482 |     # If given a filename, output to file:
483 |     if ((filename is not None) and (type(filename) is str)):
484 |         plt.savefig(filename, bbox_inches='tight')
485 |     # To get white outlines, we changed default. Fix this:
486 |     plt.rcParams['patch.edgecolor'] = _TEXT_COLOUR
487 | 
488 | 
489 | # =============================================================================
490 | #                           Word Frequency Analysis                           #
491 | #                                                                             #
492 | # Public Functions:                                                           #
493 | #  - top_word_use(Chat, name, from_me, ignore_single_words)                   #
494 | #                                                                             #
495 | # =============================================================================
496 | 
497 | 
498 | def _str_to_word_list(text):
499 |     """Turn a string into a list of words, removing URLs and punctuation.
500 | 
501 |        - The function takes in a string and returns a list of strings."""
502 |     # Some characters and strings need deleting from messages to separate them into proper words:
503 |     _EXCLUDE = ["'s", "'ll", ".", ",", ":", ";", "!", "?", "*", '"', "-", "+", "^", "_", "~", "(", ")", "[", "]", "/", "\\", "@", "="]
504 |     # Some things need removing, but not deleting as with _EXCLUDE:
505 |     _CHANGE = {"'": "", ":p": "tongueoutsmiley", ":-p": "tongueoutsmiley",
506 |                ":)": "happyfacesmiley", ":-)": "happyfacesmiley", ":/": "awkwardfacesmiley",
507 |                ":-/": "awkwardfacesmiley", "<3": "loveheartsmiley", ":(": "sadfacesmiley",
508 |                ":-(": "sadfacesmiley", ":'(": "cryingfacesmiley", ":d": "grinningfacesmiley",
509 |                ":-d": "grinningfacesmiley", ";)": "winkfacesmiley", ";-)": "winkfacesmiley",
510 |                ":o": "shockedfacesmiley"}
511 |     # Remove URLs with a regular expression, else they mess up when removing punctuation:
512 |     text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text)
513 |     # Remove the NEWLINE denoting string, and replace with a space before anything else:
514 |     text = text.replace("<|NEWLINE|>", " ")
515 |     text = text.lower()
516 |     # Change and exclude things:
517 |     for old, new in _CHANGE.items():
518 |         text = text.replace(old, new)
519 |     for ex in _EXCLUDE:
520 |         text = text.replace(ex, " ")
521 |     # A hack to replace all whitespace with one space:
522 |     text = " ".join(text.split())
523 |     # Get rid of non-ASCII characters for simplicity
524 |     text = text.encode('ascii', 'replace')
525 |     # Return a list of words:
526 |     return text.split()
527 | 
528 | 
529 | def _message_list_word_list(messages):
530 |     """Take a list of Message objects and return a list of strings.
531 | 
532 |        The returned list of strings contains all of the words in the messages."""
533 |     word_list = []
534 |     for m in messages:
535 |         word_list.extend(_str_to_word_list(m.text))
536 |     return word_list
537 | 
538 | 
539 | def _word_list_to_freq(words, ignore_single_words=False):
540 |     """Take a list of strings, and return a list of (word, word_use_count).
541 | 
542 |        - The returned list of pairs is sorted in descending order.
543 |        - Passing 'ignore_single_words' will remove any words only used once in
544 |          a message thread."""
545 |     # The order of items in the CHANGE dictionary means changing back isn't quite so simple; just use a second dictionary:
546 |     _CHANGE_BACK = {"tongueoutsmiley": ":P", "happyfacesmiley": ":)", "awkwardfacesmiley": ":/",
547 |                     "loveheartsmiley": "<3", "sadfacesmiley": ":(", "cryingfacesmiley": ":'(",
548 |                     "grinningfacesmiley": ":D", "winkfacesmiley": ";)", "shockedfacesmiley": ":o"}
549 |     # Make a dictionary of words and their total count:
550 |     freq = {x: words.count(x) for x in words}
551 |     # Change the emoticons back to emoticons:
552 |     for new, old in _CHANGE_BACK.items():
553 |         if new in freq:
554 |             freq[old] = freq.pop(new)
555 |     # Convert to a list and sort:
556 |     freq = sorted(freq.items(), key=lambda tup: tup[1], reverse=True)
557 |     # If only want words used more than once, remove those with count <= 1
558 |     if ignore_single_words:
559 |         freq = [f for f in freq if f[1] > 1]
560 |     return freq
561 | 
562 | 
563 | def top_word_use(Chat, name, from_me=False, ignore_single_words=False):
564 |     """Work out the most commonly used words by a friend.
565 | 
566 |        The function returns a list of (word, word_use_count) tuples. For long threads,
567 |        THIS FUNCTION WILL TAKE A VERY LONG TIME, due to the analysis being done
568 |        directly in Python, not in a module using the faster C or C++.
569 | 
570 |        - 'name' is a string of the name of the Thread to consider.
571 |        - 'from_me' is a boolean flag to consider messages sent by you to 'name'
572 |          if True, otherwise messages received from 'name' are used, the default.
573 |        - Setting 'ignore_single_words' to True removes words which are only used
574 |          once, which reduces the length of the list returned."""
575 |     if name != Chat._myname:
576 |         if from_me:
577 |             messages = Chat[name].by(Chat._myname)
578 |         else:
579 |             messages = Chat[name].by(name)
580 |     else:
581 |         messages = Chat.all_from(Chat._myname)
582 |     wlist = _message_list_word_list(messages)
583 |     freq = _word_list_to_freq(wlist, ignore_single_words)
584 |     return freq
585 | 


--------------------------------------------------------------------------------