├── .gitignore ├── LICENSE ├── booksoup ├── BookSoup.py ├── Conversation.py ├── Event.py ├── Events.py ├── FbTime.py ├── Message.py ├── Sentiment.py └── __init__.py ├── demo_interaction_frequency.py ├── demo_interaction_timeline.py ├── demo_sentiment_timeline.py ├── readme.md └── t.py /.gitignore: -------------------------------------------------------------------------------- 1 | facebook-data/ 2 | .Python 3 | include/ 4 | lib/ 5 | man/ 6 | bin/ 7 | _book/ 8 | .idea/ 9 | .DS_Store 10 | booksoup/*.pyc 11 | demos/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Jake Reid Browning 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /booksoup/BookSoup.py: -------------------------------------------------------------------------------- 1 | """BookSoup.py: stores a facebook user as a python object with ability to 2 | analyse events, conversations and participants.""" 3 | __author__ = "Jake Reid Browning" 4 | __license__ = "MIT" 5 | __email__ = "jake.reid.browning@gmail.com" 6 | 7 | # -*- coding: UTF-8 -*- 8 | 9 | from Conversation import Conversation 10 | from Events import Events 11 | from Event import Event 12 | from Event import find_between 13 | from bs4 import BeautifulSoup 14 | import os 15 | import re 16 | 17 | 18 | class BookSoup: 19 | def __init__(self, path): 20 | self.__path = path 21 | self.conversations = {} 22 | self.events = Events() 23 | with open(os.path.join(path,"index.htm"), "r") as f: 24 | self.__soup = BeautifulSoup(f.read(), "html.parser") 25 | self.name = self.__soup.find("h1").text 26 | 27 | def load_all_conversations(self, interval="month"): 28 | for filename in os.listdir(os.path.join(self.__path, "messages")): 29 | if filename.endswith(".html"): 30 | contact = Conversation(os.path.join(self.__path, "messages", filename), interval=interval) 31 | self.conversations[contact.name] = contact 32 | 33 | def load_conversation(self, search_name, interval="month"): 34 | # If search_name is an integer, simply get the HTML file whose name is that integer. 35 | if isinstance(search_name, int): 36 | contact = Conversation(os.path.join(self.__path, "messages", str(search_name)+".html"), interval=interval) 37 | self.conversations[contact.name] = contact 38 | return contact 39 | 40 | # If the search name is a string, get the list of links leading to conversations, 41 | # find the relative link and load the conversation using the link path. 42 | with open(os.path.join(self.__path,"html", "messages.htm"), "r") as infile: 43 | soup = BeautifulSoup(infile.read(), "html.parser") 44 | convo_links = soup.find_all("a", {"href": re.compile('.*messages.*')}) 45 | for link in convo_links: 46 | if link.text != search_name: 47 | continue 48 | contact = Conversation(os.path.join(self.__path, link["href"])) 49 | self.conversations[contact.name] = contact 50 | return contact 51 | return None 52 | 53 | def load_all_events(self): 54 | with open(os.path.join(self.__path,"html","events.htm"), "r") as f: 55 | # Replace any number of chained
...
tags with a single [BREAK] for easier splitting later. 56 | f_read = re.sub('(\
)+', '[BREAK]', f.read()) 57 | attending_soup = BeautifulSoup(find_between(f_read, "

Attending

", "

Maybe

"), "html.parser") 58 | maybe_soup = BeautifulSoup(find_between(f_read, "

Maybe

", "

Declined

"), "html.parser") 59 | declined_soup = BeautifulSoup(find_between(f_read, "

Declined

", "

No reply

"), "html.parser") 60 | no_reply_soup = BeautifulSoup(f_read.split("

No reply

")[1], "html.parser") 61 | 62 | self.__events_from_li(attending_soup, "attending") 63 | self.__events_from_li(maybe_soup, "maybe") 64 | self.__events_from_li(declined_soup, "declined") 65 | self.__events_from_li(no_reply_soup, "no_reply") 66 | return self.events 67 | 68 | # Each event is stored by FB as an
  • element where different types of info are separated by line breaks. 69 | # Split the info on line breaks and append the appropriate list in the events object. 70 | def __events_from_li(self, soup, attending_type): 71 | for event_li in soup.find_all("li"): 72 | event_split = event_li.text.split("[BREAK]") 73 | if len(event_split) < 4: 74 | new_event = Event(event_split[0], event_split[1], "", event_split[2]) 75 | else: 76 | new_event = Event(event_split[0], event_split[1], event_split[2], event_split[3]) 77 | if attending_type == "attending": 78 | self.events.attending.append(new_event) 79 | elif attending_type == "maybe": 80 | self.events.maybe.append(new_event) 81 | elif attending_type == "declined": 82 | self.events.declined.append(new_event) 83 | elif attending_type == "no_reply": 84 | self.events.no_reply.append(new_event) -------------------------------------------------------------------------------- /booksoup/Conversation.py: -------------------------------------------------------------------------------- 1 | """Conversation.py: stores a facebook conversation as a python object, 2 | and performs some basic analysis on interaction frequency and sentiment.""" 3 | 4 | from bs4 import BeautifulSoup 5 | from FbTime import FbTime 6 | from Sentiment import Sentiment 7 | from Message import Message 8 | 9 | 10 | class Conversation: 11 | def __init__(self, path, interval="month"): 12 | with open(path, 'r') as f: 13 | self.__soup = BeautifulSoup(f.read(), "html.parser") 14 | self.messages = [] 15 | self.name = self.__soup.find("title").text.replace("Conversation with ", "") 16 | message_headers = self.__soup.find_all("div", class_="message_header") 17 | self.__span_meta = [m.find("span", class_="meta").text for m in message_headers] 18 | self.__fbt = FbTime(self.__span_meta) 19 | 20 | for m in self.__soup.find_all("div", class_="message"): 21 | span = m.find("span", class_="meta") 22 | self.messages.append(Message(m.find("span", class_="user").text, self.__fbt.span_meta_to_date(span.text, interval), span.text, m.next_sibling.text)) 23 | 24 | self.__sent = Sentiment(self.messages, self.__fbt) 25 | self.participants = self.__scrape_participants() 26 | 27 | def interaction_freq(self): 28 | return self.__fbt.interaction_freq() 29 | 30 | def interaction_timeline(self, name): 31 | return self.__fbt.interaction_timeline(name, self.messages) 32 | 33 | def sentiment_timeline(self, name): 34 | return self.__sent.sentiment_timeline(name) 35 | 36 | def avg_sentiment(self, name): 37 | return self.__sent.avg_sentiment(name) 38 | 39 | # Returns a list of participants in the conversation. 40 | def __scrape_participants(self): 41 | users = [] 42 | for user_span in self.__soup.find_all("span", "user"): 43 | user_name = user_span.text 44 | if user_name not in users: 45 | users.append(user_name) 46 | return users 47 | 48 | 49 | -------------------------------------------------------------------------------- /booksoup/Event.py: -------------------------------------------------------------------------------- 1 | """Event.py: stores a facebook event as a python object and calculates the 2 | latitude and longitude of the event if available.""" 3 | 4 | class Event: 5 | def __init__(self, title, timestamp, location, description): 6 | self.title = title 7 | self.timestamp = timestamp 8 | self.location = location 9 | self.latlon = self.__latlon(location) 10 | self.description = description 11 | 12 | def __latlon(self, location): 13 | if not("(Latitude:" in location): 14 | return None 15 | lat = float(find_between(location, "(Latitude: ", ",")) 16 | lon = float(find_between(location, "Longitude: ", ")")) 17 | return [lat, lon] 18 | 19 | 20 | 21 | def find_between(s, first, last): 22 | try: 23 | start = s.index( first ) + len( first ) 24 | end = s.index( last, start ) 25 | return s[start:end] 26 | except ValueError: 27 | return "" 28 | 29 | -------------------------------------------------------------------------------- /booksoup/Events.py: -------------------------------------------------------------------------------- 1 | """Events.py: stores three lists of Event objects depending on whether 2 | user marked as attending, maybe, declined or didn't reply.""" 3 | 4 | class Events: 5 | def __init__(self, attending=[], maybe=[], declined=[], no_reply=[]): 6 | self.attending = attending 7 | self.maybe = maybe 8 | self.declined = declined 9 | self.no_reply = no_reply -------------------------------------------------------------------------------- /booksoup/FbTime.py: -------------------------------------------------------------------------------- 1 | """FbTime.py: Contains functions used for generating empty timeline/frequency 2 | dictionaries, and building them from conversation data""" 3 | 4 | import calendar 5 | from calendar import monthrange 6 | 7 | 8 | class FbTime: 9 | def __init__(self, span_meta): 10 | # Span tags with "meta" class contain a timestamp of when each message was sent. 11 | self.span_meta = span_meta 12 | 13 | # Returns a dict where each key is a time on the hour and each value is the number of messages sent 14 | # at that time over the history of the conversation. 15 | def interaction_freq(self): 16 | times = self.generate_time_dict() 17 | 18 | for date_str in self.span_meta: 19 | time = date_str.split("at ")[1][:5] 20 | hour = time.split(":")[0] 21 | times[self.__pad(hour)+":00"] += 1 22 | return times 23 | 24 | # Returns a dict where each key is a date and each value is the number of 25 | # messages sent at that date. 26 | def interaction_timeline(self, name, messages): 27 | dates = self.generate_date_dict() 28 | for message in messages: 29 | if message.name == name: 30 | dates[message.date] += 1 31 | return dates 32 | 33 | # Creates a dictionary of times on the hour where each value is 0. 34 | def generate_time_dict(self): 35 | times = {} 36 | for h in range(0,24): 37 | time = self.__pad(h) + ":" + "00" 38 | times[time] = 0 39 | return times 40 | 41 | # Creates a dictionary of dates where each value is 0. 42 | def generate_date_dict(self, interval="month"): 43 | dates = {} 44 | min_date_arr = [int(i) for i in self.span_meta_to_date(self.span_meta[-1], interval).split("-")] 45 | max_date_arr = [int(i) for i in self.span_meta_to_date(self.span_meta[0], interval).split("-")] 46 | for y in range(min_date_arr[0], max_date_arr[0]+1): 47 | if y == max_date_arr[0]: 48 | end_month = max_date_arr[1] 49 | else: 50 | end_month = 12 51 | if y == min_date_arr[0]: 52 | start_month = min_date_arr[1] 53 | else: 54 | start_month = 1 55 | for m in range(start_month, end_month+1): 56 | if interval == "month": 57 | dates[str(y)+"-"+self.__pad(m)] = 0 58 | continue 59 | if m == max_date_arr[1] and y == max_date_arr[0]: 60 | end_day = max_date_arr[2] 61 | else: 62 | end_day = monthrange(max_date_arr[0], max_date_arr[1])[1] 63 | if m == min_date_arr[1] and y == min_date_arr[0]: 64 | start_day = min_date_arr[2] 65 | else: 66 | start_day = 1 67 | for d in range(start_day, end_day+1): 68 | dates[str(y)+"-"+self.__pad(m)+"-"+self.__pad(d)] = 0 69 | return dates 70 | 71 | def __pad(self, val): 72 | int_val = int(val) 73 | if int_val < 10: 74 | return "0"+str(int_val) 75 | return str(val) 76 | 77 | # Converts timestamp in ... to YYYY-MM[-DD] format. 78 | def span_meta_to_date(self, span_str, interval="month"): 79 | # Remove all occurences of commas except the first one 80 | if span_str.count(",") == 2: 81 | span_str = ''.join(span_str.rsplit(',', 1)) 82 | 83 | date_arr = span_str.split(", ")[1].split(" ")[:3] 84 | 85 | # Re-arrange date_arr if format is month-day-year. 86 | try: 87 | a = int(date_arr[0]) 88 | except ValueError: 89 | shuffled_date_arr = [date_arr[1], date_arr[0], date_arr[2]] 90 | date_arr = shuffled_date_arr 91 | 92 | date_str = date_arr[2]+"-"+self.__pad(list(calendar.month_name).index(date_arr[1])) 93 | if interval == "day": 94 | date_str += "-"+self.__pad(date_arr[0]) 95 | return date_str 96 | -------------------------------------------------------------------------------- /booksoup/Message.py: -------------------------------------------------------------------------------- 1 | """Message.py: stores a facebook message as a python object.""" 2 | 3 | 4 | class Message: 5 | def __init__(self, name, date, timestamp, content): 6 | self.name = name 7 | self.date = date 8 | self.content = content 9 | self.timestamp = timestamp 10 | -------------------------------------------------------------------------------- /booksoup/Sentiment.py: -------------------------------------------------------------------------------- 1 | """Sentiment.py: Uses textblob's sentiment analysis to build user sentiment over time 2 | or calculate average sentiment of a user.""" 3 | 4 | 5 | from textblob import TextBlob 6 | 7 | 8 | class Sentiment: 9 | 10 | def __init__(self, messages, fbt): 11 | self.fbt = fbt 12 | self.messages = messages 13 | 14 | def sentiment_timeline(self, name): 15 | timeline = self.fbt.generate_date_dict() 16 | sentiment_counts = self.fbt.generate_date_dict() 17 | for message in self.messages: 18 | if message.content is None or message.name != name: 19 | continue 20 | blob = TextBlob(message.content) 21 | timeline[message.date] += blob.sentiment.polarity 22 | sentiment_counts[message.date] += 1 23 | 24 | for k,v in timeline.iteritems(): 25 | if v == 0: 26 | continue 27 | timeline[k] = v/sentiment_counts[k] 28 | return timeline 29 | 30 | def avg_sentiment(self, name): 31 | sentiments = [] 32 | for message in self.messages: 33 | message_text = message.content 34 | if message_text is None or message.name != name: 35 | continue 36 | sentiments.append(TextBlob(message_text).sentiment.polarity) 37 | return sum(sentiments) / len(sentiments) 38 | 39 | -------------------------------------------------------------------------------- /booksoup/__init__.py: -------------------------------------------------------------------------------- 1 | from BookSoup import BookSoup 2 | from Conversation import Conversation 3 | from FbTime import FbTime 4 | from Message import Message 5 | from Sentiment import Sentiment 6 | from Events import Events 7 | from Event import Event 8 | -------------------------------------------------------------------------------- /demo_interaction_frequency.py: -------------------------------------------------------------------------------- 1 | import matplotlib as mpl 2 | mpl.use('TkAgg') 3 | from booksoup import BookSoup 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | 7 | # Enter the path to the top level of your facebook data folder below. 8 | me = BookSoup("facebook-data") 9 | 10 | # Enter the name of the conversation or the numerical ID below. 11 | contact = me.load_conversation(274) 12 | 13 | times = contact.interaction_freq() 14 | 15 | objects = sorted(times.keys()) 16 | y_pos = np.arange(len(objects)) 17 | vals = [times[t] for t in objects] 18 | 19 | plt.bar(y_pos, vals, align='center', alpha=0.5) 20 | plt.xticks(y_pos, objects, fontsize=8, rotation=90) 21 | plt.ylabel('Frequency') 22 | plt.title('Interaction Frequency with ' + contact.name) 23 | 24 | plt.show() 25 | -------------------------------------------------------------------------------- /demo_interaction_timeline.py: -------------------------------------------------------------------------------- 1 | import matplotlib as mpl 2 | mpl.use('TkAgg') 3 | from booksoup import BookSoup 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | 7 | times = [] 8 | objects = [] 9 | vals = [] 10 | 11 | # Enter the path to the top level of your facebook data folder below. 12 | me = BookSoup("facebook-data") 13 | 14 | # Enter the name of the conversation or the numerical ID below. 15 | conversation = me.load_conversation(108) 16 | 17 | for participant in conversation.participants: 18 | timeline = conversation.interaction_timeline(participant) 19 | sorted_keys = sorted(timeline.keys()) 20 | times.append(timeline) 21 | objects.append(sorted_keys) 22 | vals.append([timeline[t] for t in sorted_keys]) 23 | 24 | y_pos = np.arange(len(objects[0])) 25 | 26 | for i,v in enumerate(vals): 27 | plt.plot(y_pos, v, alpha=0.5, label=conversation.participants[i]) 28 | 29 | plt.xticks(y_pos, objects[0]) 30 | plt.ylabel('Message count') 31 | plt.title('Messages over time with ' + conversation.name) 32 | plt.xticks(fontsize=8, rotation=90) 33 | plt.legend() 34 | plt.show() 35 | 36 | -------------------------------------------------------------------------------- /demo_sentiment_timeline.py: -------------------------------------------------------------------------------- 1 | import matplotlib as mpl 2 | mpl.use('TkAgg') 3 | from booksoup import BookSoup 4 | import numpy as np 5 | import matplotlib.pyplot as plt 6 | 7 | times = [] 8 | objects = [] 9 | vals = [] 10 | 11 | # Enter the path to the top level of your facebook data folder below. 12 | me = BookSoup("facebook-data") 13 | 14 | # Enter the name of the conversation or the numerical ID below. 15 | conversation = me.load_conversation(104) 16 | 17 | for participant in conversation.participants: 18 | timeline = conversation.sentiment_timeline(participant) 19 | sorted_keys = sorted(timeline.keys()) 20 | times.append(timeline) 21 | objects.append(sorted_keys) 22 | vals.append([timeline[t] for t in sorted_keys]) 23 | 24 | y_pos = np.arange(len(objects[0])) 25 | 26 | for i,v in enumerate(vals): 27 | plt.plot(y_pos, v, alpha=0.5, label=conversation.participants[i]) 28 | 29 | plt.xticks(y_pos, objects[0]) 30 | plt.ylabel('Average Sentiment') 31 | plt.title('Sentiment over time with ' + conversation.name) 32 | plt.xticks(fontsize=8, rotation=90) 33 | plt.legend() 34 | plt.show() 35 | 36 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | ## Booksoup 2 | 3 | Booksoup allows you to analyse and traverse your [downloaded facebook data](https://www.facebook.com/help/212802592074644?in_context), 4 | including features such as sentiment analysis and message frequency analysis over time. 5 | 6 | Booksoup requires [BeautifulSoup4](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) and [TextBlob](http://textblob.readthedocs.io/en/dev/), and requires [matplotlib](https://matplotlib.org/) to run the demo graphs. 7 | 8 | ## Usage 9 | 10 | Initialise a new instance of the `BookSoup` class, passing in the top-level path of your facebook data folder as an argument. 11 | 12 | 13 | ### Basic usage 14 | 15 | ```python 16 | from booksoup import BookSoup 17 | 18 | me = BookSoup("facebook-data") 19 | 20 | # Get a conversation by name 21 | convo = me.load_conversation("Jane Doe") 22 | 23 | # Print participants of the conversation 24 | print(convo.participants) 25 | 26 | # Print messages in the conversation 27 | for message in convo.messages: 28 | print(message.date, message.timestamp, message.name, message.content) 29 | ``` 30 | 31 | ### Interaction frequency 32 | It's possible to see how often messages are sent in a specific conversation at each hour of the day using `interaction_freq`. This returns a dict with each key being an hour in the day, and the corresponding value being the number of messages sent at that time over the history of the conversation. 33 | ```python 34 | me = BookSoup("facebook-data") 35 | convo = me.load_conversation("John Smith") 36 | 37 | times = convo.interaction_freq() 38 | ``` 39 | 40 | Using the `demo_interaction_frequency.py` code, this can be visualised: 41 | 42 | ![Interaction frequency example](https://i.imgur.com/cALmzb5.png) 43 | 44 | ### Interaction timeline 45 | 46 | It's also possible to view how many times a specific person within a conversation sent messages from the beginning to the last point 47 | of the conversation using `interaction_timeline(name)`. The following example shows how often I sent messages within a group conversation. 48 | 49 | ```python 50 | me = BookSoup("facebook-data") 51 | convo = me.load_conversation("Lewis, Andrew, Michelle and 4 others") 52 | 53 | times = convo.interaction_timeline(me.name) 54 | ``` 55 | 56 | Using the `demo_interaction_timeline.py` code, I can visualise in one graph how often everyone in the conversation spoke by building a separate 57 | timeline for each person. 58 | 59 | ![Interaction timeline example](https://i.imgur.com/7BP4GNi.png) 60 | 61 | Another example below with one friend over a longer timeline: 62 | 63 | ![Single user example](https://i.imgur.com/q6fAgVL.png) 64 | 65 | ### Sentiment 66 | 67 | Booksoup can also perform [sentiment analysis](https://en.wikipedia.org/wiki/Sentiment_analysis). Average sentiment for a user in a specific conversation can be calculated using 68 | `Conversation.avg_sentiment(name)`, or a timeline of average sentiment can also be built using `Conversation.sentiment_timeline`. 69 | 70 | ```python 71 | convo = me.load_conversation("David Grocer") 72 | 73 | # Print the average sentiment of David Grocer in the conversation 74 | print(convo.avg_sentiment("David Grocer")) 75 | 76 | # Print the timeline dictionary of my average sentiment in the conversation 77 | print(convo.sentiment_timeline(me.name)) 78 | 79 | ``` 80 | 81 | ### Loading a conversation 82 | A conversation can either be loaded using either the title of the conversation (as in all the previous examples) or the numerical 83 | ID of the conversation (the filename of the conversation's html file). 84 | 85 | ```python 86 | convo = me.load_conversation(40) 87 | ``` 88 | 89 | ### Specifying interval duration 90 | 91 | In all of the timeline examples, the interval can be specified as either `month` or `day`, with the default being `month`. To switch to daily intervals 92 | for timeline operations, set the `interval` argument, e.g 93 | 94 | ```python 95 | convo = me.load_conversation("David Grocer", interval="day") 96 | ``` 97 | 98 | ### Events 99 | 100 | Booksoup can extract and categorise event information. This includes title, description, location, timestamp and a 2-element array 101 | containing the latitude and longitude of the event if available. 102 | 103 | ```python 104 | me = BookSoup("facebook-data") 105 | 106 | events = me.load_all_events() 107 | 108 | # Events are organised into attending, maybe, declined and no_reply: 109 | for event in events.attending: 110 | print(event.title, event.description, event.location, event.timestamp, event.latlon) 111 | ``` 112 | -------------------------------------------------------------------------------- /t.py: -------------------------------------------------------------------------------- 1 | from booksoup import BookSoup 2 | 3 | me = BookSoup("facebook-data") 4 | 5 | events = me.load_all_events() 6 | 7 | # Events are organised into attending, maybe, declined and no_reply: 8 | for event in events.attending: 9 | print(event.title, event.description, event.location, event.timestamp, event.latlon) --------------------------------------------------------------------------------