├── .gitignore
├── LICENSE
├── booksoup
├── BookSoup.py
├── Conversation.py
├── Event.py
├── Events.py
├── FbTime.py
├── Message.py
├── Sentiment.py
└── __init__.py
├── demo_interaction_frequency.py
├── demo_interaction_timeline.py
├── demo_sentiment_timeline.py
├── readme.md
└── t.py
/.gitignore:
--------------------------------------------------------------------------------
1 | facebook-data/
2 | .Python
3 | include/
4 | lib/
5 | man/
6 | bin/
7 | _book/
8 | .idea/
9 | .DS_Store
10 | booksoup/*.pyc
11 | demos/
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Jake Reid Browning
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/booksoup/BookSoup.py:
--------------------------------------------------------------------------------
1 | """BookSoup.py: stores a facebook user as a python object with ability to
2 | analyse events, conversations and participants."""
3 | __author__ = "Jake Reid Browning"
4 | __license__ = "MIT"
5 | __email__ = "jake.reid.browning@gmail.com"
6 |
7 | # -*- coding: UTF-8 -*-
8 |
9 | from Conversation import Conversation
10 | from Events import Events
11 | from Event import Event
12 | from Event import find_between
13 | from bs4 import BeautifulSoup
14 | import os
15 | import re
16 |
17 |
18 | class BookSoup:
19 | def __init__(self, path):
20 | self.__path = path
21 | self.conversations = {}
22 | self.events = Events()
23 | with open(os.path.join(path,"index.htm"), "r") as f:
24 | self.__soup = BeautifulSoup(f.read(), "html.parser")
25 | self.name = self.__soup.find("h1").text
26 |
27 | def load_all_conversations(self, interval="month"):
28 | for filename in os.listdir(os.path.join(self.__path, "messages")):
29 | if filename.endswith(".html"):
30 | contact = Conversation(os.path.join(self.__path, "messages", filename), interval=interval)
31 | self.conversations[contact.name] = contact
32 |
33 | def load_conversation(self, search_name, interval="month"):
34 | # If search_name is an integer, simply get the HTML file whose name is that integer.
35 | if isinstance(search_name, int):
36 | contact = Conversation(os.path.join(self.__path, "messages", str(search_name)+".html"), interval=interval)
37 | self.conversations[contact.name] = contact
38 | return contact
39 |
40 | # If the search name is a string, get the list of links leading to conversations,
41 | # find the relative link and load the conversation using the link path.
42 | with open(os.path.join(self.__path,"html", "messages.htm"), "r") as infile:
43 | soup = BeautifulSoup(infile.read(), "html.parser")
44 | convo_links = soup.find_all("a", {"href": re.compile('.*messages.*')})
45 | for link in convo_links:
46 | if link.text != search_name:
47 | continue
48 | contact = Conversation(os.path.join(self.__path, link["href"]))
49 | self.conversations[contact.name] = contact
50 | return contact
51 | return None
52 |
53 | def load_all_events(self):
54 | with open(os.path.join(self.__path,"html","events.htm"), "r") as f:
55 | # Replace any number of chained
...
tags with a single [BREAK] for easier splitting later.
56 | f_read = re.sub('(\
)+', '[BREAK]', f.read())
57 | attending_soup = BeautifulSoup(find_between(f_read, "
Attending
", "Maybe
"), "html.parser")
58 | maybe_soup = BeautifulSoup(find_between(f_read, "Maybe
", "Declined
"), "html.parser")
59 | declined_soup = BeautifulSoup(find_between(f_read, "Declined
", "No reply
"), "html.parser")
60 | no_reply_soup = BeautifulSoup(f_read.split("No reply
")[1], "html.parser")
61 |
62 | self.__events_from_li(attending_soup, "attending")
63 | self.__events_from_li(maybe_soup, "maybe")
64 | self.__events_from_li(declined_soup, "declined")
65 | self.__events_from_li(no_reply_soup, "no_reply")
66 | return self.events
67 |
68 | # Each event is stored by FB as an element where different types of info are separated by line breaks.
69 | # Split the info on line breaks and append the appropriate list in the events object.
70 | def __events_from_li(self, soup, attending_type):
71 | for event_li in soup.find_all("li"):
72 | event_split = event_li.text.split("[BREAK]")
73 | if len(event_split) < 4:
74 | new_event = Event(event_split[0], event_split[1], "", event_split[2])
75 | else:
76 | new_event = Event(event_split[0], event_split[1], event_split[2], event_split[3])
77 | if attending_type == "attending":
78 | self.events.attending.append(new_event)
79 | elif attending_type == "maybe":
80 | self.events.maybe.append(new_event)
81 | elif attending_type == "declined":
82 | self.events.declined.append(new_event)
83 | elif attending_type == "no_reply":
84 | self.events.no_reply.append(new_event)
--------------------------------------------------------------------------------
/booksoup/Conversation.py:
--------------------------------------------------------------------------------
1 | """Conversation.py: stores a facebook conversation as a python object,
2 | and performs some basic analysis on interaction frequency and sentiment."""
3 |
4 | from bs4 import BeautifulSoup
5 | from FbTime import FbTime
6 | from Sentiment import Sentiment
7 | from Message import Message
8 |
9 |
10 | class Conversation:
11 | def __init__(self, path, interval="month"):
12 | with open(path, 'r') as f:
13 | self.__soup = BeautifulSoup(f.read(), "html.parser")
14 | self.messages = []
15 | self.name = self.__soup.find("title").text.replace("Conversation with ", "")
16 | message_headers = self.__soup.find_all("div", class_="message_header")
17 | self.__span_meta = [m.find("span", class_="meta").text for m in message_headers]
18 | self.__fbt = FbTime(self.__span_meta)
19 |
20 | for m in self.__soup.find_all("div", class_="message"):
21 | span = m.find("span", class_="meta")
22 | self.messages.append(Message(m.find("span", class_="user").text, self.__fbt.span_meta_to_date(span.text, interval), span.text, m.next_sibling.text))
23 |
24 | self.__sent = Sentiment(self.messages, self.__fbt)
25 | self.participants = self.__scrape_participants()
26 |
27 | def interaction_freq(self):
28 | return self.__fbt.interaction_freq()
29 |
30 | def interaction_timeline(self, name):
31 | return self.__fbt.interaction_timeline(name, self.messages)
32 |
33 | def sentiment_timeline(self, name):
34 | return self.__sent.sentiment_timeline(name)
35 |
36 | def avg_sentiment(self, name):
37 | return self.__sent.avg_sentiment(name)
38 |
39 | # Returns a list of participants in the conversation.
40 | def __scrape_participants(self):
41 | users = []
42 | for user_span in self.__soup.find_all("span", "user"):
43 | user_name = user_span.text
44 | if user_name not in users:
45 | users.append(user_name)
46 | return users
47 |
48 |
49 |
--------------------------------------------------------------------------------
/booksoup/Event.py:
--------------------------------------------------------------------------------
1 | """Event.py: stores a facebook event as a python object and calculates the
2 | latitude and longitude of the event if available."""
3 |
4 | class Event:
5 | def __init__(self, title, timestamp, location, description):
6 | self.title = title
7 | self.timestamp = timestamp
8 | self.location = location
9 | self.latlon = self.__latlon(location)
10 | self.description = description
11 |
12 | def __latlon(self, location):
13 | if not("(Latitude:" in location):
14 | return None
15 | lat = float(find_between(location, "(Latitude: ", ","))
16 | lon = float(find_between(location, "Longitude: ", ")"))
17 | return [lat, lon]
18 |
19 |
20 |
21 | def find_between(s, first, last):
22 | try:
23 | start = s.index( first ) + len( first )
24 | end = s.index( last, start )
25 | return s[start:end]
26 | except ValueError:
27 | return ""
28 |
29 |
--------------------------------------------------------------------------------
/booksoup/Events.py:
--------------------------------------------------------------------------------
1 | """Events.py: stores three lists of Event objects depending on whether
2 | user marked as attending, maybe, declined or didn't reply."""
3 |
4 | class Events:
5 | def __init__(self, attending=[], maybe=[], declined=[], no_reply=[]):
6 | self.attending = attending
7 | self.maybe = maybe
8 | self.declined = declined
9 | self.no_reply = no_reply
--------------------------------------------------------------------------------
/booksoup/FbTime.py:
--------------------------------------------------------------------------------
1 | """FbTime.py: Contains functions used for generating empty timeline/frequency
2 | dictionaries, and building them from conversation data"""
3 |
4 | import calendar
5 | from calendar import monthrange
6 |
7 |
8 | class FbTime:
9 | def __init__(self, span_meta):
10 | # Span tags with "meta" class contain a timestamp of when each message was sent.
11 | self.span_meta = span_meta
12 |
13 | # Returns a dict where each key is a time on the hour and each value is the number of messages sent
14 | # at that time over the history of the conversation.
15 | def interaction_freq(self):
16 | times = self.generate_time_dict()
17 |
18 | for date_str in self.span_meta:
19 | time = date_str.split("at ")[1][:5]
20 | hour = time.split(":")[0]
21 | times[self.__pad(hour)+":00"] += 1
22 | return times
23 |
24 | # Returns a dict where each key is a date and each value is the number of
25 | # messages sent at that date.
26 | def interaction_timeline(self, name, messages):
27 | dates = self.generate_date_dict()
28 | for message in messages:
29 | if message.name == name:
30 | dates[message.date] += 1
31 | return dates
32 |
33 | # Creates a dictionary of times on the hour where each value is 0.
34 | def generate_time_dict(self):
35 | times = {}
36 | for h in range(0,24):
37 | time = self.__pad(h) + ":" + "00"
38 | times[time] = 0
39 | return times
40 |
41 | # Creates a dictionary of dates where each value is 0.
42 | def generate_date_dict(self, interval="month"):
43 | dates = {}
44 | min_date_arr = [int(i) for i in self.span_meta_to_date(self.span_meta[-1], interval).split("-")]
45 | max_date_arr = [int(i) for i in self.span_meta_to_date(self.span_meta[0], interval).split("-")]
46 | for y in range(min_date_arr[0], max_date_arr[0]+1):
47 | if y == max_date_arr[0]:
48 | end_month = max_date_arr[1]
49 | else:
50 | end_month = 12
51 | if y == min_date_arr[0]:
52 | start_month = min_date_arr[1]
53 | else:
54 | start_month = 1
55 | for m in range(start_month, end_month+1):
56 | if interval == "month":
57 | dates[str(y)+"-"+self.__pad(m)] = 0
58 | continue
59 | if m == max_date_arr[1] and y == max_date_arr[0]:
60 | end_day = max_date_arr[2]
61 | else:
62 | end_day = monthrange(max_date_arr[0], max_date_arr[1])[1]
63 | if m == min_date_arr[1] and y == min_date_arr[0]:
64 | start_day = min_date_arr[2]
65 | else:
66 | start_day = 1
67 | for d in range(start_day, end_day+1):
68 | dates[str(y)+"-"+self.__pad(m)+"-"+self.__pad(d)] = 0
69 | return dates
70 |
71 | def __pad(self, val):
72 | int_val = int(val)
73 | if int_val < 10:
74 | return "0"+str(int_val)
75 | return str(val)
76 |
77 | # Converts timestamp in ... to YYYY-MM[-DD] format.
78 | def span_meta_to_date(self, span_str, interval="month"):
79 | # Remove all occurences of commas except the first one
80 | if span_str.count(",") == 2:
81 | span_str = ''.join(span_str.rsplit(',', 1))
82 |
83 | date_arr = span_str.split(", ")[1].split(" ")[:3]
84 |
85 | # Re-arrange date_arr if format is month-day-year.
86 | try:
87 | a = int(date_arr[0])
88 | except ValueError:
89 | shuffled_date_arr = [date_arr[1], date_arr[0], date_arr[2]]
90 | date_arr = shuffled_date_arr
91 |
92 | date_str = date_arr[2]+"-"+self.__pad(list(calendar.month_name).index(date_arr[1]))
93 | if interval == "day":
94 | date_str += "-"+self.__pad(date_arr[0])
95 | return date_str
96 |
--------------------------------------------------------------------------------
/booksoup/Message.py:
--------------------------------------------------------------------------------
1 | """Message.py: stores a facebook message as a python object."""
2 |
3 |
4 | class Message:
5 | def __init__(self, name, date, timestamp, content):
6 | self.name = name
7 | self.date = date
8 | self.content = content
9 | self.timestamp = timestamp
10 |
--------------------------------------------------------------------------------
/booksoup/Sentiment.py:
--------------------------------------------------------------------------------
1 | """Sentiment.py: Uses textblob's sentiment analysis to build user sentiment over time
2 | or calculate average sentiment of a user."""
3 |
4 |
5 | from textblob import TextBlob
6 |
7 |
8 | class Sentiment:
9 |
10 | def __init__(self, messages, fbt):
11 | self.fbt = fbt
12 | self.messages = messages
13 |
14 | def sentiment_timeline(self, name):
15 | timeline = self.fbt.generate_date_dict()
16 | sentiment_counts = self.fbt.generate_date_dict()
17 | for message in self.messages:
18 | if message.content is None or message.name != name:
19 | continue
20 | blob = TextBlob(message.content)
21 | timeline[message.date] += blob.sentiment.polarity
22 | sentiment_counts[message.date] += 1
23 |
24 | for k,v in timeline.iteritems():
25 | if v == 0:
26 | continue
27 | timeline[k] = v/sentiment_counts[k]
28 | return timeline
29 |
30 | def avg_sentiment(self, name):
31 | sentiments = []
32 | for message in self.messages:
33 | message_text = message.content
34 | if message_text is None or message.name != name:
35 | continue
36 | sentiments.append(TextBlob(message_text).sentiment.polarity)
37 | return sum(sentiments) / len(sentiments)
38 |
39 |
--------------------------------------------------------------------------------
/booksoup/__init__.py:
--------------------------------------------------------------------------------
1 | from BookSoup import BookSoup
2 | from Conversation import Conversation
3 | from FbTime import FbTime
4 | from Message import Message
5 | from Sentiment import Sentiment
6 | from Events import Events
7 | from Event import Event
8 |
--------------------------------------------------------------------------------
/demo_interaction_frequency.py:
--------------------------------------------------------------------------------
1 | import matplotlib as mpl
2 | mpl.use('TkAgg')
3 | from booksoup import BookSoup
4 | import numpy as np
5 | import matplotlib.pyplot as plt
6 |
7 | # Enter the path to the top level of your facebook data folder below.
8 | me = BookSoup("facebook-data")
9 |
10 | # Enter the name of the conversation or the numerical ID below.
11 | contact = me.load_conversation(274)
12 |
13 | times = contact.interaction_freq()
14 |
15 | objects = sorted(times.keys())
16 | y_pos = np.arange(len(objects))
17 | vals = [times[t] for t in objects]
18 |
19 | plt.bar(y_pos, vals, align='center', alpha=0.5)
20 | plt.xticks(y_pos, objects, fontsize=8, rotation=90)
21 | plt.ylabel('Frequency')
22 | plt.title('Interaction Frequency with ' + contact.name)
23 |
24 | plt.show()
25 |
--------------------------------------------------------------------------------
/demo_interaction_timeline.py:
--------------------------------------------------------------------------------
1 | import matplotlib as mpl
2 | mpl.use('TkAgg')
3 | from booksoup import BookSoup
4 | import numpy as np
5 | import matplotlib.pyplot as plt
6 |
7 | times = []
8 | objects = []
9 | vals = []
10 |
11 | # Enter the path to the top level of your facebook data folder below.
12 | me = BookSoup("facebook-data")
13 |
14 | # Enter the name of the conversation or the numerical ID below.
15 | conversation = me.load_conversation(108)
16 |
17 | for participant in conversation.participants:
18 | timeline = conversation.interaction_timeline(participant)
19 | sorted_keys = sorted(timeline.keys())
20 | times.append(timeline)
21 | objects.append(sorted_keys)
22 | vals.append([timeline[t] for t in sorted_keys])
23 |
24 | y_pos = np.arange(len(objects[0]))
25 |
26 | for i,v in enumerate(vals):
27 | plt.plot(y_pos, v, alpha=0.5, label=conversation.participants[i])
28 |
29 | plt.xticks(y_pos, objects[0])
30 | plt.ylabel('Message count')
31 | plt.title('Messages over time with ' + conversation.name)
32 | plt.xticks(fontsize=8, rotation=90)
33 | plt.legend()
34 | plt.show()
35 |
36 |
--------------------------------------------------------------------------------
/demo_sentiment_timeline.py:
--------------------------------------------------------------------------------
1 | import matplotlib as mpl
2 | mpl.use('TkAgg')
3 | from booksoup import BookSoup
4 | import numpy as np
5 | import matplotlib.pyplot as plt
6 |
7 | times = []
8 | objects = []
9 | vals = []
10 |
11 | # Enter the path to the top level of your facebook data folder below.
12 | me = BookSoup("facebook-data")
13 |
14 | # Enter the name of the conversation or the numerical ID below.
15 | conversation = me.load_conversation(104)
16 |
17 | for participant in conversation.participants:
18 | timeline = conversation.sentiment_timeline(participant)
19 | sorted_keys = sorted(timeline.keys())
20 | times.append(timeline)
21 | objects.append(sorted_keys)
22 | vals.append([timeline[t] for t in sorted_keys])
23 |
24 | y_pos = np.arange(len(objects[0]))
25 |
26 | for i,v in enumerate(vals):
27 | plt.plot(y_pos, v, alpha=0.5, label=conversation.participants[i])
28 |
29 | plt.xticks(y_pos, objects[0])
30 | plt.ylabel('Average Sentiment')
31 | plt.title('Sentiment over time with ' + conversation.name)
32 | plt.xticks(fontsize=8, rotation=90)
33 | plt.legend()
34 | plt.show()
35 |
36 |
--------------------------------------------------------------------------------
/readme.md:
--------------------------------------------------------------------------------
1 | ## Booksoup
2 |
3 | Booksoup allows you to analyse and traverse your [downloaded facebook data](https://www.facebook.com/help/212802592074644?in_context),
4 | including features such as sentiment analysis and message frequency analysis over time.
5 |
6 | Booksoup requires [BeautifulSoup4](https://www.crummy.com/software/BeautifulSoup/bs4/doc/) and [TextBlob](http://textblob.readthedocs.io/en/dev/), and requires [matplotlib](https://matplotlib.org/) to run the demo graphs.
7 |
8 | ## Usage
9 |
10 | Initialise a new instance of the `BookSoup` class, passing in the top-level path of your facebook data folder as an argument.
11 |
12 |
13 | ### Basic usage
14 |
15 | ```python
16 | from booksoup import BookSoup
17 |
18 | me = BookSoup("facebook-data")
19 |
20 | # Get a conversation by name
21 | convo = me.load_conversation("Jane Doe")
22 |
23 | # Print participants of the conversation
24 | print(convo.participants)
25 |
26 | # Print messages in the conversation
27 | for message in convo.messages:
28 | print(message.date, message.timestamp, message.name, message.content)
29 | ```
30 |
31 | ### Interaction frequency
32 | It's possible to see how often messages are sent in a specific conversation at each hour of the day using `interaction_freq`. This returns a dict with each key being an hour in the day, and the corresponding value being the number of messages sent at that time over the history of the conversation.
33 | ```python
34 | me = BookSoup("facebook-data")
35 | convo = me.load_conversation("John Smith")
36 |
37 | times = convo.interaction_freq()
38 | ```
39 |
40 | Using the `demo_interaction_frequency.py` code, this can be visualised:
41 |
42 | 
43 |
44 | ### Interaction timeline
45 |
46 | It's also possible to view how many times a specific person within a conversation sent messages from the beginning to the last point
47 | of the conversation using `interaction_timeline(name)`. The following example shows how often I sent messages within a group conversation.
48 |
49 | ```python
50 | me = BookSoup("facebook-data")
51 | convo = me.load_conversation("Lewis, Andrew, Michelle and 4 others")
52 |
53 | times = convo.interaction_timeline(me.name)
54 | ```
55 |
56 | Using the `demo_interaction_timeline.py` code, I can visualise in one graph how often everyone in the conversation spoke by building a separate
57 | timeline for each person.
58 |
59 | 
60 |
61 | Another example below with one friend over a longer timeline:
62 |
63 | 
64 |
65 | ### Sentiment
66 |
67 | Booksoup can also perform [sentiment analysis](https://en.wikipedia.org/wiki/Sentiment_analysis). Average sentiment for a user in a specific conversation can be calculated using
68 | `Conversation.avg_sentiment(name)`, or a timeline of average sentiment can also be built using `Conversation.sentiment_timeline`.
69 |
70 | ```python
71 | convo = me.load_conversation("David Grocer")
72 |
73 | # Print the average sentiment of David Grocer in the conversation
74 | print(convo.avg_sentiment("David Grocer"))
75 |
76 | # Print the timeline dictionary of my average sentiment in the conversation
77 | print(convo.sentiment_timeline(me.name))
78 |
79 | ```
80 |
81 | ### Loading a conversation
82 | A conversation can either be loaded using either the title of the conversation (as in all the previous examples) or the numerical
83 | ID of the conversation (the filename of the conversation's html file).
84 |
85 | ```python
86 | convo = me.load_conversation(40)
87 | ```
88 |
89 | ### Specifying interval duration
90 |
91 | In all of the timeline examples, the interval can be specified as either `month` or `day`, with the default being `month`. To switch to daily intervals
92 | for timeline operations, set the `interval` argument, e.g
93 |
94 | ```python
95 | convo = me.load_conversation("David Grocer", interval="day")
96 | ```
97 |
98 | ### Events
99 |
100 | Booksoup can extract and categorise event information. This includes title, description, location, timestamp and a 2-element array
101 | containing the latitude and longitude of the event if available.
102 |
103 | ```python
104 | me = BookSoup("facebook-data")
105 |
106 | events = me.load_all_events()
107 |
108 | # Events are organised into attending, maybe, declined and no_reply:
109 | for event in events.attending:
110 | print(event.title, event.description, event.location, event.timestamp, event.latlon)
111 | ```
112 |
--------------------------------------------------------------------------------
/t.py:
--------------------------------------------------------------------------------
1 | from booksoup import BookSoup
2 |
3 | me = BookSoup("facebook-data")
4 |
5 | events = me.load_all_events()
6 |
7 | # Events are organised into attending, maybe, declined and no_reply:
8 | for event in events.attending:
9 | print(event.title, event.description, event.location, event.timestamp, event.latlon)
--------------------------------------------------------------------------------