├── .gitignore ├── emo_network.png ├── LICENSE ├── README.md ├── train_sentiment_classifier.py ├── TwSentiment.py ├── 01_Collecting_tweets_using_Twitter_API.ipynb ├── .ipynb_checkpoints ├── 01_Collecting_tweets_using_Twitter_API-checkpoint.ipynb ├── 00_Introduction_to_Jupyter_and_Python-checkpoint.ipynb └── 02_Analysis_of_Twitter_Social_Network-checkpoint.ipynb ├── 00_Introduction_to_Jupyter_and_Python.ipynb ├── network_components.svg ├── 03_Introduction_To_Supervised_Machine_Learning.ipynb ├── 04_Twitter_Sentiment_Analysis.ipynb └── 02_Analysis_of_Twitter_Social_Network.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | credential.txt 2 | trumpTweets.txt 3 | keys.json 4 | -------------------------------------------------------------------------------- /emo_network.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexbovet/network_lesson/HEAD/emo_network.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Alexandre Bovet 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Lesson on Twitter Social Network and Sentiment Analysis 2 | 3 | This is an interactive introductory lesson that covers the following topics: 4 | 5 | 1. Very short introduction to [Python](https://www.python.org/)/[Jupyter](http://jupyter.org/)/[NumPy](http://www.numpy.org/) and [matplotlib](https://matplotlib.org/), 6 | 2. Tweet collection using the Twitter Streaming API (using [tweepy](http://www.tweepy.org/)), 7 | 3. Creation and analysis of the network of interactions between Twitter users (using [networkx](https://networkx.github.io/)), 8 | 4. Basics of supervised machine learning classification, 9 | 5. Sentiment analysis applied to the Twitter interaction network (using [scikit-learn](http://scikit-learn.org/) and [NLTK](http://www.nltk.org/)). 10 | 11 | I made it using [Jupyter notebooks](http://jupyter.org/). You can see it directly on github or download the files and run them on your machine. 12 | You need to have python3 installed with [Anaconda](https://www.continuum.io/downloads). 13 | 14 | And here is what you should get at the end: 15 | 16 | ![Twitter Network](emo_network.png "Twitter Network") 17 | 18 | http://alexbovet.github.io/ 19 | -------------------------------------------------------------------------------- /train_sentiment_classifier.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | """ 4 | Created on Fri May 7 10:08:07 2021 5 | 6 | @author: Alexandre Bovet 7 | """ 8 | 9 | 10 | from TwSentiment import CustomTweetTokenizer, bag_of_words_and_bigrams 11 | import pandas as pd 12 | 13 | from zipfile import ZipFile 14 | 15 | from sklearn.feature_extraction import DictVectorizer 16 | from sklearn.linear_model import SGDClassifier 17 | from sklearn.model_selection import KFold, GridSearchCV 18 | from sklearn.pipeline import Pipeline 19 | import joblib 20 | import pickle 21 | 22 | import numpy as np 23 | 24 | raise Exception 25 | #%% load training set 26 | with ZipFile('trainingandtestdata.zip', mode='r') as fopen: 27 | 28 | df_train = pd.read_csv(fopen.open('training.1600000.processed.noemoticon.csv'), 29 | encoding='latin1', header=None) 30 | 31 | df_train.columns = ['polarity', 'id','date','query','user','text'] 32 | 33 | tokenizer = CustomTweetTokenizer(preserve_case=False, # keep Upper cases 34 | reduce_len=True, # reduce repetition of letter to a maximum of three 35 | strip_handles=False, # remove usernames (@mentions) 36 | normalize_usernames=True, # replace all mentions to "@USER" 37 | normalize_urls=True, # replace all urls to "URL" 38 | keep_allupper=True) # keep upercase for words that are all in uppercase 39 | 40 | 41 | #%% features vect 42 | 43 | features = [bag_of_words_and_bigrams(tokenizer.tokenize(t)) for t in df_train.text.tolist()] 44 | 45 | labels = df_train.polarity.tolist() 46 | y = np.array([1 if l > 0 else 0 for l in labels]) 47 | 48 | vect = DictVectorizer(dtype=np.int8, sparse=True, sort=False) 49 | X = vect.fit_transform(features) 50 | 51 | # memmaping of the features 52 | joblib.dump(X, '_features_vect.memmap') 53 | joblib.dump(y, '_labels_vect.memmap') 54 | 55 | #%% cross-val 56 | X = joblib.load('_features_vect.memmap') 57 | y = joblib.load('_labels_vect.memmap') 58 | 59 | scoring = 'f1_micro' 60 | n_splits = 10 61 | loss = 'log' 62 | penalty = 'l2' 63 | grid_search_parameters = {'classifier__alpha' : np.logspace(-1,-7, num=20)} 64 | 65 | pipeline_list = [('classifier', SGDClassifier(verbose=True, 66 | loss=loss, 67 | penalty=penalty))] 68 | pipeline = Pipeline(pipeline_list) 69 | 70 | kfold = KFold(n_splits=n_splits, shuffle=True, random_state=34) 71 | 72 | grid_search = GridSearchCV(estimator=pipeline, param_grid=grid_search_parameters, 73 | cv=kfold, 74 | scoring=scoring, 75 | verbose=1 , 76 | n_jobs=4) 77 | 78 | grid_search.fit(X, y) 79 | 80 | alpha = grid_search.best_estimator_.get_params()['classifier__alpha'] 81 | 82 | #%% train classifier with the best parameters 83 | 84 | pipeline_list = [('feat_vectorizer', DictVectorizer(dtype=np.int8, sparse=True, sort=False)), 85 | ('classifier', SGDClassifier(loss=loss, 86 | alpha=alpha, 87 | penalty=penalty, 88 | random_state=42))] 89 | 90 | pipeline = Pipeline(pipeline_list) 91 | 92 | pipeline.fit(features, y) 93 | 94 | #%% save classifier 95 | 96 | with open('tweet_classifier_pipepline.pickle', 'wb') as fopen: 97 | pickle.dump(pipeline, fopen) 98 | -------------------------------------------------------------------------------- /TwSentiment.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on May 2nd 2017 4 | 5 | @author: Alexandre Bovet 6 | 7 | Sentiment Analysis of tweets 8 | """ 9 | 10 | 11 | import collections 12 | 13 | from nltk import ngrams 14 | from itertools import chain 15 | import numpy as np 16 | from string import punctuation 17 | from nltk.tokenize.casual import TweetTokenizer, _replace_html_entities, remove_handles, \ 18 | reduce_lengthening, HANG_RE, WORD_RE, EMOTICON_RE 19 | import re 20 | 21 | 22 | def bag_of_words(words): 23 | return dict([(word, True) for word in words]) 24 | 25 | def bag_of_words_and_bigrams(words): 26 | 27 | bigrams = ngrams(words, 2) 28 | 29 | return bag_of_words(chain(words, bigrams)) 30 | 31 | 32 | #============================================================================== 33 | # Custom Tokenizer for tweets 34 | #============================================================================== 35 | 36 | 37 | 38 | def normalize_mentions(text): 39 | """ 40 | Replace Twitter username handles with '@USER'. 41 | """ 42 | pattern = re.compile(r"(^|(?<=[^\w.-]))@[A-Za-z_]+\w+") 43 | return pattern.sub('@USER', text) 44 | 45 | 46 | def normalize_urls(text): 47 | """ 48 | Replace urls with 'URL'. 49 | """ 50 | pattern = re.compile(r"""(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))""") 51 | # first shorten consecutive punctuation to 3 52 | # to avoid the pattern to hang in exponential loop in extreme cases. 53 | text = HANG_RE.sub(r'\1\1\1', text) 54 | 55 | return pattern.sub('URL', text) 56 | 57 | 58 | def _lowerize(word, keep_all_upper=False): 59 | if EMOTICON_RE.search(word): 60 | return word 61 | elif word.isupper() and keep_all_upper: 62 | return word 63 | elif word == 'URL': 64 | return word 65 | elif word == '@USER': 66 | return word 67 | else: 68 | return word.lower() 69 | 70 | class CustomTweetTokenizer(TweetTokenizer): 71 | """ Custom tweet tokenizer based on NLTK TweetTokenizer""" 72 | 73 | def __init__(self, preserve_case=False, reduce_len=True, strip_handles=False, 74 | normalize_usernames=True, normalize_urls=True, keep_allupper=True): 75 | 76 | TweetTokenizer.__init__(self, preserve_case=preserve_case, reduce_len=reduce_len, 77 | strip_handles=strip_handles) 78 | 79 | self.keep_allupper = keep_allupper 80 | self.normalize_urls = normalize_urls 81 | self.normalize_usernames = normalize_usernames 82 | 83 | if normalize_usernames: 84 | self.strip_handles = False 85 | 86 | if self.preserve_case: 87 | self.keep_allupper = True 88 | 89 | 90 | def tokenize(self, text): 91 | """ 92 | :param text: str 93 | :rtype: list(str) 94 | :return: a tokenized list of strings; 95 | 96 | Normalizes URLs, usernames and word lengthening depending of the 97 | attributes of the instance. 98 | 99 | """ 100 | # Fix HTML character entities: 101 | text = _replace_html_entities(text) 102 | # Remove or replace username handles 103 | if self.strip_handles: 104 | text = remove_handles(text) 105 | elif self.normalize_usernames: 106 | text = normalize_mentions(text) 107 | 108 | if self.normalize_urls: 109 | # Shorten problematic sequences of characters 110 | text = normalize_urls(text) 111 | 112 | # Normalize word lengthening 113 | if self.reduce_len: 114 | text = HANG_RE.sub(r'\1\1\1', text) 115 | text = reduce_lengthening(text) 116 | 117 | # Tokenize: 118 | safe_text = HANG_RE.sub(r'\1\1\1', text) 119 | words = WORD_RE.findall(safe_text) 120 | 121 | # Possibly alter the case, but avoid changing emoticons like :D into :d: 122 | # lower words but keep words that are all upper cases 123 | if not self.preserve_case: 124 | words = [_lowerize(w, self.keep_allupper) for w in words] 125 | 126 | 127 | return words 128 | 129 | 130 | 131 | 132 | #============================================================================== 133 | # Emoticon classification 134 | #============================================================================== 135 | 136 | POS_EMOTICONS = [":D", ":-D", ":-)", ":=)", "=)", "XD", "=D", "=]", ":]", ":<)", 137 | ":>)", "=}", ":)",":}", ":o)","8D","8-)", 138 | ":]", ":-}", ":-]",":-.)","^_^", "^-^"] 139 | 140 | NEG_EMOTICONS = [":(", ":-(", ":'(", "=(", "={", 141 | ":-{", ":-{", ":-(", ":'{", "=[", ":["] 142 | 143 | POS_EMOJIS_RE = re.compile(u'[' 144 | u'\U0001F600-\U0001F606' 145 | u'\U0001F60A-\U0001F60E' 146 | u'\U0001F638-\U0001F63B' 147 | u'\U0001F642' 148 | u'\U0000263A-\U0000263B]+', 149 | re.UNICODE) 150 | 151 | NEG_EMOJIS_RE = re.compile(u'[' 152 | u'\U0001F61E-\U0001F622' 153 | u'\U0001F63E-\U0001F63F' 154 | u'\U0001F641' 155 | u'\U00002639]+', 156 | re.UNICODE) 157 | 158 | def classifyEmoticons(text): 159 | 160 | # find all emoticons 161 | emoticons = EMOTICON_RE.findall(text) 162 | 163 | pos = any([emo in POS_EMOTICONS for emo in emoticons]) or bool(POS_EMOJIS_RE.search(text)) 164 | neg = any([emo in NEG_EMOTICONS for emo in emoticons]) or bool(NEG_EMOJIS_RE.search(text)) 165 | 166 | if pos and neg: 167 | return 'N/A' 168 | elif pos and not neg: 169 | return 'pos' 170 | elif neg and not pos: 171 | return 'neg' 172 | elif not pos and not neg: 173 | return None 174 | 175 | 176 | 177 | 178 | class TweetClassifier(object): 179 | 180 | def __init__(self, classifier, 181 | tokenizer=CustomTweetTokenizer(preserve_case=False, 182 | reduce_len=True, 183 | strip_handles=False, 184 | normalize_usernames=False, 185 | normalize_urls=False, 186 | keep_allupper=False), 187 | feature_extractor=bag_of_words_and_bigrams, 188 | label_inv_mapper={0 : 'neg' , 1 : 'pos'}, 189 | polarity_threshold=0.5): 190 | 191 | self.classifier = classifier 192 | self.tokenizer = tokenizer 193 | self.feature_extractor = feature_extractor 194 | self.label_inv_mapper = label_inv_mapper 195 | self.polarity_threshold = polarity_threshold 196 | self.labels = [self.label_inv_mapper[c] for c in self.classifier.classes_] 197 | 198 | def classify_text(self, text, return_pred_labels=True): 199 | 200 | 201 | if isinstance(text, str): 202 | #single text 203 | 204 | tokens = self.tokenizer.tokenize(text) 205 | 206 | features = self.feature_extractor(tokens) 207 | 208 | proba = self.classifier.predict_proba(features) 209 | 210 | proba = proba.flatten() 211 | 212 | if return_pred_labels: 213 | if np.max(proba) > self.polarity_threshold: 214 | 215 | predicted_label = self.labels[np.argmax(proba)] 216 | 217 | else: 218 | 219 | predicted_label = 'N/A' 220 | 221 | 222 | elif isinstance(text, list): 223 | # list of multiple texts 224 | 225 | tokens = map(self.tokenizer.tokenize, text) 226 | features = map(self.feature_extractor, tokens) 227 | 228 | proba = self.classifier.predict_proba(features) 229 | 230 | if return_pred_labels: 231 | len_labels = max(len(l) for l in self.labels) 232 | 233 | predicted_label = np.zeros(len(text), dtype=' self.polarity_threshold 237 | 238 | predicted_label[mask] = [self.labels[i] for i in np.argmax(proba[mask], axis=1)] 239 | 240 | 241 | if return_pred_labels: 242 | return predicted_label, proba 243 | else: 244 | return proba 245 | -------------------------------------------------------------------------------- /01_Collecting_tweets_using_Twitter_API.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Collecting tweets using the Twitter API\n", 8 | "\n", 9 | "\n", 10 | "In this section we are going to see how to connect to the Twitter API to collect tweets and save them.\n", 11 | "\n", 12 | "\"In computer programming, an **Application Programming Interface (API)** is a set of subroutine definitions, protocols, and tools for building application software.\" [wikipedia](https://en.wikipedia.org/wiki/Application_programming_interface)\n", 13 | "\n", 14 | "The Twitter API is the tool we use to collect tweets from Twitter\n", 15 | "\n", 16 | "[Twitter APIs](https://developer.twitter.com/en/docs/twitter-api/getting-started/about-twitter-api) has different endpoints that allows one to preform different actions, such as:\n", 17 | "- Accessing a roughly 1% random sample of publicly available Tweets in real-time (https://developer.twitter.com/en/docs/twitter-api/v1/tweets/filter-realtime/overview).\n", 18 | "\n", 19 | "- Searching among historical tweets (https://developer.twitter.com/en/docs/twitter-api/v1/tweets/search/overview).\n", 20 | "\n", 21 | "To use the Twitter API from python, we will use the library [tweepy](http://www.tweepy.org/) which facilitate the access to the API.\n", 22 | "\n", 23 | "To install it run on of the following command in your terminal or execute the cell below:\n", 24 | "\n", 25 | "Intallation with pip:\n", 26 | "```\n", 27 | "pip install tweepy\n", 28 | "```\n", 29 | "\n", 30 | "Installation with conda:\n", 31 | "```\n", 32 | "conda install -c conda-forge tweepy\n", 33 | "```" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "# this will install tweepy on your machine\n", 43 | "!pip install tweepy" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "In order to be able to use Twitter APIs, you need to apply for a developper account, create a [project](https://developer.twitter.com/en/docs/projects/overview) and an [app](https://developer.twitter.com/en/docs/apps/overview).\n", 51 | "\n", 52 | "Follow the instructions here: https://developer.twitter.com/en/docs/twitter-api/getting-started/getting-access-to-the-twitter-api\n", 53 | "\n", 54 | "Once you have created an app, create a new file in the lesson's folder named `keys.json` and copy paste your *Consumer Keys* (*API Key* and *API Secret Key*) and *Authentication Tokens* (*Access Token* and *Access Token Secret*) as shown below in this new file:" 55 | ] 56 | }, 57 | { 58 | "cell_type": "raw", 59 | "metadata": { 60 | "collapsed": true 61 | }, 62 | "source": [ 63 | "{\"api_key\": \"xxx\",\n", 64 | "\"api_secret_key\": \"xxx\",\n", 65 | "\"access_token\" : \"xxx\",\n", 66 | "\"access_token_secret\" : \"xxx\"}" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "It is important to keep your keys private and secure. See https://developer.twitter.com/en/docs/authentication/guides/authentication-best-practices" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "import json\n", 83 | "with open('keys.json', 'r') as fopen:\n", 84 | " keys = json.load(fopen)\n", 85 | "# print(keys)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "### Authentificate with the Twitter API\n" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "import tweepy\n", 102 | "\n", 103 | "auth = tweepy.OAuthHandler(keys['api_key'], keys['api_secret_key'])\n", 104 | "auth.set_access_token(keys['access_token'], keys['access_token_secret'])\n", 105 | "\n", 106 | "# create the api object that we will use to interact with Twitter\n", 107 | "api = tweepy.API(auth)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "# example of an action:\n", 117 | "tweet = api.update_status('Hey @BovetAlexandre!')" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "# the returned object is a tweepy Status object\n", 127 | "type(tweet)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "The tweet object contains all the attributes of a [tweet data dictionary](https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/tweet)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "print('Tweet text: ', tweet.text)\n", 144 | "print('Tweet author: ', tweet.author.screen_name)\n", 145 | "print('Tweet creation time: ', tweet.created_at)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "# It also contains a JSON version of the tweet object\n", 155 | "tweet._json" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "## Collecting tweets from the Streaming API\n", 163 | "source : https://docs.tweepy.org/en/v3.10.0/streaming_how_to.html \n", 164 | "\n", 165 | "### Step 1: Creating a StreamListener\n", 166 | "\n", 167 | "This simple stream listener prints status text. The on_data method of Tweepy’s StreamListener conveniently passes data from statuses to the on_status method.\n", 168 | "Create class MyStreamListener inheriting from StreamListener and overriding on_status.:" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "#override tweepy.StreamListener to make it print tweet content when new data arrives\n", 178 | "class MyStreamListener(tweepy.StreamListener):\n", 179 | "\n", 180 | " def on_status(self, status):\n", 181 | " print(status.text)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "### Step 2: Creating a Stream\n", 189 | "\n", 190 | "Using the api object we created and the StreamListener we can create a Stream Object:" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "myStreamListener = MyStreamListener()\n", 200 | "myStream = tweepy.Stream(auth = api.auth, listener=myStreamListener)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "### Step 3: Starting a Stream\n", 208 | "\n", 209 | "A number of twitter streams are available through Tweepy. Most cases will use filter, the user_stream, or the sitestream. For more information on the capabilities and limitations of the different streams see [Twitter Streaming API Documentation](https://developer.twitter.com/en/docs/twitter-api/v1/tweets/filter-realtime/overview)\n", 210 | "\n", 211 | "In this example we will use filter to stream all tweets containing the word python. The track parameter is an array of search terms to stream." 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "# this will start tracking tweets with the key word 'new york'.\n", 221 | "# to stop it, interrupt the kernel.\n", 222 | "# try with different keywords\n", 223 | "# you have to run the cell below to disconnect the stream before rerunning this one\n", 224 | "myStream.filter(track=['#cop26','#ClimateChange', 'climate'])" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "myStream.disconnect()" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "myStream.filter(track=['moderna'], languages=['en'])" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [ 251 | "myStream.disconnect()" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": { 258 | "scrolled": false 259 | }, 260 | "outputs": [], 261 | "source": [ 262 | "# streaming tweets from a given location\n", 263 | "# we need to provide a comma-separated list of longitude,latitude pairs specifying a set of bounding boxes\n", 264 | "# for example for New York\n", 265 | "myStream.filter(locations=[-74,40,-73,41])" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "myStream.disconnect()" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": {}, 280 | "source": [ 281 | "### Saving the stream to a file\n", 282 | "Lets' define a new StreamListener that will save the collected data to a file" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [ 291 | "#override tweepy.StreamListener to make it save data to a file\n", 292 | "# and limit the maximum number of tweets we want to collect\n", 293 | "class StreamSaver(tweepy.StreamListener):\n", 294 | " def __init__(self, filename, max_num_tweets=2000, api=None):\n", 295 | " self.filename = filename\n", 296 | " \n", 297 | " self.num_tweets = 0\n", 298 | " \n", 299 | " self.max_num_tweets = max_num_tweets\n", 300 | " \n", 301 | " tweepy.StreamListener.__init__(self, api=api)\n", 302 | " \n", 303 | " \n", 304 | " def on_data(self, data):\n", 305 | " #print json directly to file\n", 306 | " \n", 307 | " with open(self.filename,'a') as tf:\n", 308 | " tf.write(data)\n", 309 | "\n", 310 | " self.num_tweets += 1\n", 311 | "\n", 312 | " if self.num_tweets%100 == 0:\n", 313 | " print(self.num_tweets)\n", 314 | "\n", 315 | " if self.num_tweets > self.max_num_tweets:\n", 316 | " return False\n", 317 | " \n", 318 | " \n", 319 | " def on_error(self, status):\n", 320 | " print(status)" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "# create the new StreamListener and stream object that will save collected tweets to a file\n", 330 | "saveStream = StreamSaver(filename='tweets.txt', max_num_tweets=5000)\n", 331 | "mySaveStream = tweepy.Stream(auth = api.auth, listener=saveStream)\n" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "mySaveStream.filter(track=['coronavirus','covid-19',\n", 341 | " 'covid19','covid_19','corona virus',\n", 342 | " 'covid','vaccines','vaccine'],\n", 343 | " languages=['en'])\n" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": {}, 350 | "outputs": [], 351 | "source": [ 352 | "mySaveStream.disconnect()\n", 353 | "saveStream.num_tweets" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | " " 361 | ] 362 | } 363 | ], 364 | "metadata": { 365 | "anaconda-cloud": {}, 366 | "kernelspec": { 367 | "display_name": "Python 3", 368 | "language": "python", 369 | "name": "python3" 370 | }, 371 | "language_info": { 372 | "codemirror_mode": { 373 | "name": "ipython", 374 | "version": 3 375 | }, 376 | "file_extension": ".py", 377 | "mimetype": "text/x-python", 378 | "name": "python", 379 | "nbconvert_exporter": "python", 380 | "pygments_lexer": "ipython3", 381 | "version": "3.6.10" 382 | } 383 | }, 384 | "nbformat": 4, 385 | "nbformat_minor": 2 386 | } 387 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/01_Collecting_tweets_using_Twitter_API-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Collecting tweets using the Twitter API\n", 8 | "\n", 9 | "\n", 10 | "In this section we are going to see how to connect to the Twitter API to collect tweets and save them.\n", 11 | "\n", 12 | "\"In computer programming, an **Application Programming Interface (API)** is a set of subroutine definitions, protocols, and tools for building application software.\" [wikipedia](https://en.wikipedia.org/wiki/Application_programming_interface)\n", 13 | "\n", 14 | "The Twitter API is the tool we use to collect tweets from Twitter\n", 15 | "\n", 16 | "[Twitter APIs](https://developer.twitter.com/en/docs/twitter-api/getting-started/about-twitter-api) has different endpoints that allows one to preform different actions, such as:\n", 17 | "- Accessing a roughly 1% random sample of publicly available Tweets in real-time (https://developer.twitter.com/en/docs/twitter-api/v1/tweets/filter-realtime/overview).\n", 18 | "\n", 19 | "- Searching among historical tweets (https://developer.twitter.com/en/docs/twitter-api/v1/tweets/search/overview).\n", 20 | "\n", 21 | "To use the Twitter API from python, we will use the library [tweepy](http://www.tweepy.org/) which facilitate the access to the API.\n", 22 | "\n", 23 | "To install it run on of the following command in your terminal or execute the cell below:\n", 24 | "\n", 25 | "Intallation with pip:\n", 26 | "```\n", 27 | "pip install tweepy\n", 28 | "```\n", 29 | "\n", 30 | "Installation with conda:\n", 31 | "```\n", 32 | "conda install -c conda-forge tweepy\n", 33 | "```" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": null, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "# this will install tweepy on your machine\n", 43 | "!pip install tweepy" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "In order to be able to use Twitter APIs, you need to apply for a developper account, create a [project](https://developer.twitter.com/en/docs/projects/overview) and an [app](https://developer.twitter.com/en/docs/apps/overview).\n", 51 | "\n", 52 | "Follow the instructions here: https://developer.twitter.com/en/docs/twitter-api/getting-started/getting-access-to-the-twitter-api\n", 53 | "\n", 54 | "Once you have created an app, create a new file in the lesson's folder named `keys.json` and copy paste your *Consumer Keys* (*API Key* and *API Secret Key*) and *Authentication Tokens* (*Access Token* and *Access Token Secret*) as shown below in this new file:" 55 | ] 56 | }, 57 | { 58 | "cell_type": "raw", 59 | "metadata": { 60 | "collapsed": true 61 | }, 62 | "source": [ 63 | "{\"api_key\": \"xxx\",\n", 64 | "\"api_secret_key\": \"xxx\",\n", 65 | "\"access_token\" : \"xxx\",\n", 66 | "\"access_token_secret\" : \"xxx\"}" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "It is important to keep your keys private and secure. See https://developer.twitter.com/en/docs/authentication/guides/authentication-best-practices" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": {}, 80 | "outputs": [], 81 | "source": [ 82 | "import json\n", 83 | "with open('keys.json', 'r') as fopen:\n", 84 | " keys = json.load(fopen)\n", 85 | "# print(keys)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "### Authentificate with the Twitter API\n" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "import tweepy\n", 102 | "\n", 103 | "auth = tweepy.OAuthHandler(keys['api_key'], keys['api_secret_key'])\n", 104 | "auth.set_access_token(keys['access_token'], keys['access_token_secret'])\n", 105 | "\n", 106 | "# create the api object that we will use to interact with Twitter\n", 107 | "api = tweepy.API(auth)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "# example of an action:\n", 117 | "tweet = api.update_status('Hey @BovetAlexandre!')" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "# the returned object is a tweepy Status object\n", 127 | "type(tweet)" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "The tweet object contains all the attributes of a [tweet data dictionary](https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/tweet)" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "print('Tweet text: ', tweet.text)\n", 144 | "print('Tweet author: ', tweet.author.screen_name)\n", 145 | "print('Tweet creation time: ', tweet.created_at)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "# It also contains a JSON version of the tweet object\n", 155 | "tweet._json" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "## Collecting tweets from the Streaming API\n", 163 | "source : https://docs.tweepy.org/en/v3.10.0/streaming_how_to.html \n", 164 | "\n", 165 | "### Step 1: Creating a StreamListener\n", 166 | "\n", 167 | "This simple stream listener prints status text. The on_data method of Tweepy’s StreamListener conveniently passes data from statuses to the on_status method.\n", 168 | "Create class MyStreamListener inheriting from StreamListener and overriding on_status.:" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "#override tweepy.StreamListener to make it print tweet content when new data arrives\n", 178 | "class MyStreamListener(tweepy.StreamListener):\n", 179 | "\n", 180 | " def on_status(self, status):\n", 181 | " print(status.text)" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "### Step 2: Creating a Stream\n", 189 | "\n", 190 | "Using the api object we created and the StreamListener we can create a Stream Object:" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "myStreamListener = MyStreamListener()\n", 200 | "myStream = tweepy.Stream(auth = api.auth, listener=myStreamListener)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | "### Step 3: Starting a Stream\n", 208 | "\n", 209 | "A number of twitter streams are available through Tweepy. Most cases will use filter, the user_stream, or the sitestream. For more information on the capabilities and limitations of the different streams see [Twitter Streaming API Documentation](https://developer.twitter.com/en/docs/twitter-api/v1/tweets/filter-realtime/overview)\n", 210 | "\n", 211 | "In this example we will use filter to stream all tweets containing the word python. The track parameter is an array of search terms to stream." 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "# this will start tracking tweets with the key word 'new york'.\n", 221 | "# to stop it, interrupt the kernel.\n", 222 | "# try with different keywords\n", 223 | "# you have to run the cell below to disconnect the stream before rerunning this one\n", 224 | "myStream.filter(track=['#cop26','#ClimateChange', 'climate'])" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "myStream.disconnect()" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "myStream.filter(track=['moderna'], languages=['en'])" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "metadata": {}, 249 | "outputs": [], 250 | "source": [ 251 | "myStream.disconnect()" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "metadata": { 258 | "scrolled": false 259 | }, 260 | "outputs": [], 261 | "source": [ 262 | "# streaming tweets from a given location\n", 263 | "# we need to provide a comma-separated list of longitude,latitude pairs specifying a set of bounding boxes\n", 264 | "# for example for New York\n", 265 | "myStream.filter(locations=[-74,40,-73,41])" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": null, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "myStream.disconnect()" 275 | ] 276 | }, 277 | { 278 | "cell_type": "markdown", 279 | "metadata": {}, 280 | "source": [ 281 | "### Saving the stream to a file\n", 282 | "Lets' define a new StreamListener that will save the collected data to a file" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [ 291 | "#override tweepy.StreamListener to make it save data to a file\n", 292 | "# and limit the maximum number of tweets we want to collect\n", 293 | "class StreamSaver(tweepy.StreamListener):\n", 294 | " def __init__(self, filename, max_num_tweets=2000, api=None):\n", 295 | " self.filename = filename\n", 296 | " \n", 297 | " self.num_tweets = 0\n", 298 | " \n", 299 | " self.max_num_tweets = max_num_tweets\n", 300 | " \n", 301 | " tweepy.StreamListener.__init__(self, api=api)\n", 302 | " \n", 303 | " \n", 304 | " def on_data(self, data):\n", 305 | " #print json directly to file\n", 306 | " \n", 307 | " with open(self.filename,'a') as tf:\n", 308 | " tf.write(data)\n", 309 | "\n", 310 | " self.num_tweets += 1\n", 311 | "\n", 312 | " if self.num_tweets%100 == 0:\n", 313 | " print(self.num_tweets)\n", 314 | "\n", 315 | " if self.num_tweets > self.max_num_tweets:\n", 316 | " return False\n", 317 | " \n", 318 | " \n", 319 | " def on_error(self, status):\n", 320 | " print(status)" 321 | ] 322 | }, 323 | { 324 | "cell_type": "code", 325 | "execution_count": null, 326 | "metadata": {}, 327 | "outputs": [], 328 | "source": [ 329 | "# create the new StreamListener and stream object that will save collected tweets to a file\n", 330 | "saveStream = StreamSaver(filename='tweets.txt', max_num_tweets=5000)\n", 331 | "mySaveStream = tweepy.Stream(auth = api.auth, listener=saveStream)\n" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "mySaveStream.filter(track=['coronavirus','covid-19',\n", 341 | " 'covid19','covid_19','corona virus',\n", 342 | " 'covid','vaccines','vaccine'],\n", 343 | " languages=['en'])\n" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": null, 349 | "metadata": {}, 350 | "outputs": [], 351 | "source": [ 352 | "mySaveStream.disconnect()\n", 353 | "saveStream.num_tweets" 354 | ] 355 | }, 356 | { 357 | "cell_type": "markdown", 358 | "metadata": {}, 359 | "source": [ 360 | " " 361 | ] 362 | } 363 | ], 364 | "metadata": { 365 | "anaconda-cloud": {}, 366 | "kernelspec": { 367 | "display_name": "Python 3", 368 | "language": "python", 369 | "name": "python3" 370 | }, 371 | "language_info": { 372 | "codemirror_mode": { 373 | "name": "ipython", 374 | "version": 3 375 | }, 376 | "file_extension": ".py", 377 | "mimetype": "text/x-python", 378 | "name": "python", 379 | "nbconvert_exporter": "python", 380 | "pygments_lexer": "ipython3", 381 | "version": "3.6.10" 382 | } 383 | }, 384 | "nbformat": 4, 385 | "nbformat_minor": 2 386 | } 387 | -------------------------------------------------------------------------------- /00_Introduction_to_Jupyter_and_Python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Here, I introduce basic notions of Jupyter and Python. I assume that you have some knowledge of programming (in MATLAB for example).\n", 8 | "You will need to have [anaconda](https://www.anaconda.com/products/individual) for python 3 installed for the following.\n", 9 | "\n", 10 | "# Jupyter/IPython Notebook Quick Start Guide\n", 11 | "\n", 12 | "The following is partially taken from the [offical documentation](https://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/index.html):\n", 13 | "\n", 14 | "## 1. What is the Jupyter Notebook?\n", 15 | "Notebook documents (or “notebooks”, all lower case) are documents produced by the Jupyter Notebook App, which contain both *computer code* (e.g. python) and *rich text elements* (paragraph, equations, figures, links, etc...). Notebook documents are both human-readable documents containing the analysis description and the results (figures, tables, etc..) as well as executable documents which can be run to perform data analysis." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "# press Shift+Enter to execute this cell\n", 25 | "print('This is a cell containing python code')\n" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "#we can also make figures\n", 35 | "import matplotlib.pyplot as plt\n", 36 | "import numpy as np\n", 37 | "%matplotlib inline\n", 38 | "x = np.linspace(-np.pi, np.pi, 100)\n", 39 | "plt.plot(x, np.sin(x))\n", 40 | "\n", 41 | "# Use `Tab` for completion and `Shift-Tab` for code info" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "This is an equation formatted in LaTeX $y = \\sin(x)$\n", 49 | "\n", 50 | "double-click this cell to edit it" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "## 2. Jupyter Notebook App\n", 58 | "The Jupyter Notebook App is a server-client application that allows editing and running notebook documents via a web browser.\n", 59 | "\n", 60 | "## 3. kernel\n", 61 | "A notebook kernel is a “computational engine” that executes the code contained in a Notebook document. The ipython kernel executes python code. Kernels for many other languages exist (official kernels).\n", 62 | "\n", 63 | "## 4. Notebook Dashboard\n", 64 | "\n", 65 | "The Notebook Dashboard is the component which is shown first when you launch Jupyter Notebook App. The Notebook Dashboard is mainly used to open notebook documents, and to manage the running kernels (visualize and shutdown).\n", 66 | "\n", 67 | "## 5. References\n", 68 | "\n", 69 | "- [Jupyter project ](https://jupyter.org/)\n", 70 | "- [Jupyter documentation](http://jupyter.readthedocs.io/en/latest/)\n", 71 | "- [Jupyter notebooks documentation](http://jupyter-notebook.readthedocs.io/en/latest/)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": { 77 | "collapsed": true 78 | }, 79 | "source": [ 80 | "# (Very) Short Python Introduction\n", 81 | "\n", 82 | "In addition to \"standard\" data types such as integers, floats and strings, Python knows a number of compound data types, used to group together other values. We will briefly see * [Lists](https://welcomedata.wordpress.com/2015/01/11/python-lists-and-dictionaries/)* and *Dictionaries*\n", 83 | "\n", 84 | "## 1. Lists\n", 85 | "\n", 86 | "https://docs.python.org/3/tutorial/introduction.html#lists\n", 87 | "\n", 88 | "### Accessing by index:" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "# define a new list\n", 98 | "zoo_animals = [\"pangolin\", \"cassowary\", \"sloth\", \"tiger\"]\n", 99 | "\n", 100 | "if len(zoo_animals) > 3:\n", 101 | " print(\"The first animal at the zoo is the \" + zoo_animals[0])\n", 102 | " print(\"The second animal at the zoo is the \" + zoo_animals[1])\n", 103 | " print(\"The third animal at the zoo is the \" + zoo_animals[2])\n", 104 | " print(\"The fourth animal at the zoo is the \" + zoo_animals[3])" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "### Appending to a list or extending a list:" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "# empty list\n", 121 | "suitcase = []\n", 122 | "suitcase.append(\"sunglasses\")\n", 123 | "\n", 124 | "# Your code here!\n", 125 | "suitcase.append(\"doll\")\n", 126 | "suitcase.append(\"ball\")\n", 127 | "suitcase.append(\"comb\")\n", 128 | "\n", 129 | "list_length = len(suitcase) # Set this to the length of suitcase\n", 130 | "\n", 131 | "print(\"There are %d items in the suitcase.\" % (list_length))\n", 132 | "print(suitcase)\n", 133 | "\n", 134 | "# we can also append an other list to a list\n", 135 | "# using the \"extend\" method\n", 136 | "numbers = [42,7,12]\n", 137 | "\n", 138 | "suitcase.extend(numbers)\n", 139 | "\n", 140 | "print(suitcase)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "### Slicing:" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "suitcase = [\"sunglasses\", \"hat\", \"passport\", \"laptop\", \"suit\", \"shoes\"]\n", 157 | "\n", 158 | "first = suitcase[0:2] # The first and second items (index zero and one)\n", 159 | "middle = suitcase[2:4] # Third and fourth items (index two and three)\n", 160 | "last = suitcase[4:6] # The last two items (index four and five)\n", 161 | "\n", 162 | "print(last)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "### Looping over the elements of a list:" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "my_list = [1,9,3,8,5,7]\n", 179 | "\n", 180 | "for number in my_list:\n", 181 | " print(2*number)\n", 182 | " # Your code here" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "## 2. Dictionaries\n", 190 | "\n", 191 | "https://docs.python.org/3/tutorial/datastructures.html#dictionaries\n", 192 | "\n", 193 | "Unlike sequences, which are indexed by a range of numbers, dictionaries are indexed by keys, which can be strings or numbers.\n", 194 | "\n", 195 | "It is best to think of a dictionary as an unordered set of key: value pairs, with the requirement that the keys are unique (within one dictionary).\n", 196 | "\n", 197 | "### Creating a Dictionary:" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "# Assigning a dictionary with three key-value pairs to residents:\n", 207 | "residents = {'Puffin' : 104, 'Sloth' : 105, 'Burmese Python' : 106}\n", 208 | "\n", 209 | "print(residents['Puffin']) # Prints Puffin's room number\n", 210 | "\n", 211 | "print(residents['Sloth'])\n", 212 | "print(residents['Burmese Python'])" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "### Adding entries:" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "menu = {} # Empty dictionary\n", 229 | "menu['Raclette'] = 14.50 # Adding new key-value pair\n", 230 | "print(menu['Raclette'])\n", 231 | "\n", 232 | "menu['Cheese Fondue'] = 10.50\n", 233 | "menu['Muesli'] = 13.50\n", 234 | "menu['Quiche'] = 19.50\n", 235 | "menu['Cervela'] = 17.50 # Your code here: Add some dish-price pairs to menu!\n", 236 | "\n", 237 | "print(\"There are \" + str(len(menu)) + \" items on the menu.\")\n", 238 | "print(menu)" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "### Removing entries:" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "del menu['Muesli']\n", 255 | "print(menu)" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "### Looping over entries:" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "for key, value in menu.items():\n", 272 | " print(key, value)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": { 278 | "collapsed": true 279 | }, 280 | "source": [ 281 | "### Accessing entries:" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "raclette = menu.get('Raclette')\n", 291 | "print(raclette)\n", 292 | "\n", 293 | "#accessing a non-existing key\n", 294 | "burger = menu.get('Burger')\n", 295 | "print(burger)\n" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "#listing keys\n", 305 | "menu.keys()" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "#listing values\n", 315 | "menu.values()\n" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "# NumPy\n", 323 | "\n", 324 | "NumPy is the fundamental package for scientific computing with Python (http://www.numpy.org/).\n", 325 | "It is part of SciPy (https://www.scipy.org/).\n", 326 | "\n", 327 | "The numpy package provides functionalities that makes Python similar to MATLAB." 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "# first import the package\n", 337 | "import numpy as np\n", 338 | "\n", 339 | "# numpy works with array similarly to MATLAB \n", 340 | "x = np.array([1,2,4,5,6,8,10,4,3,5,6])\n", 341 | "\n", 342 | "# but indexing start at 0!!\n", 343 | "print(x[0])\n", 344 | "\n", 345 | "# arrays have useful methods\n", 346 | "print(x.size)\n", 347 | "\n", 348 | "print(x.mean())\n", 349 | "\n", 350 | "# we can also use numpy functions\n", 351 | "\n", 352 | "amax = np.max(x)\n", 353 | "\n", 354 | "print(amax)\n" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": {}, 360 | "source": [ 361 | "And you can do much more with NumPy! see https://numpy.org/doc/stable/user/quickstart.html\n" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": {}, 367 | "source": [ 368 | "# Matplotlib\n", 369 | "\n", 370 | "Matplotlib is a Python 2D plotting library which produces publication quality figures in a variety of hardcopy formats and interactive environments across platforms (https://matplotlib.org/).\n" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [ 379 | "# import matplotlib\n", 380 | "import matplotlib.pyplot as plt\n", 381 | "# make matplotlib create figures inline\n", 382 | "%matplotlib inline\n", 383 | "\n", 384 | "x = np.linspace(0,2,100)\n", 385 | "\n", 386 | "y1 = np.exp(x)\n", 387 | "y2 = np.sqrt(x)\n", 388 | "\n", 389 | "plt.plot(x,y1, 'r-', label='y = exp(x)')\n", 390 | "plt.plot(x,y2, 'b--', label='y = sqrt(x)')\n", 391 | "\n", 392 | "plt.xlabel('x')\n", 393 | "plt.ylabel('y')\n", 394 | "plt.legend()\n", 395 | "\n" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "# press Shift-Tab to see information about a function\n", 405 | "norm = np.random.randn(200)\n", 406 | "\n", 407 | "n, bins, patches = plt.hist(norm, density=True, bins=20)" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": null, 413 | "metadata": {}, 414 | "outputs": [], 415 | "source": [] 416 | } 417 | ], 418 | "metadata": { 419 | "anaconda-cloud": {}, 420 | "kernelspec": { 421 | "display_name": "Python 3", 422 | "language": "python", 423 | "name": "python3" 424 | }, 425 | "language_info": { 426 | "codemirror_mode": { 427 | "name": "ipython", 428 | "version": 3 429 | }, 430 | "file_extension": ".py", 431 | "mimetype": "text/x-python", 432 | "name": "python", 433 | "nbconvert_exporter": "python", 434 | "pygments_lexer": "ipython3", 435 | "version": "3.6.10" 436 | } 437 | }, 438 | "nbformat": 4, 439 | "nbformat_minor": 1 440 | } 441 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/00_Introduction_to_Jupyter_and_Python-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "Here, I introduce basic notions of Jupyter and Python. I assume that you have some knowledge of programming (in MATLAB for example).\n", 8 | "You will need to have [anaconda](https://www.anaconda.com/products/individual) for python 3 installed for the following.\n", 9 | "\n", 10 | "# Jupyter/IPython Notebook Quick Start Guide\n", 11 | "\n", 12 | "The following is partially taken from the [offical documentation](https://jupyter-notebook-beginner-guide.readthedocs.io/en/latest/index.html):\n", 13 | "\n", 14 | "## 1. What is the Jupyter Notebook?\n", 15 | "Notebook documents (or “notebooks”, all lower case) are documents produced by the Jupyter Notebook App, which contain both *computer code* (e.g. python) and *rich text elements* (paragraph, equations, figures, links, etc...). Notebook documents are both human-readable documents containing the analysis description and the results (figures, tables, etc..) as well as executable documents which can be run to perform data analysis." 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "# press Shift+Enter to execute this cell\n", 25 | "print('This is a cell containing python code')\n" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "#we can also make figures\n", 35 | "import matplotlib.pyplot as plt\n", 36 | "import numpy as np\n", 37 | "%matplotlib inline\n", 38 | "x = np.linspace(-np.pi, np.pi, 100)\n", 39 | "plt.plot(x, np.sin(x))\n", 40 | "\n", 41 | "# Use `Tab` for completion and `Shift-Tab` for code info" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "This is an equation formatted in LaTeX $y = \\sin(x)$\n", 49 | "\n", 50 | "double-click this cell to edit it" 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": {}, 56 | "source": [ 57 | "## 2. Jupyter Notebook App\n", 58 | "The Jupyter Notebook App is a server-client application that allows editing and running notebook documents via a web browser.\n", 59 | "\n", 60 | "## 3. kernel\n", 61 | "A notebook kernel is a “computational engine” that executes the code contained in a Notebook document. The ipython kernel executes python code. Kernels for many other languages exist (official kernels).\n", 62 | "\n", 63 | "## 4. Notebook Dashboard\n", 64 | "\n", 65 | "The Notebook Dashboard is the component which is shown first when you launch Jupyter Notebook App. The Notebook Dashboard is mainly used to open notebook documents, and to manage the running kernels (visualize and shutdown).\n", 66 | "\n", 67 | "## 5. References\n", 68 | "\n", 69 | "- [Jupyter project ](https://jupyter.org/)\n", 70 | "- [Jupyter documentation](http://jupyter.readthedocs.io/en/latest/)\n", 71 | "- [Jupyter notebooks documentation](http://jupyter-notebook.readthedocs.io/en/latest/)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": { 77 | "collapsed": true 78 | }, 79 | "source": [ 80 | "# (Very) Short Python Introduction\n", 81 | "\n", 82 | "In addition to \"standard\" data types such as integers, floats and strings, Python knows a number of compound data types, used to group together other values. We will briefly see * [Lists](https://welcomedata.wordpress.com/2015/01/11/python-lists-and-dictionaries/)* and *Dictionaries*\n", 83 | "\n", 84 | "## 1. Lists\n", 85 | "\n", 86 | "https://docs.python.org/3/tutorial/introduction.html#lists\n", 87 | "\n", 88 | "### Accessing by index:" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "# define a new list\n", 98 | "zoo_animals = [\"pangolin\", \"cassowary\", \"sloth\", \"tiger\"]\n", 99 | "\n", 100 | "if len(zoo_animals) > 3:\n", 101 | " print(\"The first animal at the zoo is the \" + zoo_animals[0])\n", 102 | " print(\"The second animal at the zoo is the \" + zoo_animals[1])\n", 103 | " print(\"The third animal at the zoo is the \" + zoo_animals[2])\n", 104 | " print(\"The fourth animal at the zoo is the \" + zoo_animals[3])" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "### Appending to a list or extending a list:" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "# empty list\n", 121 | "suitcase = []\n", 122 | "suitcase.append(\"sunglasses\")\n", 123 | "\n", 124 | "# Your code here!\n", 125 | "suitcase.append(\"doll\")\n", 126 | "suitcase.append(\"ball\")\n", 127 | "suitcase.append(\"comb\")\n", 128 | "\n", 129 | "list_length = len(suitcase) # Set this to the length of suitcase\n", 130 | "\n", 131 | "print(\"There are %d items in the suitcase.\" % (list_length))\n", 132 | "print(suitcase)\n", 133 | "\n", 134 | "# we can also append an other list to a list\n", 135 | "# using the \"extend\" method\n", 136 | "numbers = [42,7,12]\n", 137 | "\n", 138 | "suitcase.extend(numbers)\n", 139 | "\n", 140 | "print(suitcase)" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "### Slicing:" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "suitcase = [\"sunglasses\", \"hat\", \"passport\", \"laptop\", \"suit\", \"shoes\"]\n", 157 | "\n", 158 | "first = suitcase[0:2] # The first and second items (index zero and one)\n", 159 | "middle = suitcase[2:4] # Third and fourth items (index two and three)\n", 160 | "last = suitcase[4:6] # The last two items (index four and five)\n", 161 | "\n", 162 | "print(last)" 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "### Looping over the elements of a list:" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "my_list = [1,9,3,8,5,7]\n", 179 | "\n", 180 | "for number in my_list:\n", 181 | " print(2*number)\n", 182 | " # Your code here" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "## 2. Dictionaries\n", 190 | "\n", 191 | "https://docs.python.org/3/tutorial/datastructures.html#dictionaries\n", 192 | "\n", 193 | "Unlike sequences, which are indexed by a range of numbers, dictionaries are indexed by keys, which can be strings or numbers.\n", 194 | "\n", 195 | "It is best to think of a dictionary as an unordered set of key: value pairs, with the requirement that the keys are unique (within one dictionary).\n", 196 | "\n", 197 | "### Creating a Dictionary:" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": null, 203 | "metadata": {}, 204 | "outputs": [], 205 | "source": [ 206 | "# Assigning a dictionary with three key-value pairs to residents:\n", 207 | "residents = {'Puffin' : 104, 'Sloth' : 105, 'Burmese Python' : 106}\n", 208 | "\n", 209 | "print(residents['Puffin']) # Prints Puffin's room number\n", 210 | "\n", 211 | "print(residents['Sloth'])\n", 212 | "print(residents['Burmese Python'])" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "### Adding entries:" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": {}, 226 | "outputs": [], 227 | "source": [ 228 | "menu = {} # Empty dictionary\n", 229 | "menu['Raclette'] = 14.50 # Adding new key-value pair\n", 230 | "print(menu['Raclette'])\n", 231 | "\n", 232 | "menu['Cheese Fondue'] = 10.50\n", 233 | "menu['Muesli'] = 13.50\n", 234 | "menu['Quiche'] = 19.50\n", 235 | "menu['Cervela'] = 17.50 # Your code here: Add some dish-price pairs to menu!\n", 236 | "\n", 237 | "print(\"There are \" + str(len(menu)) + \" items on the menu.\")\n", 238 | "print(menu)" 239 | ] 240 | }, 241 | { 242 | "cell_type": "markdown", 243 | "metadata": {}, 244 | "source": [ 245 | "### Removing entries:" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "metadata": {}, 252 | "outputs": [], 253 | "source": [ 254 | "del menu['Muesli']\n", 255 | "print(menu)" 256 | ] 257 | }, 258 | { 259 | "cell_type": "markdown", 260 | "metadata": {}, 261 | "source": [ 262 | "### Looping over entries:" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "for key, value in menu.items():\n", 272 | " print(key, value)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "markdown", 277 | "metadata": { 278 | "collapsed": true 279 | }, 280 | "source": [ 281 | "### Accessing entries:" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": null, 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "raclette = menu.get('Raclette')\n", 291 | "print(raclette)\n", 292 | "\n", 293 | "#accessing a non-existing key\n", 294 | "burger = menu.get('Burger')\n", 295 | "print(burger)\n" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "#listing keys\n", 305 | "menu.keys()" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "#listing values\n", 315 | "menu.values()\n" 316 | ] 317 | }, 318 | { 319 | "cell_type": "markdown", 320 | "metadata": {}, 321 | "source": [ 322 | "# NumPy\n", 323 | "\n", 324 | "NumPy is the fundamental package for scientific computing with Python (http://www.numpy.org/).\n", 325 | "It is part of SciPy (https://www.scipy.org/).\n", 326 | "\n", 327 | "The numpy package provides functionalities that makes Python similar to MATLAB." 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": null, 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "# first import the package\n", 337 | "import numpy as np\n", 338 | "\n", 339 | "# numpy works with array similarly to MATLAB \n", 340 | "x = np.array([1,2,4,5,6,8,10,4,3,5,6])\n", 341 | "\n", 342 | "# but indexing start at 0!!\n", 343 | "print(x[0])\n", 344 | "\n", 345 | "# arrays have useful methods\n", 346 | "print(x.size)\n", 347 | "\n", 348 | "print(x.mean())\n", 349 | "\n", 350 | "# we can also use numpy functions\n", 351 | "\n", 352 | "amax = np.max(x)\n", 353 | "\n", 354 | "print(amax)\n" 355 | ] 356 | }, 357 | { 358 | "cell_type": "markdown", 359 | "metadata": {}, 360 | "source": [ 361 | "And you can do much more with NumPy! see https://numpy.org/doc/stable/user/quickstart.html\n" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": {}, 367 | "source": [ 368 | "# Matplotlib\n", 369 | "\n", 370 | "Matplotlib is a Python 2D plotting library which produces publication quality figures in a variety of hardcopy formats and interactive environments across platforms (https://matplotlib.org/).\n" 371 | ] 372 | }, 373 | { 374 | "cell_type": "code", 375 | "execution_count": null, 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [ 379 | "# import matplotlib\n", 380 | "import matplotlib.pyplot as plt\n", 381 | "# make matplotlib create figures inline\n", 382 | "%matplotlib inline\n", 383 | "\n", 384 | "x = np.linspace(0,2,100)\n", 385 | "\n", 386 | "y1 = np.exp(x)\n", 387 | "y2 = np.sqrt(x)\n", 388 | "\n", 389 | "plt.plot(x,y1, 'r-', label='y = exp(x)')\n", 390 | "plt.plot(x,y2, 'b--', label='y = sqrt(x)')\n", 391 | "\n", 392 | "plt.xlabel('x')\n", 393 | "plt.ylabel('y')\n", 394 | "plt.legend()\n", 395 | "\n" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [ 404 | "# press Shift-Tab to see information about a function\n", 405 | "norm = np.random.randn(200)\n", 406 | "\n", 407 | "n, bins, patches = plt.hist(norm, density=True, bins=20)" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": null, 413 | "metadata": {}, 414 | "outputs": [], 415 | "source": [] 416 | } 417 | ], 418 | "metadata": { 419 | "anaconda-cloud": {}, 420 | "kernelspec": { 421 | "display_name": "Python 3", 422 | "language": "python", 423 | "name": "python3" 424 | }, 425 | "language_info": { 426 | "codemirror_mode": { 427 | "name": "ipython", 428 | "version": 3 429 | }, 430 | "file_extension": ".py", 431 | "mimetype": "text/x-python", 432 | "name": "python", 433 | "nbconvert_exporter": "python", 434 | "pygments_lexer": "ipython3", 435 | "version": "3.6.10" 436 | } 437 | }, 438 | "nbformat": 4, 439 | "nbformat_minor": 1 440 | } 441 | -------------------------------------------------------------------------------- /network_components.svg: -------------------------------------------------------------------------------- 1 | 2 | 13 | 15 | 17 | 21 | 25 | 26 | 32 | 37 | 38 | 44 | 49 | 50 | 58 | 64 | 69 | 70 | 71 | 73 | 74 | 76 | image/svg+xml 77 | 79 | 80 | 81 | 82 | 83 | 86 | 90 | 94 | 97 | 103 | 110 | 117 | 124 | 125 | 128 | 135 | 142 | 149 | 156 | 163 | 170 | 177 | 178 | 185 | 192 | 199 | 206 | 213 | 220 | 227 | 230 | 234 | 238 | 242 | 246 | 250 | 254 | 258 | 262 | 266 | 270 | 274 | 278 | 282 | 286 | 287 | 297 | WCGC 307 | SCGC 317 | 322 | 326 | 327 | 328 | -------------------------------------------------------------------------------- /03_Introduction_To_Supervised_Machine_Learning.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "In this section we will see the basics of supervised machine learning with a logistic regression classifier. We will see a simple example and see how to evaluate the performance of a binary classifier and avoid over-fitting.\n", 8 | "# Supervised machine learning\n", 9 | "\n", 10 | "This section is partially inspired by the following Reference: https://see.stanford.edu/materials/aimlcs229/cs229-notes1.pdf\n", 11 | "\n", 12 | "Supervised learning consists of inferring a function from a labeled training set. On the other hand, unsupervised learning is a machine learning technique used when the input data is not labeled. Clustering is a example of unsupervised learning. \n", 13 | "\n", 14 | "For supervised learning, we define:\n", 15 | "\n", 16 | "- The **features** (input variables) $x^{(i)}\\in \\mathbb{X}$ \n", 17 | "- The **target** (output we are trying to predict) $y^{(i)} \\in \\mathbb{Y}$\n", 18 | "\n", 19 | "A pair $(x^{(i)},y^{(i)})$ is a **training example**.\n", 20 | "\n", 21 | "The set $\\{(x^{(i)},y^{(i)}); i = 1,...,m\\}$ is the **training set**:\n", 22 | "\n", 23 | "The goal of supervised learning is to learn a function $h: \\mathbb{X}\\mapsto\\mathbb{Y}$, called the hypothesis, so that $h(x)$ is a good \n", 24 | "predictor of the corresponding $y$.\n", 25 | "\n", 26 | "- **Regression** correspond to the case where $y$ is a continuous variable\n", 27 | "- **Classification** correspond to the case where $y$ can only take a small number of discrete values\n", 28 | "\n", 29 | "Examples: \n", 30 | "- Univariate Linear Regression: $h_w(x) = w_0+w_1x$, with $\\mathbb{X} = \\mathbb{Y} = \\mathbb{R}$\n", 31 | "- Multivariate Linear Regression: $$h_w(x) = w_0+w_1x_1 + ... + w_nx_n = \\sum_{i=0}^{n}w_ix_i = w^Tx,$$\n", 32 | "with $\\mathbb{Y} = \\mathbb{R}$ and $\\mathbb{X} = \\mathbb{R^n}$.\n", 33 | "Here $w_0$ is the intercept with the convention that $x_0=1$ to simplify notation.\n", 34 | "\n", 35 | "\n", 36 | "\n", 37 | "## Binary Classification with Logistic Regression\n", 38 | "\n", 39 | "- $y$ can take only two values, 0 or 1. For example, if $y$ is the sentiment associated with the tweet,\n", 40 | "$y=1$ if the tweet is \"positive\" and $y=0$ is the tweet is \"negative\".\n", 41 | "\n", 42 | "- $x^{(i)}$ represents the features of a tweet. For example the presence or absence of certain words.\n", 43 | "\n", 44 | "- $y^{(i)}$ is the **label** of the training example represented by $x^{(i)}$.\n", 45 | "\n", 46 | "\n", 47 | "Since $y\\in\\{0,1\\}$ we want to limit $h_w(x)$ between $[0,1]$.\n", 48 | "\n", 49 | "The **Logistic regression** consists of choosing $h_w(x)$ as\n", 50 | "\n", 51 | "$$\n", 52 | "h_w(x) = \\frac{1}{1+e^{-w^Tx}}\n", 53 | "$$\n", 54 | "\n", 55 | "where $w^Tx = \\sum_{i=0}^{n}w_ix_i$ and $h_w(x) = g(w^Tx)$ with\n", 56 | "\n", 57 | "$$\n", 58 | "g(x)=\\frac{1}{1+e^{-x}}.\n", 59 | "$$\n", 60 | "\n", 61 | "$g(x)$ is the **logistic function** or **sigmoid function**\n" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "import numpy as np\n", 71 | "import matplotlib.pyplot as plt\n", 72 | "%matplotlib inline\n", 73 | "\n", 74 | "x = np.linspace(-10,10)\n", 75 | "y = 1/(1+np.exp(-x))\n", 76 | "\n", 77 | "p = plt.plot(x,y)\n", 78 | "plt.grid(True)" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "- $g(x)\\rightarrow 1$ for $x\\rightarrow\\infty$\n", 86 | "- $g(x)\\rightarrow 0$ for $x\\rightarrow -\\infty$\n", 87 | "- $g(0) = 1/2$\n", 88 | "\n", 89 | "Finally, to go from the regression to the classification, we can simply apply the following condition:\n", 90 | "\n", 91 | "$$\n", 92 | "y=\\left\\{\n", 93 | " \\begin{array}{@{}ll@{}}\n", 94 | " 1, & \\text{if}\\ h_w(x)>=1/2 \\\\\n", 95 | " 0, & \\text{otherwise}\n", 96 | " \\end{array}\\right.\n", 97 | "$$\n", 98 | "\n", 99 | "Let's clarify the notation. We have **$m$ training samples** and **$n$ features**, our training examples can be represented by a **$m$-by-$n$ matrix** $\\underline{\\underline{X}}=(x_{ij})$ ($m$-by-$n+1$, if we include the intercept term) that contains the training examples, $x^{(i)}$, in its rows.\n", 100 | "\n", 101 | "The target values of the training set can be represented as a $m$-dimensional vector $\\underline{y}$ and the parameters \n", 102 | "of our model as\n", 103 | "a $n$-dimensional vector $\\underline{w}$ ($n+1$ if we take into account the intercept).\n", 104 | "\n", 105 | "Now, for a given training example $x^{(i)}$, the function that we want to learn (or fit) can be written:\n", 106 | "\n", 107 | "$$\n", 108 | "h_\\underline{w}(x^{(i)}) = \\frac{1}{1+e^{-\\sum_{j=0}^n w_j x_{ij}}}\n", 109 | "$$\n" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "# Simple example:\n", 119 | "# we have 20 students that took an exam and we want to know if we can use \n", 120 | "# the number of hours they studied to predict if they pass or fail the\n", 121 | "# exam\n", 122 | "\n", 123 | "# m = 20 training samples \n", 124 | "# n = 1 feature (number of hours)\n", 125 | "\n", 126 | "X = np.array([0.50, 0.75, 1.00, 1.25, 1.50, 1.75, 1.75, 2.00, 2.25, 2.50,\n", 127 | " 2.75, 3.00, 3.25, 3.50, 4.00, 4.25, 4.50, 4.75, 5.00, 5.50])\n", 128 | "# 1 = pass, 0 = fail\n", 129 | "y = np.array([0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1])\n", 130 | "\n", 131 | "print(X.shape)\n", 132 | "\n", 133 | "print(y.shape)\n", 134 | "\n", 135 | "p = plt.plot(X,y,'o')\n", 136 | "tx = plt.xlabel('x [h]')\n", 137 | "ty = plt.ylabel('y ')\n", 138 | "\n" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "### Likelihood of the model\n", 146 | "\n", 147 | "How to find the parameters, also called *weights*, $\\underline{w}$ that best fit our training data?\n", 148 | "We want to find the weights $\\underline{w}$ that maximize the likelihood of observing the target $\\underline{y}$ given the observed features $\\underline{\\underline{X}}$.\n", 149 | "We need a probabilistic model that gives us the probability of observing the value $y^{(i)}$ given the features $x^{(i)}$.\n", 150 | "\n", 151 | "The function $h_\\underline{w}(x^{(i)})$ can be used precisely for that:\n", 152 | "\n", 153 | "$$\n", 154 | "P(y^{(i)}=1|x^{(i)};\\underline{w}) = h_\\underline{w}(x^{(i)})\n", 155 | "$$\n", 156 | "\n", 157 | "$$\n", 158 | "P(y^{(i)}=0|x^{(i)};\\underline{w}) = 1 - h_\\underline{w}(x^{(i)})\n", 159 | "$$\n", 160 | "\n", 161 | "\n", 162 | "we can write it more compactly as:\n", 163 | "\n", 164 | "$$\n", 165 | "P(y^{(i)}|x^{(i)};\\underline{w}) = (h_\\underline{w}(x^{(i)}))^{y^{(i)}} ( 1 - h_\\underline{w}(x^{(i)}))^{1-y^{(i)}}\n", 166 | "$$\n", 167 | "where $y^{(i)}\\in\\{0,1\\}$\n", 168 | "\n", 169 | "\n", 170 | "We see that $y^{(i)}$ is a random variable following a Bernouilli distribution with expectation $h_\\underline{w}(x^{(i)})$.\n", 171 | "\n", 172 | "\n", 173 | "\n", 174 | "The **Likelihood function** of a statistical model is defined as:\n", 175 | "$$\n", 176 | "\\mathcal{L}(\\underline{w}) = \\mathcal{L}(\\underline{w};\\underline{\\underline{X}},\\underline{y}) = P(\\underline{y}|\\underline{\\underline{X}};\\underline{w}).\n", 177 | "$$\n", 178 | "\n", 179 | "The likelihood takes into account all the $m$ training samples of our training dataset and estimates the likelihood \n", 180 | "of observing $\\underline{y}$ given $\\underline{\\underline{X}}$ and $\\underline{w}$. Assuming that the $m$ training examples were generated independently, we can write:\n", 181 | "\n", 182 | "$$\n", 183 | "\\mathcal{L}(\\underline{w}) = P(\\underline{y}|\\underline{\\underline{X}};\\underline{w}) = \\prod_{i=1}^m P(y^{(i)}|x^{(i)};\\underline{w}) = \\prod_{i=1}^m (h_\\underline{w}(x^{(i)}))^{y^{(i)}} ( 1 - h_\\underline{w}(x^{(i)}))^{1-y^{(i)}}.\n", 184 | "$$\n", 185 | "\n", 186 | "This is the function that we want to maximize. It is usually much simpler to maximize the logarithm of this function, which is equivalent.\n", 187 | "\n", 188 | "$$\n", 189 | "l(\\underline{w}) = \\log\\mathcal{L}(\\underline{w}) = \\sum_{i=1}^{m} \\left(y^{(i)} \\log h_\\underline{w}(x^{(i)}) + (1- y^{(i)})\\log\\left(1- h_\\underline{w}(x^{(i)})\\right) \\right)\n", 190 | "$$\n", 191 | "\n", 192 | "### Loss function and linear models\n", 193 | "\n", 194 | "An other way of formulating this problem is by defining a Loss function $L\\left(y^{(i)}, f(x^{(i)})\\right)$ such that:\n", 195 | "\n", 196 | "$$\n", 197 | "\\sum_{i=1}^{m} L\\left(y^{(i)}, f(x^{(i)})\\right) = - l(\\underline{w}).\n", 198 | "$$\n", 199 | "\n", 200 | "And now the problem consists of minimizing $\\sum_{i=1}^{m} L\\left(y^{(i)}, f(x^{(i)})\\right)$ over all the possible values of $\\underline{w}$.\n", 201 | "\n", 202 | "Using the definition of $h_\\underline{w}(x^{(i)})$ you can show that $L$ can be written as:\n", 203 | "$$\n", 204 | "L\\left(y^{(i)}=1, f(x^{(i)})\\right) = \\log_2\\left(1+e^{-f(x^{(i)})}\\right)\n", 205 | "$$\n", 206 | "and\n", 207 | "$$\n", 208 | "L\\left(y^{(i)}=0, f(x^{(i)})\\right) = \\log_2\\left(1+e^{-f(x^{(i)})}\\right) - \\log_2\\left(e^{-f(x^{(i)})}\\right)\n", 209 | "$$\n", 210 | "\n", 211 | "where $f(x^{(i)}) = \\sum_{j=0}^n w_j x_{ij}$ is called the **decision function**.\n" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": { 218 | "scrolled": false 219 | }, 220 | "outputs": [], 221 | "source": [ 222 | "import numpy as np\n", 223 | "import matplotlib.pyplot as plt\n", 224 | "%matplotlib inline\n", 225 | "\n", 226 | "fx = np.linspace(-5,5)\n", 227 | "Ly1 = np.log2(1+np.exp(-fx))\n", 228 | "Ly0 = np.log2(1+np.exp(-fx)) - np.log2(np.exp(-fx))\n", 229 | "\n", 230 | "p = plt.plot(fx,Ly1,label='L(1,f(x))')\n", 231 | "p = plt.plot(fx,Ly0,label='L(0,f(x))')\n", 232 | "tx = plt.xlabel('f(x)')\n", 233 | "ty = plt.ylabel('L')\n", 234 | "l = plt.legend()" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "# coming back to our simple example\n", 244 | "\n", 245 | "def Loss(x_i,y_i, w0, w1):\n", 246 | " fx = w0 + x_i*w1\n", 247 | " \n", 248 | " if y_i == 1:\n", 249 | " return np.log2(1+np.exp(-fx))\n", 250 | " if y_i == 0:\n", 251 | " return np.log2(1+np.exp(-fx)) - np.log2(np.exp(-fx))\n", 252 | " else:\n", 253 | " raise Exception('y_i must be 0 or 1')\n", 254 | " \n", 255 | "def sumLoss(x,y, w0, w1):\n", 256 | " sumloss = 0\n", 257 | " for x_i, y_i in zip(x,y):\n", 258 | " sumloss += Loss(x_i,y_i, w0, w1)\n", 259 | " return sumloss\n", 260 | " \n", 261 | "\n", 262 | "# lets compute the loss function for several values\n", 263 | "w0s = np.linspace(-10,20,100)\n", 264 | "w1s = np.linspace(-10,20,100)\n", 265 | "\n", 266 | "sumLoss_vals = np.zeros((w0s.size, w1s.size))\n", 267 | "for k, w0 in enumerate(w0s):\n", 268 | " for l, w1 in enumerate(w1s):\n", 269 | " sumLoss_vals[k,l] = sumLoss(X,y,w0,w1)\n", 270 | " \n" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "# let's find the values of w0 and w1 that minimize the loss\n", 280 | "ind0, ind1 = np.where(sumLoss_vals == sumLoss_vals.min())\n", 281 | "\n", 282 | "print('position of the minimum:', w0s[ind0], w1s[ind1])\n", 283 | "\n", 284 | "# plot the loss function\n", 285 | "p = plt.pcolor(w0s, w1s, sumLoss_vals, shading='auto')\n", 286 | "c = plt.colorbar()\n", 287 | "\n", 288 | "p2 = plt.plot(w1s[ind1], w0s[ind0], 'ro')\n", 289 | "\n", 290 | "tx = plt.xlabel('w1')\n", 291 | "ty = plt.ylabel('w0')\n", 292 | "\n", 293 | "\n" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "Here we found the minimum of the loss function simply by computing it over a large range of values. In practice, this approach is not possible when the dimensionality of the loss function (number of weights) is very large. To find the minimum of the loss function, the gradient descent algorithm (or [stochastic gradient descent](http://scikit-learn.org/stable/modules/sgd.html)) is often used." 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": { 307 | "scrolled": true 308 | }, 309 | "outputs": [], 310 | "source": [ 311 | "# plot the solution\n", 312 | "\n", 313 | "x = np.linspace(0,6,100)\n", 314 | "\n", 315 | "def h_w(x, w0=w0s[ind0], w1=w1s[ind1]):\n", 316 | " return 1/(1+np.exp(-(w0+x*w1)))\n", 317 | "\n", 318 | "p1 = plt.plot(x, h_w(x))\n", 319 | "p2 = plt.plot(X,y,'ro')\n", 320 | "tx = plt.xlabel('x [h]')\n", 321 | "ty = plt.ylabel('y ')\n" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "# probability of passing the exam if you worked 5 hours:\n", 331 | "print(h_w(5))" 332 | ] 333 | }, 334 | { 335 | "cell_type": "markdown", 336 | "metadata": {}, 337 | "source": [ 338 | "We will use the package sci-kit learn (http://scikit-learn.org/) that provide access to many tools for machine learning, data mining and data analysis." 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": null, 344 | "metadata": {}, 345 | "outputs": [], 346 | "source": [ 347 | "# The same thing using the sklearn module\n", 348 | "from sklearn.linear_model import LogisticRegression\n", 349 | "\n", 350 | "model = LogisticRegression(C=1e10)\n", 351 | "\n", 352 | "# to train our model we use the \"fit\" method\n", 353 | "# we have to reshape X because we have only one feature here\n", 354 | "model.fit(X.reshape(-1,1),y)\n", 355 | "\n", 356 | "# to see the weights\n", 357 | "print('w1 =', model.coef_)\n", 358 | "print('w0 =', model.intercept_)\n", 359 | "\n", 360 | "# use the trained model to predict new values\n", 361 | "print('prediction probabilities:', model.predict_proba(np.array(5).reshape(-1,1)))\n", 362 | "print('predicted label:', model.predict((np.array(5).reshape(-1,1))))" 363 | ] 364 | }, 365 | { 366 | "cell_type": "markdown", 367 | "metadata": {}, 368 | "source": [ 369 | "Note that although the loss function is not linear, the decision function is a **linear function of the weights and features**. This is why the Logistic regression is called a **linear model**.\n", 370 | "\n", 371 | "Other linear models are defined by different loss functions. For example:\n", 372 | "- Perceptron: $L \\left(y^{(i)}, f(x^{(i)})\\right) = \\max(0, -y^{(i)}\\cdot f(x^{(i)}))$\n", 373 | "- Hinge-loss (soft-margin Support vector machine (SVM) classification): $L \\left(y^{(i)}, f(x^{(i)})\\right) = \\max(0, 1-y^{(i)}\\cdot f(x^{(i)}))$\n", 374 | "\n", 375 | "See http://scikit-learn.org/stable/modules/sgd.html for more examples.\n" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "import numpy as np\n", 385 | "import matplotlib.pyplot as plt\n", 386 | "%matplotlib inline\n", 387 | "\n", 388 | "fx = np.linspace(-5,5, 200)\n", 389 | "Logit = np.log2(1+np.exp(-fx))\n", 390 | "Percep = np.maximum(0,- fx) \n", 391 | "Hinge = np.maximum(0, 1- fx)\n", 392 | "ZeroOne = np.ones(fx.size)\n", 393 | "ZeroOne[fx>=0] = 0\n", 394 | "\n", 395 | "p = plt.plot(fx,Logit,label='Logistic Regression')\n", 396 | "p = plt.plot(fx,Percep,label='Perceptron')\n", 397 | "p = plt.plot(fx,Hinge,label='Hinge-loss')\n", 398 | "p = plt.plot(fx,ZeroOne,label='Zero-One loss')\n", 399 | "plt.xlabel('f(x)')\n", 400 | "plt.ylabel('L')\n", 401 | "plt.legend()\n", 402 | "ylims = plt.ylim((0,7))" 403 | ] 404 | }, 405 | { 406 | "cell_type": "markdown", 407 | "metadata": {}, 408 | "source": [ 409 | "### Evaluating the performance of a binary classifier\n", 410 | "\n", 411 | "The confusion matrix allows to visualize the performance of a classifier:\n", 412 | "\n", 413 | "| | Predicted negative | Predicted positive |\n", 414 | "| --- |:---:|:---:|\n", 415 | "| real negative | TN | FP |\n", 416 | "| real positive | FN | TP | \n", 417 | "\n", 418 | "For each prediction $y_p$, we put it in one of the four categories based on the true value of $y$:\n", 419 | "- TP = True Positive\n", 420 | "- FP = False Positive\n", 421 | "- TN = True Negative\n", 422 | "- FN = False Negative\n", 423 | "\n", 424 | "We can then evalute several measures, for example:\n", 425 | "\n", 426 | "#### Accuracy:\n", 427 | "\n", 428 | "$\\text{Accuracy}=\\frac{TP+TN}{TP+TN+FP+FN}$\n", 429 | "\n", 430 | "Accuracy is the proportion of true results (both true positives and true negatives) among the total number of cases examined. However, accuracy is not necessarily a good measure of the predictive power of a model. See the example below:\n", 431 | "\n", 432 | "#### Accuracy paradox:\n", 433 | "A classifier with these results:\n", 434 | "\n", 435 | "| \t|Predicted Negative | \tPredicted Positive|\n", 436 | "| --- |---|---|\n", 437 | "|Negative Cases \t|9,700 |\t150|\n", 438 | "|Positive Cases \t|50 \t|100|\n", 439 | "\n", 440 | "has an accuracy = 98%.\n", 441 | "\n", 442 | "Now consider the results of a classifier that systematically predict a negative result independently of the input:\n", 443 | "\n", 444 | "| |Predicted Negative| \tPredicted Positive|\n", 445 | "|---|---|---|\n", 446 | "|Negative Cases| \t9,850 | \t0|\n", 447 | "|Positive Cases| \t150 |0 |\n", 448 | "\n", 449 | "The accuracy of this classifier is 98.5% while it is clearly useless. Here the less accurate model is more useful than the more accurate one. This is why accuracy should not be used (alone) to evaluate the performance of a classifier. \n", 450 | "Precision and Recall are usually prefered:\n", 451 | "\n", 452 | "#### Precision:\n", 453 | "\n", 454 | "$\\text{Precision}=\\frac{TP}{TP+FP}$\n", 455 | "\n", 456 | "Precision measures the fraction of correct positive or the lack of false positive.\n", 457 | "It answers the question: \"Given a positive prediction from the classifier, how likely is it to be correct ?\"\n", 458 | "\n", 459 | "#### Recall:\n", 460 | "\n", 461 | "$\\text{Recall}=\\frac{TP}{TP+FN}$\n", 462 | "\n", 463 | "Recall measures the proportion of positives that are correctly identified as such or the lack of false negative.\n", 464 | "It answers the question: \"Given a positive example, will the classifier detect it ?\"\n", 465 | "\n", 466 | "#### $F_1$ score:\n", 467 | "\n", 468 | "In order to account for the precision and recall of a classifier, $F_1$ score takes the harmonic mean of both measures:\n", 469 | "\n", 470 | "$F_1 = 2 \\cdot \\frac{\\mathrm{precision} \\cdot \\mathrm{recall}}{ \\mathrm{precision} + \\mathrm{recall}} = 2 \\frac{TP}{2TP +FP+FN}$" 471 | ] 472 | }, 473 | { 474 | "cell_type": "markdown", 475 | "metadata": {}, 476 | "source": [ 477 | "When evaluating the performance of a classifier it is important to test is on a different set of values than then set we used to train it. Indeed, we want to know how the classifier performs on new data not on the training data. For this purpose we separate the training set in two: a part that we use to train the model and a part that we use to test it. This method is called **cross-validation**. Usually, we split the training set in N parts (typically 3 or 10), train the model on N-1 parts and test it on the remaining part. We then repeat this procedure with all the combination of training and testing parts and average the performance metrics from each tests. Sci-kit learn allows to easily perform cross-validation: http://scikit-learn.org/stable/modules/cross_validation.html" 478 | ] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "metadata": { 483 | "collapsed": true 484 | }, 485 | "source": [ 486 | "### Regularization and over-fitting\n", 487 | "Overfitting happens when your model is too complicated to generalise for new data. When your model fits your data perfectly, it is unlikely to fit new data well.\n", 488 | "\n", 489 | "\n", 490 | "\n", 491 | "The model in green is over-fitted. It performs very well on the training set, but it does not generalize well to new data compared to the model in black.\n", 492 | "\n", 493 | "To avoid over-fitting, it is important to have a large training set and to use cross-validation to evaluate the performance of a model. Additionally, **regularization** is used to make the model less \"complex\" and more general.\n", 494 | "\n", 495 | "Regularization consists in adding a term $R(\\underline{w})$, that penalizes too \"complex\" models, to the loss function, so that the training error that we want to minimize is:\n", 496 | "\n", 497 | "$E(\\underline{w}) = \\sum_{i=1}^{m} L\\left(y^{(i)}, f(x^{(i)})\\right) + \\lambda R(\\underline{w})$,\n", 498 | "\n", 499 | "where $\\lambda$ is a parameter that controls the strength of the regularization.\n", 500 | "\n", 501 | "Usual choices for $R(\\underline{w})$ are:\n", 502 | "- L2 norm of the weights: $R(\\underline{w}) := \\frac{1}{2} \\sum_{i=1}^{n} w_j^2$, which forces small weights in the solution,\n", 503 | "- L1 norm of the weights: $R(\\underline{w}) := \\sum_{i=1}^{n} |w_j|$, (also refered as Lasso) which leads to sparse solutions (with several zero weights).\n", 504 | "\n", 505 | "The choice of the regularization and of the its strength are usually done by selecting the best choice during the cross-validation." 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": null, 511 | "metadata": {}, 512 | "outputs": [], 513 | "source": [ 514 | "# for example\n", 515 | "from sklearn.model_selection import cross_val_predict\n", 516 | "from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix\n", 517 | "\n", 518 | "# logistic regression with L2 regularization, C controls the strength of the regularization\n", 519 | "# C = 1/lambda\n", 520 | "model = LogisticRegression(C=1, penalty='l2')\n", 521 | "\n", 522 | "# cross validation using 10 folds\n", 523 | "y_pred = cross_val_predict(model, X.reshape(-1,1), y=y, cv=10)\n", 524 | "\n", 525 | "print(confusion_matrix(y,y_pred))\n", 526 | "\n", 527 | "\n", 528 | "print('Accuracy = ' + str(accuracy_score(y, y_pred)))\n", 529 | "print('Precision = ' + str(precision_score(y, y_pred)))\n", 530 | "print('Recall = ' + str(precision_score(y, y_pred)))\n", 531 | "print('F_1 = ' + str(f1_score(y, y_pred)))\n", 532 | "\n", 533 | "# try to run it with different number of folds for the cross-validation \n", 534 | "# and different values of the regularization strength\n", 535 | "\n" 536 | ] 537 | } 538 | ], 539 | "metadata": { 540 | "anaconda-cloud": {}, 541 | "kernelspec": { 542 | "display_name": "Python 3", 543 | "language": "python", 544 | "name": "python3" 545 | }, 546 | "language_info": { 547 | "codemirror_mode": { 548 | "name": "ipython", 549 | "version": 3 550 | }, 551 | "file_extension": ".py", 552 | "mimetype": "text/x-python", 553 | "name": "python", 554 | "nbconvert_exporter": "python", 555 | "pygments_lexer": "ipython3", 556 | "version": "3.6.10" 557 | } 558 | }, 559 | "nbformat": 4, 560 | "nbformat_minor": 1 561 | } 562 | -------------------------------------------------------------------------------- /04_Twitter_Sentiment_Analysis.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": true 7 | }, 8 | "source": [ 9 | "## Tweet sentiment analysis\n", 10 | "\n", 11 | "In this section we will see how to extract features from tweets and use a classifier to classify the tweet as positive or negative.\n", 12 | "\n", 13 | "We will use a pandas DataFrames (http://pandas.pydata.org/) to store tweets and process them.\n", 14 | "Pandas DataFrames are very powerful python data-structures, like excel spreadsheets with the power of python.\n" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# Let's create a DataFrame with each tweet using pandas\n", 24 | "import pandas as pd\n", 25 | "import json\n", 26 | "import numpy as np\n", 27 | "\n", 28 | "\n", 29 | "def getTweetID(tweet):\n", 30 | " \"\"\" If properly included, get the ID of the tweet \"\"\"\n", 31 | " return tweet.get('id')\n", 32 | " \n", 33 | "def getUserIDandScreenName(tweet):\n", 34 | " \"\"\" If properly included, get the tweet \n", 35 | " user ID and Screen Name \"\"\"\n", 36 | " user = tweet.get('user')\n", 37 | " if user is not None:\n", 38 | " uid = user.get('id')\n", 39 | " screen_name = user.get('screen_name')\n", 40 | " return uid, screen_name\n", 41 | " else:\n", 42 | " return (None, None)\n", 43 | " \n", 44 | "\n", 45 | " \n", 46 | "filename = 'tweets_covid.txt'\n", 47 | "\n", 48 | "# create a list of dictionaries with the data that interests us\n", 49 | "tweet_data_list = []\n", 50 | "with open(filename, 'r') as fopen:\n", 51 | " # each line correspond to a tweet\n", 52 | " for line in fopen:\n", 53 | " if line != '\\n':\n", 54 | " tweet = json.loads(line.strip('\\n'))\n", 55 | " tweet_id = getTweetID(tweet)\n", 56 | " user_id = getUserIDandScreenName(tweet)[0]\n", 57 | " text = tweet.get('text')\n", 58 | " if tweet_id is not None:\n", 59 | " tweet_data_list.append({'tweet_id' : tweet_id,\n", 60 | " 'user_id' : user_id,\n", 61 | " 'text' : text})\n", 62 | "\n", 63 | "# put everything in a dataframe\n", 64 | "tweet_df = pd.DataFrame.from_dict(tweet_data_list)\n", 65 | "\n" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "print('shape:', tweet_df.shape)\n", 75 | "print('columns:', tweet_df.columns)\n", 76 | "\n", 77 | "#print 5 first element of one of the column\n", 78 | "print(tweet_df.text.iloc[:5])\n", 79 | "# or\n", 80 | "print(tweet_df['text'].iloc[:5])\n" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "#show the first 10 rows\n", 90 | "tweet_df.head(10)" 91 | ] 92 | }, 93 | { 94 | "cell_type": "markdown", 95 | "metadata": {}, 96 | "source": [ 97 | "### Extracting features from the tweets\n", 98 | "\n", 99 | "#### 1) Tokenize the tweet in a list of words\n", 100 | "\n", 101 | "This part uses concepts from [Naltural Langage Processing](https://en.wikipedia.org/wiki/Natural_language_processing).\n", 102 | "We will use a tweet tokenizer I built based on TweetTokenizer from NLTK (http://www.nltk.org/).\n", 103 | "You can see how it works by opening the file TwSentiment.py. The goal is to process any tweets and extract a list of words taking into account usernames, hashtags, urls, emoticons and all the informal text we can find in tweets. We also want to reduce the number of features by doing some transformations such as putting all the words in lower cases." 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "from TwSentiment import CustomTweetTokenizer" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "tokenizer = CustomTweetTokenizer(preserve_case=False, # keep Upper cases\n", 122 | " reduce_len=True, # reduce repetition of letter to a maximum of three\n", 123 | " strip_handles=False, # remove usernames (@mentions)\n", 124 | " normalize_usernames=True, # replace all mentions to \"@USER\"\n", 125 | " normalize_urls=True, # replace all urls to \"URL\"\n", 126 | " keep_allupper=True) # keep upercase for words that are all in uppercase" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "# example\n", 136 | "tweet_df.text.iloc[0]" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "tokenizer.tokenize(tweet_df.text.iloc[0])" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "# other examples\n", 155 | "tokenizer.tokenize('Hey! This is SO cooooooooooooooooool! :)')" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "tokenizer.tokenize('Hey! This is so cooooooool! :)')" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "#### 2) Define the features that will represent the tweet\n", 172 | "We will use the occurrence of words and pair of words (bigrams) as features.\n", 173 | "\n", 174 | "This corresponds to a bag-of-words representation (https://en.wikipedia.org/wiki/Bag-of-words_model): we just count each words (or [n-grams](https://en.wikipedia.org/wiki/N-gram)) without taking account their order. For document classification, the frequency of occurence of each words is usually taken as a feature. In the case of tweets, they are so short that we can just count each words once.\n", 175 | "\n", 176 | "Using pair of words allows to capture some of the context in which each words appear. This helps capturing the correct meaning of words." 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": { 183 | "scrolled": true 184 | }, 185 | "outputs": [], 186 | "source": [ 187 | "from TwSentiment import bag_of_words_and_bigrams\n", 188 | "\n", 189 | "# this will return a dictionary of features,\n", 190 | "# we just list the features present in this tweet\n", 191 | "bag_of_words_and_bigrams(tokenizer.tokenize(tweet_df.text.iloc[0]))" 192 | ] 193 | }, 194 | { 195 | "cell_type": "markdown", 196 | "metadata": {}, 197 | "source": [ 198 | "#### Download the logistic regression classifier\n", 199 | "\n", 200 | "https://www.dropbox.com/s/xtbgr7ve6bya7sy/tweet_classifier_pipepline.pickle.gz?dl=1\n", 201 | "\n", 202 | "I trained this classifier on this dataset: http://help.sentiment140.com/for-students/, following the approach from this paper: http://cs.stanford.edu/people/alecmgo/papers/TwitterDistantSupervision09.pdf\n", 203 | "\n", 204 | "This is a set of 14 million tweets with emoticons. Tweets containing \"sad\" emoticons (7 million) are considered negative and tweets with \"happy\" emoticons (7 million) are considered positive.\n", 205 | "\n", 206 | "I used a Logistic Regression classifier with L2 regularization that I optimized with a 10 fold cross-validation using $F_1$ score as a metric. Use see how I trained it in the file [train_sentiment_classifier.py](train_sentiment_classifier.py).\n" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "# the classifier is saved in a \"pickle\" file\n", 216 | "import gzip\n", 217 | "import pickle\n", 218 | "\n", 219 | "with gzip.open('tweet_classifier_pipepline.pickle.gz') as gzfile:\n", 220 | " \n", 221 | " classifier_pipepline = pickle.load(gzfile)\n", 222 | "\n" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": null, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "classifier_dict" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": { 238 | "scrolled": true 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "# classifier_pipepline contains the classifier and the feature vectorizer\n", 243 | "classifier_pipepline" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "The classifier is in fact contained in a pipeline.\n", 251 | "A sklearn pipeline allows to assemble several transformation of your data (http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "markdown", 256 | "metadata": {}, 257 | "source": [ 258 | "In our case we have two steps: \n", 259 | "\n", 260 | "- Vectorize the textual features (using http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html)\n", 261 | "- Classify the vectorized features (using http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "classifier_pipepline.steps" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "# this the step that will transform a list of textual features to a vector of zeros and ones\n", 280 | "dict_vect = classifier_pipepline.steps[0][1]" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [ 289 | "dict_vect.feature_names_" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "# number of features\n", 299 | "len(dict_vect.feature_names_)" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": {}, 306 | "outputs": [], 307 | "source": [ 308 | "# a little example\n", 309 | "text = 'Hi all, I am very happy today'\n", 310 | "# first tokenize\n", 311 | "tokens = tokenizer.tokenize(text)\n", 312 | "print('tokens:', tokens)\n", 313 | "\n", 314 | "# list features\n", 315 | "features = bag_of_words_and_bigrams(tokens)\n", 316 | "print('features:',features)\n", 317 | "\n", 318 | "# vectorize features\n", 319 | "X = dict_vect.transform(features)\n", 320 | "print('features vector type:', type(X))\n", 321 | "print('features vector:', X)\n", 322 | "print('feature vector size:', X.shape)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "# X is a special kind of numpy array. beacause it is extremely sparse\n", 332 | "# it can be encoded to take less space in memory\n", 333 | "# if we want to see it fully, we can use .toarray()\n", 334 | "\n", 335 | "# number of non-zero values in X:\n", 336 | "X.nnz\n" 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "metadata": {}, 342 | "source": [ 343 | "The mapping between the list of features and the vector of zeros and ones is done when you train the pipeline with its `.fit` method." 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": { 349 | "collapsed": true 350 | }, 351 | "source": [ 352 | "### Classifing the tweet\n", 353 | "Now that we have vector representing the presence of features in a tweet, we can apply our logistic regression classifier to compute the probability that a tweet belong to the \"sad\" or \"happy\" category" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": null, 359 | "metadata": {}, 360 | "outputs": [], 361 | "source": [ 362 | "classifier = classifier_pipepline.steps[1][1]" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": null, 368 | "metadata": {}, 369 | "outputs": [], 370 | "source": [ 371 | "classifier" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": null, 377 | "metadata": {}, 378 | "outputs": [], 379 | "source": [ 380 | "# access the weights of the logistic regression\n", 381 | "classifier.coef_" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [ 390 | "# we have as many weights as features\n", 391 | "classifier.coef_.shape" 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [ 400 | "# plus the intrecept \n", 401 | "classifier.intercept_" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": null, 407 | "metadata": {}, 408 | "outputs": [], 409 | "source": [ 410 | "# let's check the weight associated with a given feature\n", 411 | "x = dict_vect.transform({('sad'): True})\n", 412 | "_, ind = np.where(x.todense())\n", 413 | "print(classifier.coef_[0,ind])" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": null, 419 | "metadata": {}, 420 | "outputs": [], 421 | "source": [ 422 | "x = dict_vect.transform({('good'): True})\n", 423 | "_, ind = np.where(x.todense())\n", 424 | "print(classifier.coef_[0,ind])" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "metadata": {}, 431 | "outputs": [], 432 | "source": [ 433 | "x = dict_vect.transform({('not', 'sad'): True})\n", 434 | "_, ind = np.where(x.todense())\n", 435 | "print(classifier.coef_[0,ind])" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": null, 441 | "metadata": {}, 442 | "outputs": [], 443 | "source": [ 444 | "# find the probability for a specific tweet\n", 445 | "classifier.predict_proba(X)" 446 | ] 447 | }, 448 | { 449 | "cell_type": "markdown", 450 | "metadata": {}, 451 | "source": [ 452 | "Using the sklearn pipeline to group the two last steps:" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": null, 458 | "metadata": {}, 459 | "outputs": [], 460 | "source": [ 461 | "classifier_pipepline.predict_proba(features)" 462 | ] 463 | }, 464 | { 465 | "cell_type": "markdown", 466 | "metadata": {}, 467 | "source": [ 468 | "We see two numbers, the first one is the probability of the tweet being sad, the second one is the probability of the tweet being happy." 469 | ] 470 | }, 471 | { 472 | "cell_type": "code", 473 | "execution_count": null, 474 | "metadata": {}, 475 | "outputs": [], 476 | "source": [ 477 | "# note that:\n", 478 | "classifier_pipepline.predict_proba(features).sum()" 479 | ] 480 | }, 481 | { 482 | "cell_type": "markdown", 483 | "metadata": {}, 484 | "source": [ 485 | "### Putting it all together:\n", 486 | "\n", 487 | "We will use the class `TweetClassifier` from TwSentiment.py that puts together this process for us:" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": null, 493 | "metadata": {}, 494 | "outputs": [], 495 | "source": [ 496 | "from TwSentiment import TweetClassifier" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": null, 502 | "metadata": {}, 503 | "outputs": [], 504 | "source": [ 505 | "twClassifier = TweetClassifier(classifier_pipepline,\n", 506 | " tokenizer=tokenizer,\n", 507 | " feature_extractor=bag_of_words_and_bigrams)" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": null, 513 | "metadata": {}, 514 | "outputs": [], 515 | "source": [ 516 | "# example\n", 517 | "text = 'Hi all, I am very happy today'\n", 518 | "print(twClassifier.classify_text(text))" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": null, 524 | "metadata": {}, 525 | "outputs": [], 526 | "source": [ 527 | "# the classify text method also accepts a list of text as input\n", 528 | "print(twClassifier.classify_text(['great day today!', \"bad day today...\"]))" 529 | ] 530 | }, 531 | { 532 | "cell_type": "code", 533 | "execution_count": null, 534 | "metadata": {}, 535 | "outputs": [], 536 | "source": [ 537 | "# you'll see that if the sentence becomes more complicated, \n", 538 | "# the classifier is not as accurate\n", 539 | "print(twClassifier.classify_text([\"I am not sad\"]))" 540 | ] 541 | }, 542 | { 543 | "cell_type": "code", 544 | "execution_count": null, 545 | "metadata": {}, 546 | "outputs": [], 547 | "source": [ 548 | "print(twClassifier.classify_text([\"I am not bad\"]))" 549 | ] 550 | }, 551 | { 552 | "cell_type": "markdown", 553 | "metadata": {}, 554 | "source": [ 555 | "### We can now classify our tweets:" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": null, 561 | "metadata": {}, 562 | "outputs": [], 563 | "source": [ 564 | "emo_clas, prob = twClassifier.classify_text(tweet_df.text.tolist())\n", 565 | "# for retweets, we should use the text of the original tweet, but for this example we'll skip this step." 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "execution_count": null, 571 | "metadata": { 572 | "scrolled": true 573 | }, 574 | "outputs": [], 575 | "source": [ 576 | "# add the result to the dataframe" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": null, 582 | "metadata": {}, 583 | "outputs": [], 584 | "source": [ 585 | "tweet_df['pos_class'] = (emo_clas == 'pos')\n", 586 | "tweet_df['pos_prob'] = prob[:,1]" 587 | ] 588 | }, 589 | { 590 | "cell_type": "code", 591 | "execution_count": null, 592 | "metadata": {}, 593 | "outputs": [], 594 | "source": [ 595 | "tweet_df.head()" 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": null, 601 | "metadata": {}, 602 | "outputs": [], 603 | "source": [ 604 | "# plot the distribution of probability\n", 605 | "import matplotlib.pyplot as plt\n", 606 | "%matplotlib inline\n", 607 | "h = plt.hist(tweet_df.pos_prob, bins=50)\n" 608 | ] 609 | }, 610 | { 611 | "cell_type": "markdown", 612 | "metadata": {}, 613 | "source": [ 614 | "We want to classify users based on the class of their tweets.\n", 615 | "Pandas allows to easily group tweets per users using the [groupy](http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.groupby.html) method of DataFrames:" 616 | ] 617 | }, 618 | { 619 | "cell_type": "code", 620 | "execution_count": null, 621 | "metadata": {}, 622 | "outputs": [], 623 | "source": [ 624 | "user_group = tweet_df.groupby('user_id')" 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": null, 630 | "metadata": {}, 631 | "outputs": [], 632 | "source": [ 633 | "print(type(user_group))" 634 | ] 635 | }, 636 | { 637 | "cell_type": "code", 638 | "execution_count": null, 639 | "metadata": {}, 640 | "outputs": [], 641 | "source": [ 642 | "# let's look at one of the group\n", 643 | "groups = user_group.groups\n", 644 | "uid = list(groups.keys())[1]\n", 645 | "user_group.get_group(uid)" 646 | ] 647 | }, 648 | { 649 | "cell_type": "code", 650 | "execution_count": null, 651 | "metadata": {}, 652 | "outputs": [], 653 | "source": [ 654 | "# we need to make a function that takes the dataframe of tweets grouped by users and return the class of the users\n", 655 | "def get_user_emo(group):\n", 656 | " num_pos = group.pos_class.sum()\n", 657 | " num_tweets = group.pos_class.size\n", 658 | " if num_pos/num_tweets > 0.5:\n", 659 | " return 'pos'\n", 660 | " elif num_pos/num_tweets < 0.5:\n", 661 | " return 'neg'\n", 662 | " else:\n", 663 | " return 'NA'" 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": null, 669 | "metadata": {}, 670 | "outputs": [], 671 | "source": [ 672 | "# apply the function to each group\n", 673 | "user_df = user_group.apply(get_user_emo)" 674 | ] 675 | }, 676 | { 677 | "cell_type": "code", 678 | "execution_count": null, 679 | "metadata": {}, 680 | "outputs": [], 681 | "source": [ 682 | "# This is a pandas Series where the index are the user_id\n", 683 | "user_df.head(10)" 684 | ] 685 | }, 686 | { 687 | "cell_type": "markdown", 688 | "metadata": {}, 689 | "source": [ 690 | "### Let's add this information to the graph we created earlier" 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": null, 696 | "metadata": {}, 697 | "outputs": [], 698 | "source": [ 699 | "import networkx as nx\n", 700 | "\n", 701 | "G = nx.read_gexf('twitter_lcc.gexf', node_type=int)\n", 702 | "\n", 703 | "for n in G.nodes():\n", 704 | " if n in user_df.index:\n", 705 | " # here we look at the value of the user_df series at the position where the index \n", 706 | " # is equal to the user_id of the node\n", 707 | " G.nodes[n]['emotion'] = user_df.loc[user_df.index == n].values[0]\n", 708 | " \n", 709 | "#we can also add the emotion associated with tweets to the edges of the graph\n", 710 | "for u,v, tweet_id in G.edges(data='tweet_id'):\n", 711 | " if tweet_df.tweet_id.isin([tweet_id]).any():\n", 712 | " G[u][v]['pos_class'] = int(tweet_df.loc[tweet_df.tweet_id == tweet_id].pos_class.values[0])\n", 713 | " G[u][v]['pos_prob'] = float(tweet_df.loc[tweet_df.tweet_id == tweet_id].pos_prob.values[0])" 714 | ] 715 | }, 716 | { 717 | "cell_type": "code", 718 | "execution_count": null, 719 | "metadata": { 720 | "scrolled": false 721 | }, 722 | "outputs": [], 723 | "source": [ 724 | "# we have added an attribute 'emotion' to the nodes\n", 725 | "G.nodes[n]" 726 | ] 727 | }, 728 | { 729 | "cell_type": "code", 730 | "execution_count": null, 731 | "metadata": {}, 732 | "outputs": [], 733 | "source": [ 734 | "G[u][v]" 735 | ] 736 | }, 737 | { 738 | "cell_type": "code", 739 | "execution_count": null, 740 | "metadata": {}, 741 | "outputs": [], 742 | "source": [ 743 | "# save the graph to open it with Gephi\n", 744 | "nx.write_gexf(G, 'twitter_lcc_emo.gexf')" 745 | ] 746 | }, 747 | { 748 | "cell_type": "markdown", 749 | "metadata": { 750 | "collapsed": true 751 | }, 752 | "source": [ 753 | "We can now open this file with [Gephi](https://gephi.org/) to vizualize it." 754 | ] 755 | }, 756 | { 757 | "cell_type": "markdown", 758 | "metadata": {}, 759 | "source": [ 760 | "Here is an example where the size of nodes is proportional to their in-degree, their color indicate their out-degree (from white to dark green) and the color of edges indicates the probability of the tweet carrying an \"happy\" sentiment (blue = sad, red = happy).\n", 761 | "\n", 762 | "" 763 | ] 764 | }, 765 | { 766 | "cell_type": "markdown", 767 | "metadata": {}, 768 | "source": [ 769 | "A very inclomplete list of references to go further:\n", 770 | "\n", 771 | "- Perkins, J. Python 3 Text Processing With NLTK 3 Cookbook. Python 3 Text Processing With NLTK 3 Cookbook (2014).\n", 772 | "- Hastie, T., Tibshirani, R. & Friedman, J. The Elements of Statistical Learning. Elements 1, (Springer New York, 2009).\n", 773 | "- Serrano-Guerrero, J., Olivas, J. A., Romero, F. P. & Herrera-Viedma, E. Sentiment analysis: A review and comparative analysis of web services. Inf. Sci. (Ny). 311, 18–38 (2015).\n", 774 | "- Go, A., Bhayani, R. & Huang, L. Twitter Sentiment Classification using Distant Supervision. Tech. Rep. 150, 1–6 (2009).\n", 775 | "- O’Connor, B., Balasubramanyan, R., Routledge, B. R. & Smith, N. a. From tweets to polls: Linking text sentiment to public opinion time series. Proc. 4h Int. AAAI Conf. Weblogs Soc. Media 122–129 (2010)-\n", 776 | "- Hannak, A., Anderson, E., Barrett, L. F., Lehmann, S., Mislove, A. & Riedewald, M. Tweetin’ in the Rain: Exploring societal-scale effects of weather on mood. in Proc. of the 6th International AAAI Conference on Weblogs and Social Media 479–482 (2012).\n", 777 | "- Jungherr, A., Schoen, H., Posegga, O. & Ju rgens, P. Digital Trace Data in the Study of Public Opinion: An Indicator of Attention Toward Politics Rather Than Political Support. Soc. Sci. Comput. Rev. 894439316631043 (2016).\n", 778 | "- Gayo-Avello, D. A Meta-Analysis of State-of-the-Art Electoral Prediction From Twitter Data. Soc. Sci. Comput. Rev. 31, 649–679 (2013).\n", 779 | "- Ceron, A., Curini, L. & Iacus, S. M. ISA: A fast, scalable and accurate algorithm for sentiment analysis of social media content. Inf. Sci. (Ny). 367–368, 105–124 (2016).\n", 780 | "- Bohannon, J. The pulse of the people. Science (80). 355, 470–472 (2017).\n", 781 | "- Bovet, A., Morone, F. & Makse, H. A. Validation of Twitter opinion trends with national polling aggregates: Hillary Clinton vs Donald Trump. Sci. Rep. 8, 8673 (2018)." 782 | ] 783 | } 784 | ], 785 | "metadata": { 786 | "anaconda-cloud": {}, 787 | "kernelspec": { 788 | "display_name": "Python 3", 789 | "language": "python", 790 | "name": "python3" 791 | }, 792 | "language_info": { 793 | "codemirror_mode": { 794 | "name": "ipython", 795 | "version": 3 796 | }, 797 | "file_extension": ".py", 798 | "mimetype": "text/x-python", 799 | "name": "python", 800 | "nbconvert_exporter": "python", 801 | "pygments_lexer": "ipython3", 802 | "version": "3.6.10" 803 | } 804 | }, 805 | "nbformat": 4, 806 | "nbformat_minor": 1 807 | } 808 | -------------------------------------------------------------------------------- /02_Analysis_of_Twitter_Social_Network.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Analysis of a Twitter Social Network\n", 8 | "\n", 9 | "In this section we are going to parse the tweets we collected and build the social network of interactions between Twitter users. We will also see how to analyze the network using NetworkX. We will look at the different component of the network and at percolation processes on this network.\n", 10 | "\n", 11 | "## Parsing tweets\n", 12 | "\n", 13 | "Tweets are saved in JSON format ([JavaScript Object Notation](https://www.w3schools.com/js/js_json_intro.asp))\n", 14 | "JSON is text, written with JavaScript object notation.\n", 15 | "\n", 16 | "The `json` python module allows to easily import json file into python [Dictonairies](https://docs.python.org/3/tutorial/datastructures.html#dictionaries)\n", 17 | "\n" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "#load tweets \n", 27 | "\n", 28 | "import json\n", 29 | "\n", 30 | "\n", 31 | "filename = 'tweets_covid.txt'\n", 32 | "\n", 33 | "tweet_list = []\n", 34 | "\n", 35 | "with open(filename, 'r') as fopen:\n", 36 | " # each line correspond to a tweet\n", 37 | " for line in fopen:\n", 38 | " tweet_list.append(json.loads(line))\n", 39 | " " 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "Let's look at the informations contained in a tweet" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# take the first tweet of the list\n", 56 | "tweet = tweet_list[0]" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "# each tweet is a python dictionary\n", 66 | "type(tweet)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "# all the 'entries' of the dictionary\n", 76 | "tweet.keys()" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "you can find a description of the fields in the Twitter API documentation: https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/tweet" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "#creation time\n", 93 | "tweet['created_at']" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "# text of the tweet\n", 103 | "print(tweet['text'])" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "# user info\n", 113 | "tweet['user']" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "# user is itslef a dict\n", 123 | "print(type(tweet['user']))\n", 124 | "\n", 125 | "tweet['user']['name']" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "# unique id of the user\n", 135 | "tweet['user']['id']" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "#is the tweet a retweet?\n", 145 | "'retweeted_status' in tweet" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "if 'retweeted_status' in tweet:\n", 155 | " print(tweet['retweeted_status'])\n", 156 | "# the `retweeted_status` is also a tweet dictionary " 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "if 'retweeted_status' in tweet:\n", 166 | " print(tweet['retweeted_status']['text'])" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "# user id and name of the retweeted user?\n", 176 | "if 'retweeted_status' in tweet:\n", 177 | " print(tweet['retweeted_status']['user']['id'])\n", 178 | " print(tweet['retweeted_status']['user']['name'])" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "# is the tweet a reply?\n", 188 | "'in_reply_to_user_id' in tweet and tweet['in_reply_to_user_id'] is not None" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "# 'entities' contains the hashtags, urls and usernames used in the tweet\n", 198 | "tweet['entities']" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "# user id of the mentioned users\n", 208 | "for mention in tweet['entities']['user_mentions']:\n", 209 | " print(mention['id'])" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "# is the tweet a quote?\n", 219 | "'quoted_status' in tweet" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "# Building the network of interactions\n", 227 | "\n", 228 | "We will use the python module [`NetworkX`](https://networkx.readthedocs.io/en/stable/index.html) to construct and analyze the social network.\n", 229 | "\n", 230 | "A short introduction to networkx: https://networkx.org/documentation/stable/reference/introduction.html\n" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "There are four types of interactions between two users in Twitter:\n", 238 | "- Retweet\n", 239 | "- Quote\n", 240 | "- Reply\n", 241 | "- Mention" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "# let's define some functions to extract the interactions from tweets\n", 251 | "\n", 252 | "def getTweetID(tweet):\n", 253 | " \"\"\" If properly included, get the ID of the tweet \"\"\"\n", 254 | " return tweet.get('id')\n", 255 | " \n", 256 | "def getUserIDandScreenName(tweet):\n", 257 | " \"\"\" If properly included, get the tweet \n", 258 | " user ID and Screen Name \"\"\"\n", 259 | " user = tweet.get('user')\n", 260 | " if user is not None:\n", 261 | " uid = user.get('id')\n", 262 | " screen_name = user.get('screen_name')\n", 263 | " return uid, screen_name\n", 264 | " else:\n", 265 | " return (None, None)\n", 266 | "\n", 267 | "def getRetweetedUserIDandSreenName(tweet):\n", 268 | " \"\"\" If properly included, get the retweet \n", 269 | " source user ID and Screen Name\"\"\"\n", 270 | " \n", 271 | " retweet = tweet.get('retweeted_status')\n", 272 | " if retweet is not None:\n", 273 | " return getUserIDandScreenName(retweet)\n", 274 | " else:\n", 275 | " return (None, None)\n", 276 | " \n", 277 | "def getRepliedUserIDandScreenName(tweet):\n", 278 | " \"\"\" If properly included, get the ID and Screen Name \n", 279 | " of the user the tweet replies to \"\"\"\n", 280 | " \n", 281 | " reply_id = tweet.get('in_reply_to_user_id')\n", 282 | " reply_screenname = tweet.get('in_reply_to_screen_name')\n", 283 | " return reply_id, reply_screenname\n", 284 | " \n", 285 | "def getUserMentionsIDandScreenName(tweet):\n", 286 | " \"\"\" If properly included, return a list of IDs and Screen Names tuple\n", 287 | " of all user mentions, including retweeted and replied users \"\"\"\n", 288 | " \n", 289 | " mentions = []\n", 290 | " entities = tweet.get('entities')\n", 291 | " if entities is not None:\n", 292 | " user_mentions = entities.get('user_mentions')\n", 293 | " for mention in user_mentions:\n", 294 | " mention_id = mention.get('id')\n", 295 | " screen_name = mention.get('screen_name')\n", 296 | " mentions.append((mention_id, screen_name))\n", 297 | " \n", 298 | " return mentions\n", 299 | "\n", 300 | " \n", 301 | "def getQuotedUserIDandScreenName(tweet):\n", 302 | " \"\"\" If properly included, get the ID of the user the tweet is quoting\"\"\"\n", 303 | " \n", 304 | " quoted_status = tweet.get('quoted_status')\n", 305 | " \n", 306 | " if quoted_status is not None:\n", 307 | " return getUserIDandScreenName(quoted_status)\n", 308 | " else:\n", 309 | " return (None, None)\n", 310 | " \n", 311 | "def getAllInteractions(tweet):\n", 312 | " \"\"\" Get all the interactions from this tweet\n", 313 | " \n", 314 | " returns : (tweeter_id, tweeter_screenname), list of (interacting_id, interacting_screenname)\n", 315 | " \"\"\"\n", 316 | " \n", 317 | " # Get the tweeter\n", 318 | " tweeter = getUserIDandScreenName(tweet)\n", 319 | " \n", 320 | " # Nothing to do if we couldn't get the tweeter\n", 321 | " if tweeter[0] is None:\n", 322 | " return (None, None), []\n", 323 | " \n", 324 | " # a python set is a collection of unique items\n", 325 | " # we use a set to avoid duplicated ids\n", 326 | " interacting_users = set()\n", 327 | " \n", 328 | " # Add person they're replying to\n", 329 | " interacting_users.add(getRepliedUserIDandScreenName(tweet))\n", 330 | " \n", 331 | " # Add person they retweeted\n", 332 | " interacting_users.add(getRetweetedUserIDandSreenName(tweet))\n", 333 | " \n", 334 | " # Add person they quoted\n", 335 | " interacting_users.add(getQuotedUserIDandScreenName(tweet))\n", 336 | " \n", 337 | " # Add mentions\n", 338 | " interacting_users.update(getUserMentionsIDandScreenName(tweet))\n", 339 | " \n", 340 | " # remove the tweeter if he is in the set\n", 341 | " interacting_users.discard(tweeter)\n", 342 | " # remove the None case\n", 343 | " interacting_users.discard((None,None))\n", 344 | " \n", 345 | " # Return our tweeter and their influencers\n", 346 | " return tweeter, list(interacting_users)\n", 347 | " \n" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "print(getUserIDandScreenName(tweet_list[3]))\n", 357 | "print(getAllInteractions(tweet_list[4]))\n", 358 | "\n", 359 | "tweet_list[100].get('text')" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": {}, 365 | "source": [ 366 | "#### Let's build the network" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [ 375 | "import networkx as nx\n", 376 | "\n", 377 | "# define an empty Directed Graph\n", 378 | "# A directed graph is a graph where edges have a direction\n", 379 | "# in our case the edges goes from user that sent the tweet to\n", 380 | "# the user with whom they interacted (retweeted, mentioned or quoted)\n", 381 | "G = nx.DiGraph()\n", 382 | "\n", 383 | "# loop over all the tweets and add edges if the tweet include some interactions\n", 384 | "for tweet in tweet_list:\n", 385 | " # find all influencers in the tweet\n", 386 | " tweeter, interactions = getAllInteractions(tweet)\n", 387 | " tweeter_id, tweeter_name = tweeter\n", 388 | " tweet_id = getTweetID(tweet)\n", 389 | " \n", 390 | " # add an edge to the Graph for each influencer\n", 391 | " for interaction in interactions:\n", 392 | " interact_id, interact_name = interaction\n", 393 | " \n", 394 | " # add edges between the two user ids\n", 395 | " # this will create new nodes if the nodes are not already in the network\n", 396 | " # we also add an attribute the to edge equal to the id of the tweet\n", 397 | " G.add_edge(tweeter_id, interact_id, tweet_id=tweet_id)\n", 398 | " \n", 399 | " # add name as a property to each node\n", 400 | " # with networkX each node is a dictionary\n", 401 | " G.nodes[tweeter_id]['name'] = tweeter_name\n", 402 | " G.nodes[interact_id]['name'] = interact_name\n", 403 | " " 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "metadata": {}, 410 | "outputs": [], 411 | "source": [ 412 | "# The graph's node are contained in a NodeView which has a dict-like interface\n", 413 | "print(type(G.nodes))" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": null, 419 | "metadata": {}, 420 | "outputs": [], 421 | "source": [ 422 | "# the keys are the user_id\n", 423 | "nodelist = list(G.nodes.keys())\n", 424 | "print(nodelist)" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "metadata": {}, 431 | "outputs": [], 432 | "source": [ 433 | "# each node is itself a dictionary with node attributes as key,value pairs\n", 434 | "print(type(G.nodes[nodelist[0]]))\n", 435 | "print(G.nodes[nodelist[0]])" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": null, 441 | "metadata": {}, 442 | "outputs": [], 443 | "source": [ 444 | "# edges are contained in a EdgeView with a set-like interface\n", 445 | "print(type(G.edges))\n", 446 | "print(G.edges())" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": null, 452 | "metadata": {}, 453 | "outputs": [], 454 | "source": [ 455 | "# we can see all the edges going out of this node\n", 456 | "# each edge is a dictionary inside this dictionary with a key \n", 457 | "# corresponding to the target user_id\n", 458 | "e = G.out_edges(nodelist[11], data=True)\n", 459 | "print(nodelist[11])\n", 460 | "print(e)" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": null, 466 | "metadata": {}, 467 | "outputs": [], 468 | "source": [ 469 | "# we can iterate over the out-edges \n", 470 | "for s,t,data in e:\n", 471 | " print(s,t,data)" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "#### Some basic properties of the Network:" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "metadata": {}, 485 | "outputs": [], 486 | "source": [ 487 | "G.number_of_nodes()" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": null, 493 | "metadata": {}, 494 | "outputs": [], 495 | "source": [ 496 | "G.number_of_edges()" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": null, 502 | "metadata": {}, 503 | "outputs": [], 504 | "source": [ 505 | "# listing all nodes \n", 506 | "nodelist = list(G.nodes())\n", 507 | "\n", 508 | "nodelist[:3]" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": null, 514 | "metadata": {}, 515 | "outputs": [], 516 | "source": [ 517 | "# degree of a node\n", 518 | "print(G.degree(nodelist[2]))\n", 519 | "print(G.in_degree(nodelist[2]))\n", 520 | "print(G.out_degree(nodelist[2]))" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": null, 526 | "metadata": {}, 527 | "outputs": [], 528 | "source": [ 529 | "# dictionary with the degree of all nodes\n", 530 | "all_degrees = [G.degree(n) for n in nodelist] # this is the degree for undirected edges\n", 531 | "in_degrees = [G.in_degree(n) for n in nodelist]\n", 532 | "out_degrees = [G.out_degree(n) for n in nodelist]" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": null, 538 | "metadata": {}, 539 | "outputs": [], 540 | "source": [ 541 | "# average degree\n", 542 | "2*G.number_of_edges()/G.number_of_nodes()" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": null, 548 | "metadata": {}, 549 | "outputs": [], 550 | "source": [ 551 | "import numpy as np\n", 552 | "np.array(all_degrees).mean()" 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": null, 558 | "metadata": {}, 559 | "outputs": [], 560 | "source": [ 561 | "np.array(in_degrees).mean()" 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": null, 567 | "metadata": {}, 568 | "outputs": [], 569 | "source": [ 570 | "np.array(out_degrees).mean()" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": null, 576 | "metadata": {}, 577 | "outputs": [], 578 | "source": [ 579 | "# maximum degree\n", 580 | "max(all_degrees)" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": null, 586 | "metadata": {}, 587 | "outputs": [], 588 | "source": [ 589 | "# we want to make a list with (user_id, username, degree) for all nodes\n", 590 | "degree_node_list = []\n", 591 | "for node in nodelist:\n", 592 | " degree_node_list.append((node, G.nodes[node]['name'], G.degree(node)))\n", 593 | " \n", 594 | "print('Unordered user, degree list') \n", 595 | "print(degree_node_list[:10])\n", 596 | "\n", 597 | "# sort the list according the degree in descinding order\n", 598 | "degree_node_list = sorted(degree_node_list, key=lambda x:x[2], reverse=True)\n", 599 | "print('Ordered user, degree list') \n", 600 | "print(degree_node_list[:10])" 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": null, 606 | "metadata": {}, 607 | "outputs": [], 608 | "source": [ 609 | "# we need to import matplolib for making plots\n", 610 | "# and numpy for numerical computations\n", 611 | "import numpy as np\n", 612 | "import matplotlib.pyplot as plt\n", 613 | "%matplotlib inline" 614 | ] 615 | }, 616 | { 617 | "cell_type": "markdown", 618 | "metadata": {}, 619 | "source": [ 620 | "### Network components\n", 621 | "\n", 622 | "For **directed** graphs we can define two types of components:\n", 623 | "- Weakly connected components\n", 624 | "- Strongly connected components\n", 625 | "\n", 626 | "Weakly connected component (WCC): maximal set of nodes where there exists a path in at least one direction between each pair of nodes.\n", 627 | "\n", 628 | "Strongly connected component (SCC): maximal set of nodes where there exists a path in both directions between each pair of nodes.\n", 629 | "\n", 630 | "Weakly connected giant (largest) component (WCGC): Largest WCC\n", 631 | "Strongly connected giant (largest) component (SCGC): Largest SCC\n", 632 | "\n", 633 | "" 634 | ] 635 | }, 636 | { 637 | "cell_type": "code", 638 | "execution_count": null, 639 | "metadata": {}, 640 | "outputs": [], 641 | "source": [ 642 | "# this returns a list of set of nodes belonging to the \n", 643 | "# different (weakly) connected components\n", 644 | "components = list(nx.weakly_connected_components(G))\n", 645 | "\n", 646 | "# sort the component according to their size\n", 647 | "components = list(sorted(components, key=lambda x:len(x), reverse=True))" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": null, 653 | "metadata": {}, 654 | "outputs": [], 655 | "source": [ 656 | "# make a list with the size of each component\n", 657 | "comp_sizes = []\n", 658 | "for comp in components:\n", 659 | " comp_sizes.append(len(comp))" 660 | ] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "execution_count": null, 665 | "metadata": {}, 666 | "outputs": [], 667 | "source": [ 668 | "# plot the histogram of component sizes\n", 669 | "hist = plt.hist(comp_sizes, bins=100)" 670 | ] 671 | }, 672 | { 673 | "cell_type": "code", 674 | "execution_count": null, 675 | "metadata": {}, 676 | "outputs": [], 677 | "source": [ 678 | "# histogram with logarithmic y scale\n", 679 | "hist = plt.hist(comp_sizes, bins=100, log=True)\n", 680 | "tx = plt.xlabel('component size')\n", 681 | "ty = plt.ylabel('number of components')" 682 | ] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "execution_count": null, 687 | "metadata": {}, 688 | "outputs": [], 689 | "source": [ 690 | "# sizes of the ten largest components\n", 691 | "comp_sizes[:10]" 692 | ] 693 | }, 694 | { 695 | "cell_type": "code", 696 | "execution_count": null, 697 | "metadata": {}, 698 | "outputs": [], 699 | "source": [ 700 | "# let's make a new graph which is the subgraph of G corresponding to \n", 701 | "# the largest connected component\n", 702 | "# let's find the largest component\n", 703 | "largest_comp = components[0]\n", 704 | "LCC = G.subgraph(largest_comp)" 705 | ] 706 | }, 707 | { 708 | "cell_type": "code", 709 | "execution_count": null, 710 | "metadata": {}, 711 | "outputs": [], 712 | "source": [ 713 | "G.number_of_nodes()" 714 | ] 715 | }, 716 | { 717 | "cell_type": "code", 718 | "execution_count": null, 719 | "metadata": {}, 720 | "outputs": [], 721 | "source": [ 722 | "LCC.number_of_nodes()" 723 | ] 724 | }, 725 | { 726 | "cell_type": "code", 727 | "execution_count": null, 728 | "metadata": {}, 729 | "outputs": [], 730 | "source": [ 731 | "# let's plot the degree distribution inside the LCC\n", 732 | "degrees = [LCC.degree(n) for n in LCC.nodes()]\n", 733 | "degrees" 734 | ] 735 | }, 736 | { 737 | "cell_type": "code", 738 | "execution_count": null, 739 | "metadata": {}, 740 | "outputs": [], 741 | "source": [ 742 | "degree_array = np.array(degrees)\n", 743 | "hist = plt.hist(degree_array, bins=100)" 744 | ] 745 | }, 746 | { 747 | "cell_type": "code", 748 | "execution_count": null, 749 | "metadata": {}, 750 | "outputs": [], 751 | "source": [ 752 | "# using logarithmic scales\n", 753 | "hist = plt.hist(degree_array, bins=100, log=True)\n", 754 | "plt.xscale('log')\n" 755 | ] 756 | }, 757 | { 758 | "cell_type": "code", 759 | "execution_count": null, 760 | "metadata": {}, 761 | "outputs": [], 762 | "source": [ 763 | "# logarithmic scale with logarithmic bins\n", 764 | "N, bins, patches = plt.hist(degree_array, bins=np.logspace(0,np.log10(degree_array.max()+1), 20), log=True)\n", 765 | "plt.xscale('log')\n", 766 | "tx = plt.xlabel('k - degree')\n", 767 | "ty= plt.ylabel('number of nodes')\n" 768 | ] 769 | }, 770 | { 771 | "cell_type": "code", 772 | "execution_count": null, 773 | "metadata": {}, 774 | "outputs": [], 775 | "source": [ 776 | "# Degree probability distribution (P(k))\n", 777 | "\n", 778 | "# since we have logarithmic bins, we need to\n", 779 | "# take into account the fact that the bins \n", 780 | "# have different lenghts when normalizing\n", 781 | "bin_lengths = np.diff(bins) # lenght of each bin\n", 782 | "\n", 783 | "summ = np.sum(N*bin_lengths)\n", 784 | "normalized_degree_dist = N/summ\n", 785 | "\n", 786 | "# check normalization:\n", 787 | "print(np.sum(normalized_degree_dist*bin_lengths))\n", 788 | "\n", 789 | "hist = plt.bar(bins[:-1], normalized_degree_dist, width=np.diff(bins))\n", 790 | "plt.xscale('log')\n", 791 | "plt.yscale('log')\n", 792 | "tx = plt.xlabel('k (degree)')\n", 793 | "ty = plt.ylabel('P(k)')" 794 | ] 795 | }, 796 | { 797 | "cell_type": "markdown", 798 | "metadata": {}, 799 | "source": [ 800 | "### Exercise: do the same for the Graph comprising only retweet, replies, quote and mentions" 801 | ] 802 | }, 803 | { 804 | "cell_type": "markdown", 805 | "metadata": {}, 806 | "source": [ 807 | "### Percolation of the Giant Component" 808 | ] 809 | }, 810 | { 811 | "cell_type": "code", 812 | "execution_count": null, 813 | "metadata": {}, 814 | "outputs": [], 815 | "source": [ 816 | "import random\n", 817 | "\n", 818 | "def getGCsize(G):\n", 819 | " \"\"\" returns the size of the largest component of G\"\"\"\n", 820 | " \n", 821 | " return len(max(nx.connected_components(G), key=len))\n", 822 | " \n" 823 | ] 824 | }, 825 | { 826 | "cell_type": "markdown", 827 | "metadata": {}, 828 | "source": [ 829 | "#### Random Attack:" 830 | ] 831 | }, 832 | { 833 | "cell_type": "code", 834 | "execution_count": null, 835 | "metadata": {}, 836 | "outputs": [], 837 | "source": [ 838 | "# list that will contain the size of the GC as we remove nodes\n", 839 | "rnd_attack_GC_sizes = []\n", 840 | "\n", 841 | "# we take into account the undirected version of the graph\n", 842 | "LCCundirected = nx.Graph(LCC)\n", 843 | "\n", 844 | "nodes_list = list(LCCundirected.nodes())\n", 845 | "\n", 846 | "\n", 847 | "while len(nodes_list) > 1:\n", 848 | " # add the size of the current GC\n", 849 | " rnd_attack_GC_sizes.append(getGCsize(LCCundirected))\n", 850 | " \n", 851 | " # pick a random node\n", 852 | " rnd_node = random.choice(nodes_list)\n", 853 | " # remove from graph\n", 854 | " LCCundirected.remove_node(rnd_node)\n", 855 | " # remove from node list\n", 856 | " nodes_list.remove(rnd_node)\n" 857 | ] 858 | }, 859 | { 860 | "cell_type": "code", 861 | "execution_count": null, 862 | "metadata": {}, 863 | "outputs": [], 864 | "source": [ 865 | "# convert list to numpy array\n", 866 | "rnd_attack_GC_sizes = np.array(rnd_attack_GC_sizes)\n", 867 | "\n", 868 | "# normalize by the initial size of the GC\n", 869 | "GC_rnd = rnd_attack_GC_sizes/rnd_attack_GC_sizes[0]\n", 870 | "\n", 871 | "# fraction of removed nodes\n", 872 | "q = np.linspace(0,1,num=GC_rnd.size)\n", 873 | "\n", 874 | "plt.plot(q,GC_rnd)\n", 875 | "tx = plt.xlabel('q')\n", 876 | "ty = plt.ylabel('GC')\n" 877 | ] 878 | }, 879 | { 880 | "cell_type": "markdown", 881 | "metadata": {}, 882 | "source": [ 883 | "#### High degree attack:" 884 | ] 885 | }, 886 | { 887 | "cell_type": "code", 888 | "execution_count": null, 889 | "metadata": {}, 890 | "outputs": [], 891 | "source": [ 892 | "# high degree attack\n", 893 | "LCCundirected = nx.Graph(LCC)\n", 894 | "\n", 895 | "# list of pairs (node, degree) sorted according the degree\n", 896 | "node_deg_dict = dict(nx.degree(LCCundirected))\n", 897 | "nodes_sorted = sorted(node_deg_dict, key=node_deg_dict.get)\n", 898 | "\n", 899 | "# list that will contain the size of the GC as we remove nodes\n", 900 | "hd_attack_GC_sizes = []\n", 901 | "\n", 902 | "while len(nodes_sorted) > 1:\n", 903 | " \n", 904 | " hd_attack_GC_sizes.append(getGCsize(LCCundirected))\n", 905 | " \n", 906 | " #remove node according to their degree\n", 907 | " node = nodes_sorted.pop() # pop() removes and returns the last element\n", 908 | " LCCundirected.remove_node(node)\n", 909 | " \n", 910 | " \n", 911 | "\n" 912 | ] 913 | }, 914 | { 915 | "cell_type": "code", 916 | "execution_count": null, 917 | "metadata": {}, 918 | "outputs": [], 919 | "source": [ 920 | "hd_attack_GC_sizes = np.array(hd_attack_GC_sizes)\n", 921 | "GC_hd = hd_attack_GC_sizes/hd_attack_GC_sizes[0]\n", 922 | "q = np.linspace(0,1,num=GC_hd.size)\n", 923 | "\n", 924 | "plt.plot(q,GC_rnd, label='random attack')\n", 925 | "plt.plot(q,GC_hd, label='High-Degree attack')\n", 926 | "tx = plt.xlabel('q')\n", 927 | "ty = plt.ylabel('GC')\n", 928 | "_ = plt.legend()\n" 929 | ] 930 | }, 931 | { 932 | "cell_type": "markdown", 933 | "metadata": {}, 934 | "source": [ 935 | "#### Exercise: implement the High-Degree Adaptative (HDA) attack where at each step the node with the highest degree of the remaining graph is removed." 936 | ] 937 | }, 938 | { 939 | "cell_type": "markdown", 940 | "metadata": {}, 941 | "source": [ 942 | "### PageRank\n", 943 | "\n", 944 | "The *PageRank* centrality modifies the classical random walk by introducing a \"teleportation\" probability, i.e. at each step, the walkers have a given probability to teleport uniformly at random to any other nodes of the network.\n", 945 | "This makes the random walk ergodic, i.e. it converges to a stationary distribution, even in directed and disconnected networks.\n", 946 | "\n", 947 | "The update equation for the PageRank probability density is given by \n", 948 | "\n", 949 | "$\\mathbf{p}(n+1) = (1-\\alpha)\\mathbf{p}(n)\\mathbf{D}_\\text{out}^{-1}\\mathbf{A} + \\frac{\\alpha}{N}\\mathbf{1} = \\mathbf{p}(n) \\mathbf{M}$\n", 950 | "\n", 951 | "with\n", 952 | "\n", 953 | "$ \\mathbf{M} = (1-\\alpha)\\mathbf{D}_\\text{out}^{-1}\\mathbf{A} + \\frac{\\alpha}{N}\\mathbf{1}^T\\mathbf{1}$" 954 | ] 955 | }, 956 | { 957 | "cell_type": "code", 958 | "execution_count": null, 959 | "metadata": {}, 960 | "outputs": [], 961 | "source": [ 962 | "#teleportation probability\n", 963 | "alpha = 0.15\n", 964 | "\n", 965 | "#adjacency matrix\n", 966 | "nodelist = list(G.nodes())\n", 967 | "A = nx.to_numpy_array(G, nodelist=nodelist)\n", 968 | "\n", 969 | "#diagonal matrix of out degrees\n", 970 | "deg_out_vect = np.array([float(max(G.out_degree(n),1)) for n in nodelist])\n", 971 | "D_out_inv = np.diag(1/deg_out_vect)\n", 972 | "\n", 973 | "# teleportation transition matrix\n", 974 | "N = A.shape[1]\n", 975 | "S = np.ones((N,N))*1/N\n", 976 | "\n", 977 | "# full transition matrix\n", 978 | "M = (1-alpha)*D_out_inv @ A + alpha*S\n", 979 | "\n", 980 | "# for dangling nodes (nodes without out-edges), we force the random teleportation\n", 981 | "dangling_nodes = np.where(A.sum(1) == 0)[0]\n", 982 | "M[dangling_nodes,:] = S[dangling_nodes,:]\n", 983 | "\n", 984 | "#initial walker distribution and 1st iteration\n", 985 | "p_last = np.ones(N)*1/N\n", 986 | "p = np.matmul(p_last, M)\n", 987 | "\n", 988 | "# iterate until sufficient convergence\n", 989 | "eps = 1.0e-8\n", 990 | "i = 1\n", 991 | "while np.linalg.norm(p - p_last, 2) > eps:\n", 992 | " p_last = p\n", 993 | " p = np.matmul(p, M)\n", 994 | " i += 1\n", 995 | "\n", 996 | "print(i)" 997 | ] 998 | }, 999 | { 1000 | "cell_type": "code", 1001 | "execution_count": null, 1002 | "metadata": {}, 1003 | "outputs": [], 1004 | "source": [ 1005 | "pg_ranking = np.array(np.argsort(p)[::-1])\n", 1006 | "\n", 1007 | "pagerank_values = p[pg_ranking]\n", 1008 | "nodes_pagerank = [nodelist[r] for r in pg_ranking]\n", 1009 | "nodes_pagerank[:10]" 1010 | ] 1011 | }, 1012 | { 1013 | "cell_type": "code", 1014 | "execution_count": null, 1015 | "metadata": {}, 1016 | "outputs": [], 1017 | "source": [ 1018 | "names_pagerank = [G.nodes[n]['name'] for n in nodes_pagerank]\n", 1019 | "names_pagerank[:10]" 1020 | ] 1021 | }, 1022 | { 1023 | "cell_type": "code", 1024 | "execution_count": null, 1025 | "metadata": {}, 1026 | "outputs": [], 1027 | "source": [ 1028 | "hist = plt.bar(np.arange(p.shape[0]),np.sort(p)[::-1])\n", 1029 | "ty = plt.ylabel('PageRank value')\n", 1030 | "tx = plt.xlabel('PageRank ranking')" 1031 | ] 1032 | }, 1033 | { 1034 | "cell_type": "code", 1035 | "execution_count": null, 1036 | "metadata": {}, 1037 | "outputs": [], 1038 | "source": [ 1039 | "# pagerank is a probability density\n", 1040 | "pagerank_values.sum()\n" 1041 | ] 1042 | }, 1043 | { 1044 | "cell_type": "code", 1045 | "execution_count": null, 1046 | "metadata": {}, 1047 | "outputs": [], 1048 | "source": [ 1049 | "# draw the network of the top 100 nodes\n", 1050 | "nx.draw(G, nodelist=nodes_pagerank[:100], node_size=8000*pagerank_values[:100],width=0.5, arrows=False)" 1051 | ] 1052 | }, 1053 | { 1054 | "cell_type": "markdown", 1055 | "metadata": {}, 1056 | "source": [ 1057 | "### Save the graph to a GEXF file:\n", 1058 | "\n", 1059 | "GEFX is file format based on XML useful for exchanging files between softwares.\n", 1060 | "\n", 1061 | "https://gephi.org/gexf/format/" 1062 | ] 1063 | }, 1064 | { 1065 | "cell_type": "code", 1066 | "execution_count": null, 1067 | "metadata": {}, 1068 | "outputs": [], 1069 | "source": [ 1070 | "# First let's add the pagerank value as a node attribute\n", 1071 | "for n, pr in zip(nodes_pagerank,pagerank_values):\n", 1072 | " if n in LCC:\n", 1073 | " LCC.nodes[n]['page_rank'] = pr\n" 1074 | ] 1075 | }, 1076 | { 1077 | "cell_type": "code", 1078 | "execution_count": null, 1079 | "metadata": {}, 1080 | "outputs": [], 1081 | "source": [ 1082 | "nx.write_gexf(LCC, 'twitter_lcc.gexf')" 1083 | ] 1084 | }, 1085 | { 1086 | "cell_type": "markdown", 1087 | "metadata": {}, 1088 | "source": [ 1089 | "We can now open the file with [Gephi](https://gephi.org/) to vizualize the graph" 1090 | ] 1091 | } 1092 | ], 1093 | "metadata": { 1094 | "anaconda-cloud": {}, 1095 | "kernelspec": { 1096 | "display_name": "Python 3", 1097 | "language": "python", 1098 | "name": "python3" 1099 | }, 1100 | "language_info": { 1101 | "codemirror_mode": { 1102 | "name": "ipython", 1103 | "version": 3 1104 | }, 1105 | "file_extension": ".py", 1106 | "mimetype": "text/x-python", 1107 | "name": "python", 1108 | "nbconvert_exporter": "python", 1109 | "pygments_lexer": "ipython3", 1110 | "version": "3.6.10" 1111 | } 1112 | }, 1113 | "nbformat": 4, 1114 | "nbformat_minor": 2 1115 | } 1116 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/02_Analysis_of_Twitter_Social_Network-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Analysis of a Twitter Social Network\n", 8 | "\n", 9 | "In this section we are going to parse the tweets we collected and build the social network of interactions between Twitter users. We will also see how to analyze the network using NetworkX. We will look at the different component of the network and at percolation processes on this network.\n", 10 | "\n", 11 | "## Parsing tweets\n", 12 | "\n", 13 | "Tweets are saved in JSON format ([JavaScript Object Notation](https://www.w3schools.com/js/js_json_intro.asp))\n", 14 | "JSON is text, written with JavaScript object notation.\n", 15 | "\n", 16 | "The `json` python module allows to easily import json file into python [Dictonairies](https://docs.python.org/3/tutorial/datastructures.html#dictionaries)\n", 17 | "\n" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "#load tweets \n", 27 | "\n", 28 | "import json\n", 29 | "\n", 30 | "\n", 31 | "filename = 'tweets_covid.txt'\n", 32 | "\n", 33 | "tweet_list = []\n", 34 | "\n", 35 | "with open(filename, 'r') as fopen:\n", 36 | " # each line correspond to a tweet\n", 37 | " for line in fopen:\n", 38 | " tweet_list.append(json.loads(line))\n", 39 | " " 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "Let's look at the informations contained in a tweet" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "# take the first tweet of the list\n", 56 | "tweet = tweet_list[0]" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "# each tweet is a python dictionary\n", 66 | "type(tweet)" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "# all the 'entries' of the dictionary\n", 76 | "tweet.keys()" 77 | ] 78 | }, 79 | { 80 | "cell_type": "markdown", 81 | "metadata": {}, 82 | "source": [ 83 | "you can find a description of the fields in the Twitter API documentation: https://developer.twitter.com/en/docs/twitter-api/v1/data-dictionary/object-model/tweet" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": null, 89 | "metadata": {}, 90 | "outputs": [], 91 | "source": [ 92 | "#creation time\n", 93 | "tweet['created_at']" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "# text of the tweet\n", 103 | "print(tweet['text'])" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "# user info\n", 113 | "tweet['user']" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "metadata": {}, 120 | "outputs": [], 121 | "source": [ 122 | "# user is itslef a dict\n", 123 | "print(type(tweet['user']))\n", 124 | "\n", 125 | "tweet['user']['name']" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "# unique id of the user\n", 135 | "tweet['user']['id']" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "#is the tweet a retweet?\n", 145 | "'retweeted_status' in tweet" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "if 'retweeted_status' in tweet:\n", 155 | " print(tweet['retweeted_status'])\n", 156 | "# the `retweeted_status` is also a tweet dictionary " 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "if 'retweeted_status' in tweet:\n", 166 | " print(tweet['retweeted_status']['text'])" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "# user id and name of the retweeted user?\n", 176 | "if 'retweeted_status' in tweet:\n", 177 | " print(tweet['retweeted_status']['user']['id'])\n", 178 | " print(tweet['retweeted_status']['user']['name'])" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "# is the tweet a reply?\n", 188 | "'in_reply_to_user_id' in tweet and tweet['in_reply_to_user_id'] is not None" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "# 'entities' contains the hashtags, urls and usernames used in the tweet\n", 198 | "tweet['entities']" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": null, 204 | "metadata": {}, 205 | "outputs": [], 206 | "source": [ 207 | "# user id of the mentioned users\n", 208 | "for mention in tweet['entities']['user_mentions']:\n", 209 | " print(mention['id'])" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": null, 215 | "metadata": {}, 216 | "outputs": [], 217 | "source": [ 218 | "# is the tweet a quote?\n", 219 | "'quoted_status' in tweet" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "# Building the network of interactions\n", 227 | "\n", 228 | "We will use the python module [`NetworkX`](https://networkx.readthedocs.io/en/stable/index.html) to construct and analyze the social network.\n", 229 | "\n", 230 | "A short introduction to networkx: https://networkx.org/documentation/stable/reference/introduction.html\n" 231 | ] 232 | }, 233 | { 234 | "cell_type": "markdown", 235 | "metadata": {}, 236 | "source": [ 237 | "There are four types of interactions between two users in Twitter:\n", 238 | "- Retweet\n", 239 | "- Quote\n", 240 | "- Reply\n", 241 | "- Mention" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": null, 247 | "metadata": {}, 248 | "outputs": [], 249 | "source": [ 250 | "# let's define some functions to extract the interactions from tweets\n", 251 | "\n", 252 | "def getTweetID(tweet):\n", 253 | " \"\"\" If properly included, get the ID of the tweet \"\"\"\n", 254 | " return tweet.get('id')\n", 255 | " \n", 256 | "def getUserIDandScreenName(tweet):\n", 257 | " \"\"\" If properly included, get the tweet \n", 258 | " user ID and Screen Name \"\"\"\n", 259 | " user = tweet.get('user')\n", 260 | " if user is not None:\n", 261 | " uid = user.get('id')\n", 262 | " screen_name = user.get('screen_name')\n", 263 | " return uid, screen_name\n", 264 | " else:\n", 265 | " return (None, None)\n", 266 | "\n", 267 | "def getRetweetedUserIDandSreenName(tweet):\n", 268 | " \"\"\" If properly included, get the retweet \n", 269 | " source user ID and Screen Name\"\"\"\n", 270 | " \n", 271 | " retweet = tweet.get('retweeted_status')\n", 272 | " if retweet is not None:\n", 273 | " return getUserIDandScreenName(retweet)\n", 274 | " else:\n", 275 | " return (None, None)\n", 276 | " \n", 277 | "def getRepliedUserIDandScreenName(tweet):\n", 278 | " \"\"\" If properly included, get the ID and Screen Name \n", 279 | " of the user the tweet replies to \"\"\"\n", 280 | " \n", 281 | " reply_id = tweet.get('in_reply_to_user_id')\n", 282 | " reply_screenname = tweet.get('in_reply_to_screen_name')\n", 283 | " return reply_id, reply_screenname\n", 284 | " \n", 285 | "def getUserMentionsIDandScreenName(tweet):\n", 286 | " \"\"\" If properly included, return a list of IDs and Screen Names tuple\n", 287 | " of all user mentions, including retweeted and replied users \"\"\"\n", 288 | " \n", 289 | " mentions = []\n", 290 | " entities = tweet.get('entities')\n", 291 | " if entities is not None:\n", 292 | " user_mentions = entities.get('user_mentions')\n", 293 | " for mention in user_mentions:\n", 294 | " mention_id = mention.get('id')\n", 295 | " screen_name = mention.get('screen_name')\n", 296 | " mentions.append((mention_id, screen_name))\n", 297 | " \n", 298 | " return mentions\n", 299 | "\n", 300 | " \n", 301 | "def getQuotedUserIDandScreenName(tweet):\n", 302 | " \"\"\" If properly included, get the ID of the user the tweet is quoting\"\"\"\n", 303 | " \n", 304 | " quoted_status = tweet.get('quoted_status')\n", 305 | " \n", 306 | " if quoted_status is not None:\n", 307 | " return getUserIDandScreenName(quoted_status)\n", 308 | " else:\n", 309 | " return (None, None)\n", 310 | " \n", 311 | "def getAllInteractions(tweet):\n", 312 | " \"\"\" Get all the interactions from this tweet\n", 313 | " \n", 314 | " returns : (tweeter_id, tweeter_screenname), list of (interacting_id, interacting_screenname)\n", 315 | " \"\"\"\n", 316 | " \n", 317 | " # Get the tweeter\n", 318 | " tweeter = getUserIDandScreenName(tweet)\n", 319 | " \n", 320 | " # Nothing to do if we couldn't get the tweeter\n", 321 | " if tweeter[0] is None:\n", 322 | " return (None, None), []\n", 323 | " \n", 324 | " # a python set is a collection of unique items\n", 325 | " # we use a set to avoid duplicated ids\n", 326 | " interacting_users = set()\n", 327 | " \n", 328 | " # Add person they're replying to\n", 329 | " interacting_users.add(getRepliedUserIDandScreenName(tweet))\n", 330 | " \n", 331 | " # Add person they retweeted\n", 332 | " interacting_users.add(getRetweetedUserIDandSreenName(tweet))\n", 333 | " \n", 334 | " # Add person they quoted\n", 335 | " interacting_users.add(getQuotedUserIDandScreenName(tweet))\n", 336 | " \n", 337 | " # Add mentions\n", 338 | " interacting_users.update(getUserMentionsIDandScreenName(tweet))\n", 339 | " \n", 340 | " # remove the tweeter if he is in the set\n", 341 | " interacting_users.discard(tweeter)\n", 342 | " # remove the None case\n", 343 | " interacting_users.discard((None,None))\n", 344 | " \n", 345 | " # Return our tweeter and their influencers\n", 346 | " return tweeter, list(interacting_users)\n", 347 | " \n" 348 | ] 349 | }, 350 | { 351 | "cell_type": "code", 352 | "execution_count": null, 353 | "metadata": {}, 354 | "outputs": [], 355 | "source": [ 356 | "print(getUserIDandScreenName(tweet_list[3]))\n", 357 | "print(getAllInteractions(tweet_list[4]))\n", 358 | "\n", 359 | "tweet_list[100].get('text')" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "metadata": {}, 365 | "source": [ 366 | "#### Let's build the network" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": {}, 373 | "outputs": [], 374 | "source": [ 375 | "import networkx as nx\n", 376 | "\n", 377 | "# define an empty Directed Graph\n", 378 | "# A directed graph is a graph where edges have a direction\n", 379 | "# in our case the edges goes from user that sent the tweet to\n", 380 | "# the user with whom they interacted (retweeted, mentioned or quoted)\n", 381 | "G = nx.DiGraph()\n", 382 | "\n", 383 | "# loop over all the tweets and add edges if the tweet include some interactions\n", 384 | "for tweet in tweet_list:\n", 385 | " # find all influencers in the tweet\n", 386 | " tweeter, interactions = getAllInteractions(tweet)\n", 387 | " tweeter_id, tweeter_name = tweeter\n", 388 | " tweet_id = getTweetID(tweet)\n", 389 | " \n", 390 | " # add an edge to the Graph for each influencer\n", 391 | " for interaction in interactions:\n", 392 | " interact_id, interact_name = interaction\n", 393 | " \n", 394 | " # add edges between the two user ids\n", 395 | " # this will create new nodes if the nodes are not already in the network\n", 396 | " # we also add an attribute the to edge equal to the id of the tweet\n", 397 | " G.add_edge(tweeter_id, interact_id, tweet_id=tweet_id)\n", 398 | " \n", 399 | " # add name as a property to each node\n", 400 | " # with networkX each node is a dictionary\n", 401 | " G.nodes[tweeter_id]['name'] = tweeter_name\n", 402 | " G.nodes[interact_id]['name'] = interact_name\n", 403 | " " 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": null, 409 | "metadata": {}, 410 | "outputs": [], 411 | "source": [ 412 | "# The graph's node are contained in a NodeView which has a dict-like interface\n", 413 | "print(type(G.nodes))" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": null, 419 | "metadata": {}, 420 | "outputs": [], 421 | "source": [ 422 | "# the keys are the user_id\n", 423 | "nodelist = list(G.nodes.keys())\n", 424 | "print(nodelist)" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": null, 430 | "metadata": {}, 431 | "outputs": [], 432 | "source": [ 433 | "# each node is itself a dictionary with node attributes as key,value pairs\n", 434 | "print(type(G.nodes[nodelist[0]]))\n", 435 | "print(G.nodes[nodelist[0]])" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": null, 441 | "metadata": {}, 442 | "outputs": [], 443 | "source": [ 444 | "# edges are contained in a EdgeView with a set-like interface\n", 445 | "print(type(G.edges))\n", 446 | "print(G.edges())" 447 | ] 448 | }, 449 | { 450 | "cell_type": "code", 451 | "execution_count": null, 452 | "metadata": {}, 453 | "outputs": [], 454 | "source": [ 455 | "# we can see all the edges going out of this node\n", 456 | "# each edge is a dictionary inside this dictionary with a key \n", 457 | "# corresponding to the target user_id\n", 458 | "e = G.out_edges(nodelist[11], data=True)\n", 459 | "print(nodelist[11])\n", 460 | "print(e)" 461 | ] 462 | }, 463 | { 464 | "cell_type": "code", 465 | "execution_count": null, 466 | "metadata": {}, 467 | "outputs": [], 468 | "source": [ 469 | "# we can iterate over the out-edges \n", 470 | "for s,t,data in e:\n", 471 | " print(s,t,data)" 472 | ] 473 | }, 474 | { 475 | "cell_type": "markdown", 476 | "metadata": {}, 477 | "source": [ 478 | "#### Some basic properties of the Network:" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "metadata": {}, 485 | "outputs": [], 486 | "source": [ 487 | "G.number_of_nodes()" 488 | ] 489 | }, 490 | { 491 | "cell_type": "code", 492 | "execution_count": null, 493 | "metadata": {}, 494 | "outputs": [], 495 | "source": [ 496 | "G.number_of_edges()" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": null, 502 | "metadata": {}, 503 | "outputs": [], 504 | "source": [ 505 | "# listing all nodes \n", 506 | "nodelist = list(G.nodes())\n", 507 | "\n", 508 | "nodelist[:3]" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": null, 514 | "metadata": {}, 515 | "outputs": [], 516 | "source": [ 517 | "# degree of a node\n", 518 | "print(G.degree(nodelist[2]))\n", 519 | "print(G.in_degree(nodelist[2]))\n", 520 | "print(G.out_degree(nodelist[2]))" 521 | ] 522 | }, 523 | { 524 | "cell_type": "code", 525 | "execution_count": null, 526 | "metadata": {}, 527 | "outputs": [], 528 | "source": [ 529 | "# dictionary with the degree of all nodes\n", 530 | "all_degrees = [G.degree(n) for n in nodelist] # this is the degree for undirected edges\n", 531 | "in_degrees = [G.in_degree(n) for n in nodelist]\n", 532 | "out_degrees = [G.out_degree(n) for n in nodelist]" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": null, 538 | "metadata": {}, 539 | "outputs": [], 540 | "source": [ 541 | "# average degree\n", 542 | "2*G.number_of_edges()/G.number_of_nodes()" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": null, 548 | "metadata": {}, 549 | "outputs": [], 550 | "source": [ 551 | "import numpy as np\n", 552 | "np.array(all_degrees).mean()" 553 | ] 554 | }, 555 | { 556 | "cell_type": "code", 557 | "execution_count": null, 558 | "metadata": {}, 559 | "outputs": [], 560 | "source": [ 561 | "np.array(in_degrees).mean()" 562 | ] 563 | }, 564 | { 565 | "cell_type": "code", 566 | "execution_count": null, 567 | "metadata": {}, 568 | "outputs": [], 569 | "source": [ 570 | "np.array(out_degrees).mean()" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": null, 576 | "metadata": {}, 577 | "outputs": [], 578 | "source": [ 579 | "# maximum degree\n", 580 | "max(all_degrees)" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": null, 586 | "metadata": {}, 587 | "outputs": [], 588 | "source": [ 589 | "# we want to make a list with (user_id, username, degree) for all nodes\n", 590 | "degree_node_list = []\n", 591 | "for node in nodelist:\n", 592 | " degree_node_list.append((node, G.nodes[node]['name'], G.degree(node)))\n", 593 | " \n", 594 | "print('Unordered user, degree list') \n", 595 | "print(degree_node_list[:10])\n", 596 | "\n", 597 | "# sort the list according the degree in descinding order\n", 598 | "degree_node_list = sorted(degree_node_list, key=lambda x:x[2], reverse=True)\n", 599 | "print('Ordered user, degree list') \n", 600 | "print(degree_node_list[:10])" 601 | ] 602 | }, 603 | { 604 | "cell_type": "code", 605 | "execution_count": null, 606 | "metadata": {}, 607 | "outputs": [], 608 | "source": [ 609 | "# we need to import matplolib for making plots\n", 610 | "# and numpy for numerical computations\n", 611 | "import numpy as np\n", 612 | "import matplotlib.pyplot as plt\n", 613 | "%matplotlib inline" 614 | ] 615 | }, 616 | { 617 | "cell_type": "markdown", 618 | "metadata": {}, 619 | "source": [ 620 | "### Network components\n", 621 | "\n", 622 | "For **directed** graphs we can define two types of components:\n", 623 | "- Weakly connected components\n", 624 | "- Strongly connected components\n", 625 | "\n", 626 | "Weakly connected component (WCC): maximal set of nodes where there exists a path in at least one direction between each pair of nodes.\n", 627 | "\n", 628 | "Strongly connected component (SCC): maximal set of nodes where there exists a path in both directions between each pair of nodes.\n", 629 | "\n", 630 | "Weakly connected giant (largest) component (WCGC): Largest WCC\n", 631 | "Strongly connected giant (largest) component (SCGC): Largest SCC\n", 632 | "\n", 633 | "" 634 | ] 635 | }, 636 | { 637 | "cell_type": "code", 638 | "execution_count": null, 639 | "metadata": {}, 640 | "outputs": [], 641 | "source": [ 642 | "# this returns a list of set of nodes belonging to the \n", 643 | "# different (weakly) connected components\n", 644 | "components = list(nx.weakly_connected_components(G))\n", 645 | "\n", 646 | "# sort the component according to their size\n", 647 | "components = list(sorted(components, key=lambda x:len(x), reverse=True))" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": null, 653 | "metadata": {}, 654 | "outputs": [], 655 | "source": [ 656 | "# make a list with the size of each component\n", 657 | "comp_sizes = []\n", 658 | "for comp in components:\n", 659 | " comp_sizes.append(len(comp))" 660 | ] 661 | }, 662 | { 663 | "cell_type": "code", 664 | "execution_count": null, 665 | "metadata": {}, 666 | "outputs": [], 667 | "source": [ 668 | "# plot the histogram of component sizes\n", 669 | "hist = plt.hist(comp_sizes, bins=100)" 670 | ] 671 | }, 672 | { 673 | "cell_type": "code", 674 | "execution_count": null, 675 | "metadata": {}, 676 | "outputs": [], 677 | "source": [ 678 | "# histogram with logarithmic y scale\n", 679 | "hist = plt.hist(comp_sizes, bins=100, log=True)\n", 680 | "tx = plt.xlabel('component size')\n", 681 | "ty = plt.ylabel('number of components')" 682 | ] 683 | }, 684 | { 685 | "cell_type": "code", 686 | "execution_count": null, 687 | "metadata": {}, 688 | "outputs": [], 689 | "source": [ 690 | "# sizes of the ten largest components\n", 691 | "comp_sizes[:10]" 692 | ] 693 | }, 694 | { 695 | "cell_type": "code", 696 | "execution_count": null, 697 | "metadata": {}, 698 | "outputs": [], 699 | "source": [ 700 | "# let's make a new graph which is the subgraph of G corresponding to \n", 701 | "# the largest connected component\n", 702 | "# let's find the largest component\n", 703 | "largest_comp = components[0]\n", 704 | "LCC = G.subgraph(largest_comp)" 705 | ] 706 | }, 707 | { 708 | "cell_type": "code", 709 | "execution_count": null, 710 | "metadata": {}, 711 | "outputs": [], 712 | "source": [ 713 | "G.number_of_nodes()" 714 | ] 715 | }, 716 | { 717 | "cell_type": "code", 718 | "execution_count": null, 719 | "metadata": {}, 720 | "outputs": [], 721 | "source": [ 722 | "LCC.number_of_nodes()" 723 | ] 724 | }, 725 | { 726 | "cell_type": "code", 727 | "execution_count": null, 728 | "metadata": {}, 729 | "outputs": [], 730 | "source": [ 731 | "# let's plot the degree distribution inside the LCC\n", 732 | "degrees = [LCC.degree(n) for n in LCC.nodes()]\n", 733 | "degrees" 734 | ] 735 | }, 736 | { 737 | "cell_type": "code", 738 | "execution_count": null, 739 | "metadata": {}, 740 | "outputs": [], 741 | "source": [ 742 | "degree_array = np.array(degrees)\n", 743 | "hist = plt.hist(degree_array, bins=100)" 744 | ] 745 | }, 746 | { 747 | "cell_type": "code", 748 | "execution_count": null, 749 | "metadata": {}, 750 | "outputs": [], 751 | "source": [ 752 | "# using logarithmic scales\n", 753 | "hist = plt.hist(degree_array, bins=100, log=True)\n", 754 | "plt.xscale('log')\n" 755 | ] 756 | }, 757 | { 758 | "cell_type": "code", 759 | "execution_count": null, 760 | "metadata": {}, 761 | "outputs": [], 762 | "source": [ 763 | "# logarithmic scale with logarithmic bins\n", 764 | "N, bins, patches = plt.hist(degree_array, bins=np.logspace(0,np.log10(degree_array.max()+1), 20), log=True)\n", 765 | "plt.xscale('log')\n", 766 | "tx = plt.xlabel('k - degree')\n", 767 | "ty= plt.ylabel('number of nodes')\n" 768 | ] 769 | }, 770 | { 771 | "cell_type": "code", 772 | "execution_count": null, 773 | "metadata": {}, 774 | "outputs": [], 775 | "source": [ 776 | "# Degree probability distribution (P(k))\n", 777 | "\n", 778 | "# since we have logarithmic bins, we need to\n", 779 | "# take into account the fact that the bins \n", 780 | "# have different lenghts when normalizing\n", 781 | "bin_lengths = np.diff(bins) # lenght of each bin\n", 782 | "\n", 783 | "summ = np.sum(N*bin_lengths)\n", 784 | "normalized_degree_dist = N/summ\n", 785 | "\n", 786 | "# check normalization:\n", 787 | "print(np.sum(normalized_degree_dist*bin_lengths))\n", 788 | "\n", 789 | "hist = plt.bar(bins[:-1], normalized_degree_dist, width=np.diff(bins))\n", 790 | "plt.xscale('log')\n", 791 | "plt.yscale('log')\n", 792 | "tx = plt.xlabel('k (degree)')\n", 793 | "ty = plt.ylabel('P(k)')" 794 | ] 795 | }, 796 | { 797 | "cell_type": "markdown", 798 | "metadata": {}, 799 | "source": [ 800 | "### Exercise: do the same for the Graph comprising only retweet, replies, quote and mentions" 801 | ] 802 | }, 803 | { 804 | "cell_type": "markdown", 805 | "metadata": {}, 806 | "source": [ 807 | "### Percolation of the Giant Component" 808 | ] 809 | }, 810 | { 811 | "cell_type": "code", 812 | "execution_count": null, 813 | "metadata": {}, 814 | "outputs": [], 815 | "source": [ 816 | "import random\n", 817 | "\n", 818 | "def getGCsize(G):\n", 819 | " \"\"\" returns the size of the largest component of G\"\"\"\n", 820 | " \n", 821 | " return len(max(nx.connected_components(G), key=len))\n", 822 | " \n" 823 | ] 824 | }, 825 | { 826 | "cell_type": "markdown", 827 | "metadata": {}, 828 | "source": [ 829 | "#### Random Attack:" 830 | ] 831 | }, 832 | { 833 | "cell_type": "code", 834 | "execution_count": null, 835 | "metadata": {}, 836 | "outputs": [], 837 | "source": [ 838 | "# list that will contain the size of the GC as we remove nodes\n", 839 | "rnd_attack_GC_sizes = []\n", 840 | "\n", 841 | "# we take into account the undirected version of the graph\n", 842 | "LCCundirected = nx.Graph(LCC)\n", 843 | "\n", 844 | "nodes_list = list(LCCundirected.nodes())\n", 845 | "\n", 846 | "\n", 847 | "while len(nodes_list) > 1:\n", 848 | " # add the size of the current GC\n", 849 | " rnd_attack_GC_sizes.append(getGCsize(LCCundirected))\n", 850 | " \n", 851 | " # pick a random node\n", 852 | " rnd_node = random.choice(nodes_list)\n", 853 | " # remove from graph\n", 854 | " LCCundirected.remove_node(rnd_node)\n", 855 | " # remove from node list\n", 856 | " nodes_list.remove(rnd_node)\n" 857 | ] 858 | }, 859 | { 860 | "cell_type": "code", 861 | "execution_count": null, 862 | "metadata": {}, 863 | "outputs": [], 864 | "source": [ 865 | "# convert list to numpy array\n", 866 | "rnd_attack_GC_sizes = np.array(rnd_attack_GC_sizes)\n", 867 | "\n", 868 | "# normalize by the initial size of the GC\n", 869 | "GC_rnd = rnd_attack_GC_sizes/rnd_attack_GC_sizes[0]\n", 870 | "\n", 871 | "# fraction of removed nodes\n", 872 | "q = np.linspace(0,1,num=GC_rnd.size)\n", 873 | "\n", 874 | "plt.plot(q,GC_rnd)\n", 875 | "tx = plt.xlabel('q')\n", 876 | "ty = plt.ylabel('GC')\n" 877 | ] 878 | }, 879 | { 880 | "cell_type": "markdown", 881 | "metadata": {}, 882 | "source": [ 883 | "#### High degree attack:" 884 | ] 885 | }, 886 | { 887 | "cell_type": "code", 888 | "execution_count": null, 889 | "metadata": {}, 890 | "outputs": [], 891 | "source": [ 892 | "# high degree attack\n", 893 | "LCCundirected = nx.Graph(LCC)\n", 894 | "\n", 895 | "# list of pairs (node, degree) sorted according the degree\n", 896 | "node_deg_dict = dict(nx.degree(LCCundirected))\n", 897 | "nodes_sorted = sorted(node_deg_dict, key=node_deg_dict.get)\n", 898 | "\n", 899 | "# list that will contain the size of the GC as we remove nodes\n", 900 | "hd_attack_GC_sizes = []\n", 901 | "\n", 902 | "while len(nodes_sorted) > 1:\n", 903 | " \n", 904 | " hd_attack_GC_sizes.append(getGCsize(LCCundirected))\n", 905 | " \n", 906 | " #remove node according to their degree\n", 907 | " node = nodes_sorted.pop() # pop() removes and returns the last element\n", 908 | " LCCundirected.remove_node(node)\n", 909 | " \n", 910 | " \n", 911 | "\n" 912 | ] 913 | }, 914 | { 915 | "cell_type": "code", 916 | "execution_count": null, 917 | "metadata": {}, 918 | "outputs": [], 919 | "source": [ 920 | "hd_attack_GC_sizes = np.array(hd_attack_GC_sizes)\n", 921 | "GC_hd = hd_attack_GC_sizes/hd_attack_GC_sizes[0]\n", 922 | "q = np.linspace(0,1,num=GC_hd.size)\n", 923 | "\n", 924 | "plt.plot(q,GC_rnd, label='random attack')\n", 925 | "plt.plot(q,GC_hd, label='High-Degree attack')\n", 926 | "tx = plt.xlabel('q')\n", 927 | "ty = plt.ylabel('GC')\n", 928 | "_ = plt.legend()\n" 929 | ] 930 | }, 931 | { 932 | "cell_type": "markdown", 933 | "metadata": {}, 934 | "source": [ 935 | "#### Exercise: implement the High-Degree Adaptative (HDA) attack where at each step the node with the highest degree of the remaining graph is removed." 936 | ] 937 | }, 938 | { 939 | "cell_type": "markdown", 940 | "metadata": {}, 941 | "source": [ 942 | "### PageRank\n", 943 | "\n", 944 | "The *PageRank* centrality modifies the classical random walk by introducing a \"teleportation\" probability, i.e. at each step, the walkers have a given probability to teleport uniformly at random to any other nodes of the network.\n", 945 | "This makes the random walk ergodic, i.e. it converges to a stationary distribution, even in directed and disconnected networks.\n", 946 | "\n", 947 | "The update equation for the PageRank probability density is given by \n", 948 | "\n", 949 | "$\\mathbf{p}(n+1) = (1-\\alpha)\\mathbf{p}(n)\\mathbf{D}_\\text{out}^{-1}\\mathbf{A} + \\frac{\\alpha}{N}\\mathbf{1} = \\mathbf{p}(n) \\mathbf{M}$\n", 950 | "\n", 951 | "with\n", 952 | "\n", 953 | "$ \\mathbf{M} = (1-\\alpha)\\mathbf{D}_\\text{out}^{-1}\\mathbf{A} + \\frac{\\alpha}{N}\\mathbf{1}^T\\mathbf{1}$" 954 | ] 955 | }, 956 | { 957 | "cell_type": "code", 958 | "execution_count": null, 959 | "metadata": {}, 960 | "outputs": [], 961 | "source": [ 962 | "#teleportation probability\n", 963 | "alpha = 0.15\n", 964 | "\n", 965 | "#adjacency matrix\n", 966 | "nodelist = list(G.nodes())\n", 967 | "A = nx.to_numpy_array(G, nodelist=nodelist)\n", 968 | "\n", 969 | "#diagonal matrix of out degrees\n", 970 | "deg_out_vect = np.array([float(max(G.out_degree(n),1)) for n in nodelist])\n", 971 | "D_out_inv = np.diag(1/deg_out_vect)\n", 972 | "\n", 973 | "# teleportation transition matrix\n", 974 | "N = A.shape[1]\n", 975 | "S = np.ones((N,N))*1/N\n", 976 | "\n", 977 | "# full transition matrix\n", 978 | "M = (1-alpha)*D_out_inv @ A + alpha*S\n", 979 | "\n", 980 | "# for dangling nodes (nodes without out-edges), we force the random teleportation\n", 981 | "dangling_nodes = np.where(A.sum(1) == 0)[0]\n", 982 | "M[dangling_nodes,:] = S[dangling_nodes,:]\n", 983 | "\n", 984 | "#initial walker distribution and 1st iteration\n", 985 | "p_last = np.ones(N)*1/N\n", 986 | "p = np.matmul(p_last, M)\n", 987 | "\n", 988 | "# iterate until sufficient convergence\n", 989 | "eps = 1.0e-8\n", 990 | "i = 1\n", 991 | "while np.linalg.norm(p - p_last, 2) > eps:\n", 992 | " p_last = p\n", 993 | " p = np.matmul(p, M)\n", 994 | " i += 1\n", 995 | "\n", 996 | "print(i)" 997 | ] 998 | }, 999 | { 1000 | "cell_type": "code", 1001 | "execution_count": null, 1002 | "metadata": {}, 1003 | "outputs": [], 1004 | "source": [ 1005 | "pg_ranking = np.array(np.argsort(p)[::-1])\n", 1006 | "\n", 1007 | "pagerank_values = p[pg_ranking]\n", 1008 | "nodes_pagerank = [nodelist[r] for r in pg_ranking]\n", 1009 | "nodes_pagerank[:10]" 1010 | ] 1011 | }, 1012 | { 1013 | "cell_type": "code", 1014 | "execution_count": null, 1015 | "metadata": {}, 1016 | "outputs": [], 1017 | "source": [ 1018 | "names_pagerank = [G.nodes[n]['name'] for n in nodes_pagerank]\n", 1019 | "names_pagerank[:10]" 1020 | ] 1021 | }, 1022 | { 1023 | "cell_type": "code", 1024 | "execution_count": null, 1025 | "metadata": {}, 1026 | "outputs": [], 1027 | "source": [ 1028 | "hist = plt.bar(np.arange(p.shape[0]),np.sort(p)[::-1])\n", 1029 | "ty = plt.ylabel('PageRank value')\n", 1030 | "tx = plt.xlabel('PageRank ranking')" 1031 | ] 1032 | }, 1033 | { 1034 | "cell_type": "code", 1035 | "execution_count": null, 1036 | "metadata": {}, 1037 | "outputs": [], 1038 | "source": [ 1039 | "# pagerank is a probability density\n", 1040 | "pagerank_values.sum()\n" 1041 | ] 1042 | }, 1043 | { 1044 | "cell_type": "code", 1045 | "execution_count": null, 1046 | "metadata": {}, 1047 | "outputs": [], 1048 | "source": [ 1049 | "# draw the network of the top 100 nodes\n", 1050 | "nx.draw(G, nodelist=nodes_pagerank[:100], node_size=8000*pagerank_values[:100],width=0.5, arrows=False)" 1051 | ] 1052 | }, 1053 | { 1054 | "cell_type": "markdown", 1055 | "metadata": {}, 1056 | "source": [ 1057 | "### Save the graph to a GEXF file:\n", 1058 | "\n", 1059 | "GEFX is file format based on XML useful for exchanging files between softwares.\n", 1060 | "\n", 1061 | "https://gephi.org/gexf/format/" 1062 | ] 1063 | }, 1064 | { 1065 | "cell_type": "code", 1066 | "execution_count": null, 1067 | "metadata": {}, 1068 | "outputs": [], 1069 | "source": [ 1070 | "# First let's add the pagerank value as a node attribute\n", 1071 | "for n, pr in zip(nodes_pagerank,pagerank_values):\n", 1072 | " if n in LCC:\n", 1073 | " LCC.nodes[n]['page_rank'] = pr\n" 1074 | ] 1075 | }, 1076 | { 1077 | "cell_type": "code", 1078 | "execution_count": null, 1079 | "metadata": {}, 1080 | "outputs": [], 1081 | "source": [ 1082 | "nx.write_gexf(LCC, 'twitter_lcc.gexf')" 1083 | ] 1084 | }, 1085 | { 1086 | "cell_type": "markdown", 1087 | "metadata": {}, 1088 | "source": [ 1089 | "We can now open the file with [Gephi](https://gephi.org/) to vizualize the graph" 1090 | ] 1091 | } 1092 | ], 1093 | "metadata": { 1094 | "anaconda-cloud": {}, 1095 | "kernelspec": { 1096 | "display_name": "Python 3", 1097 | "language": "python", 1098 | "name": "python3" 1099 | }, 1100 | "language_info": { 1101 | "codemirror_mode": { 1102 | "name": "ipython", 1103 | "version": 3 1104 | }, 1105 | "file_extension": ".py", 1106 | "mimetype": "text/x-python", 1107 | "name": "python", 1108 | "nbconvert_exporter": "python", 1109 | "pygments_lexer": "ipython3", 1110 | "version": "3.6.10" 1111 | } 1112 | }, 1113 | "nbformat": 4, 1114 | "nbformat_minor": 2 1115 | } 1116 | --------------------------------------------------------------------------------