├── Requirements.txt
├── Report.pdf
├── Screenshots
    ├── brain.png
    ├── ImageSample.png
    ├── crawl_images.PNG
    ├── crawl_texts.PNG
    └── texts_exmpl.PNG
├── Scripts
    ├── Images_functions
    │   ├── run_npz.py
    │   ├── run_pexels.py
    │   ├── run_save_images_crawler.py
    │   ├── run_API_unsplash.py
    │   ├── npz.py
    │   ├── Save_images.py
    │   ├── UnsplashAPI.py
    │   └── pexels.py
    └── Twitter_Crawler
    │   ├── run_TWINT.py
    │   └── Twint_pkge.py
├── data
    ├── TwitterP.py
    ├── Images_load.py
    ├── __init__.py
    └── c3d.py
├── Notebooks
    ├── Remove_Duplicates.ipynb
    ├── Explore_Twint.ipynb
    ├── Bibliography.ipynb
    └── TwitterscraperDemo.ipynb
├── models
    └── bilstm.py
├── utils.py
├── README.md
├── LICENSE
├── ResNet-Transfer.ipynb
├── GLove+Bilstm.ipynb
├── ResNet.ipynb
└── Preprocessing_Texts.ipynb


/Requirements.txt:
--------------------------------------------------------------------------------
1 | pandas
2 | scikit-learn
3 | tqdm
4 | beautifulsoup4


--------------------------------------------------------------------------------
/Report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BouzidiImen/Social_media_Prediction_depression/HEAD/Report.pdf


--------------------------------------------------------------------------------
/Screenshots/brain.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BouzidiImen/Social_media_Prediction_depression/HEAD/Screenshots/brain.png


--------------------------------------------------------------------------------
/Screenshots/ImageSample.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BouzidiImen/Social_media_Prediction_depression/HEAD/Screenshots/ImageSample.png


--------------------------------------------------------------------------------
/Screenshots/crawl_images.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BouzidiImen/Social_media_Prediction_depression/HEAD/Screenshots/crawl_images.PNG


--------------------------------------------------------------------------------
/Screenshots/crawl_texts.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BouzidiImen/Social_media_Prediction_depression/HEAD/Screenshots/crawl_texts.PNG


--------------------------------------------------------------------------------
/Screenshots/texts_exmpl.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BouzidiImen/Social_media_Prediction_depression/HEAD/Screenshots/texts_exmpl.PNG


--------------------------------------------------------------------------------
/Scripts/Images_functions/run_npz.py:
--------------------------------------------------------------------------------
1 | from npz import create_npz
2 | import numpy as np
3 | create_npz(path='Data/Unsplash_Pexels_Data/Depression/')
4 | create_npz(path='Data/Unsplash_Pexels_Data/Happiness/',label=np.array([0,1]))
5 | 


--------------------------------------------------------------------------------
/Scripts/Images_functions/run_pexels.py:
--------------------------------------------------------------------------------
1 | from pexels import get_pexels_images
2 | 
3 | DepressionKeywords = ['Suicide', 'Sad', 'Depression', 'Stress', 'Anxiety', 'Grief', 'Despair', 'Crying']
4 | HappyKeywords = ['Happy', 'excited']
5 | for key_word in HappyKeywords:
6 |     get_pexels_images(key_word)
7 | #for key_word in DepressionKeywords:
8 |     #get_pexels_images(key_word)
9 | 


--------------------------------------------------------------------------------
/Scripts/Twitter_Crawler/run_TWINT.py:
--------------------------------------------------------------------------------
1 | from Twint_pkge import clean_data, get_profile_infos, get_timeline_by_username
2 | from tqdm import trange
3 | key_words = ["#lovemylife", "#lifeisgood", "#happyme"]
4 | data = clean_data(key_words)
5 | for i in trange(len(data)):
6 |     get_profile_infos(data['username'].iloc[i])
7 |     print('Okay profile'+str(i))
8 |     get_timeline_by_username(data['username'].iloc[i])
9 | 


--------------------------------------------------------------------------------
/Scripts/Images_functions/run_save_images_crawler.py:
--------------------------------------------------------------------------------
1 | from Save_images import save_images
2 | file_names_Deprssion_unsplash = ['Depression_unsplash']
3 | for file_name in file_names_Deprssion_unsplash:
4 |     save_images(file_name=file_names_Deprssion_unsplash)
5 | file_names_Deprssion_Pexels = ['Depression_Pexels']
6 | for file_name in file_names_Deprssion_unsplash:
7 |     save_images(website_name='Pexels_',file_name=file_name, folder_path='Pexels/Depressions/')
8 | 


--------------------------------------------------------------------------------
/Scripts/Images_functions/run_API_unsplash.py:
--------------------------------------------------------------------------------
 1 | from UnsplashAPI import get_unsplash_images
 2 | import json
 3 | 
 4 | with open('Token.json') as t:
 5 |     api_key = json.load(t)
 6 |     token = api_key[ 'unsplash' ]
 7 | DepressionKeywords = [ 'Suicide', 'Sad', 'Depression', 'Stress', 'Anxiety', 'Grief', 'Despair', 'Crying' ]
 8 | for key_word in DepressionKeywords:
 9 |     get_unsplash_images(token, key_word)
10 | DepressionKeywords = [ 'Suicide', 'Sad', 'Depression', 'Stress', 'Anxiety', 'Grief', 'Despair', 'Crying' ]
11 | for key_word in DepressionKeywords:
12 |     get_unsplash_images(token, key_word)
13 | 


--------------------------------------------------------------------------------
/Scripts/Images_functions/npz.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | import os
 4 | from tqdm import tqdm
 5 | 
 6 | DEFAULT_DATA_NAME = 'data.npz'
 7 | DEFAULT_SCALE = .1
 8 | DEFAULT_LABEL=np.array([1,0]) #for happiness changed to [0,1] 
 9 | DEFALUT_KEY='dep_'
10 | 
11 | def _resize_to_np(filepath, label, scale):
12 |     src = cv2.imread(filepath, cv2.IMREAD_UNCHANGED)
13 |     width = int(src.shape[1] * scale)
14 |     height = int(src.shape[0] * scale)
15 |     output = cv2.resize(src, (width, height))
16 |     img = np.array([output, label])
17 |     return img.reshape((2, 1))
18 | 
19 | 
20 | def create_npz(path,key=DEFALUT_KEY, data_name=DEFAULT_DATA_NAME, scale=DEFAULT_SCALE,label=DEFAULT_LABEL):
21 |     filepath = path + data_name
22 |     if os.path.exists(filepath):
23 |         print('Deleting existing data...')
24 |         os.remove(filepath)
25 |     pictures = os.listdir(path)
26 |     all_imgs = []
27 |     names=[]
28 |     i = 0
29 |     for pic in tqdm(pictures):
30 |         all_imgs.append(_resize_to_np(path + pic, label, scale))
31 |         names.append(key+str(i))
32 |         i += 1
33 |     print('all images were resized ')
34 |     np.savez(filepath, **{name:value for name,value in zip(names,all_imgs)})
35 |     print(f"Data saved into '{filepath}'")
36 | 


--------------------------------------------------------------------------------
/Scripts/Images_functions/Save_images.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from requests import get
 3 | from tqdm import trange
 4 | import time
 5 | from socket import error as socket_error
 6 | import errno
 7 | import pathlib
 8 | from random import randint
 9 | from urllib.request import urlcleanup
10 | 
11 | DEFAULT_FILE_NAME = 'Suicide'
12 | DEFAULT_FOLDER_PATH = 'Unsplash/Depression/'
13 | DEFAULT_NAME = 'Unsplash_'
14 | 
15 | headers = {
16 |     'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Cafari/537.36'}
17 | 
18 | 
19 | def _save_image(website_name,Name, Extension, Link, path_to_folder):
20 |     try:
21 |         name = website_name + Name
22 |         pic = get(Link, headers=headers)
23 |         with open(path_to_folder + name + "." + Extension, 'wb') as photo:
24 |             photo.write(pic.content)
25 |     except socket_error as e:
26 |         if e.errno != errno.ECONNRESET:
27 |             raise
28 |     urlcleanup()
29 | 
30 | 
31 | def save_images(website_name=DEFAULT_NAME,file_name=DEFAULT_FILE_NAME, folder_path=DEFAULT_FOLDER_PATH):
32 |     missed = []
33 |     data = pd.read_csv(file_name + ".csv")
34 |     pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True)
35 |     for i in trange(len(data)):
36 |         wait = randint(5, 15)
37 |         time.sleep(wait)
38 |         print(f'\nWaiting {wait}s...')
39 |         try:
40 |             _save_image(website_name,data['Name'].iloc[i], data['Extension'].iloc[i], data['Links'].iloc[i], folder_path)
41 |         except:
42 |             missed.append(data['Links'].iloc[i])
43 |     missed_data = pd.DataFrame({'Missed_Links': missed}, index=None)
44 |     missed_data.to_csv('MissedData.csv')
45 | 


--------------------------------------------------------------------------------
/data/TwitterP.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import json
 4 | import os
 5 | from tqdm import tqdm
 6 | from utils import maybe_download_and_extract, getlink
 7 | 
 8 | SOURCE_URL = "http://www.mediafire.com/file/xp2jp8ezm3ynpc1/Twitter_Data%25282%2529.zip/file"
 9 | DATA_DEFAULT_PATH = '~/.datasets/Crawled-Twitter-Data/'
10 | DEFAULT_TEST_SIZE = .25
11 | DEFAULT_SEED = 2020
12 | 
13 | 
14 | def load_data(data_path=DATA_DEFAULT_PATH, link=SOURCE_URL, test_size=DEFAULT_TEST_SIZE):
15 |     """
16 |     load data from website and return train, test and validation data
17 |     :param data_path where data will be saved
18 |     :param link to data
19 |     :param test_size
20 | 
21 |     """
22 |     data_path = os.path.expanduser(data_path)
23 |     # Download files
24 |     maybe_download_and_extract(getlink(link), data_path)
25 |     # read data
26 |     Twitter_data = pd.read_csv(data_path + "Twitter_data/Profiles.csv")
27 |     L = len(Twitter_data)
28 |     train, val, test = np.split(Twitter_data, [ int(L * (1 - test_size) * (1 - test_size)), int(L * (1 - test_size)) ])
29 |     return train, val, test
30 | 
31 | 
32 | def get_username_profile(username, data_path=DATA_DEFAULT_PATH):
33 |     """
34 |     load user's profile for a given username
35 |     :param data_path where data is saved
36 |     :param username 
37 | 
38 |     """
39 |     timelines = os.listdir(data_path + 'Twitter_data/Timelines')
40 |     clean_usernames = [s.strip('.csv') for s in timelines]
41 |     for i in range(len(clean_usernames)):
42 |         if clean_usernames[i] == username:
43 |             return (pd.read_csv(data_path + 'Twitter_data/Timelines/' + username + '.csv'))
44 |     return (print("user's timeline not found"))
45 | 


--------------------------------------------------------------------------------
/Scripts/Images_functions/UnsplashAPI.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | import requests as requests
 3 | from urllib.request import urlretrieve
 4 | from tqdm import trange
 5 | import time
 6 | from socket import error as socket_error
 7 | import errno
 8 | 
 9 | DEFAULT_KEY_WORD = 'depression'
10 | DEFAULT_PATH = "Unsplash/Depression/"
11 | 
12 | def get_unsplash_images(token, key_word=DEFAULT_KEY_WORD, path_to_folder=DEFAULT_PATH):
13 |     """
14 |     Using Unsplash API to save url of photos in a csv
15 |     :param path_to_folder: Path there images will be saved
16 |     :param browser: webdriver
17 |     :param key_word: key word to be searched
18 |     :param token
19 | 
20 |     """
21 |     r = requests.get(f'https://api.unsplash.com/search/photos?query={key_word}&client_id={token}')
22 |     infos = r.json()
23 |     total_pages = infos['total_pages']
24 |     total_images = infos['total']
25 |     links = []
26 |     num_per_page = 200
27 |     for pg in range(1, total_pages + 1):
28 |         new_r = requests.get(f'https://api.unsplash.com/search/photos?query={key_word}' +
29 |                              f'&page={pg}&per_page={num_per_page}&client_id={token}')
30 |         data = new_r.json()
31 |         for img_data in data['results']:
32 |             img_url = img_data['urls']['raw']
33 |             links.append(img_url)
34 |     if len(links) == total_images:
35 |         print('Links for all images')
36 |     else:
37 |         print('Missing links')
38 |     Clean_links = []
39 |     for l in links:
40 |         Clean_links.append(l.split('?')[0])
41 |     pathlib.Path(path_to_folder).mkdir(parents=True, exist_ok=True)
42 |     try:
43 |         for i in trange(len(Clean_links)):
44 |             name = 'Unsplash_' + key_word + '_' + str(i)
45 |             time.sleep(5)
46 |             urlretrieve(links[i], path_to_folder+ name + "." + 'jpeg')
47 |     except socket_error as e:
48 |         if e.errno != errno.ECONNRESET:
49 |             raise
50 |         pass
51 | 


--------------------------------------------------------------------------------
/Scripts/Twitter_Crawler/Twint_pkge.py:
--------------------------------------------------------------------------------
 1 | import twint
 2 | import pandas as pd
 3 | from tqdm import trange
 4 | import sys, os
 5 | 
 6 | DEFAULT_KEYWORD = "I suffer from depression"
 7 | DEFAULT_KEYWORDS = [ "I am diagnosed with depression", 'I am fighting depression', 'I suffer from depression' ]
 8 | DEFAULT_LIMIT = 3000
 9 | DEFAULT_USERNAME = 'Imen'
10 | DEFAULT_LIMIT_PROFILE_STATUS = 100
11 | 
12 | 
13 | def clean_data(key_words=DEFAULT_KEYWORDS):
14 |     data = pd.DataFrame()
15 |     for key in key_words:
16 |         data = data.append(pd.read_csv(key + ".csv"))
17 |     data = data.drop_duplicates(subset=['user_id'], keep='first')
18 |     return data
19 | 
20 | 
21 | def get_keywords_tweets(keyword=DEFAULT_KEYWORD, limit=DEFAULT_LIMIT):
22 |     """
23 |     This function returns a csv data set with tweets containing the keyword
24 |     :param keyword is the key word to search for
25 |     :param limit number of tweets to retrieve
26 |     """
27 |     c = twint.Config()
28 |     c.Search = keyword
29 |     c.Limit = limit
30 |     c.Store_csv = True
31 |     c.Output = keyword + ".csv"
32 |     sys.stdout = open(os.devnull, 'w')
33 |     twint.run.Search(c)
34 | 
35 | 
36 | def get_profile_infos(user_name=DEFAULT_USERNAME):
37 |     """
38 |            This function returns a csv file with users pesonal information (bio/location/followers/following)
39 |            :param  user_name Username of a twitter user
40 |     """
41 |     c = twint.Config()
42 |     c.Username = user_name
43 |     c.Store_csv = True
44 |     c.Output = ("Profileinfos.csv")
45 |     sys.stdout = open(os.devnull, 'w')
46 |     twint.run.Lookup(c)
47 | 
48 | 
49 | def get_timeline_by_username(user_name=DEFAULT_USERNAME, limit=DEFAULT_LIMIT_PROFILE_STATUS):
50 |     """
51 |            This function returns csv files each contain 100 recent post of a user
52 |            :param  user_name user_name Username of a twitter user
53 |     """
54 |     c = twint.Config()
55 |     c.Username = user_name
56 |     c.Profile = True
57 |     c.Retweets = True
58 |     c.Limit = limit
59 |     c.Store_csv = True
60 |     c.Output = ('Timelines/' + user_name + ".csv")
61 |     sys.stdout = open(os.devnull, 'w')
62 |     twint.run.Search(c)
63 | 


--------------------------------------------------------------------------------
/data/Images_load.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | from tqdm import trange
 4 | from utils import maybe_download_and_extract, getlink
 5 | from data import DataSet, DataSets
 6 | 
 7 | SOURCE_NEGATIVE_URL = "http://www.mediafire.com/file/17l0aqdbqhbvlfu/negative.npz/file"
 8 | SOURCE_POSITIVE_URL = "http://www.mediafire.com/file/v1s3tqqzriuuq61/positive.npz/file"
 9 | DATA_DEFAULT_PATH = '~/.datasets/Images_From_Unsplash_And_Pexels/'
10 | DEFAULT_TEST_SIZE = .25
11 | DEFAULT_SEED=2020
12 | 
13 | 
14 | def _extract(Pdata, Ndata):
15 |     '''
16 |         Extract features and labels of images
17 |         :param Pdata: data for depressed users' images
18 |         :param Ndata: data for non depressed users' images
19 |         :return: features and labels
20 |     '''
21 |     # Extract features and labels 
22 |     features = list(Pdata[ Pdata.files[ 0 ] ][ 0 ])
23 |     labels = list(Pdata[ Pdata.files[ 0 ] ][ 1 ])
24 |     for i in trange(1,len(Pdata)):
25 |         features.append(Pdata[ Pdata.files[ i ] ][ 0 ][ 0 ])
26 |         labels.append(Pdata[ Pdata.files[ i ] ][ 1 ][ 0 ])
27 |     for i in trange(len(Ndata)):
28 |         features.append(Ndata[ Ndata.files[ i ] ][ 0 ][ 0 ])
29 |         labels.append(Ndata[ Ndata.files[ i ] ][ 1 ][ 0 ])
30 |     return features, labels
31 | 
32 | 
33 | def load_data(data_path=DATA_DEFAULT_PATH, seed=DEFAULT_SEED, test_size=DEFAULT_TEST_SIZE):
34 |     """
35 |     Loads dataset.
36 |     :param data_path: string
37 |             the path of the directory that contains the dataset  
38 |     """
39 | 
40 |     data_path = os.path.expanduser(data_path)
41 | 
42 |     # Download files
43 |     maybe_download_and_extract(getlink(SOURCE_NEGATIVE_URL), data_path)
44 |     maybe_download_and_extract(getlink(SOURCE_POSITIVE_URL), data_path)
45 |     # read data
46 |     P = np.load(data_path + 'positive.npz',allow_pickle=True)  # load data for depressed users
47 |     N = np.load(data_path + 'negative.npz',allow_pickle=True)  # load data for not depressed users
48 |     features, labels = _extract(P, N)
49 | 
50 |     data = DataSet(features, labels, seed=seed)
51 |     train, test = data.split(split_size=test_size)
52 |     train, validation = train.split(split_size=test_size)
53 |     return DataSets(train, validation, test)
54 | 


--------------------------------------------------------------------------------
/Notebooks/Remove_Duplicates.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "### To remove duplicate images using <i><b>fdupes for linux \n",
 8 |     "* <i><b>fdupes</b></i> is a Linux utility for identifying or deleting duplicate files by comparing md5sum then running a byte-to-byte comparaison"
 9 |    ]
10 |   },
11 |   {
12 |    "cell_type": "markdown",
13 |    "metadata": {},
14 |    "source": [
15 |     "### Install fdupes  for linux "
16 |    ]
17 |   },
18 |   {
19 |    "cell_type": "code",
20 |    "execution_count": null,
21 |    "metadata": {},
22 |    "outputs": [],
23 |    "source": [
24 |     "sudo apt-get update && apt-get install fdupes "
25 |    ]
26 |   },
27 |   {
28 |    "cell_type": "markdown",
29 |    "metadata": {},
30 |    "source": [
31 |     " ### Search duplicate photos in a folder "
32 |    ]
33 |   },
34 |   {
35 |    "cell_type": "code",
36 |    "execution_count": null,
37 |    "metadata": {},
38 |    "outputs": [],
39 |    "source": [
40 |     "fdupes Path_To_folder"
41 |    ]
42 |   },
43 |   {
44 |    "cell_type": "markdown",
45 |    "metadata": {},
46 |    "source": [
47 |     "### Number of duplicates in a folder "
48 |    ]
49 |   },
50 |   {
51 |    "cell_type": "code",
52 |    "execution_count": null,
53 |    "metadata": {},
54 |    "outputs": [],
55 |    "source": [
56 |     "fdupes -m Path_To_folder"
57 |    ]
58 |   },
59 |   {
60 |    "cell_type": "markdown",
61 |    "metadata": {},
62 |    "source": [
63 |     "### Delete files and preserve the first one"
64 |    ]
65 |   },
66 |   {
67 |    "cell_type": "code",
68 |    "execution_count": null,
69 |    "metadata": {},
70 |    "outputs": [],
71 |    "source": [
72 |     "fdupes -dN Path_To_folder"
73 |    ]
74 |   }
75 |  ],
76 |  "metadata": {
77 |   "kernelspec": {
78 |    "display_name": "Python 3",
79 |    "language": "python",
80 |    "name": "python3"
81 |   },
82 |   "language_info": {
83 |    "codemirror_mode": {
84 |     "name": "ipython",
85 |     "version": 3
86 |    },
87 |    "file_extension": ".py",
88 |    "mimetype": "text/x-python",
89 |    "name": "python",
90 |    "nbconvert_exporter": "python",
91 |    "pygments_lexer": "ipython3",
92 |    "version": "3.6.10"
93 |   }
94 |  },
95 |  "nbformat": 4,
96 |  "nbformat_minor": 4
97 | }
98 | 


--------------------------------------------------------------------------------
/data/__init__.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import collections
 3 | import numpy as np
 4 | from sklearn.model_selection import train_test_split
 5 | 
 6 | DataSets = collections.namedtuple('DataSets', ['train', 'validation', 'test'])
 7 | class DataSet(object):
 8 |     def __init__(self, features, labels, seed=None, name=None):
 9 |         if name is None:
10 |             name = self.__class__.__name__
11 |         self._logger = logging.getLogger(self.__class__.__name__)
12 |         self._features = features
13 |         self._labels = labels
14 |         self._seed = seed
15 |         self._name = name
16 | 
17 |         self._epoch_completed = 0
18 |         self._index_in_epoch = 0
19 | 
20 | 
21 |     @property
22 |     def features(self):
23 |         return self._features
24 | 
25 |     @property
26 |     def labels(self):
27 |         return self._labels
28 | 
29 |     @property
30 |     def num_examples(self):
31 |         return len(self._features)
32 | 
33 |     @property
34 |     def input_dim(self):
35 |         if len(self._features.shape) == 2:
36 |             return self._features.shape[1]
37 |         else:
38 |             return self._features.shape[1:]
39 | 
40 |     @property
41 |     def output_dim(self):
42 |         return self.n_classes
43 | 
44 |     @property
45 |     def n_classes(self):
46 |         if len(self.labels.shape) == 1:
47 |             _n_classes = len(np.unique(self.labels))
48 |         else:
49 |             _n_classes = self.labels.shape[1]
50 |         return _n_classes
51 | 
52 |     def split(self, split_size):
53 |         assert split_size > 0 and not split_size > 1
54 |         features_train, features_test, labels_train, labels_test = train_test_split(self._features, self._labels,
55 |                                                                                     test_size=split_size,
56 |                                                                                     random_state=self._seed)
57 | 
58 |         train = DataSet(features_train, labels_train, seed=self._seed)
59 |         test = DataSet(features_test, labels_test, seed=self._seed)
60 |         return train, test
61 | 
62 |     def shuffle(self):
63 |         idx = np.arange(0, self.num_examples)
64 |         np.random.seed(self._seed)
65 |         np.random.shuffle(idx)
66 |         self._features = self.features[idx]
67 |         self._labels = self.labels[idx]


--------------------------------------------------------------------------------
/data/c3d.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import json
 4 | import os
 5 | from tqdm import tqdm
 6 | import logging
 7 | from utils import maybe_download_and_extract
 8 | from data import DataSet, DataSets
 9 | from utils import maybe_download_and_extract, getlink
10 | 
11 | SOURCE_URL = "http://www.mediafire.com/file/33gw4n73pyoa2ea/data.zip/file"
12 | DATA_DEFAULT_PATH = '~/.datasets/Cross-Domain_Depression_Detection_via Harvesting_Social_Media/'
13 | DEFAULT_TEST_SIZE=.25
14 | DEFAULT_SEED=2020
15 | 
16 | def _read_data(data_path):
17 |     '''
18 |     Read downloaded data
19 |     :param data_path: path for the dataset json files
20 |     :return: tweets and labels
21 |     '''
22 |     # Import data
23 |     tweets=[]
24 |     labels=[]
25 |     labels_path = data_path+'data/'
26 |     for cathegory in os.listdir(labels_path):
27 |         data_files=labels_path+cathegory
28 |         files = os.listdir(data_files)
29 |         for file in tqdm(files):
30 |             with open(data_files+'/'+file) as json_d:
31 |                 tmp = json.load(json_d)
32 |             tweets.append(tmp['text'])
33 |             if cathegory=='negative':
34 |                 labels.append(0)
35 |             if cathegory=='positive':
36 |                 labels.append(0)
37 |     x, y = np.array(tweets), np.array(labels)
38 |     return x, y
39 | 
40 | 
41 | def load_data(data_path=DATA_DEFAULT_PATH, test_size=DEFAULT_TEST_SIZE, seed=DEFAULT_SEED):
42 |     """
43 |     Loads dataset.
44 | 
45 |     Args:
46 |         data_path: string
47 |             the path of the directory that contains the dataset
48 |         test_size: float
49 |             Value between 0 and 1 that indicated the proportion to use for the test set. This is calculated from
50 |               the train set.
51 |         seed: integer
52 |             initialization of  the random number generator
53 |     Returns:
54 |         DataSets object
55 |             A named tuple of type Datasets containing the train and test sets all of them of type dataset.
56 |     """
57 | 
58 |     data_path = os.path.expanduser(data_path)
59 | 
60 |     # Download files
61 |     maybe_download_and_extract(getlink(SOURCE_URL), data_path)
62 | 
63 |     # read data to memory
64 |     x, y = _read_data(data_path)
65 | 
66 |     data = DataSet(x, y, seed=seed)
67 |     train, test = data.split(split_size=test_size)
68 |     train, validation= train.split(split_size=test_size)
69 |     return DataSets(train,validation,test)
70 | 


--------------------------------------------------------------------------------
/Scripts/Images_functions/pexels.py:
--------------------------------------------------------------------------------
 1 | from selenium import webdriver
 2 | import time
 3 | from tqdm import trange
 4 | import urllib.request
 5 | from socket import error as socket_error
 6 | import errno
 7 | import pathlib
 8 | 
 9 | 
10 | DEFAULT_key_word = "Depression"
11 | DEFAULT_PATH_FOLDER = "Pexels/Happy/"
12 | DEFAULT_WEB_DRIVER = webdriver.Firefox()
13 | DEFAULT_TIME_SLEEP_SCROLL = 5
14 | 
15 | 
16 | class AppURLopener(urllib.request.FancyURLopener):
17 |     version = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.69 " \
18 |               "Safari/537.36 "
19 | 
20 | 
21 | def _choose(messy):
22 |     clean = []
23 |     for elt in messy:
24 |         link = elt.get_attribute('src')
25 |         if not (link.find('https://images.pexels.com/photos/')):
26 |             clean.append(link.split("?")[0])
27 |     return clean
28 | def _scroll_down(browser, time_sleep):
29 |     """A method for scrolling the page."""
30 |     # Get scroll height.
31 |     last_height = browser.execute_script("return document.body.scrollHeight")
32 |     while True:
33 |         # Scroll down to the bottom.
34 |         browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
35 |         # Wait to load the page, this depends on the speed of internet connection
36 |         time.sleep(time_sleep)
37 |         # Calculate new scroll height and compare with last scroll height.
38 |         new_height = browser.execute_script("return document.body.scrollHeight")
39 |         if new_height == last_height:
40 |             break
41 |         last_height = new_height
42 | 
43 | def get_pexels_images(key_word=DEFAULT_key_word, browser=DEFAULT_WEB_DRIVER, path_to_folder=DEFAULT_PATH_FOLDER, time_sleep_for_scroll = DEFAULT_TIME_SLEEP_SCROLL):
44 |     """
45 |     Crawl Pexel Website and download all images with a key word
46 |     :param path_to_folder: Path there images will be saved
47 |     :param browser: webdriver
48 |     :param key_word: key word to be searched
49 | 
50 |     """
51 |     pathlib.Path(path_to_folder).mkdir(parents=True, exist_ok=True)
52 |     browser.get("https://www.pexels.com/search/" + key_word)
53 |     _scroll_down(browser, time_sleep_for_scroll)
54 |     browser.implicitly_wait(10)  # seconds
55 |     columns = browser.find_elements_by_class_name('photos__column')
56 |     all_imgs = []
57 |     for column in columns:
58 |         all_imgs.append(column.find_elements_by_tag_name("img"))
59 |     imgs = []
60 |     for col in all_imgs:
61 |         for img in col:
62 |             imgs.append(img)
63 |     img_links = _choose(imgs)
64 |     try:  # Because an exception occurred while running the code (three times)
65 |         urllib_urlopener = AppURLopener()
66 |         for i in trange(len(img_links)):
67 |             extension = img_links[i].split('.')[-1]
68 |             name = key_word + str(i)
69 |             time.sleep(5)
70 |             urllib_urlopener.retrieve(img_links[i], path_to_folder + name + "." + extension)
71 |     except socket_error as e:
72 |         if e.errno != errno.ECONNRESET:
73 |             raise
74 |         pass
75 |     browser.close()
76 | 


--------------------------------------------------------------------------------
/models/bilstm.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | from keras.preprocessing.text import Tokenizer
  4 | from keras.preprocessing.sequence import pad_sequences
  5 | from keras.models import Sequential
  6 | from keras.layers import Dense, Embedding, LSTM, Dropout, Bidirectional
  7 | from keras.utils.np_utils import to_categorical
  8 | from keras.optimizers import Adam
  9 | from keras.callbacks import EarlyStopping, ModelCheckpoint
 10 | import os
 11 | 
 12 | DEFAULT_MAX_FEATURES = 2000
 13 | DEFAULT_MAX_LENGTH = 28
 14 | DEFAULT_EMBED = 128
 15 | DEFAULT_LSTM_UNITS = 400
 16 | DEFAULT_BATCH = 300
 17 | DEFAULT_EPOCHS = 200
 18 | DEFAULT_LR = .001
 19 | DEFAULT_PATIENCE=10
 20 | 
 21 | 
 22 | def _preprocess_data(train,validation, test, max_features=DEFAULT_MAX_FEATURES, max_len=DEFAULT_MAX_LENGTH):
 23 |     """
 24 |     Prepare data sequentially to feed it to the neural network
 25 |     :param train:
 26 |         train data
 27 |     :param test:
 28 |         test data
 29 |     :param max_features:
 30 |         maximum number that sentence may contain
 31 |     :param max_len:
 32 |         padding size
 33 |     :return:
 34 |         train and test, features and their labels
 35 |     """
 36 |     train_tokenizer = Tokenizer(num_words=max_features, split=' ')
 37 |     train_tokenizer.fit_on_texts(train.features)
 38 |     x_train = train_tokenizer.texts_to_sequences(train.features)
 39 |     x_train = pad_sequences(x_train, maxlen=max_len)
 40 | 
 41 |     test_tokenizer = Tokenizer(num_words=max_features, split=' ')
 42 |     test_tokenizer.fit_on_texts(test.features)
 43 |     x_test = test_tokenizer.texts_to_sequences(test.features)
 44 |     x_test = pad_sequences(x_test, maxlen=max_len)
 45 | 
 46 |     validation_tokenizer = Tokenizer(num_words=max_features, split=' ')
 47 |     validation_tokenizer.fit_on_texts(validation.features)
 48 |     x_validation = validation_tokenizer.texts_to_sequences(validation.features)
 49 |     x_validation = pad_sequences(x_validation, maxlen=max_len)
 50 | 
 51 |     y_train = to_categorical(train.labels)
 52 |     y_test = to_categorical(test.labels)
 53 |     y_validation = to_categorical(validation.labels)
 54 | 
 55 |     return x_train, y_train, x_test, y_test, x_validation, y_validation
 56 | 
 57 | 
 58 | def model(train, validation, test, embed_dim=DEFAULT_EMBED, lstm_units=DEFAULT_LSTM_UNITS, batch_size=DEFAULT_BATCH,
 59 |           lr=DEFAULT_LR,patience=DEFAULT_PATIENCE,epochs=DEFAULT_EPOCHS, max_features=DEFAULT_MAX_FEATURES, max_len=DEFAULT_MAX_LENGTH):
 60 |     """
 61 |     LSTM MODEL FOR BINARY CLASSIFICATION
 62 |     :param train:
 63 |         train data
 64 |     :param validation
 65 |         validation data
 66 |     :param test:
 67 |         test data
 68 |     :param embed_dim:
 69 |         embedding dimension
 70 |     :param lstm_units:
 71 |         number of units in an lstm cell
 72 |     :param batch_size:
 73 |         batch size
 74 |     :param epochs:
 75 |         number of epochs
 76 |     :param max_features:
 77 |         maximum number that sentence may contain
 78 |     :param max_len:
 79 |         padding size
 80 |     :return:
 81 |     """
 82 |     adam = Adam(lr=lr, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
 83 | 
 84 |     file_path = "weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5"
 85 | 
 86 |     check_point = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True,
 87 |                                   save_weights_only=True, mode='auto', period=1)
 88 |     early_stop = EarlyStopping(monitor='val_loss', patience=patience, verbose=1,
 89 |                                mode='auto', restore_best_weights=True)
 90 | 
 91 |     out_dim = len(np.unique(train.labels))
 92 |     x_train, y_train, x_test, y_test,  x_validation, y_validation = _preprocess_data(train, validation, test, max_features, max_len)
 93 |     loss = 'binary_crossentropy'
 94 |     if y_train.shape[1]>2:
 95 |         loss = 'categorical_crossentropy'
 96 | 
 97 |     model = Sequential()
 98 |     model.add(Embedding(max_features, embed_dim, input_length=x_train.shape[1]))
 99 |     model.add(Dropout(.2))
100 |     model.add(Bidirectional(LSTM(lstm_units, dropout=.8, recurrent_dropout=.8))=
101 |     model.add(Dropout(.8))
102 |     model.add(Dense(out_dim, activation='softmax'))
103 |     model.compile(loss=loss, optimizer=adam, metrics=['accuracy'])
104 |     model.fit(x_train, y_train, batch_size,epochs, verbose=True,
105 |               validation_data=(x_validation,y_validation),
106 |               callbacks=[check_point,early_stop])
107 |     loss, train_accuracy = model.evaluate(x_train, y_train, verbose=False)
108 |     print("Training Accuracy: {:.4f}".format(train_accuracy))
109 |     loss, test_accuracy = model.evaluate(x_test, y_test, verbose=False)
110 |     print("Testing Accuracy:  {:.4f}".format(test_accuracy))
111 |     return train_accuracy,test_accuracy
112 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | import os 
  2 | import sys
  3 | import urllib.request
  4 | from bs4 import BeautifulSoup as soup
  5 | import zipfile
  6 | import tarfile
  7 | import logging
  8 | 
  9 | def maybe_download(file_name, data_path, url):
 10 |     """
 11 |     Download file from url if not found.
 12 | 
 13 |     This function will check if the data_path directory exists otherwise it will create it. it will check if file_name
 14 |     exists in data_path directory otherwise it will download it from url.
 15 | 
 16 |     Args:
 17 |         file_name: string
 18 |             The name of file after download
 19 |         data_path: string
 20 |             The folder where where data should be downloaded
 21 |         url: string
 22 |             The url of the file to download
 23 | 
 24 |     Returns:
 25 |         string
 26 |             the name of the downloaded file
 27 | 
 28 |     """
 29 |     logger = logging.getLogger(__name__ + '.maybe_download')
 30 | 
 31 |     file_path = data_path + file_name
 32 |     logger.debug(('Checking {} into {}'.format(file_name, data_path)))
 33 | 
 34 |     # Check data dir exists
 35 |     if not os.path.exists(data_path):
 36 |         logger.debug('Folder {} not found, creating it'.format(data_path))
 37 |         os.makedirs(data_path)
 38 | 
 39 |     # Check data file exists
 40 |     if os.path.exists(file_path):
 41 |         logger.debug('File {} found'.format(file_path))
 42 |         return file_path
 43 | 
 44 |     # Otherwise download it
 45 |     logger.info('Downloading file {} from {}'.format(file_path, url))
 46 |     temp_file_name, _ = urllib.request.urlretrieve(url, file_path)
 47 |     logger.info('Successfully downloaded file {}, {} bites'.format(temp_file_name, os.stat(temp_file_name).st_size))
 48 | 
 49 |     return file_path
 50 | 
 51 | 
 52 | def _print_download_progress(count, block_size, total_size):
 53 |     """
 54 |     Function used for printing the download progress.
 55 |     Used as a call-back function in maybe_download_and_extract().
 56 |     """
 57 | 
 58 |     # Percentage completion.
 59 |     pct_complete = float(count * block_size) / total_size
 60 | 
 61 |     # Limit it because rounding errors may cause it to exceed 100%.
 62 |     pct_complete = min(1.0, pct_complete)
 63 | 
 64 |     # Status-message. Note the \r which means the line should overwrite itself.
 65 |     msg = "\r- Download progress: {0:.1%}".format(pct_complete)
 66 | 
 67 |     # Print it.
 68 |     sys.stdout.write(msg)
 69 | 
 70 | 
 71 |     sys.stdout.flush()
 72 | 
 73 | 
 74 | def maybe_download_and_extract(url, download_dir):
 75 |     """
 76 |     Download and extract the data if it doesn't already exist.
 77 |     Assumes the url is a tar-ball file.
 78 |     :param url:
 79 |         Internet URL for the tar-file to download.
 80 |         Example: "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
 81 |     :param download_dir:
 82 |         Directory where the downloaded file is saved.
 83 |         Example: "data/CIFAR-10/"
 84 |     :return:
 85 |         Nothing.
 86 |     """
 87 | 
 88 |     # Filename for saving the file downloaded from the internet.
 89 |     # Use the filename from the URL and add it to the download_dir.
 90 |     filename = url.split('/')[-1]
 91 |     filename = filename.split('?')[0]
 92 |     file_path = os.path.join(download_dir, filename)
 93 | 
 94 |     # Check if the file already exists.
 95 |     # If it exists then we assume it has also been extracted,
 96 |     # otherwise we need to download and extract it now.
 97 |     if not os.path.exists(file_path):
 98 |         # Check if the download directory exists, otherwise create it.
 99 |         if not os.path.exists(download_dir):
100 |             os.makedirs(download_dir)
101 | 
102 |         # Download the file from the internet.
103 |         file_path, _ = urllib.request.urlretrieve(url=url,
104 |                                                   filename=file_path,
105 |                                                   reporthook=_print_download_progress)
106 | 
107 |         print()
108 |         print("Download finished. Extracting files.")
109 | 
110 |         if file_path.endswith(".zip"):
111 |             # Unpack the zip-file.
112 |             zipfile.ZipFile(file=file_path, mode="r").extractall(download_dir)
113 |         elif file_path.endswith((".tar.gz", ".tgz")):
114 |             # Unpack the tar-ball.
115 |             tarfile.open(name=file_path, mode="r:gz").extractall(download_dir)
116 | 
117 |         print("Done.")
118 |     else:
119 |         print("Data has apparently already been downloaded and unpacked.")
120 | 
121 | def getlink(link):
122 |     driver = urllib.request.urlopen(link)
123 |     content = driver.read()
124 |     driver.close()
125 |     page = soup(content, "html.parser")       
126 |     download = page.find("div",{"class":"download_link"}) 
127 |     return download.find("a",{"class":"input"})['href']


--------------------------------------------------------------------------------
/Notebooks/Explore_Twint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Twint Package"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import twint\n",
 17 |     "import nest_asyncio\n",
 18 |     "import pandas as pd\n",
 19 |     "import sys, os\n",
 20 |     "import time\n",
 21 |     "from tqdm import trange"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 2,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "# Solve compatibility issues with notebooks and RunTime errors.\n",
 31 |     "nest_asyncio.apply()"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 3,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "# I was diagnosed with deprssion\n",
 41 |     "# I am fighting depression \n",
 42 |     "# I suffer from depression\n",
 43 |     "DEFAULT_KEYWORD=\"I was diagnosed with depression\"\n",
 44 |     "DEFAULT_LIMIT=400\n",
 45 |     "def get_keywords_tweets(keyword=DEFAULT_KEYWORD, limit=DEFAULT_LIMIT):\n",
 46 |     "    c = twint.Config()\n",
 47 |     "    c.Search = keyword\n",
 48 |     "    c.Limit = limit\n",
 49 |     "    c.Store_csv = True\n",
 50 |     "    c.Output = (keyword+\".csv\")\n",
 51 |     "    sys.stdout = open(os.devnull, 'w')\n",
 52 |     "    print(twint.run.Search(c))"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 5,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "get_keywords_tweets()"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 3,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "key_words = [\"I am diagnosed with depression\",'I am fighting depression','I suffer from depression']\n",
 71 |     "data = pd.DataFrame() \n",
 72 |     "for key in key_words:\n",
 73 |     "    data =data.append( pd.read_csv(key+\".csv\")) "
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 4,
 79 |    "metadata": {},
 80 |    "outputs": [
 81 |     {
 82 |      "data": {
 83 |       "text/plain": [
 84 |        "6000"
 85 |       ]
 86 |      },
 87 |      "execution_count": 4,
 88 |      "metadata": {},
 89 |      "output_type": "execute_result"
 90 |     }
 91 |    ],
 92 |    "source": [
 93 |     "len(data)"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 5,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "data = data.drop_duplicates(subset=['user_id'], keep='first') #To drop duplicate usernames "
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": 6,
108 |    "metadata": {},
109 |    "outputs": [
110 |     {
111 |      "data": {
112 |       "text/plain": [
113 |        "2877"
114 |       ]
115 |      },
116 |      "execution_count": 6,
117 |      "metadata": {},
118 |      "output_type": "execute_result"
119 |     }
120 |    ],
121 |    "source": [
122 |     "len(data)"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "code",
127 |    "execution_count": 7,
128 |    "metadata": {},
129 |    "outputs": [
130 |     {
131 |      "data": {
132 |       "text/plain": [
133 |        "('i have been officially diagnosed with clinical depression and anxiety after many years of suffering in silence, not understanding what’s wrong with me. i have reflected my insecurities on my friendships and relationships for the longest time and i am sorry to whoever i hurt.',\n",
134 |        " 'slaysiah')"
135 |       ]
136 |      },
137 |      "execution_count": 7,
138 |      "metadata": {},
139 |      "output_type": "execute_result"
140 |     }
141 |    ],
142 |    "source": [
143 |     "data['tweet'].iloc[0],data['username'].iloc[0]"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 8,
149 |    "metadata": {},
150 |    "outputs": [],
151 |    "source": [
152 |     "user_name=data['username'].iloc[0]\n",
153 |     "def get_profile_infos(user_name=user_name):\n",
154 |     "    c = twint.Config()\n",
155 |     "    c.Username = user_name\n",
156 |     "    c.Store_csv = True\n",
157 |     "    c.Output = (\"Profileinfos.csv\")\n",
158 |     "    sys.stdout = open(os.devnull, 'w')\n",
159 |     "    twint.run.Lookup(c)\n"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 17,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "get_profile_infos()"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": null,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": [
177 |     "profiles=pd.read_csv(\"Profileinfos.csv\")"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "for i in trange(len(data)):\n",
187 |     "    get_profile_infos(data['username'].iloc[i])"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 10,
193 |    "metadata": {},
194 |    "outputs": [],
195 |    "source": [
196 |     "user_name=data['username'].iloc[0]\n",
197 |     "def get_timeline_by_usernames(user_name=user_name):\n",
198 |     "    c = twint.Config()\n",
199 |     "    c.Username =user_name\n",
200 |     "    c.Retweets = True\n",
201 |     "    c.Limit=100\n",
202 |     "    c.Store_csv = True\n",
203 |     "    c.Output = (\"Timelines/\"+user_name+\".csv\")\n",
204 |     "    sys.stdout = open(os.devnull, 'w')\n",
205 |     "    twint.run.Search(c)\n",
206 |     "        "
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "for i in trange(len(data)):\n",
216 |     "    get_timeline_by_usernames(data['username'].iloc[i])"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": []
225 |   }
226 |  ],
227 |  "metadata": {
228 |   "kernelspec": {
229 |    "display_name": "Python 3",
230 |    "language": "python",
231 |    "name": "python3"
232 |   },
233 |   "language_info": {
234 |    "codemirror_mode": {
235 |     "name": "ipython",
236 |     "version": 3
237 |    },
238 |    "file_extension": ".py",
239 |    "mimetype": "text/x-python",
240 |    "name": "python",
241 |    "nbconvert_exporter": "python",
242 |    "pygments_lexer": "ipython3",
243 |    "version": "3.6.10"
244 |   }
245 |  },
246 |  "nbformat": 4,
247 |  "nbformat_minor": 4
248 | }
249 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 |  <h1 align="center"> Depression Detection</h1>
  3 | 
  4 |  <img src="Screenshots/brain.png" alt="Logo-brain" width="150" height="150">
  5 |   
  6 | </div>
  7 | 
  8 | This project aims to detect early indicators of depression by analyzing data from a range of social media platforms, including images and texts. 
  9 | 
 10 | ---
 11 | 
 12 | <!-- List of table of contents -->
 13 | <details open="open">
 14 |   <summary name="tbc"> Table of Contents</summary>
 15 | 
 16 |   <ol>
 17 |     <li> <a href="#data-collection--">Data collection </a> 
 18 |        <ol>
 19 |           <li> <a href="#visual-data">Visual Data</a> </li>
 20 |           <li> <a href="#textual-data">Textual Data</a> </li>
 21 |      </ol>
 22 |     </li>
 23 |     <li> <a href="#models--">Models </a> 
 24 |        <ol>
 25 |           <li> <a href="#models-for-images"> Models for images</a> </li>
 26 |           <li> <a href="#models-for-texts"> Models for texts</a> </li>
 27 |      </ol>
 28 |     </li>
 29 |     
 30 |    
 31 |    <li> <a href="#software-and-technologies--">Softwares and technologies</a> </li>
 32 |    <li> <a href="#hardware--">Hardware</a> </li>
 33 | 
 34 | 
 35 |   </ol>
 36 | </details>
 37 |  
 38 | 
 39 | ---
 40 | 
 41 | ## Data collection <a href="#top">&#8593; </a>
 42 | Data were collected from <a href="https://www.pexels.com/"> Pexels</a>, <a href="https://unsplash.com/"> Unsplash  </a> and <a href="https://twitter.com/?lang=ang"> Twitter </a>. <br>
 43 | Pexels and Unsplash are two freely-usable images platforms. <br>Tweets used are publicly available. 
 44 | ### Visual Data: 
 45 | The overall process of scraping images from unsplash and pexels is presented as follows:
 46 | <div> <img src="Screenshots/crawl_images.PNG" alt="Image sample" width="800" height="200" align="center"> </div>
 47 | 
 48 | Images were crawled from Pexels using Selenium and from Unsplash using UnsplashAPI.  
 49 | <ul>
 50 |            <li> 6250 images labeled as "Depressed" </li>
 51 |            <li> 5234 images labeled as "Not Depressed" </li>
 52 | </ul>
 53 | 
 54 | This is a sample of the dataset: 
 55 | <div> <img src="Screenshots/ImageSample.png" alt="Image sample" width="800" height="300" align="center"> </div>
 56 |   
 57 | Images can be loaded as shown in <a href="Project Cheat Sheet.ipynb"> Project Cheat Sheet</a> and codes are available <a href="https://github.com/BouzidiImen/Social_media_Prediction_depression/tree/main/Scripts/Images_functions"> here <a>.
 58 | ### Textual Data: 
 59 | 
 60 | Hashtags that were used are trending hashtags using Keywords inspired from DSM-5(Diagnostic and Statistical Manual of Mental Disorders). 
 61 | Textual data were collected from Twitter users sharing their posts publicly using twint. <br> 
 62 | Overall, 5460 tweets were collected.
 63 | The process was: <br>
 64 |  <div> <img src="Screenshots/crawl_texts.PNG" alt="Image sample" width="300" height="100" align="center"> </div>
 65 |  
 66 | You can check the result of texts loader in <a href="Project Cheat Sheet.ipynb"> Project Cheat Sheet</a> and codes are available <a href="https://github.com/BouzidiImen/Social_media_Prediction_depression/tree/main/Scripts/Twitter_Crawler"> here <a>.
 67 | This is a sample of the dataset: 
 68 | <div> <img src="Screenshots/texts_exmpl.PNG" alt="Image sample" width="800" height="300" align="center"> </div>
 69 |   
 70 | ## Models <a href="#top">&#8593; </a>
 71 | #### Models for Images:
 72 |  Trained five different types of models: 
 73 |  <ul>
 74 |    <li> Deep CNN </li>
 75 |    <li> ResNet50  </li>
 76 |    <li> BiT-L(ResNet50x1) </li>
 77 |    <li>BiT-L(ResNet50x3) </li>
 78 |    <li> BiT-L(ResNet101x1): This was the best model in term of Accuracy(0.82), Precisions, Recalls, and F1-scores with hyperparameters as follow SGD (Stochastic gradient descent) as optimizer, 50 epochs, leraning rate is variable and size of images is 128*128p </li>
 79 | 
 80 | </ul>
 81 |  
 82 | #### Models for Texts:
 83 |  Trained two different types of models: 
 84 |  <ul>
 85 |    <li>LSTM</li>
 86 |    <li> GloVe+BiLSTM: This was the best model in term of Accuracy(0.7), Precisions, Recalls, and F1-scores. </li>
 87 | </ul>
 88 | 
 89 |  <br> 
 90 |  
 91 | For the best models I actually chose, you can find three notebooks: <br> 
 92 |  <ul>
 93 |    <li>For images: <a href="https://github.com/BouzidiImen/Social_media_Prediction_depression/blob/main/Test_Best_Model.ipynb">this notebook </a> presents the test of  BiT-L(ResNet101x1)  model which is the best model for classifying images.  </li>
 94 |    <li>For Texts:  <a href="https://github.com/BouzidiImen/Social_media_Prediction_depression/blob/main/Testing_models.ipynb">this notebook </a> presents the test of  GloVe+BiLSTM  which is the best model for classifying texts.  </li>
 95 |   <li>
 96 |    For integrating models: <a href="https://github.com/BouzidiImen/Social_media_Prediction_depression/blob/main/Integrating_Models.ipynb">this notebook </a> is to test the integration of BiT-L(ResNet101x1) and GloVe+BiLSTM to get a multimodal model. 
 97 |   </li>
 98 | </ul>
 99 | 
100 | You can find the saved weights for images best model and texts best model <a href="https://drive.google.com/drive/folders/1R2nh2mDIhL1Z99O9XHPefwaaCNrKvFan?fbclid=IwAR1b-ZcUq7A9Xb8uV9Tv6m4ailydNWp6Pj3sr4SZ-Qm62U4tFPuVBoA_RvA">here</a>. 
101 | 
102 |  
103 |  
104 | ## Software and technologies: <a href="#top">&#8593; </a> 
105 |  
106 | <ul>
107 |    <li> Python (version 3.8.3) </li>
108 |    <li> Anaconda (Distribution 2020.02) </li>
109 |    <li> TensorFlow  </li>
110 |    <li> Keras </li>
111 |    <li> Jupyter Notebook</li>
112 |    <li>Pycharm (Community Edition)</li>
113 | 
114 | </ul>
115 |  
116 |   
117 |  
118 | ## Hardware <a href="#top">&#8593; </a>
119 |  In the process of the implementation of our solution we used two main machines,
120 | a local machine for refactoring codes, testing models and research, and a virtual
121 | machine (VM) on Google Cloud Platform (GCP) to run models and codes that
122 | are heavy in term of computation and time. Following are the specifications of
123 | these machines:
124 | <ul>
125 |     <li> Local Machine: Lenovo E330:
126 |         <ul>
127 |             <li>Operating System : Kali Linux 2020.2</li>
128 |             <li>CPU: Intel Core i5-3230M 2,6GHz</li>
129 |          <li>RAM: 8 Go DDR3</li>
130 |          <li>Disk: 320 GB HDD</li>
131 |         </ul>
132 |     </li>
133 |  
134 |  
135 |  <li>Virtual Machine on GCP: mastermind
136 |         <br> I used two configurations: 
137 |        <ul>
138 |         <li> For tasks that are not heavy in both computation and time:
139 |            <ul> <li>Operating System : Ubuntu 19.10</li>
140 |             <li>Machine Type: n1-highmem-8</li>
141 |           <li>CPU: 8 vCPUs</li>
142 |           <li> RAM: 16 Go</li>
143 |           <li> Disk: 100 GB SSD</li>
144 |         </ul>
145 |          </li>
146 |             <li>
147 |           For training models: <ul>
148 |             <li>Operating System : Ubuntu 19.10</li>
149 |             <li> Machine Type: n1-highmem-8</li>
150 |           <li>CPU: 8 vCPUs</li>
151 |           <li>RAM: 52 Go</li>
152 |           <li>Disk: 100 GB SSD</li>
153 |         </ul>
154 |              </li>
155 |          
156 | 
157 |  </li>
158 | 
159 | </ul>
160 |          
161 |  
162 |  
163 | 
164 |  
165 |  
166 | ---
167 | 
168 | ### Finally, I hope you enjoyed my work and got inspired to help people get noticed :monocle_face:. If you want more details you can find my report <a href="https://github.com/BouzidiImen/Social_media_Prediction_depression/blob/main/Report.pdf">  here </a> . 
169 | 
170 | Please contact me for more details, I would be really happy to share more infos :yum:. 
171 | 
172 | 
173 | 
174 |  
175 | 
176 |  
177 |  
178 | 


--------------------------------------------------------------------------------
/Notebooks/Bibliography.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Bibliography:"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## Social Media Use Worldwide:"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "#### Introduction:\n",
 22 |     "The worldwide accessibility to internet is one of the phenomena that reshapes the world as we know it. Social Media is used by nearly 3.5 billion people, which makes it a powerful tool to influence people, change their behaviour and even cause them mental diseases.\n",
 23 |     "With the Cambridge Analytica scandal, it was shown to the world how personal data can be a tool to understand human behaviour.\n",
 24 |     "\n",
 25 |     "As noted in DSM-5 [1], the possibility of suicidal behavior exists at all times during major depressive episodes. According to WHO(World Health Organisation), Due to suicide, 800000 people die every year which makes it the second leading cause of death in 15-29-year-olds. Depression disorder is then considered a serious public health concern  with more than 264 million people affected worldwide. \n",
 26 |     "\n",
 27 |     "Our study focus essentially on harvesting social networks data to detect early signs of depression and hopefully participate in decreasing the rate of suicide worldwide. \n"
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "markdown",
 32 |    "metadata": {},
 33 |    "source": [
 34 |     " # Articles:\n",
 35 |     "\n",
 36 |     "The diagnosis of Depression from Social Networks is effective.[3] [4]\n",
 37 |     " \n",
 38 |     " \n",
 39 |     "The sources of publicly available data used by [2] were:\n",
 40 |     "\n",
 41 |     "  * Searching public tweets for keywords\n",
 42 |     "  * Public tweets having mental illness keywords such as depression, anxiety, insomnia, suicide...\n",
 43 |     "  * Users sharing their mental health diagnosis\n",
 44 |     "  * Groups/pages of mental health problems\n",
 45 |     "\n",
 46 |     "The features were mostly:\n",
 47 |     "  * User activity:\n",
 48 |     "      * number of posts per hour/per day\n",
 49 |     "      * posts between 12-6 AM\n",
 50 |     "      * Likes (types of reaction on facebook can be used)\n",
 51 |     "      * mentions\n",
 52 |     "      * status update (freq)\n",
 53 |     "      * posts content (N-grams)\n",
 54 |     "  * Social networking:\n",
 55 |     "      * Friends (Number)\n",
 56 |     "      * Follows (Pages/groups)\n",
 57 |     "  * Gender/Age/ Education...  \n",
 58 |     "  \n",
 59 |     "We will base our work on the DSM-5 to determine the keywords we will use for Depression, The features extracted from DSM-5 are: stress, depression, suicide, worthless, insomnia, despair, hopeless, lonely, anxiety, struggle\n",
 60 |     "\n",
 61 |     "\n",
 62 |     "\n",
 63 |     " \n",
 64 |     "### What is the age range of the population?\n",
 65 |     "Age will not be fixed due to difficulties to extract it via web-scraping (birth date and Age from Twitter are not always available and can be faulty)\n",
 66 |     "\n",
 67 |     "\n",
 68 |     "\n"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     " # Datasets:\n",
 76 |     "### How personality traits is related to depression:\n",
 77 |     "\n",
 78 |     "I used a dataset that contains Facebook statuses in raw text and gold standard personality labels (5 big traits) which are Extraversion, Neuroticism, Agreeableness, Conscientiousness and Openness. \n",
 79 |     "Multiple research (Kotov et al. 2010, Hakulinen et al. 2015) proved that Depression is related to low extraversion, high neuroticism and low conscientiousness.\n",
 80 |     "\n",
 81 |     "I Used this to create a binary variable for depression in the dataset.\n",
 82 |     "\n",
 83 |     "The My_Personality Dataset was available to the public but was removed due to privacy security.\n",
 84 |     "\n",
 85 |     "\n",
 86 |     "### Another Dataset:\n",
 87 |     "\n",
 88 |     "I contacted Sharath Chandra Guntuku the writer of \"What Twitter Profile and Posted Images Reveal About Depression and Anxiety\" and he provided me with a data set containing twitter Ids of people having Depression with an estimated score of depression and anxiety level (the depression score varies from 1.591 to 4.383).\n",
 89 |     "\n",
 90 |     "(This is not interested due to the unknown time of the experiment)\n",
 91 |     "\n",
 92 |     "### Dataset of labeled status: \n",
 93 |     "From this article [2] a dataset of labeled status was found publicly available.\n",
 94 |     "\n"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "markdown",
 99 |    "metadata": {},
100 |    "source": [
101 |     "# Modeling:\n",
102 |     "\n",
103 |     "The multimodal learning is an interested approach that combines information from different modalities such as images, texts, audio..\n",
104 |     "\n",
105 |     "This method is challenging due to two reasons: \n",
106 |     " * Where the fusion of modalities has to be done?\n",
107 |     "      * Early Fusion \n",
108 |     "      * Late Fusion\n",
109 |     "      * In between\n",
110 |     " \n",
111 |     " \n",
112 |     " * How to combine modalities?\n",
113 |     "      * Late fusion :\n",
114 |     "          * Weighted mean\n",
115 |     "          * Concat: using a single linear perceptron\n",
116 |     "          * Concat+Multitask: uses the same multi-task-loss of CentralNet (proposed by Vielzeuf et.al)\n",
117 |     "          * Moddrop \n",
118 |     "          * Gated Multimodal unit \n",
119 |     "          * CentralNet: Vielzeuf et.al (Best accuracy on three datasets) \n",
120 |     "\n",
121 |     "\n",
122 |     "There's three github repositories that uses Multimodal learning:\n",
123 |     "\n",
124 |     " * <a href=\"https://github.com/husseinmozannar/multimodal-deep-learning-for-disaster-response\">disaster-response</a>\n",
125 |     " * <a href=\"https://github.com/Trankei/inf8225-emoclassifier\">Emotion classifier</a> there's an article related to this repo on <a href=\"https://medium.com/dair-ai/detecting-emotions-with-cnn-fusion-models-b066944969c8\"> medium </a> and a paper <a href=\"https://arxiv.org/pdf/1708.02099.pdf\">article</a>. [5]\n",
126 |     " \n",
127 |     "## Twitter vs Facebook\n",
128 |     "\n",
129 |     "There's multiple studies done on detecting Depressive Disorder on Twitter. \n",
130 |     "\n",
131 |     "For instance: \n",
132 |     "\n",
133 |     "[6] detected depressive disorders via Twitter.\n",
134 |     "In previous studies, online detection was proven effective in Twitter.\n",
135 |     "\n",
136 |     "Facebook \"protects\" users Data and privacy by those <a href=\"https://www.facebook.com/apps/site_scraping_tos_terms.php\"> terms </a>.\n",
137 |     "\n",
138 |     "#Models that will be created:\n",
139 |     "##Three architectures will be built:\n",
140 |     "* First model: \n",
141 |     "\n",
142 |     "    input: photos and status both labeled. \n",
143 |     "    \n",
144 |     "    output: depressed / not depressed\n",
145 |     "* Second Model:\n",
146 |     "\n",
147 |     "    input: profile of twitter user (both structured and unstructured data)\n",
148 |     "    \n",
149 |     "    output: depressed/ Not depressed\n",
150 |     "\n",
151 |     "* Third Model:\n",
152 |     "\n",
153 |     "    input: Integrate first and second models into one model accepting two types of unstructured data Images and texts \n",
154 |     "    \n",
155 |     "    output: depressed/ Not depressed\n",
156 |     "\n"
157 |     
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "markdown",
162 |    "metadata": {},
163 |    "source": [
164 |     "References:\n",
165 |     "\n",
166 |     "\n",
167 |     "[1] American Psychiatric Association. Diagnostic and Statistical Manual of Mental Disorders : DSM-5. Arlington, Va., American Psychiatric Association, 2013.\n",
168 |     "\n",
169 |     "\n",
170 |     "[2] Guntuku, Sharath Chandra & Yaden, David & Kern, Margaret & Ungar, Lyle & Eichstaedt, Johannes. (2017). Detecting depression and mental illness on social media: an integrative review. Current Opinion in Behavioral Sciences. 18. 43-49. 10.1016/j.cobeha.2017.07.005. \n",
171 |     "\n",
172 |     "[3] Shen, Tiancheng & Jia, Jia & Shen, Guangyao & Feng, Fuli & He, Xiangnan & Luan, Huanbo & Tang, Jie & Tiropanis, Thanassis & Chua, Tat-Seng & Hall, Wendy. (2018). Cross-Domain Depression Detection via Harvesting Social Media. 1611-1617. 10.24963/ijcai.2018/223. \n",
173 |     "\n",
174 |     "[4] Shen, Guangyao & Jia, Jia & Nie, Liqiang & Feng, Fuli & Zhang, Cunjun & Hu, Tianrui & Chua, Tat-Seng & Zhu, Wenwu. (2017). Depression Detection via Harvesting Social Media: A Multimodal Dictionary Learning Solution. 3838-3844. 10.24963/ijcai.2017/536. \n",
175 |     "\n",
176 |     "[5] Duong, Chi & Lebret, Rémi & Aberer, Karl. (2017). Multimodal Classification for Analysing Social Media. \n",
177 |     "\n",
178 |     "[6] Gamon, Michael & Choudhury, Munmun & Counts, Scott & Horvitz, Eric. (2013). Predicting Depression via Social Media. Association for the Advancement of Artificial Intelligence. \n"
179 |    ]
180 |   }
181 |  ],
182 |  "metadata": {
183 |   "kernelspec": {
184 |    "display_name": "Python 3",
185 |    "language": "python",
186 |    "name": "python3"
187 |   },
188 |   "language_info": {
189 |    "codemirror_mode": {
190 |     "name": "ipython",
191 |     "version": 3
192 |    },
193 |    "file_extension": ".py",
194 |    "mimetype": "text/x-python",
195 |    "name": "python",
196 |    "nbconvert_exporter": "python",
197 |    "pygments_lexer": "ipython3",
198 |    "version": "3.6.10"
199 |   }
200 |  },
201 |  "nbformat": 4,
202 |  "nbformat_minor": 4
203 | }
204 | 


--------------------------------------------------------------------------------
/Notebooks/TwitterscraperDemo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Twitter scraper"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 35,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "from twitterscraper import query_tweets,query_user_info,query_tweets_from_user"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 36,
 22 |    "metadata": {},
 23 |    "outputs": [
 24 |     {
 25 |      "name": "stderr",
 26 |      "output_type": "stream",
 27 |      "text": [
 28 |       "INFO: Using proxy 47.75.11.94:8080\n",
 29 |       "INFO: Got user information from username @Alwaleed_Talal\n",
 30 |       "INFO: Using proxy 200.89.178.228:3128\n",
 31 |       "INFO: Got user information from username @Moncef_Marzouki\n"
 32 |      ]
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "info=query_user_info(user='@Alwaleed_Talal') \n",
 37 |     "info\n",
 38 |     "info=query_user_info(user='@Moncef_Marzouki')\n",
 39 |     "info\n",
 40 |     "## these are two users that theirs profiles can't be scraped by twitter scraper "
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 37,
 46 |    "metadata": {},
 47 |    "outputs": [
 48 |     {
 49 |      "name": "stderr",
 50 |      "output_type": "stream",
 51 |      "text": [
 52 |       "INFO: Using proxy 203.162.21.216:8000\n",
 53 |       "INFO: Got user information from username @rebaisaber\n"
 54 |      ]
 55 |     },
 56 |     {
 57 |      "data": {
 58 |       "text/plain": [
 59 |        "3756880"
 60 |       ]
 61 |      },
 62 |      "execution_count": 37,
 63 |      "metadata": {},
 64 |      "output_type": "execute_result"
 65 |     }
 66 |    ],
 67 |    "source": [
 68 |     "info=query_user_info(user='@rebaisaber')\n",
 69 |     "# even though the name is in arabic it scrapes its content\n",
 70 |     "info.followers\n"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 38,
 76 |    "metadata": {},
 77 |    "outputs": [
 78 |     {
 79 |      "name": "stderr",
 80 |      "output_type": "stream",
 81 |      "text": [
 82 |       "INFO: Using proxy 165.227.215.71:8080\n",
 83 |       "INFO: Got user information from username @namatahara\n",
 84 |       "INFO: Using proxy 62.118.131.200:3127\n",
 85 |       "INFO: Got user information from username @anatoliisharii\n"
 86 |      ]
 87 |     }
 88 |    ],
 89 |    "source": [
 90 |     "infoChinese=query_user_info(user='@namatahara')\n",
 91 |     "infoRuss=query_user_info(user='@anatoliisharii')\n",
 92 |     "# The language is not the problem "
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 39,
 98 |    "metadata": {},
 99 |    "outputs": [
100 |     {
101 |      "name": "stderr",
102 |      "output_type": "stream",
103 |      "text": [
104 |       "INFO: Scraping tweets from https://twitter.com/@ityaadie\n",
105 |       "INFO: Using proxy 114.134.190.230:37294\n",
106 |       "INFO: Got 20 tweets from username @ityaadie\n"
107 |      ]
108 |     }
109 |    ],
110 |    "source": [
111 |     "Q=query_tweets_from_user('@ityaadie',limit=3)\n",
112 |     "#per page : query_single_page"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 40,
118 |    "metadata": {},
119 |    "outputs": [
120 |     {
121 |      "data": {
122 |       "text/plain": [
123 |        "list"
124 |       ]
125 |      },
126 |      "execution_count": 40,
127 |      "metadata": {},
128 |      "output_type": "execute_result"
129 |     }
130 |    ],
131 |    "source": [
132 |     "type(Q)"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 41,
138 |    "metadata": {},
139 |    "outputs": [
140 |     {
141 |      "name": "stdout",
142 |      "output_type": "stream",
143 |      "text": [
144 |       "0\n",
145 |       "bns tummy is my new comfy spot here see a selfypic.twitter.com/1EJms2rQFb\n",
146 |       "1\n",
147 |       "Sex is great (I’m assuming here) but have y’all ever been hugged like you’re the most precious thing in their arms? As if they’d lose something if they let you go? So tight that you felt yourself feel whole?\n",
148 |       "\n",
149 |       "If it’s not obvious, I want hugs\n",
150 |       "2\n",
151 |       "This is the cutest thing I've seen in a while!https://twitter.com/shrishrishrii/status/1230359993683042304 …\n",
152 |       "3\n",
153 |       "Time Circles\n",
154 |       "41mins:44seconds in the life of circles\n",
155 |       "\n",
156 |       "creativecoding #processing #basiljs #indesign #javascript #p5js\n",
157 |       "#programmingfordesigners #generativedesign #generativeart @ UIC School of Design https://www.instagram.com/p/B8xWqbFJDKN/?igshid=n7vjrrp8rspq …\n",
158 |       "4\n",
159 |       "I hate that part of asian culture where the men eat before the women. Everyone should jus eat at the same damn time on the table\n",
160 |       "5\n",
161 |       "and more basil.js\n",
162 |       "...\n",
163 |       "\n",
164 |       " #creativecoding #processing #basiljs #indesign #javascript\n",
165 |       "#programmingfordesigners @ UIC School of Design https://www.instagram.com/p/B8iOULMJSvu/?igshid=11td02o8obj5p …\n",
166 |       "6\n",
167 |       "more experiments with basil.js\n",
168 |       "...\n",
169 |       "\n",
170 |       " #creativecoding #processing #basiljs #indesign #javascript\n",
171 |       "#programmingfordesigners #p5js @ UIC School of Design https://www.instagram.com/p/B8iODEhpz9M/?igshid=zp5ewwfj9c76 …\n",
172 |       "7\n",
173 |       "Initial expeiments with basil.js during a workshop with teddavisdotorg \n",
174 |       "\n",
175 |       "...\n",
176 |       "\n",
177 |       "#creativecoding #processing #basiljs #indesign #javascript\n",
178 |       "#programmingfordesigners @ UIC School of Design https://www.instagram.com/p/B8iNqbWpj2P/?igshid=1eeqiw9mr7bnu …\n",
179 |       "8\n",
180 |       "28 days in a row  https://www.headspace.com/sharing-is-caring/Coo9reCLSGOqkt-AruSpUw …pic.twitter.com/r7fDArf2BJ\n",
181 |       "9\n",
182 |       "drinkiespic.twitter.com/MPTJ2IsMRE\n",
183 |       "10\n",
184 |       "THERE IS NOT ENOUGH TIME IN A DAY TO GO TO CLASS AND STUDY HARD AND GO TO WORK AND MAKE HEALTHY MEALS AND GO TO THE GYM AND SPEND TIME WITH FRIENDS AND FAMILY AND I’M SICK OF IT\n",
185 |       "11\n",
186 |       "\"What You Missed that day you were absent from Fourth Grade\" by Brad Aaron Modlinpic.twitter.com/luX2EowPAZ\n",
187 |       "12\n",
188 |       "Me vs who I share my birthday with.pic.twitter.com/CNmOQr65t1\n",
189 |       "13\n",
190 |       "#itMepic.twitter.com/ZhKzBQecoR\n",
191 |       "14\n",
192 |       "Every day is a good day to meditate. https://www.headspace.com/sharing-is-caring/bdPUABndSN6vrsJY_wQqww …pic.twitter.com/IDNaJiYwjR\n",
193 |       "15\n",
194 |       "At this point, I'd marry just anybody but I cannot, will not settle for just any therapist.\n",
195 |       "16\n",
196 |       "pic.twitter.com/htX6w9mJDG\n",
197 |       "17\n",
198 |       "Mumbai still has internet, permission to protest, and a non-BJP government. Read up on your rights, take your friends, convince your co-workers, and show up at August Kranti Maidan at 4 pm today. #IndiaAgainstCAA\n",
199 |       "18\n",
200 |       "My parents this morning: If you get detained, we're not coming to get you. \n",
201 |       "\n",
202 |       "I'm so glad cowardice and apathy aren't hereditary.\n",
203 |       "19\n",
204 |       "\"meet me in the narrow neck of an hourglass\n",
205 |       "where I can't find the time to see where I you end and I begin\"\n",
206 |       "\n",
207 |       "\n"
208 |      ]
209 |     }
210 |    ],
211 |    "source": [
212 |     "for i in range(len(Q)):\n",
213 |     "    print(i)\n",
214 |     "    print(Q[i].text)"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 48,
220 |    "metadata": {},
221 |    "outputs": [
222 |     {
223 |      "data": {
224 |       "text/plain": [
225 |        "{'screen_name': 'ityaadie',\n",
226 |        " 'username': 'depression cherry',\n",
227 |        " 'user_id': '310830503',\n",
228 |        " 'tweet_id': '897330478012813312',\n",
229 |        " 'tweet_url': '/ityaadie/status/897330478012813312',\n",
230 |        " 'timestamp': datetime.datetime(2017, 8, 15, 5, 33, 52),\n",
231 |        " 'timestamp_epochs': 1502775232,\n",
232 |        " 'text': 'bns tummy is my new comfy spot here see a selfypic.twitter.com/1EJms2rQFb',\n",
233 |        " 'text_html': '<p class=\"TweetTextSize TweetTextSize--normal js-tweet-text tweet-text\" data-aria-label-part=\"0\" lang=\"en\">bns tummy is my new comfy spot here see a selfy<a class=\"twitter-timeline-link u-hidden\" data-pre-embedded=\"true\" dir=\"ltr\" href=\"https://t.co/1EJms2rQFb\">pic.twitter.com/1EJms2rQFb</a></p>',\n",
234 |        " 'links': [],\n",
235 |        " 'hashtags': [],\n",
236 |        " 'has_media': True,\n",
237 |        " 'img_urls': ['https://pbs.twimg.com/media/DHP1R28UIAEjjRi.jpg'],\n",
238 |        " 'video_url': '',\n",
239 |        " 'likes': 151,\n",
240 |        " 'retweets': 10,\n",
241 |        " 'replies': 10,\n",
242 |        " 'is_replied': True,\n",
243 |        " 'is_reply_to': False,\n",
244 |        " 'parent_tweet_id': '',\n",
245 |        " 'reply_to_users': []}"
246 |       ]
247 |      },
248 |      "execution_count": 48,
249 |      "metadata": {},
250 |      "output_type": "execute_result"
251 |     }
252 |    ],
253 |    "source": [
254 |     "Q[0].__dict__"
255 |    ]
256 |   },
257 |   {
258 |    "cell_type": "code",
259 |    "execution_count": 49,
260 |    "metadata": {},
261 |    "outputs": [
262 |     {
263 |      "data": {
264 |       "text/plain": [
265 |        "'310830503'"
266 |       ]
267 |      },
268 |      "execution_count": 49,
269 |      "metadata": {},
270 |      "output_type": "execute_result"
271 |     }
272 |    ],
273 |    "source": [
274 |     "Q[0].user_id"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "markdown",
279 |    "metadata": {},
280 |    "source": [
281 |     "# Twitter_scraper\n",
282 |     "\n",
283 |     "For personal information extraction of users we can use this."
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": 9,
289 |    "metadata": {},
290 |    "outputs": [],
291 |    "source": [
292 |     "from twitter_scraper import Profile"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": 44,
298 |    "metadata": {},
299 |    "outputs": [
300 |     {
301 |      "name": "stdout",
302 |      "output_type": "stream",
303 |      "text": [
304 |       "Name: محمد المنصف المرزوقي \n",
305 |       " Username: @Moncef_Marzouki \n",
306 |       " followers: 502720 \n",
307 |       " Birthday: None \n",
308 |       " Website: moncefmarzouki.net \n",
309 |       " link to profile photo : https://pbs.twimg.com/profile_images/1171090653218054146/prECMbzH_400x400.jpg \n",
310 |       " Likes: None \n",
311 |       " Tweets count : 2409 \n",
312 |       " following: 12\n"
313 |      ]
314 |     }
315 |    ],
316 |    "source": [
317 |     "profile = Profile(\"@Moncef_Marzouki\")\n",
318 |     "print(\"Name:\",profile.name,\n",
319 |     "      \"\\n Username:\",profile.username,\n",
320 |     "      \"\\n followers:\",profile.followers_count,\n",
321 |     "      \"\\n Birthday:\",profile.birthday,\n",
322 |     "      \"\\n Website:\", profile.website,\n",
323 |     "      \"\\n link to profile photo :\",profile.profile_photo,\n",
324 |     "      \"\\n Likes:\",profile.likes_count,\n",
325 |     "      \"\\n Tweets count :\",profile.tweets_count,\n",
326 |     "      \"\\n following:\",profile.following_count)"
327 |    ]
328 |   }
329 |  ],
330 |  "metadata": {
331 |   "kernelspec": {
332 |    "display_name": "Python 3",
333 |    "language": "python",
334 |    "name": "python3"
335 |   },
336 |   "language_info": {
337 |    "codemirror_mode": {
338 |     "name": "ipython",
339 |     "version": 3
340 |    },
341 |    "file_extension": ".py",
342 |    "mimetype": "text/x-python",
343 |    "name": "python",
344 |    "nbconvert_exporter": "python",
345 |    "pygments_lexer": "ipython3",
346 |    "version": "3.6.10"
347 |   }
348 |  },
349 |  "nbformat": 4,
350 |  "nbformat_minor": 4
351 | }
352 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/ResNet-Transfer.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## Load Data "
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "from data import Images_load"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 2,
 22 |    "metadata": {},
 23 |    "outputs": [
 24 |     {
 25 |      "name": "stdout",
 26 |      "output_type": "stream",
 27 |      "text": [
 28 |       "Data has apparently already been downloaded and unpacked.\n",
 29 |       "Data has apparently already been downloaded and unpacked.\n"
 30 |      ]
 31 |     },
 32 |     {
 33 |      "name": "stderr",
 34 |      "output_type": "stream",
 35 |      "text": [
 36 |       "100%|██████████| 4006/4006 [00:09<00:00, 442.29it/s]\n",
 37 |       "100%|██████████| 4192/4192 [00:10<00:00, 386.32it/s]\n"
 38 |      ]
 39 |     }
 40 |    ],
 41 |    "source": [
 42 |     "train, validation, test = Images_load.load_data()"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "code",
 47 |    "execution_count": 3,
 48 |    "metadata": {},
 49 |    "outputs": [
 50 |     {
 51 |      "data": {
 52 |       "text/plain": [
 53 |        "(4611, 224, 224, 3)"
 54 |       ]
 55 |      },
 56 |      "execution_count": 3,
 57 |      "metadata": {},
 58 |      "output_type": "execute_result"
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "train.features.shape"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "# ResNet 50 transfer learning: "
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": 7,
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "import keras\n",
 79 |     "from keras.models import Sequential\n",
 80 |     "from keras.layers import Dense, Flatten\n",
 81 |     "from keras.applications.resnet50 import ResNet50, decode_predictions, preprocess_input\n"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 9,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "def _prepare_data(train, validation, test):\n",
 91 |     "    \"\"\"\n",
 92 |     "    Prepare datasets of images for CNN\n",
 93 |     "    :param train:\n",
 94 |     "    :param validation:\n",
 95 |     "    :param test:\n",
 96 |     "    :return:\n",
 97 |     "    \"\"\"\n",
 98 |     "    features, y_train = train.features, train.labels\n",
 99 |     "    featuresV, y_val = validation.features, validation.labels\n",
100 |     "    featuresT, y_test = test.features, test.labels\n",
101 |     "    x_train = np.stack(features)\n",
102 |     "    x_val = np.stack(featuresV)\n",
103 |     "    x_test = np.stack(featuresT)\n",
104 |     "    x_train = x_train.astype('float32')\n",
105 |     "    x_test = x_test.astype('float32')\n",
106 |     "    x_val = x_val.astype('float32')\n",
107 |     "    x_val /= 255\n",
108 |     "    x_train /= 255\n",
109 |     "    x_test /= 255\n",
110 |     "\n",
111 |     "    return x_train, y_train, x_test, y_test, x_val, y_val"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 11,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "import numpy as np\n",
121 |     "x_train, y_train, x_test, y_test, x_val, y_val=_prepare_data(train,validation,test)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 12,
127 |    "metadata": {},
128 |    "outputs": [
129 |     {
130 |      "name": "stdout",
131 |      "output_type": "stream",
132 |      "text": [
133 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n",
134 |       "Instructions for updating:\n",
135 |       "If using Keras pass *_constraint arguments to layers.\n",
136 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4070: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.\n",
137 |       "\n",
138 |       "Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5\n",
139 |       "102858752/102853048 [==============================] - 9s 0us/step\n"
140 |      ]
141 |     }
142 |    ],
143 |    "source": [
144 |     "# Load ResNet50 Trained on imagenet\n",
145 |     "resnet_model = ResNet50(weights=\"imagenet\")\n"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 20,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "# We should preprocess the images the same way resnet images were preprocessed\n",
155 |     "x_train_preprocessed = preprocess_input(x_train)\n",
156 |     "x_test_preprocessed = preprocess_input(x_test)\n",
157 |     "x_val_preprocess = preprocess_input(x_val)"
158 |    ]
159 |   },
160 |   {
161 |    "cell_type": "code",
162 |    "execution_count": 16,
163 |    "metadata": {},
164 |    "outputs": [],
165 |    "source": [
166 |     "# Build a new model that is ResNet50 minus the very last layer\n",
167 |     "last_layer = resnet_model.get_layer(\"avg_pool\")\n",
168 |     "\n",
169 |     "resnet_layers = keras.Model(inputs=resnet_model.inputs, outputs=last_layer.output)"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 30,
175 |    "metadata": {},
176 |    "outputs": [
177 |     {
178 |      "name": "stdout",
179 |      "output_type": "stream",
180 |      "text": [
181 |       "Model: \"sequential_4\"\n",
182 |       "_________________________________________________________________\n",
183 |       "Layer (type)                 Output Shape              Param #   \n",
184 |       "=================================================================\n",
185 |       "model_2 (Model)              (None, 2048)              23587712  \n",
186 |       "_________________________________________________________________\n",
187 |       "dense_4 (Dense)              (None, 2)                 4098      \n",
188 |       "=================================================================\n",
189 |       "Total params: 23,591,810\n",
190 |       "Trainable params: 4,098\n",
191 |       "Non-trainable params: 23,587,712\n",
192 |       "_________________________________________________________________\n"
193 |      ]
194 |     }
195 |    ],
196 |    "source": [
197 |     "# We can directly stich the models together\n",
198 |     "\n",
199 |     "ResNet_adapt=Sequential()\n",
200 |     "ResNet_adapt.add(resnet_layers)\n",
201 |     "ResNet_adapt.add(Dense(2, activation=\"sigmoid\"))\n",
202 |     "\n",
203 |     "ResNet_adapt.layers[0].trainable=False # we are just going to tune the last layer weight the other weights inside of the resnet model will remain the same \n",
204 |     "\n",
205 |     "ResNet_adapt.compile(loss=\"binary_crossentropy\", optimizer=\"adam\", metrics=[\"accuracy\"])\n",
206 |     "\n",
207 |     "ResNet_adapt.summary()"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 31,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "from keras.callbacks import EarlyStopping, ModelCheckpoint\n",
217 |     "from keras.callbacks import TensorBoard\n",
218 |     "my_callbacks = [\n",
219 |     "        EarlyStopping(monitor='val_loss', patience=10, verbose=1, mode='auto', restore_best_weights=True),\n",
220 |     "        ModelCheckpoint(filepath='Resnet_transfer.{epoch:02d}-{val_loss:.2f}.h5',\n",
221 |     "                        monitor='val_accuracy', verbose=1,\n",
222 |     "                        save_best_only=True, mode='max'),\n",
223 |     "        TensorBoard(log_dir=\"logs\", histogram_freq=0, write_graph=True, write_images=True)\n",
224 |     "    ]"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": 32,
230 |    "metadata": {},
231 |    "outputs": [
232 |     {
233 |      "name": "stdout",
234 |      "output_type": "stream",
235 |      "text": [
236 |       "Train on 4611 samples, validate on 1538 samples\n",
237 |       "Epoch 1/50\n",
238 |       "4611/4611 [==============================] - 751s 163ms/step - loss: 0.5920 - accuracy: 0.6852 - val_loss: 1.1036 - val_accuracy: 0.4844\n",
239 |       "\n",
240 |       "Epoch 00001: val_accuracy improved from -inf to 0.48440, saving model to Resnet_transfer.01-1.10.h5\n",
241 |       "Epoch 2/50\n",
242 |       "4611/4611 [==============================] - 733s 159ms/step - loss: 0.5507 - accuracy: 0.7224 - val_loss: 1.1658 - val_accuracy: 0.4844\n",
243 |       "\n",
244 |       "Epoch 00002: val_accuracy did not improve from 0.48440\n",
245 |       "Epoch 3/50\n",
246 |       "4611/4611 [==============================] - 754s 164ms/step - loss: 0.5292 - accuracy: 0.7427 - val_loss: 1.2232 - val_accuracy: 0.4844\n",
247 |       "Restoring model weights from the end of the best epoch\n",
248 |       "\n",
249 |       "Epoch 00003: val_accuracy did not improve from 0.48440\n",
250 |       "Epoch 00003: early stopping\n",
251 |       "CPU times: user 3h 11min 16s, sys: 1h 45min 28s, total: 4h 56min 45s\n",
252 |       "Wall time: 37min 30s\n"
253 |      ]
254 |     }
255 |    ],
256 |    "source": [
257 |     "%%time \n",
258 |     "history=ResNet_adapt.fit(x_train_preprocessed, y_train, epochs=50, validation_data=(x_val_preprocess, y_val), callbacks=my_callbacks)"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": 33,
264 |    "metadata": {},
265 |    "outputs": [
266 |     {
267 |      "data": {
268 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYIAAAEWCAYAAABrDZDcAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3deZxcZZ3v8c+vq6vTS5JOJ91k64SEkRESDAHagLIYrjhDkEUc1CDogIO54DAIr3k54B3HbfSO9w5yUREZHKODYhRZxg1EHSMREUiHCTEEkQgJ6aydPekl1cvv/nFOd5+urupUhz5dnZzv+/WqV9dZ66nKyfme53lOPWXujoiIJFdJsQsgIiLFpSAQEUk4BYGISMIpCEREEk5BICKScAoCEZGEUxDIsDGzb5nZ5wpcd4OZXRBjWa4ys5/Htf84mdmnzew74fOZZnbQzFKHW/cIX+sFM1t4pNsPst9fm9l1w71fiUdpsQsgks3MvgU0ufsnjnQf7n4/cP+wFapI3P01YOxw7CvX5+ruc4dj33J0U41AjjpmpgsYkWGkIEiYsEnmY2a2xsxazOwbZjbZzB4zswNm9kszq4msf2nYfLA3rO6fHFl2mpk9F273faA867UuNrPV4bZPmdm8Asq3BLgK+IewSeTHkXLfamZrgBYzKzWz28zsT+HrrzOzyyP7ucbMnoxMu5ldb2Yvm9keM/uqmVmO159mZm1mNjHrfe40s7SZvcHMnjCzfeG87+d5Hz8zsxuz5j1vZu8On3/JzDaZ2X4zW2Vm5+bZz6yw7KXh9Ozw9Q+Y2S+A2qz1f2Bm28LyrTCzuQV8rheEz8eY2Z1mtiV83GlmY8JlC82sycz+3sx2mNlWM7s297/igPdQYmafMLON4bb3mVl1uKzczL5jZrvC42SlmU0Ol11jZq+E7/VVM7uqkNeTI+DueiToAWwAngYmA9OBHcBzwGnAGOBXwKfCdf8caAHeAaSBfwDWA2XhYyNwS7jsCqAD+Fy47enhvs8EUsBfh689JlKOC/KU8Vs9+8kq92pgBlARznsPMI3gguZ9YVmnhsuuAZ6MbO/AT4AJwEygGbgwz+v/CvhwZPpfgXvC58uAfwxfsxw4J88+Pgj8NjI9B9gbef9XA5MImmf/HtgGlIfLPg18J3w+Kyx7aTj9O+CO8N/qPOBAz7rh8g8B48LldwKrC/hcLwiffzY8No4D6oCngH8Oly0EOsN10sBFQCtQk+f9/xq4LlKm9cAJBM1cDwPfDpf9T+DHQGV4nJwBjAeqgP3AG8P1pgJzi/3/51h9qEaQTF9x9+3uvhn4DfCMu/+3ux8CHiEIBQhOrj9191+4ewdwO1ABvBU4i+CEcKe7d7j7g8DKyGt8GPg3d3/G3bvc/T+AQ+F2R+rL7r7J3dsA3P0H7r7F3bvd/fvAy8CCQbb/grvv9aDdfTkwP8963wWuBAhrDYvDeRCE3fHANHdvd/cnc++CR4D5ZnZ8OH0V8HD4GePu33H3Xe7e6e5fJDhxv3GwN29mM4E3A//k7ofcfQXBSbSXuy919wPh63waOLXn6rsAVwGfdfcd7t4MfAb4QGR5R7i8w90fBQ4ersyR/d7h7q+4+0Hg48DisJbTQRCIbwiPk1Xuvj/crhs4xcwq3H2ru79Q4PuQIVIQJNP2yPO2HNM9nZPTCK76AXD3bmATQU1iGrDZ3aOjFm6MPD8e+Puwur/XzPYSXM1Pex3l3hSdMLMPRpqe9gKnkNVUkmVb5Hkr+TthHwTeYmbTCK66nSAwIagVGfBs2GT2oVw7cPcDwE8JQoTwb2/nddjE8mLYhLMXqD5M2SH47Pa4e0tkXu9nbmYpM/tC2Fy2n+BqnwL2G91/9N9wI/3/vXa5e2dkerDP8HD7LSWolX4beBz4Xtgc9X/NLB2+x/cB1wNbzeynZnZSge9DhkhBIIPZQnBCB3qvjmcAm4GtwPSsdvaZkeebgM+7+4TIo9LdlxXwuvmGxO2dH15pfx24EZjk7hOAtQQn6dfF3fcCPwfeC7wfWNYTeO6+zd0/7O7TCJo17jazN+TZ1TLgSjN7C0FNanlY9nOBW8P914Rl31dA2bcCNWZWFZkX/czfD1wGXEAQLLPC+T37PdxQw/3+vcN9bznMNoXItd9OYHtYu/iMu88hqGleTNCshrs/7u7vIGgW+gPBv7fEQEEgg3kAeKeZvd3M0gRt2YcI2o5/R/Cf+aaw4/bd9G+W+TpwvZmdaYEqM3unmY0r4HW3E7QnD6aK4MTWDBB2XJ4ylDd3GN8lOCH9FX3NQpjZe8ysPpzcE5ahK88+HiU4AX4W+H5Yo4KgDb8zLHupmX2SoF18UO6+EWgEPmNmZWZ2DnBJZJVxBP8+uwja3P931i4O97kuAz5hZnVmVgt8Ejji7yhk7feWsKN7bFiu77t7p5mdb2ZvsuB7EvsJmoq6LLiB4dIw9A4RNEPl+5zldVIQSF7u/hJBp+ZXgJ0EJ51L3D3j7hng3QSdsnsIqvEPR7ZtJOgnuCtcvj5ctxDfAOaETT7/mads64AvEgTSduBNwG+H9g4H9SPgRIKr1ucj898MPGNmB8N1Purur+Yp4yGCz+QCImFC0BTyGPBHgmaSdrKavQbxfoIO+N3Ap4D7IsvuC/e3GVhH0PEbdbjP9XMEQbMG+D3BTQQFfUHwMJYSNAGtAF4leL9/Fy6bQtAUtx94EXiCIHxKCC48thC817cBHxmGskgO1r+JV0REkkY1AhGRhFMQiIgknIJARCThFAQiIgl31A3eVVtb67NmzSp2MUREjiqrVq3a6e51uZYddUEwa9YsGhsbi10MEZGjipltzLdMTUMiIgmnIBARSTgFgYhIwh11fQQicmzp6OigqamJ9vb2YhflmFBeXk59fT3pdLrgbRQEIlJUTU1NjBs3jlmzZmEDfzROhsDd2bVrF01NTcyePbvg7dQ0JCJF1d7ezqRJkxQCw8DMmDRp0pBrVwoCESk6hcDwOZLPUk1DIiKjUXcXdHVAVwa6O4Ln6UooP+xPVwyZagQikmh79+7l7rvvHvJ2F110EXv37h36C7oHJ/VMK7Tvg5Zm2L8F9myEnethxzrY+jxsWwPNL8LuP8He1+DAVsgcGPrrFUA1AhFJtJ4g+MhH+v/uTVdXF6lUKu92jz766MCZ3g1dnf2v4rOv6rs6yPmroSVpSKWhtBzGjA+el6QhVdb3vCSea3cFgYgk2m233caf/vQn5s+fTzqdZuzYsUydOpXVq1ezbt063vWud7Fp0yba29v56I0fYcmHPghdGWadPJ/GX/2Eg/v3seh913LOm+fzVONqpk85jh8uvYOKivLwFUqCE3kqDWVV4Um9rG9ez0m+iP0kCgIRGTU+8+MXWLdl/7Duc8608Xzqkrl5l3/hX/6FtWvXsnrlU/x6+a945+XvY+3vfsXsmVNg13qW/p9bmTi+irbWFt78zg/wV+fOYdLECUEbftte6Org5Vc2suwbd/H1U+fz3muu56EVa7n66qvDq/hUUU/yhYgtCMxsKXAxsMPdB/youJldBdwaTh4Ebsj6bVgRkdevuxsyLX1NNF0dkWaaDOzYCJ3t0PwS7N/KglPnMLu2DNoPQCrNl5d+j0d++gswY9PWHby8u5tJJ50cNNlMmQsHDzJ79mzmn7sIgDPOfCsbNm+HdEWR33jh4qwRfIvgh8vvy7P8VeBt7r7HzBYB9xL8KLeIJNRgV+4DdHdDdybSDt9zgs/0n6YTdv4xsqH1Ncmkq6BiIpSUQs0sqN5GVU0dTD0VrIRf//rX/PK3q/jds41UVlaycOFC2rtTQTt+xJgxY3qfp1Ip2traXtfnMNJiCwJ3X2FmswZZ/lRk8mmgPq6yiMhRxB28K39Ha8+J3rsGbmupvpN8aXlfR2u04zWrqWacj+NASxtU1ARX8VYSPIB9+/ZRU1NDZWUlf/jDH3j66adH6lMYUaOlj+BvgMfyLTSzJcASgJkzZ45UmURkuHV3hbdLbob9W4PbJtNvgj0b+l/Z0z1w25LS8KQ+BsrGRjpby/ruuCnJf5dPPpMmTeLss8/mlFNOoaKigsmTJ/cuu/DCC7nnnnuYN28eb3zjGznrrLOO/L2PYuae4zam4dp5UCP4Sa4+gsg65wN3A+e4+67D7bOhocH1wzQio1BHe3Cv+4HwBN/zONDzPFyWdSX/4l/+gJNPmN53Mh9wFR8+TF97KtSLL77IySef3G+ema1y94Zc6xe1RmBm84B/BxYVEgIiUgTucGh/eAW/Of+JvjXHf+GysTB+WvCYfV74fCqMnw7jwr+vNcPkOSP/vqRX0YLAzGYCDwMfcPc/Hm59EYlBdze07ow01eQ60W+FzMGB21bW9p3UpzcEf8dPDU7248KTfyHDIdjO4X9fMiRx3j66DFgI1JpZE/ApIA3g7vcAnwQmAXeHgyR15qu2iMgR6MzAwW3hCT3SJp/dVNPd0X+7klIYOyU4kU+eAye+I7x6n9b3GDcVSsfkfl056sR519CVh1l+HXBdXK8vckw7dDDrpB65eu856bfsGLhdurLvRH78W7OaacKTfFXdEXW6ytFrtNw1JCIQtMe37srd0RpttjmU49u3FRP7TuZT5+dpqqke9d9ylZGnIBAZKV2dkaaafCf6bdB1qP92VhI21UyF2hPhhIV9Ha29J/qpR9U3WWV0URCIDIeO9rBJZpBO15YdweiUUaXlfSf1GQv6X733NtUcByn9Vx0txo4dy8GDB9myZQs33XQTDz744IB1Fi5cyO23305DQ/5uzzvvvJMlS5ZQWVkJBMNaf/e732XChAmxlT0fHV0ih9PdHZzE9zXBvk2wb3PkeVPwaM1x50t5dV/7++S5kY7WyEm+okZNNUepadOm5QyBQt15551cffXVvUGQc1jrEaIgEDl0IDyhb+5/ct/XBPvD+dl31pSNheoZUF0P0+YHf8fXQ/X08EQ/NRhyWEa9W2+9leOPP7739wg+/elPY2asWLGCPXv20NHRwec+9zkuu+yyfttt2LCBiy++mLVr19LW1sa1117LunXrOPnkk/uNNXTDDTewcuVK2trauOKKK/jMZz7Dl7/8ZbZs2cL5559PbW0ty5cvZ9asWTQ2NlJbW8sdd9zB0qVLAbjuuuu4+eab2bBhA4sWLeKcc87hqaeeYvr06fzwhz+kouL1NwkqCOTY1tUZNNH0ntzDE/3+yFV9+77+21gquJKvng71b4a5lwcn+uoZ4fx6dbrG5bHbYNvvh3efU94Ei76Qd/HixYu5+eabe4PggQce4Gc/+xm33HIL48ePZ+fOnZx11llceumleX8P+Gtf+xqVlZWsWbOGNWvWcPrpp/cu+/znP8/EiRPp6uri7W9/O2vWrOGmm27ijjvuYPny5dTW1vbb16pVq/jmN7/JM888g7tz5pln8ra3vY2amhpefvllli1bxte//nXe+9738tBDDwXDXb9OCgI5erlD254cV/CRx4GtA9vlK2qCk/mEmcEtlNX14Qk+vMIfN0W3TybIaaedxo4dO9iyZQvNzc3U1NQwdepUbrnlFlasWEFJSQmbN29m+/btTJkyJec+VqxYwU033QTAvHnzmDdvXu+yBx54gHvvvZfOzk62bt3KunXr+i3P9uSTT3L55ZdTVRXUKN/97nfzm9/8hksvvTQY7nr+fADOOOMMNmzYMCyfgYJARq+eDtjsK/jeE/1m6Gjpv02qLLx6rw/urum5gu+5oq+eriab0WyQK/c4XXHFFTz44INs27aNxYsXc//999Pc3MyqVatIp9PMmjWL9vb2QfeRq7bw6quvcvvtt7Ny5Upqamq45pprDrufwcZ/i2u4awWBFEd3dzgKZdYVfLQzNtcXosZODk7qdSfBG94ROcmHj8ra2H7XVY5dixcv5sMf/jA7d+7kiSee4IEHHuC4444jnU6zfPlyNm7cOOj25513Hvfffz/nn38+a9euZc2aNQDs37+fqqoqqqur2b59O4899hgLFy4EYNy4cRw4cGBA09B5553HNddcw2233Ya788gjj/Dtb387lvfdQ0Eg8Th0MLyKz7qCj7bRd2X6b5Ou6juhT3lTX1NN9fS+5hsNayAxmDt3LgcOHGD69OlMnTqVq666iksuuYSGhgbmz5/PSSedNOj2N9xwA9deey3z5s1j/vz5LFiwAIBTTz2V0047jblz53LCCSdw9tln926zZMkSFi1axNSpU1m+fHnv/NNPP51rrrmmdx/XXXcdp5122rA1A+US6zDUcdAw1KNAzxejcl3F93bA7u2/jZUEd9NkX8FHH+UT1AGbQLmGTJbX56gahlpGIffgJL4vu8kmuwM269ehyif0XcHPPDPSJh+e5MdO0ZeiREYp/c9Mms5DYZNN1hV8tEM2e8jhVFlfp+vsc/tfxffcOz9mXHHej4i8bgqCY4k7tOzM86Wo8HFw+8DtquqCk3rtifBn/2Pg7ZRVdeqAlVi5e9579GVojqS5X0FwNMm0DOxw7XdL5eaBA5alK/uu3ifPDa/go1f00yFdXpz3IwKUl5eza9cuJk2apDB4ndydXbt2UV4+tP/TCoLRorsrGHky15eiejpj23b338ZKgnFsquuDYYdPurh/u3x1vcaykVGvvr6epqYmmpubi12UY0J5eTn19fVD2kZBMBLcg2EM8n0pqufqfkAHbHXfFXz9gv5fiqquD0IglS7OexIZJul0mtmzZxe7GImmIBgOnZlgXPm8t1M2QeZA/21K0sHok9Uz+oY5qI60y4+fXtjvvYqIvE4KgsPp+cWoaDt8dmfswe1AVgdNZW1wQp/0Z3DC27KGOagPxphXB6yIjAJx/nj9UuBiYIe7n5Jj+UnAN4HTgX9099vjKsugMq2RJpusdvme+Z1ZY4OUVvRdwZ94Qf92+Z7bKfVrUSJylIizRvAt4C7gvjzLdwM3Ae+KsQx9dq6Hlx6N3G0TXtW37spa0YLRJ3uGOXjjov7DD1fPgMqJ6oAVkWNGbEHg7ivMbNYgy3cAO8zsnXGVoZ/mF+EX/wRjxvddvU8/o//98j0dsKVlI1IkEZHR4KjoIzCzJcASgJkzZx7ZTt5wAdz2WnAnjoiI9Doqeivd/V53b3D3hrq6uiPbSbpCISAiksNREQQiIhIfBYGISMLFefvoMmAhUGtmTcCngDSAu99jZlOARmA80G1mNwNz3H1/XGUSEZGB4rxr6MrDLN8GDG1ADBERGXZqGhIRSTgFgYhIwikIREQSTkEgIpJwCgIRkYRTEIiIJJyCQEQk4RQEIiIJpyAQEUk4BYGISMIpCEREEk5BICKScAoCEZGEUxCIiCScgkBEJOEUBCIiCacgEBFJOAWBiEjCKQhERBJOQSAiknCxBYGZLTWzHWa2Ns9yM7Mvm9l6M1tjZqfHVRYREckvzhrBt4ALB1m+CDgxfCwBvhZjWUREJI/YgsDdVwC7B1nlMuA+DzwNTDCzqXGVR0REcitmH8F0YFNkuimcN4CZLTGzRjNrbG5uHpHCiYgkRTGDwHLM81wruvu97t7g7g11dXUxF0tEJFmKGQRNwIzIdD2wpUhlERFJrGIGwY+AD4Z3D50F7HP3rUUsj4hIIpXGtWMzWwYsBGrNrAn4FJAGcPd7gEeBi4D1QCtwbVxlERGR/GILAne/8jDLHfjbuF5fREQKo28Wi4gknIJARCThFAQiIgmnIBARSTgFgYhIwikIREQSTkEgIpJwCgIRkYRTEIiIJJyCQEQk4RQEIiIJpyAQEUk4BYGISMIpCEREEk5BICKScAoCEZGEUxCIiCScgkBEJOEUBCIiCRdrEJjZhWb2kpmtN7PbciyvMbNHzGyNmT1rZqfEWR4RERkotiAwsxTwVWARMAe40szmZK32v4DV7j4P+CDwpbjKIyIiucVZI1gArHf3V9w9A3wPuCxrnTnAfwG4+x+AWWY2OcYyiYhIljiDYDqwKTLdFM6Leh54N4CZLQCOB+qzd2RmS8ys0cwam5ubYyquiEgyFRQEZvZRMxtvgW+Y2XNm9heH2yzHPM+a/gJQY2argb8D/hvoHLCR+73u3uDuDXV1dYUUWUREClRojeBD7r4f+AugDriW4CQ+mCZgRmS6HtgSXcHd97v7te4+n6CPoA54tcAyiYjIMCg0CHqu7i8Cvunuz5P7ij9qJXCimc02szJgMfCjfjs1mxAuA7gOWBEGjoiIjJDSAtdbZWY/B2YDHzezcUD3YBu4e6eZ3Qg8DqSApe7+gpldHy6/BzgZuM/MuoB1wN8c4fsQEZEjZO7ZzfY5VjIrAeYDr7j7XjObCNS7+5q4C5itoaHBGxsbR/plRUSOama2yt0bci0rtGnoLcBLYQhcDXwC2DdcBRQRkeIpNAi+BrSa2anAPwAbgftiK5WIiIyYQoOg04M2pMuAL7n7l4Bx8RVLRERGSqGdxQfM7OPAB4Bzw+Ej0vEVS0RERkqhNYL3AYcIvk+wjeAbwv8aW6lERGTEFBQE4cn/fqDazC4G2t1dfQQiIseAQoeYeC/wLPAe4L3AM2Z2RZwFExGRkVFoH8E/Am929x0AZlYH/BJ4MK6CiYjIyCi0j6CkJwRCu4awrYiIjGKF1gh+ZmaPA8vC6fcBj8ZTJBERGUkFBYG7f8zM/go4m2CwuXvd/ZFYSyYiIiOi0BoB7v4Q8FCMZRERkSIYNAjM7AADf0wGglqBu/v4WEolIiIjZtAgcHcNIyEicozTnT8iIgmnIBARSTgFgYhIwikIREQSTkEgIpJwCgIRkYSLNQjM7EIze8nM1pvZbTmWV5vZj83seTN7wcyujbM8IiIyUGxBEP6K2VeBRcAc4Eozm5O12t8C69z9VGAh8EUzK4urTCIiMlCcNYIFwHp3f8XdM8D3CH7zOMqBcWZmwFhgN9AZY5lERCRLnEEwHdgUmW4K50XdBZwMbAF+D3zU3buzd2RmS8ys0cwam5ub4yqviEgixRkElmNe9rhFfwmsBqYB84G7zGzA+EXufq+7N7h7Q11d3fCXVEQkweIMgiZgRmS6nuDKP+pa4GEPrAdeBU6KsUwiIpIlziBYCZxoZrPDDuDFwI+y1nkNeDuAmU0G3gi8EmOZREQkS8G/RzBU7t5pZjcCjwMpYKm7v2Bm14fL7wH+GfiWmf2eoCnpVnffGVeZRERkoNiCAMDdHyXrJy3DAOh5vgX4izjLICIig9M3i0VEEk5BICKScAoCEZGEUxCIiCScgkBEJOEUBCIiCacgEBFJOAWBiEjCKQhERBJOQSAiknAKAhGRhFMQiIgknIJARCThFAQiIgmnIBARSTgFgYhIwikIREQSTkEgIpJwCgIRkYSLNQjM7EIze8nM1pvZbTmWf8zMVoePtWbWZWYT4yyTiIj0F1sQmFkK+CqwCJgDXGlmc6LruPu/uvt8d58PfBx4wt13x1UmEREZKM4awQJgvbu/4u4Z4HvAZYOsfyWwLMbyiIhIDnEGwXRgU2S6KZw3gJlVAhcCD+VZvsTMGs2ssbm5edgLKiKSZHEGgeWY53nWvQT4bb5mIXe/190b3L2hrq5u2AooIiLxBkETMCMyXQ9sybPuYtQsJCJSFHEGwUrgRDObbWZlBCf7H2WvZGbVwNuAH8ZYFhERyaM0rh27e6eZ3Qg8DqSApe7+gpldHy6/J1z1cuDn7t4SV1lERCQ/c8/XbD86NTQ0eGNjY7GLISJyVDGzVe7ekGuZvlksIpJwCgIRkYRTEIiIJJyCQEQk4RQEIiIJpyAQEUk4BYGISMIpCEREEk5BICKScAoCEZGEUxCIiCScgkBEJOEUBCIiCacgEBFJOAWBiEjCKQhERBJOQSAiknAKAhGRhFMQiIgknIJARCThYg0CM7vQzF4ys/VmdluedRaa2Woze8HMnoizPCIiMlBpXDs2sxTwVeAdQBOw0sx+5O7rIutMAO4GLnT318zsuLjKIyIiucVZI1gArHf3V9w9A3wPuCxrnfcDD7v7awDuviPG8oiISA5xBsF0YFNkuimcF/XnQI2Z/drMVpnZB3PtyMyWmFmjmTU2NzfHVFwRkWSKrWkIsBzzPMfrnwG8HagAfmdmT7v7H/tt5H4vcC9AQ0ND9j4K8uyru/nKr15mQmUZEyvT1FSVMbGqjJrK8FGV7p0uT6eO5CVERI5KcQZBEzAjMl0PbMmxzk53bwFazGwFcCrwR4ZZprObg4c62bS7ld0tGfa3d+Zdt7Is1RsONZX9A2NiVRgilWVBqFQF640pVXiIyNEpziBYCZxoZrOBzcBigj6BqB8Cd5lZKVAGnAn8vzgKc86JtZxzYm3vdGdXN3vbOtjTkmF3S4Y9rRn2tHYEz1sy7G4N/u5p7eC1MDwODBIeVWUpanoCoypS66gsY0L4t6fW0RMiZaW6e1dEii+2IHD3TjO7EXgcSAFL3f0FM7s+XH6Pu79oZj8D1gDdwL+7+9q4yhRVmiqhduwYaseOKXibjq5u9rRm2JsVGNnTe1oybNjZwp6WDAcO5Q+PsWNKg3AIwyO71tE33Vc7SacUHiIyvMz9iJrci6ahocEbGxuLXYyCZTq72dvaExAd7GnN9IbGntbIdPh3b2sHBwcJj3FjSoOQiNQ6os1XE6vSfU1WlWVMqEwrPEQEM1vl7g25lsXZNCRAWWkJx40v57jx5QVvc6izq18tY09rR29NIwiLDLtbO9h5MMMftx9kT2uG1kxX3v2NLy89bGBMrOqbN6EiTanCQyQxFASj0JjSFJPHp5g8hPBo74iER2skMFr61zp2HGjnpW0H2N2Soa0jf3hUVwT9GRMq+5qu+jrN+991NbGqjOqKNKmSXDeKichopyA4RpSnU0ypTjGlemjh0ddU1RHpIO/p7wg607ftb+fFrfvZ1ZLhUGd3zn2ZheHR298RueOqt4M83W+6uiJNicJDpOgUBAlWnk4xtbqCqdUVBW/TlunqFxh9neQdYQ0kmL95bzsvbAnCI5MnPErC8KjpvasqUtuoLIv87QuV8eUKD5HhpiCQIakoSzG9rILpEwoLD3enraOrt9YR3Kab6X+XVVjz2LS7lTVNwXqZrvzhUROtXURqHblqITWVZYwrL1V4iAxCQSCxMjMqy0qpLCulvqawbdyd1kzXgLupopa7hVMAAAmgSURBVNN7wruwXtvdyupNe9nTmqGjK/cdcKkS6w2J7G+RB/0g4S27kRAZN6YUM4WHJIOCQEYdM6NqTClVY0qZMbGyoG3cnZZMV++dVdEvBEa/37G7JcOGna0899pe9rRk6OzOHR6lJdYvIHq/KJj9bfNI89VYhYccpRQEckwwM8aOKWXsEMPjwKFO9rb0vz23r/mqL0Re2XmQ3RuDpq2uQcKjsixFRVmKyrJSytMpKtIlfc/LUlSGf8vTqWDddIryyPyKdGRZOB2drzuzJA4KAkksM2N8eZrx5WlmTio8PPa3d/bdXRUJjJ7vc7RlumjrCB/h857bdXuXZbry9oMMZkxpyYCA6Pe8LAiR8nTWsnB5dFllWSkVZSVh8JRSkU4xprRE/SkJpCAQGQIzo7oiTXVFmllUva59dXZ1097ZTWumk/ZMN20dXbRmOmnr6KK9o4u2TLgsDJXWMETaM5Hn4fyDhzppPnBoQNjka/oaTP9aSElvSJTnqdEMCKZ8f8PnY0pL1IQ2yigIRIqkNFXC2FQJY8fE99+wo6t7QHj0hkWmf62lJ2zaw3m9z8OA2tfWwfZ97bR2dNKW6Q5DqJOhZo0ZOWonqb7ms6waTWVZEEKF1Gh6wiadMoXNECgIRI5h6VQJ6VQJ48vTsezf3cl0dees0bTmCZvBQmh3S4bNewYuG+qQaKkS69/fkqMvptAaTHQ6GkrH0hheCgIROWJmxpjSFGNKU1QTX9gc6uzuDYb+NZUgMNo7spvLglpLv+mObtozXew40B5u0x0JrqH316RTlrdG0xNCuWo0fQGVv0bTEz4jdXOAgkBERjWz4IRbnk5R4FdRhqy722nvzFFTyXTRGjatZTed5QqlnmV7WjoG9O3k+4b9YMpS/W8OuOrMmVx37gnD/v4VBCKSeCUlfV98jEtXt/eGS7QGM2C6o4u2SI2mLay1tHV0Uzeu8N9PGQoFgYjICEiV9H3XZbQ5dno7RETkiCgIREQSTkEgIpJwsQaBmV1oZi+Z2Xozuy3H8oVmts/MVoePT8ZZHhERGSi2XgszSwFfBd4BNAErzexH7r4ua9XfuPvFcZVDREQGF2eNYAGw3t1fcfcM8D3gshhfT0REjkCcQTAd2BSZbgrnZXuLmT1vZo+Z2dxcOzKzJWbWaGaNzc3NcZRVRCSx4gyCXN+Nzh4x5DngeHc/FfgK8J+5duTu97p7g7s31NXVDXMxRUSSLc5vNjQBMyLT9cCW6Aruvj/y/FEzu9vMat19Z76drlq1aqeZbTzCMtUCefddRKO1XDB6y6ZyDY3KNTTHYrmOz7cgziBYCZxoZrOBzcBi4P3RFcxsCrDd3d3MFhDUUHYNtlN3P+IqgZk1unvDkW4fl9FaLhi9ZVO5hkblGpqklSu2IHD3TjO7EXgcSAFL3f0FM7s+XH4PcAVwg5l1Am3AYvehDjgrIiKvR6yDXrj7o8CjWfPuiTy/C7grzjKIiMjgkvbN4nuLXYA8Rmu5YPSWTeUaGpVraBJVLlNLjIhIsiWtRiAiIlkUBCIiCXfMBEEBA9yZmX05XL7GzE4vdNuYy3VVWJ41ZvaUmZ0aWbbBzH4fDsjXOMLlyjsgYJE/r49FyrTWzLrMbGK4LM7Pa6mZ7TCztXmWF+v4Oly5inV8Ha5cxTq+DleuET++zGyGmS03sxfN7AUz+2iOdeI9vtz9qH8Q3J76J+AEoAx4HpiTtc5FwGME33g+C3im0G1jLtdbgZrw+aKecoXTG4DaIn1eC4GfHMm2cZYra/1LgF/F/XmF+z4POB1Ym2f5iB9fBZZrxI+vAss14sdXIeUqxvEFTAVOD5+PA/440uevY6VGUMgAd5cB93ngaWCCmU0tcNvYyuXuT7n7nnDyaYJvYMft9bznon5eWa4Elg3Taw/K3VcAuwdZpRjH12HLVaTjq5DPK5+ifl5ZRuT4cvet7v5c+PwA8CIDx2WL9fg6VoKgkAHu8q1T6OB4cZUr6m8IUr+HAz83s1VmtmSYyjSUcuUaEHBUfF5mVglcCDwUmR3X51WIYhxfQzVSx1ehRvr4Klixji8zmwWcBjyTtSjW42v0/YrykSlkgLt86xSy7ZEqeN9mdj7Bf9RzIrPPdvctZnYc8Asz+0N4RTMS5eoZEPCgmV1EMCDgiQVuG2e5elwC/Nbdo1d3cX1ehSjG8VWwET6+ClGM42soRvz4MrOxBMFzs0fGYetZnGOTYTu+jpUawWEHuBtknUK2jbNcmNk84N+By9y9d6wld98S/t0BPEJQDRyRcrn7fnc/GD5/FEibWW0h28ZZrojFZFXbY/y8ClGM46sgRTi+DqtIx9dQjOjxZWZpghC4390fzrFKvMfXcHd8FONBULN5BZhNX4fJ3Kx13kn/zpZnC9025nLNBNYDb82aXwWMizx/CrhwBMs1hb4vHC4AXgs/u6J+XuF61QTtvFUj8XlFXmMW+Ts/R/z4KrBcI358FViuET++CilXMY6v8H3fB9w5yDqxHl/HRNOQFzbA3aMEPe/rgVbg2sG2HcFyfRKYBNxtZgCdHowuOBl4JJxXCnzX3X82guXKNyBgsT8vgMuBn7t7S2Tz2D4vADNbRnCnS62ZNQGfAtKRco348VVguUb8+CqwXCN+fBVYLhj54+ts4APA781sdTjvfxGE+IgcXxpiQkQk4Y6VPgIRETlCCgIRkYRTEIiIJJyCQEQk4RQEIiIJpyAQGUHhqJs/KXY5RKIUBCIiCacgEMnBzK42s2fDsef/zcxSZnbQzL5oZs+Z2X+ZWV247nwzezocJ/4RM6sJ57/BzH4ZDqz2nJn9Wbj7sWb2oJn9wczut/BbSiLFoiAQyWJmJwPvIxhkbD7QBVxFMLTAc+5+OvAEwbdSIRge4FZ3nwf8PjL/fuCr7n4qwe8CbA3nnwbcDMwhGEf+7NjflMggjokhJkSG2duBM4CV4cV6BbAD6Aa+H67zHeBhM6sGJrj7E+H8/wB+YGbjgOnu/giAu7cDhPt71t2bwunVBGPfPBn/2xLJTUEgMpAB/+HuH+830+yfstYbbHyWwZp7DkWed6H/h1JkahoSGei/gCvCcecxs4lmdjzB/5crwnXeDzzp7vuAPWZ2bjj/A8ATHown32Rm7wr3MSb8sRORUUdXIiJZ3H2dmX2C4NeoSoAO4G+BFmCuma0C9hH0IwD8NXBPeKJ/hXBkSIJQ+Dcz+2y4j/eM4NsQKZhGHxUpkJkddPexxS6HyHBT05CISMKpRiAiknCqEYiIJJyCQEQk4RQEIiIJpyAQEUk4BYGISML9f6LuRF+VFxeJAAAAAElFTkSuQmCC\n",
269 |       "text/plain": [
270 |        "<Figure size 432x288 with 1 Axes>"
271 |       ]
272 |      },
273 |      "metadata": {
274 |       "needs_background": "light"
275 |      },
276 |      "output_type": "display_data"
277 |     }
278 |    ],
279 |    "source": [
280 |     "from matplotlib import pyplot\n",
281 |     "# plot train and validation loss\n",
282 |     "pyplot.plot(history.history['loss'])\n",
283 |     "pyplot.plot(history.history['val_loss'])\n",
284 |     "pyplot.title('model train vs validation loss')\n",
285 |     "pyplot.ylabel('loss')\n",
286 |     "pyplot.xlabel('epoch')\n",
287 |     "pyplot.legend(['train', 'validation'], loc='upper right')\n",
288 |     "pyplot.show() # overfittinnng gued rassi "
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": 36,
294 |    "metadata": {},
295 |    "outputs": [],
296 |    "source": [
297 |     "from sklearn.metrics import classification_report, confusion_matrix"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": 39,
303 |    "metadata": {},
304 |    "outputs": [
305 |     {
306 |      "name": "stdout",
307 |      "output_type": "stream",
308 |      "text": [
309 |       "Train accuracy: 0.4903491735458374\n"
310 |      ]
311 |     }
312 |    ],
313 |    "source": [
314 |     "score = ResNet_adapt.evaluate(x_train_preprocessed, y_train, verbose=2)\n",
315 |     "print('Train accuracy:', score[ 1 ])\n",
316 |     "    "
317 |    ]
318 |   }
319 |  ],
320 |  "metadata": {
321 |   "kernelspec": {
322 |    "display_name": "Python 3",
323 |    "language": "python",
324 |    "name": "python3"
325 |   },
326 |   "language_info": {
327 |    "codemirror_mode": {
328 |     "name": "ipython",
329 |     "version": 3
330 |    },
331 |    "file_extension": ".py",
332 |    "mimetype": "text/x-python",
333 |    "name": "python",
334 |    "nbconvert_exporter": "python",
335 |    "pygments_lexer": "ipython3",
336 |    "version": "3.6.10"
337 |   }
338 |  },
339 |  "nbformat": 4,
340 |  "nbformat_minor": 4
341 | }
342 | 


--------------------------------------------------------------------------------
/GLove+Bilstm.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Load Data"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "from data import TwitterP\n",
 17 |     "import numpy as np\n",
 18 |     "from tqdm import trange"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "name": "stdout",
 28 |      "output_type": "stream",
 29 |      "text": [
 30 |       "Data has apparently already been downloaded and unpacked.\n"
 31 |      ]
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "trainT, validationT, testT = TwitterP.load_data()"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 3,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "df=trainT.append(validationT)\n",
 45 |     "df=df.append(testT)"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 4,
 51 |    "metadata": {},
 52 |    "outputs": [
 53 |     {
 54 |      "name": "stderr",
 55 |      "output_type": "stream",
 56 |      "text": [
 57 |       "100%|██████████| 5459/5459 [01:04<00:00, 84.73it/s] \n"
 58 |      ]
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "Negative_tweets=[]\n",
 63 |     "Positive_tweets=[]\n",
 64 |     "n_private=0\n",
 65 |     "for i in trange(1,len(df)):\n",
 66 |     "    timeline = TwitterP.get_username_profile(df['username'].iloc[i])\n",
 67 |     "    if df[\"Positive\"][i]==1:\n",
 68 |     "        if (timeline.empty):\n",
 69 |     "            n_private+=1\n",
 70 |     "        else:\n",
 71 |     "            Positive_tweets=np.append(Positive_tweets,timeline.tweet)\n",
 72 |     "    if df[\"Negative\"][i]==1:\n",
 73 |     "        if (timeline.empty):\n",
 74 |     "            n_private+=1\n",
 75 |     "        else:\n",
 76 |     "            Negative_tweets=np.append(Negative_tweets,timeline.tweet)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 5,
 82 |    "metadata": {},
 83 |    "outputs": [],
 84 |    "source": [
 85 |     "PL = [1] * len(Positive_tweets) \n",
 86 |     "NL = [0] * len(Negative_tweets) "
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 6,
 92 |    "metadata": {},
 93 |    "outputs": [
 94 |     {
 95 |      "data": {
 96 |       "text/plain": [
 97 |        "(223014, 187360)"
 98 |       ]
 99 |      },
100 |      "execution_count": 6,
101 |      "metadata": {},
102 |      "output_type": "execute_result"
103 |     }
104 |    ],
105 |    "source": [
106 |     "len(Positive_tweets), len(Negative_tweets) "
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 6,
112 |    "metadata": {},
113 |    "outputs": [],
114 |    "source": [
115 |     "x=np.append(Positive_tweets[10001:15000],Negative_tweets[10001:15000])\n",
116 |     "y=np.append(PL[10001:15000],NL[10001:15000])"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": 9,
122 |    "metadata": {},
123 |    "outputs": [],
124 |    "source": [
125 |     "# we shuffle our sorted data, to make it random first\n",
126 |     "indices = np.arange(x.shape[0])\n",
127 |     "np.random.shuffle(indices)\n",
128 |     "x = x[indices]\n",
129 |     "y = y[indices]"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 10,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "import pandas as pd \n",
139 |     "d = {'Tweet':x,'label':y}\n",
140 |     "df = pd.DataFrame(d)\n"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 11,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "df.to_csv('test_data.csv', index=False)"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 7,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "x=np.append(Positive_tweets[0:10000],Negative_tweets[0:10000])\n",
159 |     "y=np.append(PL[0:10000],NL[0:10000])"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 8,
165 |    "metadata": {},
166 |    "outputs": [
167 |     {
168 |      "name": "stdout",
169 |      "output_type": "stream",
170 |      "text": [
171 |       "Found 400000 word vectors.\n"
172 |      ]
173 |     }
174 |    ],
175 |    "source": [
176 |     "import os\n",
177 |     "import numpy as np\n",
178 |     "# NumPy is a very widely-used library in Python for mathematical operations.\n",
179 |     "# It supports many complex mathematical functions and can work on large arrays and matrices.\n",
180 |     "\n",
181 |     "embeddings_index = {}\n",
182 |     "with open(os.path.join('glove.6B.100d.txt')) as f:\n",
183 |     "    for line in f:\n",
184 |     "        values = line.split()\n",
185 |     "        word = values[0]\n",
186 |     "        coefs = np.asarray(values[1:], dtype='float32')\n",
187 |     "        embeddings_index[word] = coefs\n",
188 |     "\n",
189 |     "print(\"Found {} word vectors.\".format(len(embeddings_index)))"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 9,
195 |    "metadata": {},
196 |    "outputs": [
197 |     {
198 |      "data": {
199 |       "text/plain": [
200 |        "array([-0.067077,  0.34271 ,  0.31389 ,  0.45675 , -1.0441  , -0.30359 ,\n",
201 |        "       -0.72071 ,  0.024742,  1.0052  , -0.30277 , -0.61816 ,  0.1482  ,\n",
202 |        "        0.55835 , -0.74295 ,  0.10714 ,  0.22635 , -0.99951 , -0.54721 ,\n",
203 |        "        0.73511 ,  0.40552 ,  0.71583 ,  0.37093 ,  0.017737,  0.56344 ,\n",
204 |        "       -0.086712, -0.19697 ,  0.16806 , -0.67181 , -0.16957 ,  1.233   ,\n",
205 |        "        0.40425 ,  0.39286 ,  0.09856 , -0.39293 , -0.56219 ,  0.25364 ,\n",
206 |        "        0.19299 ,  0.17292 ,  0.40808 , -0.32074 , -0.082906, -0.59409 ,\n",
207 |        "        0.10728 , -0.28854 ,  0.11224 ,  0.30465 ,  0.75957 ,  0.91237 ,\n",
208 |        "        0.54494 , -0.89268 , -0.66143 , -0.24641 ,  0.053331,  0.20658 ,\n",
209 |        "       -0.021297, -0.97389 ,  0.55199 , -0.34674 ,  0.58749 , -0.023662,\n",
210 |        "        0.55734 ,  0.55996 , -0.63015 , -0.15025 , -0.15092 ,  0.11918 ,\n",
211 |        "        1.0616  , -0.29991 ,  0.38754 , -0.48744 ,  0.29452 , -0.42888 ,\n",
212 |        "       -0.031931,  0.49391 ,  0.31056 ,  0.18853 ,  0.28281 ,  0.1583  ,\n",
213 |        "        0.046716, -0.66058 ,  0.055853,  0.95369 , -1.4047  ,  0.47653 ,\n",
214 |        "       -0.98774 , -0.86063 , -0.29713 , -0.17217 , -1.1155  , -0.60824 ,\n",
215 |        "       -0.37497 ,  0.23848 ,  0.46777 ,  0.099564, -0.41541 ,  0.97339 ,\n",
216 |        "        0.28276 ,  0.12338 , -0.323   , -0.69511 ], dtype=float32)"
217 |       ]
218 |      },
219 |      "execution_count": 9,
220 |      "metadata": {},
221 |      "output_type": "execute_result"
222 |     }
223 |    ],
224 |    "source": [
225 |     "# to see what word embeddings look like\n",
226 |     "embeddings_index[\"depressed\"]"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "markdown",
231 |    "metadata": {},
232 |    "source": [
233 |     "### Prepare Model:"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 10,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "import sys\n",
243 |     "\n",
244 |     "from tensorflow.keras.preprocessing.text import Tokenizer\n",
245 |     "from tensorflow.keras.preprocessing.sequence import pad_sequences\n",
246 |     "from tensorflow.keras.utils import to_categorical\n",
247 |     "from tensorflow.keras.layers import Dense, Input, LSTM, Embedding, Bidirectional, Dropout\n",
248 |     "from tensorflow.keras.models import Model\n",
249 |     "from tensorflow.keras.initializers import Constant"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": 11,
255 |    "metadata": {},
256 |    "outputs": [],
257 |    "source": [
258 |     "# Setting some parameters\n",
259 |     "\n",
260 |     "MAX_SEQUENCE_LENGTH = 200 # determines how many words of every tweet will be processed\n",
261 |     "# if the tweet length is smaller than this, the tweet will be padded until it has a length equivalent to this\n",
262 |     "\n",
263 |     "MAX_NUM_WORDS = 70000 # how many unique words in the training text to be tokenised\n",
264 |     "EMBEDDING_DIM = 100 # number of dimensions in the word embeddings\n",
265 |     "TRAINING_SPLIT = 0.8 # proportion of data to be used to training"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "code",
270 |    "execution_count": 12,
271 |    "metadata": {},
272 |    "outputs": [
273 |     {
274 |      "name": "stdout",
275 |      "output_type": "stream",
276 |      "text": [
277 |       "59179 distinct words in original text\n"
278 |      ]
279 |     }
280 |    ],
281 |    "source": [
282 |     "tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)\n",
283 |     "tokenizer.fit_on_texts(x)\n",
284 |     "sequences = tokenizer.texts_to_sequences(x)\n",
285 |     "word_index = tokenizer.word_index\n",
286 |     "\n",
287 |     "print(\"{} distinct words in original text\".format(len(word_index)))"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": 13,
293 |    "metadata": {},
294 |    "outputs": [
295 |     {
296 |      "data": {
297 |       "text/plain": [
298 |        "99"
299 |       ]
300 |      },
301 |      "execution_count": 13,
302 |      "metadata": {},
303 |      "output_type": "execute_result"
304 |     },
305 |     {
306 |      "data": {
307 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXgAAAD4CAYAAADmWv3KAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3deXgV5dk/8O/NoqgIgiwii4Bg3epKKSq4UVe08Fp9i/1pacUfWhXX1mKtr7ZvW1EqCtQNl4K4Qd1AUTAEUERBwr6FLIAQCEnYEgIh6/P+ceaEk5OZc2Y9c2by/VwXF8lkzsx95sy555lnnkWUUiAiovBp5ncARETkDSZ4IqKQYoInIgopJngiopBigiciCqkWfgcAAB06dFA9e/b0OwwiokBZvnz5bqVUR6O/p0WC79mzJ7KysvwOg4goUETkh0R/ZxUNEVFIMcETEYUUEzwRUUglTfAi8qaIFIvIuphl7UUkQ0Rytf/bxfztMRHJE5FNInKNV4ETEVFiZkrwUwBcG7dsDIBMpVRfAJna7xCRMwEMB3CW9pqXRKS5a9ESEZFpSRO8UuprAHvjFg8FMFX7eSqAYTHL31dKVSqltgDIA9DfpViJiMgCu3XwnZVShQCg/d9JW94VwPaY9Qq0ZY2IyCgRyRKRrJKSEpthEBGREbcfsorOMt3xiJVSk5VS/ZRS/Tp2NGynTw7NXlOI/Yeq/A6DiHxgN8EXiUgXAND+L9aWFwDoHrNeNwA77YdHTuzcX4F7312Be95Z4XcoROQDuwl+FoAR2s8jAMyMWT5cRI4WkV4A+gL43lmIZFdlTR2ASKInoqYn6VAFIvIegMsBdBCRAgBPAhgLYIaIjASwDcAtAKCUWi8iMwBsAFAD4F6lVK1HsRMRUQJJE7xS6laDPw02WP/vAP7uJCgiInKOPVmJiEKKCZ6IKKSY4ImIQooJvgnQ7YhARKHHBB9ier3OiKjpYIInIgopJngiopBigqcmqbC0AnPWFfodBpGnmOCpSbr55e9w99sco4fCjQmemqQdHJ+HmgAmeCKikGKCbwIUG8ITNUlM8CEmbAhP1KQxwRMRhRQTPBFRSDHBExGFFBM8EVFIMcETEYUUE3wToDhgMFGTxAQfYsIBg4maNCZ4IqKQYoInIgopJngiopBigiciCikmeCKikGKCbwI4miRR08QEH2IcTZKoaWOCJyIKKSZ4IqKQYoInIgopJngiopBylOBF5CERWS8i60TkPRFpJSLtRSRDRHK1/9u5FSwREZlnO8GLSFcA9wPop5Q6G0BzAMMBjAGQqZTqCyBT+52IiFLMaRVNCwDHiEgLAMcC2AlgKICp2t+nAhjmcB/kENvBEzVNthO8UmoHgH8C2AagEECpUupLAJ2VUoXaOoUAOum9XkRGiUiWiGSVlJTYDYOIiAw4qaJph0hpvReAkwEcJyK3mX29UmqyUqqfUqpfx44d7YZBREQGnFTR/AzAFqVUiVKqGsBHAC4GUCQiXQBA+7/YeZhERGSVkwS/DcAAETlWRATAYAAbAcwCMEJbZwSAmc5CJCIiO1rYfaFSaqmIfABgBYAaACsBTAbQGsAMERmJyEXgFjcCJSIia2wneABQSj0J4Mm4xZWIlOaJiMhH7MlKRBRSTPAhduBwDQBAsSE8UZPEBB9if/5kLQBgZ+lhnyMhIj8wwYfY/kPVfodARD5igiciCikmeCKikGKCJyIKKSZ4amT/oSq8vmgzW9/4rPRQNT8HcsRRRycKpz9+uAZz1xfhvO4noF/P9n6H02Q99vEafL52F87u2hYDep/odzgUQCzBh5nYe1lpRaT1TVVtnYvBkFVlFZF+DNX8HMgmJngyxpoBokBjgqdGxG7Rn4jSChN8iDFNJ8cHmBRmTPBkiKmPKNiY4KkRYdGfKBSY4ImIQooJPsTEYVGc1dNEwcYET42wioaSUUrhjW+2oLiMQ1GnMyZ4MqT4mJUMbN59EP/72Qbc/fZyv0OhBJjg41TV1GFiZi4OV9f6HYpvzLSDr6tTeHFBHsoOc8x5r6VjVVltXSSo6KxhlJ6Y4OO8veQHjM/IweSvN/sdSlrLzC7GuLmb8NdPN/gdSmgFoaosDa89FIMJPk6FVnKvCEEJ3ml+SFRyrKqJjI9yqIoluKYoANceAhM86QhCyZHSA3sCp7cmmeALSyvwxjdb/A7Dc0zU5IUDh6sxaX6e32GQCU1yPPiRU7KwobAM15zVGd3aHau7ThgKJk7fg5mXh+E4kTVjv8jGrNU7/Q6DTGiSJfhoyw+95MRSrzlBPE7f5u3GiDe/x7Y9hxKuN23JD9iy+2CKonLXJyt3YG1Bqaf7OFR15PkUr+/prUmW4BNhifSIsNWv/ur1pQCAX07+znAdpRSe+GQd2h7TEqufvDpVobnmwemrAABbxw7xORJKB02yBG+GnRJqRVUtXpiXU9/CJKicDnGQ7soqkrfdLzWxDoFF+DTHBO+ilxbm4YV5uXjv+21+h0JEFK4Ev+9gFSZl5qKuzrhYoZRCwb4K3b+t21GKj1fu0Nazvv8KrW4y64d91l+chswcgkW5uz2Pww15xeWGf1NKYfLX+Sgs1T8vgmh8Rg4qa7zvy7E5gM8qpizekvQ5TDKz1xQia+telyLyTqgS/OOfrMVzGTn4Nn+P4Trrd5YZ/u2GSd8kTARmfZomLQzs1rSYeVl0nfLKYHR0+tn4r+p/jq+C+mHPIfzj82yMeiu9xlVx8ghkYmYupize6losYVFaUY2nPt2AW19b4mg79767Aje/YvwsJ104SvAicoKIfCAi2SKyUUQuEpH2IpIhIrna/+3cCjaZ8spIiaWmzrgOvDZB6d4ps1vef6gKE5PcabjBztyq5ZU1+CqnxINo0letlkkP+nSx2lV6GK9+le/6Q223ngUppfDqV8G6w8krLse0JT80Wh5tHRWUgolTTkvwEwDMUUqdDuBcABsBjAGQqZTqCyBT+71J2LTrgKn1Hv9kHcZn5GBRXvpVb/xz7qYjv/ABWkrc9fZyPP1FdqPqjnR51r1t7yE8/UU2/v9bWX6HYtqQiYvwxCfrGi2/+eVvAbh38Ut3thO8iLQBcCmANwBAKVWllNoPYCiAqdpqUwEMcxqkWclKQOWVNXh+Xo5n+0905xDrkFZ6qDW5fip9m59+Fx27yitrMD4jBzW1DY+zUd70qz65vL5fhqr//+WF+Sgs9X6s9UNVNRj/5ab6hLdzf0Wju4noXe9B7Q45Ta47CVVq72f8l5sa3JnVaO8lDGNNmeGkHXxvACUA/i0i5wJYDuABAJ2VUoUAoJQqFJFOei8WkVEARgFAjx49HIShu23d5ePmZGPhpqZV/WBVTpHzZxDp4p9zN2HKt1vRo71+b+V0lV9yEM/MyU7JviZm5uGVr/LRsU0r3D7gFIycmoWNhWW4/sdd0D1gx03PxPl5OFhViyduONPvUHzhpIqmBYALALyslDofwEFYqI5RSk1WSvVTSvXr2LGjgzDMydq6FzOyCnT/NvXbra48XDXLTC/JwtIKvLzQ/XpZKxJN+BF7Db32ha89j3P19v24a1oWFluo1oqO6V9da+9OqaqmDs99uQmHqmowa/XORq0mPl9biO9iHuhv2FmG911oIpvKzzx6jFZv34/py7bVl3brtBiqa+swPsO7u95UaMpzOzgpwRcAKFBKLdV+/wCRBF8kIl200nsXAMVOg3RDoifeT85aj+OOau54H2a/l1u1Jlo1tcYvuGvacqwpKMU1Z3VG746tHcfmpexdB7Bq+36c38O75+lDX1wMAJi7vsjTXpqxn+GMrO2YND8PVbV1ePWryPwAsfu+550VDZZdP3ERAGB4f2t3pPFngd2699JDDTtnWdnOB8sL8MHygkZ3O5+s3IHP1hTaC4h8Z7sEr5TaBWC7iPxIWzQYwAYAswCM0JaNADDTUYQmvb5oc32bbCvfj+17I8n2YFXiq3zBvkN45at8w78rpbB0i7V2sYl6jJbXl6Qiv+8ur8SEebm2S3efry1MWL8+ZfGWRncxVnaVaNX52UVYkN34Ol9TGykh22nRUFVTh3/O3WRrPPoDJvcXrZeurD5yBxDdX8mBSsv71XOwsgabS6J3dBL3vzV2plics25Xwr9Xxdz9BGXoiuUO+6EUHziMSZn2v2vpxOlYNKMBvCMiRwHYDOC3iFw0ZojISADbANzicB9JHa6uxd9mb7T12hFvfq+7PP7LcseUZcgpKsfPzz0ZJ59wTKP180u8fUD36AdrMD+7GAN6t8dPe59o6jWx14/40ma8pz7dgGNa2r+LKU8wddsdU7J09/3Jqp2YND8PZRXV+MvQsw1fr3eLPX3ZNvxrQR5qlcIfrz3dZtTm7D9UVf/zpPl5+OO1p+PhGatc2fbEzNxGy+yW4O08P9mVZNJsvaa26Z72fqG1lImyGu/D01fjm7zdGHRaR5zX/QT3AvOBo2aSSqlVWj36OUqpYUqpfUqpPUqpwUqpvtr/nnX3ipYAncwLabY0t0+7/TVqK12R5A5AT8LvcdxZGS051npYqkjUsuBwdS3Gzc2OSbYNo28mgqqaOoybm21Yqi4ua1gyitaNV9bUYWNhGd5Z2rjdspFKndK1kWSHzKhns56XF+ZjT3mlaz149S5edlupGB33yV/nm+65uW1vw/W8aKo57butyCky16TYjNcXbcZWGy2gjJ7NHIx+1zzup5IKge7JOmt1pAT4rActDuJLLtFb8n8t0J/o4N/fWp9AJNGXR5lYx2uxiXHKt1vx4oJ8vGYwV61IpFT94oJ8TMzUP0aj31uJ5zJydHsTXzdhER7/uHG7ZSfMHrvf/lv/Lu7Idhpu6LGP1toNyRS7g729vqjhOSgi2FNeiX98no3b3lhq8Koksdh6VWJPzFyPq5//2pVtHayswd9mb8R/v2q9V+nMVenR49xLgU7w0Stw/EMgEWBjYRne1nqyRcd/t8Kovt2og0SlwfK6OoXnM3Kwp7xxna2Z77HZL1he8QFMWWztIvOfrO14PiMH87OLkq4bfd/PZeTgyZmNE7HgyDEwOkYrtkXqRt9fZtzSJL5uu7q2rmHnK038hBPFBw7bekax92AVkhXU4j8nO22oK6oid0CVNbX4eGVBwnFM4uvFD5m8O9Q77tH3ZvVZhV7VTLRxQDq1g49+dIl6IRvFG98/IowCneCj9L5w101YhD9rPdke/c8a74MwSBJLtuzBhMxc/PHDxjEkGkrAaqK6cdJiPPXpBkuv+cMHazAhM7e+jjyR2HCmfqdTlRLzVowe9lVrrYbeXmKc4H//n9UNfv945Q68rjO94pq4SS0emr4Kz8/LwWqLk138Wae3Y7xmLtxGvbQwDy8uyMc7S7bhoemrdVt1RXcT3wb+5YX+TY+XLr1pnQh+RYt9oUjwiewqPYw56/VbCoybu8l2G2mzovV4h3XqimvqFMbNzcaBBHcY8bfruUXlWouXA/h3TIndbKlyd3klxmfkuD4OjkDqx1k3s+2imId7sReP+HFwzH4+0VJuTtGB+jbdZnwRV1qu0tlffI6zU/+u1ya/55jZDe4elAIm6Tx0Nbo7TAU74xl5rexwNcbNzW5UAi+vrNGtrl2UW5K0tdCW3QcNqx8BYMf+Cry4IM+w4PVt/m58tib9qnxCOaNT7Ek5+r0Vhuu5Oq+kje/BrNU78enqnSirqMH/DmvYisQoRT45az0AoFXLZjhcXYffXtLL0j7/9NFafLmhCAN6tbcecIz4kp1IpNcgAMzfVIy/JHn96PdW4qbzuzqKIVb0ovLoB0fulG7t393ydt7WGaDKzVJs/LaWxVTV5BYdwHNOOhUleqZj8XpefxeWfvkdz3yRjXeWbsOpHVvj6rNOql8e7bkcb/veCtz99vJGrbhiP4tfvvodig9U4taf6vdhGPVWFtbvjPTw7dXhuEZ//9VrkWccN5xzso135J3Ql+D1Ss5mbdhZhkvGzq9vK59Qki9QeWUNnpmT3aCetDpJnTVg/P0yel9TFm9JOOhZtIXAYRNjhWdsOFI3n6yN9b/mH6lGqE3QgSsqtvXI9KztSdePZXagqGztOFhpHx5f9QPAsAe0FUYJti52zBcLWXjL7oOY/LVxv4wouxenaJ8OvZf7XeURvaOpibtTtHqnE3seFWvPfvKKy7Fy2/765Su27cOMZdvrW8nVxX1GZYerG9w1zFhm7Vz2WugTvBPXT1yEHfsrDNvKW7Fq+368vDAfHyw3lyzstoZ86tMNuG5CpIWCXmuMxXmRrvVVNcl3YCXxfpPCkTE/WnHkGEbfot67if2imuXqXZ0Ot6o8hk/+Dv/4PLtBA4L4LTu584i9E4qXymE9zLLzddF7FjRM6zEdddNL3+JRnednUePmbMJLC49caBOt64dAJ3ijL4vbbcUbzCLfoM400kIme1cZZq811507tm7YVCsaG19SM9XrXj978FJ1itonJ6u3NWvdjlK8pVX97K+oSrK2PqUUJszLrR+T/VBl8juwF+blorjMWY9bvUJCfMnZjDnrdun2ZnbiuS831XdCsxNTtYXRXKMjja6KKzRYmTVrd3klnvtyk+fzQMQKdII3+oBiTyQ36k+NbvFXF5RiQmYurn1hkelt6bXy8KOlgtVOHG4NW2CVnw/5vndpSrYbJn1TXx3w4oKG1Spm319OUTmen5eD371t/EwpXlVNXX2PW7ufiVtH/+63l+O3U5a5sq1oTEVllfXt6e2M727nvT0S18rLijEfrsGk+XlYstl4xjm3BTrBG4mvJ3NKb3MLsosx2+FT8+h29YYwjl5UVm3f78oIhW5LNBZLZU0dnv58I16Yl4PNJfq383p13U4kGioh1Z7+fKOtEQwTJfsjrbEi2432wI49N/UKCk5HUtTbplFLkl2lh/F8Rk7CJr7FB6yPcZ+9q6xBi7GSmD4lsXfXbk92r9f/wo51O0ox7but9c/NvOyNHi+UrWjcpvdxuFUaAfTHA4meAw+8HymBDejtrNWLU/HHIFH78b0Hq/Cq1uRMr1VKlJt3Luk0+fOrX29Gm2Na4t4r+vgdSn3npFS4//2V+H7LXgw+oxPO6aY/hssjM6yXgKN3yNEWY27N6ZCsx/B3LpW0b5j0DQBgYJ8OAJzNtWtVKEvw73/v3ZPsOet3uTZ357qdDUuxSilMzMzVbbWzZLN+dYFRaenlhfnYstv5w7C9B6vw7Jxs2/WGicbosX2iW3zhG3Fd+L0QP1Tvuh3W71CcXvDcrs6Kno/xsuNaaa3XzuPK6mhLE+Ntxp4PU3WaNMbv//m4ZqN2jqsRu0fr9UXG7eUT7s+H2sZQJvjYzipuHNP4fDJxfuOT3kiiL138IFcF+yowPiMHI6cuc3yVf2ZOtqMmolFPzFyHlxbmY2FOWgzrDwDYUNh4LJtEUlG6//vnDXsRx3egMhL7pffy+29n6Nv8kvJGdwClFY075Q2Z+I3pbWbFDOUb7dNhJHvXAUyIu8BES8NusJtw7Y5cG5XKZqahTPDua/iRVJtoYmhH9NmBG4nZLbO1cX7SqdFNOo7y52Vv00RVCRu1i13BvkOmqhQ27TpgugSq95l/aLKZb6yd+82P1tlw/w0/55vjhgG2avjkhsNDFO73Zs7b3eWVGDc3G4tyS/CCh3NAmxH+OngX7ovSedx/pdJ7vBCnhy6d35sbzJxbSinD4zB88hJsHTsEd01bbmp/109chNo6hTsH9bYQ5RF//czaeEcAcPHY+bb2FX/RzHI4kUd8NafZocKtGvPhWszbWNSoxZQfAl2CN/PlcKWKJu53O7MIeWXrntQ8XCww05s3RcyOrphKRvPsWrnbMEri2bsONDjX9c6/ZK1loncBVuKxM0OUVa8v2lx/F6IXQRAlaxufypmiAp3gUyX+A3nf4+7ICsr0SXD7G8572ZrhRWnHbuk8HecINWr2maxzT8NjYHxAdsRUc0xOMCiWkXSdfu5vszfiugnm+5GkGysPtu2O8+9E6BO8nQ4QbnrTwhjtdlpBpPuM8YlK204fVgVBsh6WZvNu9PlM9q4Duk1Prabv1dutD+NgllsXEzPDaQRRnVJ4dk42ipNMl+iG0Cd4qy0u9Dg5zebb7J5tdp9B/go4mWqxKdtdbm+4g1hD48Zc0WM5T7tcQp29Nv2G342nV41ldNyiR2fZ1n14aWG+o16xZoU+wQdVYan5q7udNupGdcbpYOSUZfXvKVmS+SFFzyAS+cuniZr7uXMJTpQ6//bZBk8ullYS/J7ySkd3BR8sL8DivN0o2HcI47/cBKUUakyMSpqOjAbei851EH0OkorxoMLfiibklFJYXWD9ixXfvjidZGYXY1XBflzQox027Up8BzZvo//t8/+9eKvt1zZoB2+zAKw341U8r1NlsjbtyURn8jqnW1usKSjFkDQbVz2oWII3IRXPp/JjxmyxOrBXGjYLdyxH6y0Z9Pe271Di+YBjW7UkuhOzM4tUrP2HqjFvQ/K5d2NZaUVj9Kzrf3Tm700k+rA6FS14/GbUO91NTPBp4levLQl9m28rxny01u8QXPH3JA+Sc2PGVl+wyfhuZFqCMX3MuvOt5HPvuu0tvfl7TUrThj+uKveoLX5UoKtoUpUQ7exn2pIfcMZJx5tev7KmzvYJ/e7S9BttkiKstOIK+t2K28oSzFVM5gQ6waezJxKMtpiM1QvKhyucTymXroJ+V6M3ibeRdHurfpegzY7nEzTs6JRmUvF52N2H319CL611ecz4VNuw01oT3SBfzGJb8YT4lAwcJvg0ZOkha4gz/I3/cm/kQD9cPzG4PTStcmvs9KDxc8YxM5jg04idElxZyDsLzV0fztt0olRggqe0VuRw0uggSbfSoJ83hyG+MU0pJvg0EVvVEj8RCCU2KY07bVmx95DzIQisyE7SicyuZ77IbjA4GvmHCZ4C77kMfydVcEuGxY5ITg0zMR6NHUu37MXod1c42kaQHzinE8cJXkSai8hKEflM+729iGSISK72fzvnYepL1W2c3jRlREFXnWSsl6cSjrGTWLJRNMMip/hA8pXieDn7Vzw3SvAPAIjtrjcGQKZSqi+ATO13IgqY5Q5nUHIiKHXwK7dZHwdqRpa380nEcpTgRaQbgCEAXo9ZPBTAVO3nqQCGOdlHIgE5B0wJe2sYSj9ezm0blAQNAO99n9qe4Kmcc9lpCf4FAI8CiI24s1KqEAC0/zvpvVBERolIlohklZSUOAyDiMiex0Iy7pEe2wleRG4AUKyUMjfbbxyl1GSlVD+lVL+OHTvai8HWq4jIa05Hg+RDVnc4GYvmEgA/F5HrAbQC0EZE3gZQJCJdlFKFItIFgP8DdhNRoASpiscJr69jtkvwSqnHlFLdlFI9AQwHMF8pdRuAWQBGaKuNADDTcZQGeJUnSl9ORjn18vlAOvH6XXrRDn4sgKtEJBfAVdrvRNSEKAX86WP7dduLDaa9I2tcGS5YKbUQwELt5z0ABrux3eT7TcVeiCjVUjFfaVPAnqxE5Dqn1afpMNduGDDBp5E63pJQSDg9lVP1TZi5akeK9uSPQCf4sD1k/drhxMpEZM0D76/yOwRPBTrBh02YJ+8gosbStpkkEZERp2WVkN2cGwpiM0myqam0/SVKZtV264N4UWOBTvBhq9Hwc/Q+IgqfQCf4sLU6Cde7oaaM57I5b36zxdNOXa50dPLLuh2lfodARGTbeG02sq1jh3iy/UCX4MNmf4rn5CSicGOCTyOL8/b4HQIRhQgTPBFRSAU6wYfsGSsRkasCneCJKD2xV3Z6YIInIgqpQCf4sA02RkTkpkAneCIiMsYET0Su44xM6YEJnohcl19y0O8QCAFP8HxQT0RkLNAJnoiIjDHBExGFVKATfOtWgR4Mk4jIU4FO8Kd2bO13CEREaSvQCZ4dnYiIjAU6wbMVDRGRsWAneL8DICJKY4FO8EREZIwJnogopAKd4PmMlYjIWKATPBERGbOd4EWku4gsEJGNIrJeRB7QlrcXkQwRydX+b+deuEREZJaTEnwNgEeUUmcAGADgXhE5E8AYAJlKqb4AMrXfiYgoxWwneKVUoVJqhfbzAQAbAXQFMBTAVG21qQCGOQ2SiIisc6UOXkR6AjgfwFIAnZVShUDkIgCgk8FrRolIlohklZSU2Nov28ETERlznOBFpDWADwE8qJQqM/s6pdRkpVQ/pVS/jh07Og2DiIjiOErwItISkeT+jlLqI21xkYh00f7eBUCxsxAT7N+rDRMRhYCTVjQC4A0AG5VS42P+NAvACO3nEQBm2g+PiIjscjKg+iUAbgewVkRWacv+BGAsgBkiMhLANgC3OAuRiIjssJ3glVLfwLiWZLDd7RIRkTvYk5WIKKSY4ImIQooJnogopJjgiYhCKtAJnj1ZKUj+cM2P/A6BmphAJ3iiILn3ij5+h0BNDBM8kQ9uOr+r3yFQExDoBM+hCiioLvsRx18i7wU6wRMFxZjrTvc7BGqCmOCJUuCWC7v5HQI1QUzwREQhxQRPlAKRwVeJUivQCZ7t4In80aH10X6HQCYEOsETkT8evbbpdtrq06m13yGYxgRPlAJNvYKmR/tj/Q6hSQp0gm/qXxoKjvjqRNXE6hebNwvPt7V5gJ6nBDrBEwXFwcqaBr+XVlT7FIk/VIiuaEEaU4gJnigF4gt9YUp4Tc1RLYKTNoMTKVGAxTeTbGrNJsP0fhWA31zc0+8wTGGCJ0qB+PRWVHbYlzj8EqIqeAiAuy871e8wTGGCJ/LBK1/l+x1CSv3szM5+h+CaS/p0aFTllq4CneBZi0lBddzRLfwOwRGrLUlatWjuUSSp17yZBKYFX6ATPFFQxOfDowP0oE7PjeeebGn9uy7r7VEkPglIhg/0WRaQY0whdeXpnUyv27jMF+yz12pLkmOPaoFu7Y7xKBoyEugET+SnU048FvdfaW0avqdv+jGmjxoQmDpcN00b+VO/Q3BNUCppmOCJbGrZvBkevtpap5db+/fAT3ufGJD04K5eHY7DXZeGrKomzTHBE9l0/+C+tl+brAR/cttWtrfttZn3XmL7tVbr7o28dUd/V7Zjl5t3YB/dc7F7G4vDBE9kU2utJcy0kdaTTbJb/BbNj3w127RKrxY353Y/wfZrW3AJzU8AAAp2SURBVDR3JzP26nCcK9tJBxf0aOfZtpngiSx45bYLGy2zUx+bqAQY//B2yDlHSr0f/s670p5fBp/eCU/f9GO/w7BE7+Prm4bDCAc6wbd0qTRAZNa1Z5/UaJmd2/VEL4mvp449z08+wVrVTdtjWiZd50/Xp25CcL0heG44twtu7d8jZTG4QW/ohb6dmeBddcuF3X3bN2e0CYcRF51i+TV/HnIGvnhgUNL1nvnFkVJp5zYNz5faJIONGV00TmpzJMH//urTksZgpmQ8cqC5B59Xn9kZY2O2N21kf0tNRfW0aCa44ZzE9fJ3X3Yqhv/E/He9e3vvm2O2P+6oRstGDuyN0RZbVXnNswQvIteKyCYRyRORMV7so5nFAS7+POQM1/Z9Xve2jsajaBGmwTlsuHNgL8+2fdbJbUyv2z3JRBR6HZLuHNQbZ3Q5sg+jz/KXPzlSKo0v8Z1wTOMEEdWsmeCYlkd6fraMqY+P3c59V/bVTTSxYuM00ryZYOvYIUnXm/zrfhgeU9Ie1LcjHr4q+UUmfl+x/jr07AbvT8+Y605H5zbm7ly2jh2CjIcusxSTW449qjkesdiqymueJHgRaQ7gRQDXATgTwK0icqYX+4pP2h1aNz7hb7qgK+64pBduG3AKPrrnYvz6olPwxA1nIvORxifCf+6+CI9ddzo+/N1FCfc79hfnYPSVfXBut7boorV4OK/7CZhq8HT/hV+e1+D3hX+4HAt/f3nCfZg158FBhq0K+vdqj/uu6IPvHrsSd13WG711Hk49ENMa5N07I22VX/zVBY3WO61z6wYllGOPatz9/DcX90zaumTUpb3xwM/64mptfJK37uiPNq1a4KQ2reqPZSK39u+BYeedjBd/dQH+OvQsPHVjw1PrvVEDMPv+gbjwlHYYfWUfPHmj/qk3bWR/3DbglIS9SmfdN7C+9Dju5nN01/lJz/a45/JT0b9X+/pl00cNAADccUkvXHNW43FYptzxk/qfPxs9EHdd1hsZD12K+67ogwt7tMNrv+6H+67og1GX9sZDV52GJ288E5+NHggAePbmc+q3P187h//r/K4AgKl39MeE4ZFz7W/DzkbPExtewF69/ULc2v9IafjT+wY2iu1Z7X3+uGvb+mNnNHriWSe3wf1X9qmvVrr5wm6N1ok9Z/p2at3gvIld/8PfXYwfdT4efTu1xsA+HdD1hGPwsdbCJLYn7M0XdtPtNBV9PtGqZXMM6tuhwd/i7wDO7dYWn9x7CY4/ugUG9e2A+66IxHhMy+Y4p1tb/Obinph13yWNLkixn/FpcVUyp590PADg9gEN7wqjnxsAfPWHyxvF7SXxYlxqEbkIwFNKqWu03x8DAKXU03rr9+vXT2VlZbkeR1ANfm4h8ksOYvGYK9H1BOu3my8uyMO4uZvwyFWnYbRBsv0qpwQj3vwe15zVGa/e3s/S9nuOmQ0Apkp9FBw9x8xGMwE2Px2+z/WHPQdx2biFOP2k4zHnwUv9Dsc1IrJcKWX4Bfaq/VVXANtjfi8A0KAbm4iMAjAKAHr0CNYDFq9N+W1/fLJyh+220CMH9kJZRTXuHGRctzqwTwfcc/mpGGmjquSNEf1QXcuh3sLmz0POwKC+Hf0OwxM92h+Lh686rf5Op6nwqgR/C4BrlFJ3ar/fDqC/Umq03voswRMRWZesBO/VQ9YCALGVXt0A7PRoX0REpMOrBL8MQF8R6SUiRwEYDmCWR/siIiIdntTBK6VqROQ+AHMBNAfwplJqvRf7IiIifZ4NcqGU+hzA515tn4iIEgt0T1YiIjLGBE9EFFJM8EREIcUET0QUUp50dLIchEgJgB8cbKIDgN0uheMmxmUN47KGcVkTxrhOUUoZdj9OiwTvlIhkJerN5RfGZQ3jsoZxWdMU42IVDRFRSDHBExGFVFgS/GS/AzDAuKxhXNYwLmuaXFyhqIMnIqLGwlKCJyKiOEzwREQhFegEn4qJveP2111EFojIRhFZLyIPaMufEpEdIrJK+3d9zGse0+LbJCLXxCy/UETWan+bKPGzMluPbau2vVUikqUtay8iGSKSq/3fLpVxiciPYo7JKhEpE5EH/TheIvKmiBSLyLqYZa4dHxE5WkSma8uXikhPB3GNE5FsEVkjIh+LyAna8p4iUhFz3F5JcVyufW4uxzU9JqatIrLKh+NllBv8PceUUoH8h8gwxPkAegM4CsBqAGd6vM8uAC7Qfj4eQA4ik4o/BeD3OuufqcV1NIBeWrzNtb99D+AiAALgCwDXOYxtK4AOccueBTBG+3kMgGdSHVfc57ULwCl+HC8AlwK4AMA6L44PgHsAvKL9PBzAdAdxXQ2ghfbzMzFx9YxdL247qYjLtc/Nzbji/v4cgP/x4XgZ5QZfz7Egl+D7A8hTSm1WSlUBeB/AUC93qJQqVEqt0H4+AGAjIvPPGhkK4H2lVKVSaguAPAD9RaQLgDZKqe9U5NN6C8AwD0IeCmCq9vPUmH34EddgAPlKqUQ9lj2LSyn1NYC9Ovtz6/jEbusDAIPN3GXoxaWU+lIpVaP9ugSRGdEMpSquBHw9XlHa6/8bwHuJtuFRXEa5wddzLMgJXm9i75TNqKvdHp0PYKm26D7tlvrNmNswoxi7aj/HL3dCAfhSRJZLZEJzAOislCoEIicggE4+xBU1HA2/eH4fL8Dd41P/Gi05lwI40YUY70CkFBfVS0RWishXIjIoZt+pisutz82L4zUIQJFSKjdmWcqPV1xu8PUcC3KC17typaTNp4i0BvAhgAeVUmUAXgZwKoDzABQicpuYKEYvYr9EKXUBgOsA3CsilyZYN5VxQSLTNv4cwH+0RelwvBKxE4frMYrI4wBqALyjLSoE0EMpdT6AhwG8KyJtUhiXm5+bF5/prWhYiEj58dLJDYarGuzH1diCnOB9mdhbRFoi8gG+o5T6CACUUkVKqVqlVB2A1xCpPkoUYwEa3nY7jl0ptVP7vxjAx1oMRdotX/S2tDjVcWmuA7BCKVWkxej78dK4eXzqXyMiLQC0hfkqjkZEZASAGwD8P+1WHdrt/B7t5+WI1Nuelqq4XP7c3D5eLQDcBGB6TLwpPV56uQE+n2NBTvApn9hbq+96A8BGpdT4mOVdYlb7LwDRJ/yzAAzXnn73AtAXwPfardoBERmgbfPXAGY6iOs4ETk++jMiD+nWafsfoa02ImYfKYkrRoOSld/HK4abxyd2WzcDmB9NzFaJyLUA/gjg50qpQzHLO4pIc+3n3lpcm1MYl5ufm2txaX4GIFspVV+9kcrjZZQb4Pc5luwpbDr/A3A9Ik+r8wE8noL9DUTklmgNgFXav+sBTAOwVls+C0CXmNc8rsW3CTEtPwD0Q+QLkg/gX9B6FduMqzciT+RXA1gfPRaI1M9lAsjV/m+fyri07R0LYA+AtjHLUn68ELnAFAKoRqQkNNLN4wOgFSJVUHmItILo7SCuPETqWqPnWLTlxC+0z3c1gBUAbkxxXK59bm7GpS2fAuDuuHVTebyMcoOv5xiHKiAiCqkgV9EQEVECTPBERCHFBE9EFFJM8EREIcUET0QUUkzwREQhxQRPRBRS/wfED58BT9eVPwAAAABJRU5ErkJggg==\n",
308 |       "text/plain": [
309 |        "<Figure size 432x288 with 1 Axes>"
310 |       ]
311 |      },
312 |      "metadata": {
313 |       "needs_background": "light"
314 |      },
315 |      "output_type": "display_data"
316 |     }
317 |    ],
318 |    "source": [
319 |     "import matplotlib.pyplot as plt\n",
320 |     "%matplotlib inline\n",
321 |     "seq_lengths = [len(x) for x in sequences]\n",
322 |     "plt.plot(seq_lengths)\n",
323 |     "max(seq_lengths)"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": 14,
329 |    "metadata": {},
330 |    "outputs": [],
331 |    "source": [
332 |     "MAX_SEQUENCE_LENGTH = max(seq_lengths)\n",
333 |     "\n",
334 |     "data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": 15,
340 |    "metadata": {},
341 |    "outputs": [
342 |     {
343 |      "name": "stdout",
344 |      "output_type": "stream",
345 |      "text": [
346 |       "Shape of data tensor: (20000, 99)\n",
347 |       "Shape of label tensor: (20000, 2)\n"
348 |      ]
349 |     }
350 |    ],
351 |    "source": [
352 |     "labels = to_categorical(np.asarray(y))\n",
353 |     "print('Shape of data tensor:', data.shape)\n",
354 |     "print('Shape of label tensor:', labels.shape)"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": 16,
360 |    "metadata": {},
361 |    "outputs": [
362 |     {
363 |      "name": "stdout",
364 |      "output_type": "stream",
365 |      "text": [
366 |       "99\n"
367 |      ]
368 |     }
369 |    ],
370 |    "source": [
371 |     "print(MAX_SEQUENCE_LENGTH)"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "code",
376 |    "execution_count": 15,
377 |    "metadata": {},
378 |    "outputs": [],
379 |    "source": [
380 |     "\n",
381 |     "# we shuffle our sorted data, to make it random first\n",
382 |     "indices = np.arange(data.shape[0])\n",
383 |     "np.random.shuffle(indices)\n",
384 |     "data = data[indices]\n",
385 |     "labels = labels[indices]\n",
386 |     "\n",
387 |     "# before we begin to split them\n",
388 |     "num_training_samples = int(TRAINING_SPLIT * data.shape[0])"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": 16,
394 |    "metadata": {},
395 |    "outputs": [],
396 |    "source": [
397 |     "x_train = data[:num_training_samples]\n",
398 |     "y_train = labels[:num_training_samples]\n",
399 |     "x_val = data[num_training_samples:]\n",
400 |     "y_val = labels[num_training_samples:]"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "markdown",
405 |    "metadata": {},
406 |    "source": [
407 |     "## Prepare embedding:"
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "code",
412 |    "execution_count": 17,
413 |    "metadata": {},
414 |    "outputs": [],
415 |    "source": [
416 |     "num_words = min(MAX_NUM_WORDS, len(word_index)) + 1\n",
417 |     "embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))\n",
418 |     "for word, i in word_index.items():\n",
419 |     "    if i > MAX_NUM_WORDS:\n",
420 |     "        continue\n",
421 |     "    embedding_vector = embeddings_index.get(word)\n",
422 |     "    if embedding_vector is not None:\n",
423 |     "        # words not found in embedding index will be all-zeros.\n",
424 |     "        embedding_matrix[i] = embedding_vector\n",
425 |     "\n",
426 |     "# load pre-trained word embeddings into an Embedding layer\n",
427 |     "# note that we set trainable = False so as to keep the embeddings fixed\n",
428 |     "embedding_layer = Embedding(num_words,\n",
429 |     "                            EMBEDDING_DIM,\n",
430 |     "                            embeddings_initializer=Constant(embedding_matrix),\n",
431 |     "                            input_length=MAX_SEQUENCE_LENGTH,\n",
432 |     "                            trainable=False)"
433 |    ]
434 |   },
435 |   {
436 |    "cell_type": "code",
437 |    "execution_count": 18,
438 |    "metadata": {},
439 |    "outputs": [
440 |     {
441 |      "name": "stdout",
442 |      "output_type": "stream",
443 |      "text": [
444 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n",
445 |       "Instructions for updating:\n",
446 |       "If using Keras pass *_constraint arguments to layers.\n",
447 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling GlorotUniform.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
448 |       "Instructions for updating:\n",
449 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
450 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Orthogonal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
451 |       "Instructions for updating:\n",
452 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n",
453 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/tensorflow_core/python/ops/init_ops.py:97: calling Zeros.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.\n",
454 |       "Instructions for updating:\n",
455 |       "Call initializer instance with the dtype argument instead of passing it to the constructor\n"
456 |      ]
457 |     }
458 |    ],
459 |    "source": [
460 |     "#Model\n",
461 |     "sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')\n",
462 |     "embedded_sequences = embedding_layer(sequence_input)\n",
463 |     "x = Bidirectional(LSTM(256))(embedded_sequences)\n",
464 |     "y = Dropout(0.5)(x)\n",
465 |     "\n",
466 |     "preds = Dense(labels.shape[1], activation='softmax')(y)\n",
467 |     "model = Model(sequence_input, preds)"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "code",
472 |    "execution_count": 19,
473 |    "metadata": {},
474 |    "outputs": [
475 |     {
476 |      "name": "stdout",
477 |      "output_type": "stream",
478 |      "text": [
479 |       "Model: \"model\"\n",
480 |       "_________________________________________________________________\n",
481 |       "Layer (type)                 Output Shape              Param #   \n",
482 |       "=================================================================\n",
483 |       "input_1 (InputLayer)         [(None, 99)]              0         \n",
484 |       "_________________________________________________________________\n",
485 |       "embedding (Embedding)        (None, 99, 100)           5918000   \n",
486 |       "_________________________________________________________________\n",
487 |       "bidirectional (Bidirectional (None, 512)               731136    \n",
488 |       "_________________________________________________________________\n",
489 |       "dropout (Dropout)            (None, 512)               0         \n",
490 |       "_________________________________________________________________\n",
491 |       "dense (Dense)                (None, 2)                 1026      \n",
492 |       "=================================================================\n",
493 |       "Total params: 6,650,162\n",
494 |       "Trainable params: 732,162\n",
495 |       "Non-trainable params: 5,918,000\n",
496 |       "_________________________________________________________________\n"
497 |      ]
498 |     }
499 |    ],
500 |    "source": [
501 |     "model.summary()"
502 |    ]
503 |   },
504 |   {
505 |    "cell_type": "code",
506 |    "execution_count": 20,
507 |    "metadata": {},
508 |    "outputs": [],
509 |    "source": [
510 |     "model.compile(loss='categorical_crossentropy',\n",
511 |     "              optimizer='rmsprop',\n",
512 |     "              metrics=['acc'])"
513 |    ]
514 |   },
515 |   {
516 |    "cell_type": "code",
517 |    "execution_count": null,
518 |    "metadata": {},
519 |    "outputs": [
520 |     {
521 |      "name": "stdout",
522 |      "output_type": "stream",
523 |      "text": [
524 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/tensorflow_core/python/ops/math_grad.py:1424: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n",
525 |       "Instructions for updating:\n",
526 |       "Use tf.where in 2.0, which has the same broadcast rule as np.where\n",
527 |       "Train on 16000 samples, validate on 4000 samples\n",
528 |       "Epoch 1/50\n"
529 |      ]
530 |     }
531 |    ],
532 |    "source": [
533 |     "from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint\n",
534 |     "\n",
535 |     "checkpoint = ModelCheckpoint(\"best_weights.hdf5\", monitor='val_acc', verbose=1, save_best_only=True, mode='max')\n",
536 |     "earlystopping = EarlyStopping(patience=4, monitor='acc')\n",
537 |     "callbacks = [earlystopping, checkpoint]\n",
538 |     "\n",
539 |     "model.fit(x_train, y_train,\n",
540 |     "          batch_size=32,\n",
541 |     "          epochs=50,\n",
542 |     "          validation_data=(x_val, y_val),\n",
543 |     "          callbacks=callbacks,\n",
544 |     "          verbose=2)\n",
545 |     "\n",
546 |     "model.load_weights(\"best_weights.hdf5\")\n",
547 |     "\n",
548 |     "print(\"DONE\")"
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "code",
553 |    "execution_count": null,
554 |    "metadata": {},
555 |    "outputs": [],
556 |    "source": [
557 |     "def predict_tweet(text_to_predict):\n",
558 |     "    test_sequence = tokenizer.texts_to_sequences([text_to_predict])\n",
559 |     "    test_data = pad_sequences(test_sequence, maxlen=MAX_SEQUENCE_LENGTH)\n",
560 |     "    predictions = model.predict(test_data)\n",
561 |     "    prediction = np.argmax(predictions)\n",
562 |     "    print(\"The person who tweeted this is {}\\n\".format(classes_index[prediction]))\n",
563 |     "    ranking = predictions.argsort()[-6:][::-1]\n",
564 |     "    for i in reversed(range(0, 6)):\n",
565 |     "        rank = ranking[0][i]\n",
566 |     "        print(\"{:>10} : {:.5f}%\".format(classes_index[rank], predictions[0][rank]*100))"
567 |    ]
568 |   },
569 |   {
570 |    "cell_type": "code",
571 |    "execution_count": null,
572 |    "metadata": {},
573 |    "outputs": [],
574 |    "source": [
575 |     "predict_tweet(\"If I just disappear from the world right now, will anyone even care?\")"
576 |    ]
577 |   }
578 |  ],
579 |  "metadata": {
580 |   "kernelspec": {
581 |    "display_name": "Python 3",
582 |    "language": "python",
583 |    "name": "python3"
584 |   },
585 |   "language_info": {
586 |    "codemirror_mode": {
587 |     "name": "ipython",
588 |     "version": 3
589 |    },
590 |    "file_extension": ".py",
591 |    "mimetype": "text/x-python",
592 |    "name": "python",
593 |    "nbconvert_exporter": "python",
594 |    "pygments_lexer": "ipython3",
595 |    "version": "3.7.6"
596 |   }
597 |  },
598 |  "nbformat": 4,
599 |  "nbformat_minor": 4
600 | }
601 | 


--------------------------------------------------------------------------------
/ResNet.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# <a href='https://towardsdatascience.com/an-overview-of-resnet-and-its-variants-5281e2f56035'>ResNet</a>\n"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "## Load Images"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 1,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "from data import Images_load"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 2,
 29 |    "metadata": {},
 30 |    "outputs": [
 31 |     {
 32 |      "name": "stdout",
 33 |      "output_type": "stream",
 34 |      "text": [
 35 |       "Data has apparently already been downloaded and unpacked.\n"
 36 |      ]
 37 |     },
 38 |     {
 39 |      "name": "stderr",
 40 |      "output_type": "stream",
 41 |      "text": [
 42 |       "  2%|▏         | 62/4006 [00:00<00:06, 611.82it/s]"
 43 |      ]
 44 |     },
 45 |     {
 46 |      "name": "stdout",
 47 |      "output_type": "stream",
 48 |      "text": [
 49 |       "Data has apparently already been downloaded and unpacked.\n"
 50 |      ]
 51 |     },
 52 |     {
 53 |      "name": "stderr",
 54 |      "output_type": "stream",
 55 |      "text": [
 56 |       "100%|██████████| 4006/4006 [00:07<00:00, 564.95it/s]\n",
 57 |       "100%|██████████| 4192/4192 [00:07<00:00, 575.22it/s]\n"
 58 |      ]
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "train, validation, test = Images_load.load_data()"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 3,
 68 |    "metadata": {},
 69 |    "outputs": [
 70 |     {
 71 |      "name": "stderr",
 72 |      "output_type": "stream",
 73 |      "text": [
 74 |       "Using TensorFlow backend.\n"
 75 |      ]
 76 |     }
 77 |    ],
 78 |    "source": [
 79 |     "from models.Image_classifier import Resnet50"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "markdown",
 84 |    "metadata": {},
 85 |    "source": [
 86 |     "### Experiment 1"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 7,
 92 |    "metadata": {},
 93 |    "outputs": [
 94 |     {
 95 |      "name": "stdout",
 96 |      "output_type": "stream",
 97 |      "text": [
 98 |       "Train on 4611 samples, validate on 1538 samples\n",
 99 |       "Epoch 1/10\n",
100 |       " - 1510s - loss: 1.3411 - accuracy: 0.6141 - val_loss: 0.7431 - val_accuracy: 0.5169\n",
101 |       "\n",
102 |       "Epoch 00001: val_accuracy improved from -inf to 0.51691, saving model to ResNet50.01-0.74.h5\n",
103 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/callbacks/tensorboard_v1.py:343: The name tf.Summary is deprecated. Please use tf.compat.v1.Summary instead.\n",
104 |       "\n",
105 |       "Epoch 2/10\n",
106 |       " - 1444s - loss: 0.8681 - accuracy: 0.6506 - val_loss: 7.0479 - val_accuracy: 0.5953\n",
107 |       "\n",
108 |       "Epoch 00002: val_accuracy improved from 0.51691 to 0.59525, saving model to ResNet50.02-7.05.h5\n",
109 |       "Epoch 3/10\n",
110 |       " - 1454s - loss: 0.8189 - accuracy: 0.6302 - val_loss: 0.6574 - val_accuracy: 0.5927\n",
111 |       "\n",
112 |       "Epoch 00003: val_accuracy did not improve from 0.59525\n",
113 |       "Epoch 4/10\n",
114 |       " - 1447s - loss: 0.7420 - accuracy: 0.6728 - val_loss: 1.0174 - val_accuracy: 0.6141\n",
115 |       "\n",
116 |       "Epoch 00004: val_accuracy improved from 0.59525 to 0.61411, saving model to ResNet50.04-1.02.h5\n",
117 |       "Epoch 5/10\n",
118 |       " - 1447s - loss: 0.6664 - accuracy: 0.6914 - val_loss: 0.6506 - val_accuracy: 0.6336\n",
119 |       "\n",
120 |       "Epoch 00005: val_accuracy improved from 0.61411 to 0.63362, saving model to ResNet50.05-0.65.h5\n",
121 |       "Epoch 6/10\n",
122 |       " - 1441s - loss: 0.6025 - accuracy: 0.7047 - val_loss: 0.6013 - val_accuracy: 0.6873\n",
123 |       "\n",
124 |       "Epoch 00006: val_accuracy improved from 0.63362 to 0.68726, saving model to ResNet50.06-0.60.h5\n",
125 |       "Epoch 7/10\n",
126 |       " - 1446s - loss: 0.6075 - accuracy: 0.7092 - val_loss: 0.6523 - val_accuracy: 0.6187\n",
127 |       "\n",
128 |       "Epoch 00007: val_accuracy did not improve from 0.68726\n",
129 |       "Epoch 8/10\n",
130 |       " - 1447s - loss: 0.6020 - accuracy: 0.7141 - val_loss: 0.7746 - val_accuracy: 0.5078\n",
131 |       "Restoring model weights from the end of the best epoch\n",
132 |       "\n",
133 |       "Epoch 00008: val_accuracy did not improve from 0.68726\n",
134 |       "Epoch 00008: early stopping\n",
135 |       "Train accuracy: 0.6975710391998291\n",
136 |       "Test accuracy: 0.6907317042350769\n",
137 |       "CPU times: user 20h 10min 4s, sys: 7h 43min 59s, total: 1d 3h 54min 3s\n",
138 |       "Wall time: 3h 32min 54s\n"
139 |      ]
140 |     }
141 |    ],
142 |    "source": [
143 |     "%%time \n",
144 |     "CM, CR = Resnet50.run_resnet50(train, validation, test)"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 8,
150 |    "metadata": {},
151 |    "outputs": [
152 |     {
153 |      "name": "stdout",
154 |      "output_type": "stream",
155 |      "text": [
156 |       "Confusion Matrix\n",
157 |       "[[635 366]\n",
158 |       " [268 781]]\n",
159 |       "Classification Report\n",
160 |       "               precision    recall  f1-score   support\n",
161 |       "\n",
162 |       "    Depressed       0.70      0.63      0.67      1001\n",
163 |       "Not depressed       0.68      0.74      0.71      1049\n",
164 |       "\n",
165 |       "     accuracy                           0.69      2050\n",
166 |       "    macro avg       0.69      0.69      0.69      2050\n",
167 |       " weighted avg       0.69      0.69      0.69      2050\n",
168 |       "\n"
169 |      ]
170 |     }
171 |    ],
172 |    "source": [
173 |     "print('Confusion Matrix')\n",
174 |     "print(CM)\n",
175 |     "print('Classification Report')\n",
176 |     "print(CR)"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {},
182 |    "source": [
183 |     "### Experiment 2"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": 4,
189 |    "metadata": {},
190 |    "outputs": [
191 |     {
192 |      "name": "stdout",
193 |      "output_type": "stream",
194 |      "text": [
195 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n",
196 |       "Instructions for updating:\n",
197 |       "If using Keras pass *_constraint arguments to layers.\n",
198 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4070: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.\n",
199 |       "\n",
200 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4074: The name tf.nn.avg_pool is deprecated. Please use tf.nn.avg_pool2d instead.\n",
201 |       "\n",
202 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/tensorflow_core/python/ops/nn_impl.py:183: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.\n",
203 |       "Instructions for updating:\n",
204 |       "Use tf.where in 2.0, which has the same broadcast rule as np.where\n",
205 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.\n",
206 |       "\n",
207 |       "Train on 4611 samples, validate on 1538 samples\n",
208 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/callbacks/tensorboard_v1.py:200: The name tf.summary.merge_all is deprecated. Please use tf.compat.v1.summary.merge_all instead.\n",
209 |       "\n",
210 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/callbacks/tensorboard_v1.py:203: The name tf.summary.FileWriter is deprecated. Please use tf.compat.v1.summary.FileWriter instead.\n",
211 |       "\n",
212 |       "Epoch 1/10\n",
213 |       " - 1558s - loss: 0.8704 - accuracy: 0.5835 - val_loss: 0.7313 - val_accuracy: 0.4844\n",
214 |       "\n",
215 |       "Epoch 00001: val_accuracy improved from -inf to 0.48440, saving model to ResNet50.01-0.73.h5\n",
216 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/callbacks/tensorboard_v1.py:343: The name tf.Summary is deprecated. Please use tf.compat.v1.Summary instead.\n",
217 |       "\n",
218 |       "Epoch 2/10\n",
219 |       " - 1432s - loss: 0.6630 - accuracy: 0.6712 - val_loss: 0.7963 - val_accuracy: 0.4844\n",
220 |       "\n",
221 |       "Epoch 00002: val_accuracy did not improve from 0.48440\n",
222 |       "Epoch 3/10\n",
223 |       " - 1447s - loss: 0.5620 - accuracy: 0.7259 - val_loss: 0.7604 - val_accuracy: 0.4844\n",
224 |       "Restoring model weights from the end of the best epoch\n",
225 |       "\n",
226 |       "Epoch 00003: val_accuracy did not improve from 0.48440\n",
227 |       "Epoch 00003: early stopping\n",
228 |       "Train accuracy: 0.4903491735458374\n",
229 |       "Test accuracy: 0.4882926940917969\n"
230 |      ]
231 |     },
232 |     {
233 |      "name": "stderr",
234 |      "output_type": "stream",
235 |      "text": [
236 |       "/opt/conda/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
237 |       "  _warn_prf(average, modifier, msg_start, len(result))\n"
238 |      ]
239 |     },
240 |     {
241 |      "name": "stdout",
242 |      "output_type": "stream",
243 |      "text": [
244 |       "CPU times: user 8h 27min 54s, sys: 3h 25min 38s, total: 11h 53min 32s\n",
245 |       "Wall time: 1h 31min 12s\n"
246 |      ]
247 |     }
248 |    ],
249 |    "source": [
250 |     "%%time \n",
251 |     "CM, CR = Resnet50.run_resnet50(train, validation, test, lr= .0001)"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": 5,
257 |    "metadata": {},
258 |    "outputs": [
259 |     {
260 |      "name": "stdout",
261 |      "output_type": "stream",
262 |      "text": [
263 |       "Confusion Matrix\n",
264 |       "[[1001    0]\n",
265 |       " [1049    0]]\n",
266 |       "Classification Report\n",
267 |       "               precision    recall  f1-score   support\n",
268 |       "\n",
269 |       "    Depressed       0.49      1.00      0.66      1001\n",
270 |       "Not depressed       0.00      0.00      0.00      1049\n",
271 |       "\n",
272 |       "     accuracy                           0.49      2050\n",
273 |       "    macro avg       0.24      0.50      0.33      2050\n",
274 |       " weighted avg       0.24      0.49      0.32      2050\n",
275 |       "\n"
276 |      ]
277 |     }
278 |    ],
279 |    "source": [
280 |     "print('Confusion Matrix')\n",
281 |     "print(CM)\n",
282 |     "print('Classification Report')\n",
283 |     "print(CR)"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "markdown",
288 |    "metadata": {},
289 |    "source": [
290 |     "### Experiment 3\n"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": 4,
296 |    "metadata": {},
297 |    "outputs": [
298 |     {
299 |      "name": "stdout",
300 |      "output_type": "stream",
301 |      "text": [
302 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n",
303 |       "Instructions for updating:\n",
304 |       "If using Keras pass *_constraint arguments to layers.\n",
305 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4070: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.\n",
306 |       "\n",
307 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4074: The name tf.nn.avg_pool is deprecated. Please use tf.nn.avg_pool2d instead.\n",
308 |       "\n",
309 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.\n",
310 |       "\n",
311 |       "Train on 4611 samples, validate on 1538 samples\n",
312 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/callbacks/tensorboard_v1.py:200: The name tf.summary.merge_all is deprecated. Please use tf.compat.v1.summary.merge_all instead.\n",
313 |       "\n",
314 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/callbacks/tensorboard_v1.py:203: The name tf.summary.FileWriter is deprecated. Please use tf.compat.v1.summary.FileWriter instead.\n",
315 |       "\n",
316 |       "Epoch 1/10\n",
317 |       " - 1492s - loss: 2.4897 - accuracy: 0.6127 - val_loss: 0.6721 - val_accuracy: 0.6144\n",
318 |       "\n",
319 |       "Epoch 00001: val_accuracy improved from -inf to 0.61443, saving model to ResNet50.01-0.67.h5\n",
320 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/callbacks/tensorboard_v1.py:343: The name tf.Summary is deprecated. Please use tf.compat.v1.Summary instead.\n",
321 |       "\n",
322 |       "Epoch 2/10\n",
323 |       " - 1435s - loss: 1.2611 - accuracy: 0.6348 - val_loss: 0.7472 - val_accuracy: 0.5247\n",
324 |       "\n",
325 |       "Epoch 00002: val_accuracy did not improve from 0.61443\n",
326 |       "Epoch 3/10\n",
327 |       " - 1451s - loss: 0.7192 - accuracy: 0.6860 - val_loss: 0.6530 - val_accuracy: 0.6203\n",
328 |       "\n",
329 |       "Epoch 00003: val_accuracy improved from 0.61443 to 0.62029, saving model to ResNet50.03-0.65.h5\n",
330 |       "Epoch 4/10\n",
331 |       " - 1436s - loss: 0.7756 - accuracy: 0.6886 - val_loss: 0.8505 - val_accuracy: 0.5410\n",
332 |       "\n",
333 |       "Epoch 00004: val_accuracy did not improve from 0.62029\n",
334 |       "Epoch 5/10\n",
335 |       " - 1447s - loss: 0.8227 - accuracy: 0.6667 - val_loss: 0.8399 - val_accuracy: 0.5156\n",
336 |       "Restoring model weights from the end of the best epoch\n",
337 |       "\n",
338 |       "Epoch 00005: val_accuracy did not improve from 0.62029\n",
339 |       "Epoch 00005: early stopping\n",
340 |       "Train accuracy: 0.606376051902771\n",
341 |       "Test accuracy: 0.5956097841262817\n",
342 |       "CPU times: user 12h 58min 19s, sys: 5h 13min 6s, total: 18h 11min 25s\n",
343 |       "Wall time: 2h 18min 33s\n"
344 |      ]
345 |     }
346 |    ],
347 |    "source": [
348 |     "%%time \n",
349 |     "CM, CR = Resnet50.run_resnet50(train, validation, test, epochs=10)"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": 5,
355 |    "metadata": {},
356 |    "outputs": [
357 |     {
358 |      "name": "stdout",
359 |      "output_type": "stream",
360 |      "text": [
361 |       "Confusion Matrix\n",
362 |       "[[268 733]\n",
363 |       " [ 96 953]]\n",
364 |       "Classification Report\n",
365 |       "               precision    recall  f1-score   support\n",
366 |       "\n",
367 |       "    Depressed       0.74      0.27      0.39      1001\n",
368 |       "Not depressed       0.57      0.91      0.70      1049\n",
369 |       "\n",
370 |       "     accuracy                           0.60      2050\n",
371 |       "    macro avg       0.65      0.59      0.54      2050\n",
372 |       " weighted avg       0.65      0.60      0.55      2050\n",
373 |       "\n"
374 |      ]
375 |     }
376 |    ],
377 |    "source": [
378 |     "print('Confusion Matrix')\n",
379 |     "print(CM)\n",
380 |     "print('Classification Report')\n",
381 |     "print(CR)"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "markdown",
386 |    "metadata": {},
387 |    "source": [
388 |     "### Experiment 4"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": 7,
394 |    "metadata": {},
395 |    "outputs": [
396 |     {
397 |      "name": "stdout",
398 |      "output_type": "stream",
399 |      "text": [
400 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n",
401 |       "Instructions for updating:\n",
402 |       "If using Keras pass *_constraint arguments to layers.\n",
403 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4070: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.\n",
404 |       "\n",
405 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4074: The name tf.nn.avg_pool is deprecated. Please use tf.nn.avg_pool2d instead.\n",
406 |       "\n",
407 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.\n",
408 |       "\n",
409 |       "Train on 4611 samples, validate on 1538 samples\n",
410 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/callbacks/tensorboard_v1.py:200: The name tf.summary.merge_all is deprecated. Please use tf.compat.v1.summary.merge_all instead.\n",
411 |       "\n",
412 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/callbacks/tensorboard_v1.py:203: The name tf.summary.FileWriter is deprecated. Please use tf.compat.v1.summary.FileWriter instead.\n",
413 |       "\n",
414 |       "Epoch 1/10\n",
415 |       " - 1500s - loss: 1.8641 - accuracy: 0.5834 - val_loss: 0.6544 - val_accuracy: 0.6190\n",
416 |       "\n",
417 |       "Epoch 00001: val_accuracy improved from -inf to 0.61899, saving model to ResNet50.01-0.65.h5\n",
418 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/callbacks/tensorboard_v1.py:343: The name tf.Summary is deprecated. Please use tf.compat.v1.Summary instead.\n",
419 |       "\n",
420 |       "Epoch 2/10\n",
421 |       " - 1452s - loss: 0.7156 - accuracy: 0.6738 - val_loss: 0.7534 - val_accuracy: 0.5039\n",
422 |       "\n",
423 |       "Epoch 00002: val_accuracy did not improve from 0.61899\n",
424 |       "Epoch 3/10\n",
425 |       " - 1459s - loss: 0.6721 - accuracy: 0.6747 - val_loss: 0.6098 - val_accuracy: 0.6795\n",
426 |       "\n",
427 |       "Epoch 00003: val_accuracy improved from 0.61899 to 0.67945, saving model to ResNet50.03-0.61.h5\n",
428 |       "Epoch 4/10\n",
429 |       " - 1459s - loss: 0.6265 - accuracy: 0.6881 - val_loss: 0.6037 - val_accuracy: 0.6977\n",
430 |       "\n",
431 |       "Epoch 00004: val_accuracy improved from 0.67945 to 0.69766, saving model to ResNet50.04-0.60.h5\n",
432 |       "Epoch 5/10\n",
433 |       " - 1493s - loss: 0.6050 - accuracy: 0.6996 - val_loss: 0.5807 - val_accuracy: 0.7230\n",
434 |       "\n",
435 |       "Epoch 00005: val_accuracy improved from 0.69766 to 0.72302, saving model to ResNet50.05-0.58.h5\n",
436 |       "Epoch 6/10\n",
437 |       " - 1535s - loss: 0.5855 - accuracy: 0.7118 - val_loss: 0.5706 - val_accuracy: 0.7224\n",
438 |       "\n",
439 |       "Epoch 00006: val_accuracy did not improve from 0.72302\n",
440 |       "Epoch 7/10\n",
441 |       " - 1548s - loss: 0.5600 - accuracy: 0.7257 - val_loss: 0.5802 - val_accuracy: 0.7146\n",
442 |       "\n",
443 |       "Epoch 00007: val_accuracy did not improve from 0.72302\n",
444 |       "Epoch 8/10\n",
445 |       " - 1554s - loss: 0.5473 - accuracy: 0.7296 - val_loss: 0.5575 - val_accuracy: 0.7269\n",
446 |       "\n",
447 |       "Epoch 00008: val_accuracy improved from 0.72302 to 0.72692, saving model to ResNet50.08-0.56.h5\n",
448 |       "Epoch 9/10\n",
449 |       " - 1554s - loss: 0.5279 - accuracy: 0.7428 - val_loss: 0.5826 - val_accuracy: 0.7165\n",
450 |       "\n",
451 |       "Epoch 00009: val_accuracy did not improve from 0.72692\n",
452 |       "Epoch 10/10\n",
453 |       " - 1568s - loss: 0.5157 - accuracy: 0.7517 - val_loss: 0.5928 - val_accuracy: 0.7094\n",
454 |       "Restoring model weights from the end of the best epoch\n",
455 |       "\n",
456 |       "Epoch 00010: val_accuracy did not improve from 0.72692\n",
457 |       "Epoch 00010: early stopping\n",
458 |       "Train accuracy: 0.7553675770759583\n",
459 |       "Test accuracy: 0.7151219248771667\n",
460 |       "CPU times: user 1d 38min 17s, sys: 11h 3min 5s, total: 1d 11h 41min 22s\n",
461 |       "Wall time: 4h 30min 23s\n"
462 |      ]
463 |     }
464 |    ],
465 |    "source": [
466 |     "%%time \n",
467 |     "CM, CR = Resnet50.run_resnet50(train, validation, test, batch_size=50)"
468 |    ]
469 |   },
470 |   {
471 |    "cell_type": "code",
472 |    "execution_count": 8,
473 |    "metadata": {},
474 |    "outputs": [
475 |     {
476 |      "name": "stdout",
477 |      "output_type": "stream",
478 |      "text": [
479 |       "Confusion Matrix\n",
480 |       "[[787 214]\n",
481 |       " [370 679]]\n",
482 |       "Classification Report\n",
483 |       "               precision    recall  f1-score   support\n",
484 |       "\n",
485 |       "    Depressed       0.68      0.79      0.73      1001\n",
486 |       "Not depressed       0.76      0.65      0.70      1049\n",
487 |       "\n",
488 |       "     accuracy                           0.72      2050\n",
489 |       "    macro avg       0.72      0.72      0.71      2050\n",
490 |       " weighted avg       0.72      0.72      0.71      2050\n",
491 |       "\n"
492 |      ]
493 |     }
494 |    ],
495 |    "source": [
496 |     "print('Confusion Matrix')\n",
497 |     "print(CM)\n",
498 |     "print('Classification Report')\n",
499 |     "print(CR)"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "code",
504 |    "execution_count": null,
505 |    "metadata": {},
506 |    "outputs": [],
507 |    "source": []
508 |   },
509 |   {
510 |    "cell_type": "markdown",
511 |    "metadata": {},
512 |    "source": [
513 |     "### Experiment 5\n"
514 |    ]
515 |   },
516 |   {
517 |    "cell_type": "code",
518 |    "execution_count": 4,
519 |    "metadata": {},
520 |    "outputs": [
521 |     {
522 |      "name": "stdout",
523 |      "output_type": "stream",
524 |      "text": [
525 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/tensorflow_core/python/ops/resource_variable_ops.py:1630: calling BaseResourceVariable.__init__ (from tensorflow.python.ops.resource_variable_ops) with constraint is deprecated and will be removed in a future version.\n",
526 |       "Instructions for updating:\n",
527 |       "If using Keras pass *_constraint arguments to layers.\n",
528 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4070: The name tf.nn.max_pool is deprecated. Please use tf.nn.max_pool2d instead.\n",
529 |       "\n",
530 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4074: The name tf.nn.avg_pool is deprecated. Please use tf.nn.avg_pool2d instead.\n",
531 |       "\n",
532 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:422: The name tf.global_variables is deprecated. Please use tf.compat.v1.global_variables instead.\n",
533 |       "\n",
534 |       "Train on 4611 samples, validate on 1538 samples\n",
535 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/callbacks/tensorboard_v1.py:200: The name tf.summary.merge_all is deprecated. Please use tf.compat.v1.summary.merge_all instead.\n",
536 |       "\n",
537 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/callbacks/tensorboard_v1.py:203: The name tf.summary.FileWriter is deprecated. Please use tf.compat.v1.summary.FileWriter instead.\n",
538 |       "\n",
539 |       "Epoch 1/30\n",
540 |       " - 1492s - loss: 2.3615 - accuracy: 0.5797 - val_loss: 0.8068 - val_accuracy: 0.4844\n",
541 |       "\n",
542 |       "Epoch 00001: val_accuracy improved from -inf to 0.48440, saving model to ResNet50.01-0.81.h5\n",
543 |       "WARNING:tensorflow:From /opt/conda/lib/python3.7/site-packages/keras/callbacks/tensorboard_v1.py:343: The name tf.Summary is deprecated. Please use tf.compat.v1.Summary instead.\n",
544 |       "\n",
545 |       "Epoch 2/30\n",
546 |       " - 1458s - loss: 0.9057 - accuracy: 0.6196 - val_loss: 1.1554 - val_accuracy: 0.4844\n",
547 |       "\n",
548 |       "Epoch 00002: val_accuracy did not improve from 0.48440\n",
549 |       "Epoch 3/30\n",
550 |       " - 1460s - loss: 0.6772 - accuracy: 0.6582 - val_loss: 0.7958 - val_accuracy: 0.5111\n",
551 |       "\n",
552 |       "Epoch 00003: val_accuracy improved from 0.48440 to 0.51105, saving model to ResNet50.03-0.80.h5\n",
553 |       "Epoch 4/30\n",
554 |       " - 1454s - loss: 0.6158 - accuracy: 0.6890 - val_loss: 0.6010 - val_accuracy: 0.6899\n",
555 |       "\n",
556 |       "Epoch 00004: val_accuracy improved from 0.51105 to 0.68986, saving model to ResNet50.04-0.60.h5\n",
557 |       "Epoch 5/30\n",
558 |       " - 1463s - loss: 0.5771 - accuracy: 0.7072 - val_loss: 0.5911 - val_accuracy: 0.6866\n",
559 |       "\n",
560 |       "Epoch 00005: val_accuracy did not improve from 0.68986\n",
561 |       "Epoch 6/30\n",
562 |       " - 1464s - loss: 0.5749 - accuracy: 0.7142 - val_loss: 0.5726 - val_accuracy: 0.7172\n",
563 |       "\n",
564 |       "Epoch 00006: val_accuracy improved from 0.68986 to 0.71717, saving model to ResNet50.06-0.57.h5\n",
565 |       "Epoch 7/30\n",
566 |       " - 1464s - loss: 0.5524 - accuracy: 0.7302 - val_loss: 0.5652 - val_accuracy: 0.7178\n",
567 |       "\n",
568 |       "Epoch 00007: val_accuracy improved from 0.71717 to 0.71782, saving model to ResNet50.07-0.57.h5\n",
569 |       "Epoch 8/30\n",
570 |       " - 1460s - loss: 0.5327 - accuracy: 0.7456 - val_loss: 0.5694 - val_accuracy: 0.7100\n",
571 |       "\n",
572 |       "Epoch 00008: val_accuracy did not improve from 0.71782\n",
573 |       "Epoch 9/30\n",
574 |       " - 1462s - loss: 0.5329 - accuracy: 0.7452 - val_loss: 0.6432 - val_accuracy: 0.6671\n",
575 |       "Restoring model weights from the end of the best epoch\n",
576 |       "\n",
577 |       "Epoch 00009: val_accuracy did not improve from 0.71782\n",
578 |       "Epoch 00009: early stopping\n",
579 |       "Train accuracy: 0.7469095587730408\n",
580 |       "Test accuracy: 0.707317054271698\n",
581 |       "CPU times: user 21h 40min 24s, sys: 9h 31min 41s, total: 1d 7h 12min 6s\n",
582 |       "Wall time: 3h 56min 33s\n"
583 |      ]
584 |     }
585 |    ],
586 |    "source": [
587 |     "%%time \n",
588 |     "CM, CR = Resnet50.run_resnet50(train, validation, test, batch_size=50,epochs=30)"
589 |    ]
590 |   },
591 |   {
592 |    "cell_type": "code",
593 |    "execution_count": 5,
594 |    "metadata": {},
595 |    "outputs": [
596 |     {
597 |      "name": "stdout",
598 |      "output_type": "stream",
599 |      "text": [
600 |       "Confusion Matrix\n",
601 |       "[[719 282]\n",
602 |       " [318 731]]\n",
603 |       "Classification Report\n",
604 |       "               precision    recall  f1-score   support\n",
605 |       "\n",
606 |       "    Depressed       0.69      0.72      0.71      1001\n",
607 |       "Not depressed       0.72      0.70      0.71      1049\n",
608 |       "\n",
609 |       "     accuracy                           0.71      2050\n",
610 |       "    macro avg       0.71      0.71      0.71      2050\n",
611 |       " weighted avg       0.71      0.71      0.71      2050\n",
612 |       "\n"
613 |      ]
614 |     }
615 |    ],
616 |    "source": [
617 |     "print('Confusion Matrix')\n",
618 |     "print(CM)\n",
619 |     "print('Classification Report')\n",
620 |     "print(CR)"
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "markdown",
625 |    "metadata": {},
626 |    "source": [
627 |     "### Experiment 6"
628 |    ]
629 |   },
630 |   {
631 |    "cell_type": "code",
632 |    "execution_count": 6,
633 |    "metadata": {},
634 |    "outputs": [
635 |     {
636 |      "name": "stdout",
637 |      "output_type": "stream",
638 |      "text": [
639 |       "Train on 4611 samples, validate on 1538 samples\n",
640 |       "Epoch 1/30\n",
641 |       " - 1434s - loss: 1.4976 - accuracy: 0.5975 - val_loss: 1.2120 - val_accuracy: 0.4837\n",
642 |       "\n",
643 |       "Epoch 00001: val_accuracy improved from -inf to 0.48375, saving model to ResNet50.01-1.21.h5\n",
644 |       "Epoch 2/30\n",
645 |       " - 1411s - loss: 0.8718 - accuracy: 0.6337 - val_loss: 0.6475 - val_accuracy: 0.6430\n",
646 |       "\n",
647 |       "Epoch 00002: val_accuracy improved from 0.48375 to 0.64304, saving model to ResNet50.02-0.65.h5\n",
648 |       "Epoch 3/30\n",
649 |       " - 1416s - loss: 0.6556 - accuracy: 0.6667 - val_loss: 0.6151 - val_accuracy: 0.6756\n",
650 |       "\n",
651 |       "Epoch 00003: val_accuracy improved from 0.64304 to 0.67555, saving model to ResNet50.03-0.62.h5\n",
652 |       "Epoch 4/30\n",
653 |       " - 1409s - loss: 0.6061 - accuracy: 0.6881 - val_loss: 0.5873 - val_accuracy: 0.6983\n",
654 |       "\n",
655 |       "Epoch 00004: val_accuracy improved from 0.67555 to 0.69831, saving model to ResNet50.04-0.59.h5\n",
656 |       "Epoch 5/30\n",
657 |       " - 1420s - loss: 0.6063 - accuracy: 0.6918 - val_loss: 0.5870 - val_accuracy: 0.6964\n",
658 |       "\n",
659 |       "Epoch 00005: val_accuracy did not improve from 0.69831\n",
660 |       "Epoch 6/30\n",
661 |       " - 1427s - loss: 0.5881 - accuracy: 0.6977 - val_loss: 0.5752 - val_accuracy: 0.7003\n",
662 |       "\n",
663 |       "Epoch 00006: val_accuracy improved from 0.69831 to 0.70026, saving model to ResNet50.06-0.58.h5\n",
664 |       "Epoch 7/30\n",
665 |       " - 1415s - loss: 0.5681 - accuracy: 0.7202 - val_loss: 0.5651 - val_accuracy: 0.7263\n",
666 |       "\n",
667 |       "Epoch 00007: val_accuracy improved from 0.70026 to 0.72627, saving model to ResNet50.07-0.57.h5\n",
668 |       "Epoch 8/30\n",
669 |       " - 1428s - loss: 0.5900 - accuracy: 0.7020 - val_loss: 0.7894 - val_accuracy: 0.5735\n",
670 |       "\n",
671 |       "Epoch 00008: val_accuracy did not improve from 0.72627\n",
672 |       "Epoch 9/30\n",
673 |       " - 1430s - loss: 0.5575 - accuracy: 0.7237 - val_loss: 0.5615 - val_accuracy: 0.7191\n",
674 |       "\n",
675 |       "Epoch 00009: val_accuracy did not improve from 0.72627\n",
676 |       "Epoch 10/30\n",
677 |       " - 1411s - loss: 0.5503 - accuracy: 0.7306 - val_loss: 0.6058 - val_accuracy: 0.6886\n",
678 |       "\n",
679 |       "Epoch 00010: val_accuracy did not improve from 0.72627\n",
680 |       "Epoch 11/30\n",
681 |       " - 1417s - loss: 0.5336 - accuracy: 0.7402 - val_loss: 0.5745 - val_accuracy: 0.7295\n",
682 |       "Restoring model weights from the end of the best epoch\n",
683 |       "\n",
684 |       "Epoch 00011: val_accuracy improved from 0.72627 to 0.72952, saving model to ResNet50.11-0.57.h5\n",
685 |       "Epoch 00011: early stopping\n",
686 |       "Train accuracy: 0.7328128218650818\n",
687 |       "Test accuracy: 0.6956097483634949\n",
688 |       "CPU times: user 1d 1h 12min 34s, sys: 11h 30min 28s, total: 1d 12h 43min 3s\n",
689 |       "Wall time: 4h 38min 59s\n"
690 |      ]
691 |     }
692 |    ],
693 |    "source": [
694 |     "%%time \n",
695 |     "CM, CR = Resnet50.run_resnet50(train, validation, test, batch_size=32,epochs=30)"
696 |    ]
697 |   },
698 |   {
699 |    "cell_type": "code",
700 |    "execution_count": null,
701 |    "metadata": {},
702 |    "outputs": [
703 |     {
704 |      "name": "stdout",
705 |      "output_type": "stream",
706 |      "text": [
707 |       "Confusion Matrix\n",
708 |       "[[596 405]\n",
709 |       " [219 830]]\n",
710 |       "Classification Report\n",
711 |       "               precision    recall  f1-score   support\n",
712 |       "\n",
713 |       "    Depressed       0.73      0.60      0.66      1001\n",
714 |       "Not depressed       0.67      0.79      0.73      1049\n",
715 |       "\n",
716 |       "     accuracy                           0.70      2050\n",
717 |       "    macro avg       0.70      0.69      0.69      2050\n",
718 |       " weighted avg       0.70      0.70      0.69      2050\n",
719 |       "\n"
720 |      ]
721 |     }
722 |    ],
723 |    "source": [
724 |     "print('Confusion Matrix')\n",
725 |     "print(CM)\n",
726 |     "print('Classification Report')\n",
727 |     "print(CR)"
728 |    ]
729 |   },
730 |   {
731 |    "cell_type": "markdown",
732 |    "metadata": {},
733 |    "source": [
734 |     "### Experiment 7"
735 |    ]
736 |   },
737 |   {
738 |    "cell_type": "code",
739 |    "execution_count": 6,
740 |    "metadata": {},
741 |    "outputs": [
742 |     {
743 |      "name": "stdout",
744 |      "output_type": "stream",
745 |      "text": [
746 |       "Train on 4611 samples, validate on 1538 samples\n",
747 |       "Epoch 1/30\n",
748 |       " - 1569s - loss: 1.7274 - accuracy: 0.5988 - val_loss: 0.6953 - val_accuracy: 0.4844\n",
749 |       "\n",
750 |       "Epoch 00001: val_accuracy improved from -inf to 0.48440, saving model to ResNet50.01-0.70.h5\n",
751 |       "Epoch 2/30\n",
752 |       " - 1557s - loss: 1.0233 - accuracy: 0.6664 - val_loss: 0.6545 - val_accuracy: 0.6346\n",
753 |       "\n",
754 |       "Epoch 00002: val_accuracy improved from 0.48440 to 0.63459, saving model to ResNet50.02-0.65.h5\n",
755 |       "Epoch 3/30\n",
756 |       " - 1532s - loss: 0.9601 - accuracy: 0.6821 - val_loss: 0.6460 - val_accuracy: 0.6242\n",
757 |       "\n",
758 |       "Epoch 00003: val_accuracy did not improve from 0.63459\n",
759 |       "Epoch 4/30\n",
760 |       " - 1535s - loss: 0.8202 - accuracy: 0.6701 - val_loss: 4.0647 - val_accuracy: 0.5052\n",
761 |       "\n",
762 |       "Epoch 00004: val_accuracy did not improve from 0.63459\n",
763 |       "Epoch 5/30\n",
764 |       " - 1534s - loss: 0.6978 - accuracy: 0.6992 - val_loss: 4.6297 - val_accuracy: 0.6385\n",
765 |       "Restoring model weights from the end of the best epoch\n",
766 |       "\n",
767 |       "Epoch 00005: val_accuracy improved from 0.63459 to 0.63849, saving model to ResNet50.05-4.63.h5\n",
768 |       "Epoch 00005: early stopping\n",
769 |       "Train accuracy: 0.645196259021759\n",
770 |       "Test accuracy: 0.6458536386489868\n",
771 |       "CPU times: user 13h 27min 49s, sys: 5h 47min 37s, total: 19h 15min 27s\n",
772 |       "Wall time: 2h 27min 45s\n"
773 |      ]
774 |     }
775 |    ],
776 |    "source": [
777 |     "%%time \n",
778 |     "CM, CR = Resnet50.run_resnet50(train, validation, test, batch_size=50,epochs=30,lr=.0003)"
779 |    ]
780 |   },
781 |   {
782 |    "cell_type": "code",
783 |    "execution_count": 7,
784 |    "metadata": {},
785 |    "outputs": [
786 |     {
787 |      "name": "stdout",
788 |      "output_type": "stream",
789 |      "text": [
790 |       "Confusion Matrix\n",
791 |       "[[772 229]\n",
792 |       " [497 552]]\n",
793 |       "Classification Report\n",
794 |       "               precision    recall  f1-score   support\n",
795 |       "\n",
796 |       "    Depressed       0.61      0.77      0.68      1001\n",
797 |       "Not depressed       0.71      0.53      0.60      1049\n",
798 |       "\n",
799 |       "     accuracy                           0.65      2050\n",
800 |       "    macro avg       0.66      0.65      0.64      2050\n",
801 |       " weighted avg       0.66      0.65      0.64      2050\n",
802 |       "\n"
803 |      ]
804 |     }
805 |    ],
806 |    "source": [
807 |     "print('Confusion Matrix')\n",
808 |     "print(CM)\n",
809 |     "print('Classification Report')\n",
810 |     "print(CR)"
811 |    ]
812 |   },
813 |   {
814 |    "cell_type": "code",
815 |    "execution_count": null,
816 |    "metadata": {},
817 |    "outputs": [],
818 |    "source": []
819 |   }
820 |  ],
821 |  "metadata": {
822 |   "kernelspec": {
823 |    "display_name": "Python 3",
824 |    "language": "python",
825 |    "name": "python3"
826 |   },
827 |   "language_info": {
828 |    "codemirror_mode": {
829 |     "name": "ipython",
830 |     "version": 3
831 |    },
832 |    "file_extension": ".py",
833 |    "mimetype": "text/x-python",
834 |    "name": "python",
835 |    "nbconvert_exporter": "python",
836 |    "pygments_lexer": "ipython3",
837 |    "version": "3.6.10"
838 |   }
839 |  },
840 |  "nbformat": 4,
841 |  "nbformat_minor": 4
842 | }
843 | 


--------------------------------------------------------------------------------
/Preprocessing_Texts.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Libraries"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "from data import c3d\n",
 17 |     "import nltk\n",
 18 |     "#nltk.download()\n",
 19 |     "from nltk.tokenize import word_tokenize\n",
 20 |     "from nltk.corpus import stopwords \n",
 21 |     "from nltk.stem import PorterStemmer\n",
 22 |     "from nltk.stem import WordNetLemmatizer\n",
 23 |     "from ekphrasis.classes.tokenizer import SocialTokenizer\n",
 24 |     "\n",
 25 |     "from nltk.tokenize import TweetTokenizer\n",
 26 |     "from ekphrasis.classes.spellcorrect import SpellCorrector\n",
 27 |     "from ekphrasis.classes.segmenter import Segmenter\n",
 28 |     "import re\n",
 29 |     "from ekphrasis.classes.preprocessor import TextPreProcessor\n",
 30 |     "import emoji\n",
 31 |     "from tqdm import trange \n",
 32 |     "import collections, numpy"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "## Loading  Data "
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 2,
 45 |    "metadata": {},
 46 |    "outputs": [
 47 |     {
 48 |      "name": "stderr",
 49 |      "output_type": "stream",
 50 |      "text": [
 51 |       "  0%|          | 9/5384 [00:00<01:01, 87.51it/s]"
 52 |      ]
 53 |     },
 54 |     {
 55 |      "name": "stdout",
 56 |      "output_type": "stream",
 57 |      "text": [
 58 |       "Data has apparently already been downloaded and unpacked.\n"
 59 |      ]
 60 |     },
 61 |     {
 62 |      "name": "stderr",
 63 |      "output_type": "stream",
 64 |      "text": [
 65 |       "100%|██████████| 5384/5384 [00:17<00:00, 301.35it/s]\n",
 66 |       "100%|██████████| 6493/6493 [00:30<00:00, 215.99it/s]\n"
 67 |      ]
 68 |     }
 69 |    ],
 70 |    "source": [
 71 |     "train, validation, test = c3d.load_data()"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 3,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "train_tweets, train_labels = train.features, train.labels\n",
 81 |     "val_tweets, val_labels = validation.features, validation.labels\n",
 82 |     "test_tweets, test_labels = test.features, test.labels"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 4,
 88 |    "metadata": {},
 89 |    "outputs": [
 90 |     {
 91 |      "data": {
 92 |       "text/plain": [
 93 |        "(Counter({'negative': 3051, 'positive': 3629}),\n",
 94 |        " Counter({'positive': 1228, 'negative': 999}),\n",
 95 |        " Counter({'positive': 1636, 'negative': 1334}))"
 96 |       ]
 97 |      },
 98 |      "execution_count": 4,
 99 |      "metadata": {},
100 |      "output_type": "execute_result"
101 |     }
102 |    ],
103 |    "source": [
104 |     "collections.Counter(train_labels),collections.Counter(val_labels),collections.Counter(test_labels) # Data is balanced"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "The various text preprocessing steps are:\n",
112 |     "\n",
113 |     " * Tokenization\n",
114 |     " * Lower casing\n",
115 |     " * Stop words removal\n",
116 |     " * Stemming\n",
117 |     " * Lemmatization"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "markdown",
122 |    "metadata": {},
123 |    "source": [
124 |     "# Data Cleaning:"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "markdown",
129 |    "metadata": {},
130 |    "source": [
131 |     "## Tokenization \n",
132 |     "Tokenization is the process of splitting the given text into smaller pieces called tokens"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 4,
138 |    "metadata": {},
139 |    "outputs": [
140 |     {
141 |      "name": "stdout",
142 |      "output_type": "stream",
143 |      "text": [
144 |       "So since season is starting I'ma start coming to school looking like a homeless person 🙂👌🏼🏊🏻‍♀️\n",
145 |       "['So', 'since', 'season', 'is', 'starting', \"I'ma\", 'start', 'coming', 'to', 'school', 'looking', 'like', 'a', 'homeless', 'person', '🙂👌🏼🏊🏻\\u200d♀️']\n"
146 |      ]
147 |     }
148 |    ],
149 |    "source": [
150 |     "sentence = train_tweets[0]\n",
151 |     "print(sentence)\n",
152 |     "words = word_tokenize(sentence)\n",
153 |     "print(words)"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "markdown",
158 |    "metadata": {},
159 |    "source": [
160 |     "## Lower casing\n",
161 |     "Converting a word to lower case (DEPRESSED -> depressed)\n"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 5,
167 |    "metadata": {},
168 |    "outputs": [
169 |     {
170 |      "name": "stdout",
171 |      "output_type": "stream",
172 |      "text": [
173 |       "so since season is starting i'ma start coming to school looking like a homeless person 🙂👌🏼🏊🏻‍♀️\n"
174 |      ]
175 |     }
176 |    ],
177 |    "source": [
178 |     "sentence = sentence.lower()\n",
179 |     "print(sentence)"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "markdown",
184 |    "metadata": {},
185 |    "source": [
186 |     "## Stop words removal"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": 6,
192 |    "metadata": {},
193 |    "outputs": [
194 |     {
195 |      "name": "stdout",
196 |      "output_type": "stream",
197 |      "text": [
198 |       "{'my', 'being', 'of', 'a', 'for', 'most', 'our', 'these', 'up', 'an', 'ourselves', 'as', 'can', 'under', 'where', 'your', 'her', 'further', 'its', 'or', 'won', 'before', \"you'll\", \"won't\", \"you're\", 'no', 'theirs', 'his', 'such', 'ours', 'any', 'o', 'what', 'was', 'didn', 'be', 'on', 'm', 'only', 'doesn', 'just', 'yours', 'haven', 'through', \"she's\", 'ain', 'were', 'shouldn', 'in', \"should've\", 'because', 'doing', \"it's\", 'not', 'you', 'hadn', 'against', 'she', 's', 'been', 'but', 'has', 'this', \"you'd\", 'is', 'other', 'y', 'below', 'am', 'whom', 'then', 'so', \"hadn't\", 'more', 'with', 'having', 'too', 'hasn', 're', 'weren', \"shouldn't\", 'hers', 'from', 'few', 'until', 'their', 'why', 'off', 'will', 'at', 'down', 'are', \"shan't\", 'he', 'how', 'there', 'themselves', 'nor', 'herself', 'me', 'if', 'the', 'll', \"hasn't\", 'yourselves', 't', 'himself', 'by', 'above', 'aren', \"haven't\", 'now', 'mustn', 'do', 'them', 'some', 'had', 'who', 'when', 'each', \"that'll\", \"don't\", 'don', 'it', 'which', 'have', 'to', 'does', \"wasn't\", 'did', 'mightn', 'about', \"didn't\", \"needn't\", 'here', \"weren't\", \"wouldn't\", 'after', \"aren't\", 'own', \"couldn't\", 'i', 'very', 'couldn', 'him', 'they', 'all', \"mightn't\", \"doesn't\", 'once', 'those', 'd', 've', 'wouldn', 'yourself', 'over', 'we', 'isn', \"you've\", 'between', 'again', 'myself', 'needn', 'into', 'than', 'wasn', 'same', 'ma', 'itself', 'that', 'during', 'out', 'both', 'and', 'shan', \"isn't\", \"mustn't\", 'while', 'should'}\n"
199 |      ]
200 |     }
201 |    ],
202 |    "source": [
203 |     "stop_words = set(stopwords.words('english')) \n",
204 |     "print(stop_words)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "markdown",
209 |    "metadata": {},
210 |    "source": [
211 |     "We can see that if NLTK stopwords are used than all the negative contractions will be removed which plays a significant role in sentiment analysis."
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 7,
217 |    "metadata": {},
218 |    "outputs": [
219 |     {
220 |      "name": "stdout",
221 |      "output_type": "stream",
222 |      "text": [
223 |       "Tweet Tokenizer: ['So', 'since', 'season', 'is', 'starting', \"I'ma\", 'start', 'coming', 'to', 'school', 'looking', 'like', 'a', 'homeless', 'person', '🙂', '👌', '🏼', '🏊', '🏻', '\\u200d', '♀', '️']\n",
224 |       "\n",
225 |       "Social tokenizer: ['so', 'since', 'season', 'is', 'starting', 'i', \"'\", 'ma', 'start', 'coming', 'to', 'school', 'looking', 'like', 'a', 'homeless', 'person', '🙂', '👌', '🏼', '🏊', '🏻', '\\u200d', '♀️']\n"
226 |      ]
227 |     }
228 |    ],
229 |    "source": [
230 |     "sentence = train_tweets[0]\n",
231 |     "tknzr = TweetTokenizer()\n",
232 |     "social_tokenizer = SocialTokenizer(lowercase=True).tokenize\n",
233 |     "\n",
234 |     "print(\"Tweet Tokenizer:\",tknzr.tokenize(sentence))\n",
235 |     "print()\n",
236 |     "print(\"Social tokenizer:\",social_tokenizer(sentence))\n"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "markdown",
241 |    "metadata": {},
242 |    "source": [
243 |     "## Stemming: \n",
244 |     "It is a process of transforming a word to its root form.\n"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 8,
250 |    "metadata": {},
251 |    "outputs": [
252 |     {
253 |      "name": "stdout",
254 |      "output_type": "stream",
255 |      "text": [
256 |       "So\n",
257 |       "sinc\n",
258 |       "season\n",
259 |       "is\n",
260 |       "start\n",
261 |       "i'ma\n",
262 |       "start\n",
263 |       "come\n",
264 |       "to\n",
265 |       "school\n",
266 |       "look\n",
267 |       "like\n",
268 |       "a\n",
269 |       "homeless\n",
270 |       "person\n",
271 |       "🙂👌🏼🏊🏻‍♀️\n"
272 |      ]
273 |     }
274 |    ],
275 |    "source": [
276 |     "ps = PorterStemmer()\n",
277 |     "for word in sentence.split():\n",
278 |     "    print(ps.stem(word))"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "markdown",
283 |    "metadata": {},
284 |    "source": [
285 |     "## Lemmatization: \n",
286 |     "Lemmatization reduces the words to a word existing in the language."
287 |    ]
288 |   },
289 |   {
290 |    "cell_type": "code",
291 |    "execution_count": 9,
292 |    "metadata": {},
293 |    "outputs": [
294 |     {
295 |      "name": "stdout",
296 |      "output_type": "stream",
297 |      "text": [
298 |       "So\n",
299 |       "since\n",
300 |       "season\n",
301 |       "be\n",
302 |       "start\n",
303 |       "I'ma\n",
304 |       "start\n",
305 |       "come\n",
306 |       "to\n",
307 |       "school\n",
308 |       "look\n",
309 |       "like\n",
310 |       "a\n",
311 |       "homeless\n",
312 |       "person\n",
313 |       "🙂👌🏼🏊🏻‍♀️\n"
314 |      ]
315 |     }
316 |    ],
317 |    "source": [
318 |     "lemmatizer = WordNetLemmatizer()\n",
319 |     "for word in sentence.split():\n",
320 |     "    print(lemmatizer.lemmatize(word, pos='v'))"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "markdown",
325 |    "metadata": {},
326 |    "source": [
327 |     "### Remove Links"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": 7,
333 |    "metadata": {},
334 |    "outputs": [],
335 |    "source": [
336 |     "tweet = ' '.join(re.sub(\"(\\w+:\\/\\/\\S+)\", \" \", train_tweets[7]).split())"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": 8,
342 |    "metadata": {},
343 |    "outputs": [
344 |     {
345 |      "data": {
346 |       "text/plain": [
347 |        "'How can u be friends with ur mans ex — I LOVE MIRNA SHES FAB AF. Plus wtf were not like 7 ...'"
348 |       ]
349 |      },
350 |      "execution_count": 8,
351 |      "metadata": {},
352 |      "output_type": "execute_result"
353 |     }
354 |    ],
355 |    "source": [
356 |     "tweet"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "markdown",
361 |    "metadata": {},
362 |    "source": [
363 |     "## Preprocessor on Github: \n",
364 |     "### ekphrasis <a href=\"https://github.com/cbaziotis/ekphrasis\"> on github </a>"
365 |    ]
366 |   },
367 |   {
368 |    "cell_type": "markdown",
369 |    "metadata": {},
370 |    "source": [
371 |     "The Spell Corrector is based on Peter Norvig's spell-corrector. Just like the segmentation algorithm, we utilize word statistics in order to find the most probable candidate. Besides the provided statistics, you can use your own."
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "code",
376 |    "execution_count": 12,
377 |    "metadata": {},
378 |    "outputs": [
379 |     {
380 |      "name": "stdout",
381 |      "output_type": "stream",
382 |      "text": [
383 |       "Reading english - 1grams ...\n",
384 |       "corrected\n"
385 |      ]
386 |     }
387 |    ],
388 |    "source": [
389 |     "sp = SpellCorrector(corpus=\"english\")\n",
390 |     "print(sp.correct(\"korrectud\"))"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "markdown",
395 |    "metadata": {},
396 |    "source": [
397 |     "### Tweet preprocessor adapted pipeline"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": 5,
403 |    "metadata": {},
404 |    "outputs": [],
405 |    "source": [
406 |     "words_to_delete = ['theirs', 'she', 'of', \n",
407 |     "                  'all', 'her', 'ourselves', 'that', 'some', 'your', \n",
408 |     "                  'what', 'or', 'me',  'now', 'after',\n",
409 |     "                  'until', 'them', 'through', 'who', 'herself', 'he', \n",
410 |     "                   'y', 'each', 'under', 'hers', 'other', 'down', \n",
411 |     "                  'this', 'their', 'as', 'on','few', 'which', 'further', \n",
412 |     "                  'whom', 'its', 'so', 'yourselves', 'because', 'it', 'both', 'in', 'nor', \n",
413 |     "                    'yours', 'yourself', 'before','since', \n",
414 |     "                  'there', 'himself', 'then', \n",
415 |     "                  'him', 'over',  'here',  'an',  'into','next','d','u','r','im','m','have', \n",
416 |     "                  'the', 'again','such', 'myself', 'they', \n",
417 |     "                  'we', 'those', 'between', 'once','even','have'\n",
418 |     "                   'how', 'from',  'ours', 'during','be','ama','r','i','do','but',\n",
419 |     "                  'his', 'against', 'below',  'to', 'about', \n",
420 |     "                   'by', 'i', 'where', 'a', 'very', 'our', 'my', 'for', 'and','ur'\n",
421 |     "                  'while', 'only', 'up', 'these', 'just', 'same','how',\n",
422 |     "                  'you', 'themselves', 'above', 'with',  'than', \n",
423 |     "                  'own', 'out', 'when', 'any', 'too', 'o', 'at']\n",
424 |     "def load_dict_contractions():\n",
425 |     "    \n",
426 |     "    return {\n",
427 |     "        \"ain't\":\"is not\",\"amn't\":\"am not\",\"aren't\":\"are not\",\"can't\":\"cannot\",\"'cause\":\"because\",\"couldn't\":\"could not\",\n",
428 |     "        \"couldn't've\":\"could not have\",\"could've\":\"could have\",\"daren't\":\"dare not\",\"daresn't\":\"dare not\",\"dasn't\":\"dare not\",\"didn't\":\"did not\",\"doesn't\":\"does not\",\n",
429 |     "        \"don't\":\"do not\",\"e'er\":\"ever\",\"em\":\"them\",\"everyone's\":\"everyone is\",\"finna\":\"fixing to\",\n",
430 |     "        \"gimme\":\"give me\",\"gonna\":\"going to\", \"gon't\":\"go not\",\"gotta\":\"got to\",  \"hadn't\":\"had not\", \"hasn't\":\"has not\",\"haven't\":\"have not\",\n",
431 |     "        \"he'd\":\"he would\", \"he'll\":\"he will\",\"he's\":\"he is\", \"he've\":\"he have\",\"how'd\":\"how would\",\n",
432 |     "        \"how'll\":\"how will\", \"how're\":\"how are\",\"how's\":\"how is\", \"I'd\":\"I would\", \"I'll\":\"I will\", \"I'm\":\"I am\",\n",
433 |     "        \"I'm'a\":\"I am about to\",\"I'm'o\":\"I am going to\",\"isn't\":\"is not\",\"it'd\":\"it would\",\n",
434 |     "        \"it'll\":\"it will\", \"it's\":\"it is\",\"I've\":\"I have\",\"kinda\":\"kind of\",\"let's\":\"let us\",\"mayn't\":\"may not\",\n",
435 |     "        \"may've\":\"may have\",\"mightn't\":\"might not\",\"might've\":\"might have\", \"mustn't\":\"must not\",\"mustn't've\":\"must not have\",\n",
436 |     "        \"must've\":\"must have\",\"needn't\":\"need not\", \"ne'er\":\"never\", \"o'\":\"of\", \"o'er\":\"over\",\"ol'\":\"old\",\n",
437 |     "        \"oughtn't\":\"ought not\",\"shalln't\":\"shall not\",\"shan't\":\"shall not\",\"she'd\":\"she would\",\"she'll\":\"she will\",\"she's\":\"she is\",\n",
438 |     "        \"shouldn't\":\"should not\",\"shouldn't've\":\"should not have\",\"should've\":\"should have\",\"somebody's\":\"somebody is\",\n",
439 |     "        \"someone's\":\"someone is\",\"something's\":\"something is\",\"that'd\":\"that would\",\"that'll\":\"that will\",\n",
440 |     "        \"that're\":\"that are\",\"that's\":\"that is\",\"there'd\":\"there would\",\"there'll\":\"there will\", \"there're\":\"there are\",\"there's\":\"there is\", \"these're\":\"these are\",\n",
441 |     "        \"they'd\":\"they would\",\"they'll\":\"they will\",\"they're\":\"they are\", \"they've\":\"they have\",\"this's\":\"this is\",\n",
442 |     "        \"those're\":\"those are\",\"'tis\":\"it is\",\"'twas\":\"it was\",\"wanna\":\"want to\",\"wasn't\":\"was not\",\"we'd\":\"we would\",\n",
443 |     "        \"we'd've\":\"we would have\",\"we'll\":\"we will\",\"we're\":\"we are\",\"weren't\":\"were not\",\"we've\":\"we have\",\n",
444 |     "        \"what'd\":\"what did\",\"what'll\":\"what will\",\"what're\":\"what are\",\"what's\":\"what is\",\n",
445 |     "        \"what've\":\"what have\",\"when's\":\"when is\",\"where'd\":\"where did\",\"where're\":\"where are\",\"where's\":\"where is\",\n",
446 |     "        \"where've\":\"where have\",\"which's\":\"which is\",\"who'd\":\"who would\",\"who'd've\":\"who would have\",\"who'll\":\"who will\",\n",
447 |     "        \"who're\":\"who are\",\"who's\":\"who is\",\"who've\":\"who have\",\"why'd\":\"why did\",\"why're\":\"why are\",\"why's\":\"why is\",\n",
448 |     "        \"won't\":\"will not\",\"wouldn't\":\"would not\",\"would've\":\"would have\",\"y'all\":\"you all\",\"you'd\":\"you would\",\n",
449 |     "        \"you'll\":\"you will\",\"you're\":\"you are\",\"you've\":\"you have\",\"Whatcha\":\"What are you\",\"luv\":\"love\",\"sux\":\"sucks\"\n",
450 |     "        ,\"shes\":\"she is\",\"wtf\":\"what the fuck\"}"
451 |    ]
452 |   },
453 |   {
454 |    "cell_type": "code",
455 |    "execution_count": 6,
456 |    "metadata": {},
457 |    "outputs": [
458 |     {
459 |      "name": "stdout",
460 |      "output_type": "stream",
461 |      "text": [
462 |       "Reading english - 1grams ...\n"
463 |      ]
464 |     }
465 |    ],
466 |    "source": [
467 |     "text_processor = TextPreProcessor(\n",
468 |     "    # terms that will be normalized\n",
469 |     "    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',\n",
470 |     "               'time', 'url', 'date', 'number'],\n",
471 |     "    # terms that will be annotated\n",
472 |     "    annotate={\"hashtag\", \"allcaps\", \"elongated\",\n",
473 |     "              'emphasis', 'censored'},\n",
474 |     "    fix_html=True,  # fix HTML tokens\n",
475 |     "\n",
476 |     "    # corpus from which the word statistics are going to be used\n",
477 |     "    # for word segmentation\n",
478 |     "    segmenter=\"twitter\",\n",
479 |     "    # corpus from which the word statistics are going to be used\n",
480 |     "    # for spell correction\n",
481 |     "    corrector=\"english\",\n",
482 |     "    unpack_hashtags=False,  # perform word segmentation on hashtags\n",
483 |     "    unpack_contractions=True,  # Unpack contractions (can't -> can not)\n",
484 |     "    spell_correction=True,\n",
485 |     "    tokenizer=SocialTokenizer(lowercase=False).tokenize\n",
486 |     ")\n"
487 |    ]
488 |   },
489 |   {
490 |    "cell_type": "code",
491 |    "execution_count": 7,
492 |    "metadata": {},
493 |    "outputs": [],
494 |    "source": [
495 |     "def clean_tweet(tweet,processor=text_processor):\n",
496 |     "    tweet=emoji.demojize(tweet)\n",
497 |     "    tweet = tweet.lower()\n",
498 |     "    # Replacement of words such as I've to I have \n",
499 |     "    tweet = tweet.replace(\"’\",\"'\")\n",
500 |     "    tweet = tweet.split()\n",
501 |     "    contractions=load_dict_contractions()\n",
502 |     "    tweet = [contractions[word] if word in contractions else word for word in tweet]\n",
503 |     "    tweet = \" \".join(tweet) \n",
504 |     "    tweet=\" \".join(text_processor.pre_process_doc(tweet))\n",
505 |     "\n",
506 |     "    # remove punctuations\n",
507 |     "    tweet = re.sub(u'[{}]'.format('!\"#$%&\\'()*+,-./:;<=>?@[\\\\]^`{|}~'), u'',tweet)\n",
508 |     "    # Lemmatizer\n",
509 |     "    lemmatizer = WordNetLemmatizer()\n",
510 |     "    s=''\n",
511 |     "    for word in tweet.split():\n",
512 |     "        s=s+\" \"+lemmatizer.lemmatize(word, pos='v')\n",
513 |     "     # unuseful words removal\n",
514 |     "    for w in words_to_delete:\n",
515 |     "        pattern = r'\\b'+w+r'\\b'\n",
516 |     "        s = re.sub(pattern, '', s)\n",
517 |     "    #correct all multiple white spaces to a single white space\n",
518 |     "    tweet = re.sub('[\\s]+', ' ', s)\n",
519 |     "    #Tokenize \n",
520 |     "    #tokenizer=SocialTokenizer(lowercase=True).tokenize\n",
521 |     "    #resultf=tokenizer(s)\n",
522 |     "    return tweet"
523 |    ]
524 |   },
525 |   {
526 |    "cell_type": "code",
527 |    "execution_count": 8,
528 |    "metadata": {},
529 |    "outputs": [],
530 |    "source": [
531 |     "def clean_tweets (tweets):\n",
532 |     "    clean_all_tweets=[]\n",
533 |     "    for i in trange(len(tweets)):\n",
534 |     "         clean_all_tweets.append(clean_tweet(tweets[i]))\n",
535 |     "    return(clean_all_tweets)"
536 |    ]
537 |   },
538 |   {
539 |    "cell_type": "code",
540 |    "execution_count": 9,
541 |    "metadata": {},
542 |    "outputs": [
543 |     {
544 |      "name": "stderr",
545 |      "output_type": "stream",
546 |      "text": [
547 |       "100%|██████████| 6680/6680 [00:11<00:00, 559.43it/s]\n"
548 |      ]
549 |     }
550 |    ],
551 |    "source": [
552 |     "Clean_train_tweets=clean_tweets(train_tweets)"
553 |    ]
554 |   },
555 |   {
556 |    "cell_type": "code",
557 |    "execution_count": 10,
558 |    "metadata": {},
559 |    "outputs": [
560 |     {
561 |      "name": "stdout",
562 |      "output_type": "stream",
563 |      "text": [
564 |       "\n",
565 |       "So since season is starting I'ma start coming to school looking like a homeless person 🙂👌🏼🏊🏻‍♀️\n",
566 |       " season start start come school look like homeless person slightly_smiling_face ok_hand_medium light_skin_tone woman_swimming_light_skin_tone\n",
567 |       "\n",
568 |       "\n",
569 |       "I never liked it anyway https://t.co/7bTrApaFjd\n",
570 |       " never like anyway url\n",
571 |       "\n",
572 |       "\n",
573 |       "Retweeted Aries Spears (@AriesSpears):\n",
574 |       "\n",
575 |       "I think its disgustin &amp; disrespectful some blk folks r so short sighted... https://t.co/RPicbgeLRl\n",
576 |       " retweeted aries spear user think disgustin disrespectful blk folks short sight url\n",
577 |       "\n",
578 |       "\n",
579 |       "@anniecross00 depression is a mental disorder you cannot help... I've been diagnosed like don't even fucking come at me about mental illness\n",
580 |       " user depression mental disorder cannot help diagnose like not fuck come mental illness\n",
581 |       "\n",
582 |       "\n",
583 |       "@EverythinOakley yikes I hate that stuff\n",
584 |       " user yikes hate stuff\n",
585 |       "\n",
586 |       "\n",
587 |       "Bitch I've done meth and I wasn't tweaking as much as y'all are https://t.co/SV0r58R0d8\n",
588 |       " bitch meth not tweak much url\n",
589 |       "\n",
590 |       "\n",
591 |       "I was diagnosed with severe depression when I was 16.\n",
592 |       " diagnose severe depression number\n",
593 |       "\n",
594 |       "\n",
595 |       "How can u be friends with ur mans ex — I LOVE MIRNA SHES FAB AF. Plus wtf were not like 7 ... https://t.co/ktg25qRq3d\n",
596 |       " can friends ur man ex — love mirna fab af plus fuck not like number url\n",
597 |       "\n",
598 |       "\n",
599 |       "@matthaig1 I have been diagnosed with \"severe depression\" but I resist meds because they make me worse.\n",
600 |       " user diagnose severe depression resist meds make worse\n",
601 |       "\n",
602 |       "\n",
603 |       "@SingFoxx I have been diagnosed with depression. And anxiety. I've been to multiple therapists and taken about four different kinds of\n",
604 |       " user diagnose depression anxiety multiple therapists take four different kinds \n",
605 |       "\n"
606 |      ]
607 |     }
608 |    ],
609 |    "source": [
610 |     "for i in range(10):\n",
611 |     "    print()\n",
612 |     "    print(train_tweets[i])\n",
613 |     "    print(Clean_train_tweets[i])\n",
614 |     "    print()"
615 |    ]
616 |   },
617 |   {
618 |    "cell_type": "markdown",
619 |    "metadata": {},
620 |    "source": [
621 |     "# Data Representation: \n",
622 |     "## Word Representation:\n",
623 |     "### One hot encoding:\n",
624 |     "Binary representation of words: \n",
625 |     "\n",
626 |     "&rArr; Depending on the vocabulary: each word get a representation of 1*n vector representation.\n",
627 |     "\n",
628 |     "Negative aspect of one hot encoding :\n",
629 |     " * The number of dimensions increase linearly as we add words (used memory is Large)\n",
630 |     " * Embedding matrix is very sparse mainly made of zeroes \n",
631 |     " * No context of word because every word is treated on its own\n",
632 |     " * No frequency information is present\n",
633 |     "\n",
634 |     "### Bag-of-words: (BoW)\n",
635 |     "Creates a vocabulary of all the tokens occuring in akk tweets, The frequency of the word in each tweet is inserted. \n",
636 |     "&rArr; The number of dimensions increases with obs in dataset\n",
637 |     "Negative aspect of BoW:\n",
638 |     " * Ignores the meaning of words. The same word can be used in multiple places based on the context or nearby words.\n",
639 |     " * Vector can be huge and it ca be costly for both time and computation\n",
640 |     "\n",
641 |     "&rArr; TF-IDF (term frequency-inverse document frequency)=TF * IDF with TF = Number of times term appears in a document/total number of items in the document and IDF= log(Total number of documents/Number of documents with Term in it)\n",
642 |     " * TF-IDF is based on the bag-of-words (BoW) model, therefore it does not capture position in text, semantics, co-occurrences in different documents.\n",
643 |     "\n",
644 |     "## Word Embedding:\n",
645 |     "### Word2vec:\n",
646 |     "Mapping of words into vectors, the words existing in smilar contexts will have similar word embeddings. \n",
647 |     "To generate vectors from words there's two algorithms:\n",
648 |     " * CBoW(Continuous Bag of Words): Predict the target word from a context &rarr; Small corpus\n",
649 |     " * Skip Gram: Predict the context words from a word &rarr; Large corpus\n",
650 |     "\n",
651 |     "Choosing number of dimensions: Extreme accuracy can be obtained with 300D\n",
652 |     "\n",
653 |     "Pros:\n",
654 |     "   * Calculating the semantic similarity between words.\n",
655 |     "   * Can feed it raw text and it output word vectors\n",
656 |     "\n",
657 |     "Cons:\n",
658 |     "   * Words having multiple sense are represented in one vector \n",
659 |     "   * Can’t handle out-of-vocabulary words, have to re-train to add new words.\n",
660 |     "\n",
661 |     "### GloVe:\n",
662 |     "It is an unsupervised learning algorithm for obtaining vector representations for words. It puts emphasis on the importance co-occurences to extract meaning. The idea behind it is that a certain word generally co-occurs more often with one word than another.\n",
663 |     " \n",
664 |     " &rArr; It proves to perform better than Word2vec in the word analogy tasks.\n",
665 |     "\n",
666 |     "### FastText:\n",
667 |     "It treats each word as composed of character ngrams so the vector for a word is made of the sum of this character n grams.\n",
668 |     " Pros:\n",
669 |     " * Generate better word embeddings for rare words.\n",
670 |     " * Handle Out-of-Vocabulary words unlike Word2vec and GloVe.\n",
671 |     "\n",
672 |     "### ELmo:\n",
673 |     "Instead of using a fixed embedding for each word, like models like GloVe do, ELMo looks at the entire sentence before assigning each word in it its embedding.\n",
674 |     " * ELMo word representations are purely character-based, which allows it to treat out-of-vocabulary tokens unseen during training.\n",
675 |     " * Unlike other word embeddings, it generates word vectors on run time.\n",
676 |     " * It gives embedding of anything you put in — characters, words, sentences, paragraphs, but it is built for sentence embeddings in mind.\n",
677 |     "\n",
678 |     "## BERT(Bidirectional Encoder Representations from Transformer): \n",
679 |     "It uses the transformer architecture in addition to a number of different techniques to train the model. \n",
680 |     "It achives state of the art results. \n",
681 |     "Based on this <a href=\"https://arxiv.org/abs/1810.04805\"> article</a>, <a href=\"https://github.com/google-research/bert\">BERT </a> is the word embedding algorithm I'm using.\n",
682 |     "\n",
683 |     "For visualization (<a href='https://towardsdatascience.com/deconstructing-bert-part-2-visualizing-the-inner-workings-of-attention-60a16d86b5c1'>link</a>)\n",
684 |     "\n",
685 |     "Tensoflow hub <a href='https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'>Link</a>\n",
686 |     "\n",
687 |     "BERT for embedding on <a href='https://medium.com/@aieeshashafique/feature-extraction-from-bert-25887ed2152a'> medium</a>\n",
688 |     "\n",
689 |     "Evaluation of embeddings to check <a href='https://arxiv.org/pdf/1801.09536.pdf'> this article</a>"
690 |    ]
691 |   },
692 |   {
693 |    "cell_type": "code",
694 |    "execution_count": 11,
695 |    "metadata": {},
696 |    "outputs": [
697 |     {
698 |      "name": "stdout",
699 |      "output_type": "stream",
700 |      "text": [
701 |       "TF version:  2.0.0\n",
702 |       "Hub version:  0.8.0\n"
703 |      ]
704 |     }
705 |    ],
706 |    "source": [
707 |     "import tensorflow_hub as hub\n",
708 |     "import tensorflow as tf\n",
709 |     "import bert\n",
710 |     "FullTokenizer = bert.bert_tokenization.FullTokenizer\n",
711 |     "from tensorflow.keras.models import Model \n",
712 |     "import math\n",
713 |     "print(\"TF version: \", tf.__version__)\n",
714 |     "print(\"Hub version: \", hub.__version__)"
715 |    ]
716 |   },
717 |   {
718 |    "cell_type": "markdown",
719 |    "metadata": {},
720 |    "source": [
721 |     "### 1. Prepare inputs:"
722 |    ]
723 |   },
724 |   {
725 |    "cell_type": "code",
726 |    "execution_count": 20,
727 |    "metadata": {},
728 |    "outputs": [],
729 |    "source": [
730 |     "# See BERT paper: https://arxiv.org/pdf/1810.04805.pdf\n",
731 |     "# And BERT implementation convert_single_example() at https://github.com/google-research/bert/blob/master/run_classifier.py\n",
732 |     "\n",
733 |     "def get_masks(tokens, max_seq_length):\n",
734 |     "    \"\"\"Mask for padding\"\"\"\n",
735 |     "    if len(tokens)>max_seq_length:\n",
736 |     "        raise IndexError(\"Token length more than max seq length!\")\n",
737 |     "    return [1]*len(tokens) + [0] * (max_seq_length - len(tokens))\n",
738 |     "\n",
739 |     "\n",
740 |     "def get_segments(tokens, max_seq_length):\n",
741 |     "    \"\"\"Segments: 0 for the first sequence, 1 for the second\"\"\"\n",
742 |     "    if len(tokens)>max_seq_length:\n",
743 |     "        raise IndexError(\"Token length more than max seq length!\")\n",
744 |     "    segments = []\n",
745 |     "    current_segment_id = 0\n",
746 |     "    for token in tokens:\n",
747 |     "        segments.append(current_segment_id)\n",
748 |     "        if token == \"[SEP]\":\n",
749 |     "            current_segment_id = 1\n",
750 |     "    return segments + [0] * (max_seq_length - len(tokens))\n",
751 |     "\n",
752 |     "\n",
753 |     "def get_ids(tokens, tokenizer, max_seq_length):\n",
754 |     "    \"\"\"Token ids from Tokenizer vocab\"\"\"\n",
755 |     "    token_ids = tokenizer.convert_tokens_to_ids(tokens)\n",
756 |     "    input_ids = token_ids + [0] * (max_seq_length-len(token_ids))\n",
757 |     "    return input_ids"
758 |    ]
759 |   },
760 |   {
761 |    "cell_type": "code",
762 |    "execution_count": 13,
763 |    "metadata": {},
764 |    "outputs": [],
765 |    "source": [
766 |     "max_seq_length = 128  # maximum length of a sequence after tokenizing\n",
767 |     "input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name=\"input_word_ids\")\n",
768 |     "input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name=\"input_mask\")\n",
769 |     "segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name=\"segment_ids\")\n",
770 |     "bert_layer = hub.KerasLayer(\"https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2\",\n",
771 |     "                            trainable=True) #pretrained \n",
772 |     "pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])\n",
773 |     "\n",
774 |     "#pooled_output: pooled output of the entire sequence with shape [batch_size, hidden_size].\n",
775 |     "#sequence_output: representations of every token in the input sequence with shape [batch_size, max_sequence_length, hidden_size].\n"
776 |    ]
777 |   },
778 |   {
779 |    "cell_type": "code",
780 |    "execution_count": 14,
781 |    "metadata": {},
782 |    "outputs": [],
783 |    "source": [
784 |     "model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=[pooled_output,sequence_output])"
785 |    ]
786 |   },
787 |   {
788 |    "cell_type": "code",
789 |    "execution_count": 21,
790 |    "metadata": {},
791 |    "outputs": [],
792 |    "source": [
793 |     "#BERT tokenizer\n",
794 |     "vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()\n",
795 |     "do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()\n",
796 |     "tokenizer = FullTokenizer(vocab_file, do_lower_case)"
797 |    ]
798 |   },
799 |   {
800 |    "cell_type": "code",
801 |    "execution_count": 22,
802 |    "metadata": {},
803 |    "outputs": [],
804 |    "source": [
805 |     "s = Clean_train_tweets[1]"
806 |    ]
807 |   },
808 |   {
809 |    "cell_type": "code",
810 |    "execution_count": 23,
811 |    "metadata": {},
812 |    "outputs": [],
813 |    "source": [
814 |     "stokens=tokenizer.tokenize(s)"
815 |    ]
816 |   },
817 |   {
818 |    "cell_type": "code",
819 |    "execution_count": 24,
820 |    "metadata": {},
821 |    "outputs": [],
822 |    "source": [
823 |     "stokens = [\"[CLS]\"] + stokens + [\"[SEP]\"]"
824 |    ]
825 |   },
826 |   {
827 |    "cell_type": "code",
828 |    "execution_count": 25,
829 |    "metadata": {},
830 |    "outputs": [],
831 |    "source": [
832 |     "input_ids = get_ids(stokens, tokenizer, max_seq_length)\n",
833 |     "input_masks = get_masks(stokens, max_seq_length)\n",
834 |     "input_segments = get_segments(stokens, max_seq_length)"
835 |    ]
836 |   },
837 |   {
838 |    "cell_type": "code",
839 |    "execution_count": 26,
840 |    "metadata": {},
841 |    "outputs": [
842 |     {
843 |      "name": "stdout",
844 |      "output_type": "stream",
845 |      "text": [
846 |       "['[CLS]', 'never', 'like', 'anyway', 'ur', '##l', '[SEP]']\n",
847 |       "[101, 2196, 2066, 4312, 24471, 2140, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
848 |       "[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n",
849 |       "[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]\n"
850 |      ]
851 |     }
852 |    ],
853 |    "source": [
854 |     "print(stokens)\n",
855 |     "print(input_ids)\n",
856 |     "print(input_masks)\n",
857 |     "print(input_segments)"
858 |    ]
859 |   },
860 |   {
861 |    "cell_type": "code",
862 |    "execution_count": 27,
863 |    "metadata": {},
864 |    "outputs": [],
865 |    "source": [
866 |     "pool_embs,seq_embs= model.predict([[input_ids],[input_masks],[input_segments]])\n",
867 |     "#Embeddings are generated using the pre-trained model previously loaded ."
868 |    ]
869 |   },
870 |   {
871 |    "cell_type": "code",
872 |    "execution_count": 32,
873 |    "metadata": {},
874 |    "outputs": [
875 |     {
876 |      "data": {
877 |       "text/plain": [
878 |        "(1, 128, 768)"
879 |       ]
880 |      },
881 |      "execution_count": 32,
882 |      "metadata": {},
883 |      "output_type": "execute_result"
884 |     }
885 |    ],
886 |    "source": [
887 |     "seq_embs.shape"
888 |    ]
889 |   },
890 |   {
891 |    "cell_type": "code",
892 |    "execution_count": 33,
893 |    "metadata": {},
894 |    "outputs": [
895 |     {
896 |      "data": {
897 |       "text/plain": [
898 |        "array([[[-0.04586147,  0.07677846,  0.15028717, ..., -0.3676868 ,\n",
899 |        "          0.25273815,  0.41104478],\n",
900 |        "        [ 0.50704026, -0.07457219,  0.7028382 , ...,  0.06672566,\n",
901 |        "          0.00297276,  0.2975085 ],\n",
902 |        "        [ 0.8509491 , -0.05657218,  1.025556  , ..., -0.27750093,\n",
903 |        "         -0.30000427, -0.07440143],\n",
904 |        "        ...,\n",
905 |        "        [ 0.12525357,  0.2656413 ,  0.7679463 , ...,  0.01556945,\n",
906 |        "          0.07984328,  0.2148341 ],\n",
907 |        "        [ 0.10444136,  0.21031368,  0.7588055 , ..., -0.02725608,\n",
908 |        "          0.08136114,  0.22190559],\n",
909 |        "        [ 0.13000613,  0.18980533,  0.7512319 , ..., -0.07465325,\n",
910 |        "          0.06056323,  0.15698689]]], dtype=float32)"
911 |       ]
912 |      },
913 |      "execution_count": 33,
914 |      "metadata": {},
915 |      "output_type": "execute_result"
916 |     }
917 |    ],
918 |    "source": [
919 |     "seq_embs"
920 |    ]
921 |   }
922 |  ],
923 |  "metadata": {
924 |   "kernelspec": {
925 |    "display_name": "Python 3",
926 |    "language": "python",
927 |    "name": "python3"
928 |   },
929 |   "language_info": {
930 |    "codemirror_mode": {
931 |     "name": "ipython",
932 |     "version": 3
933 |    },
934 |    "file_extension": ".py",
935 |    "mimetype": "text/x-python",
936 |    "name": "python",
937 |    "nbconvert_exporter": "python",
938 |    "pygments_lexer": "ipython3",
939 |    "version": "3.6.10"
940 |   }
941 |  },
942 |  "nbformat": 4,
943 |  "nbformat_minor": 4
944 | }
945 | 


--------------------------------------------------------------------------------