├── Requirements.txt ├── Report.pdf ├── Screenshots ├── brain.png ├── ImageSample.png ├── crawl_images.PNG ├── crawl_texts.PNG └── texts_exmpl.PNG ├── Scripts ├── Images_functions │ ├── run_npz.py │ ├── run_pexels.py │ ├── run_save_images_crawler.py │ ├── run_API_unsplash.py │ ├── npz.py │ ├── Save_images.py │ ├── UnsplashAPI.py │ └── pexels.py └── Twitter_Crawler │ ├── run_TWINT.py │ └── Twint_pkge.py ├── data ├── TwitterP.py ├── Images_load.py ├── __init__.py └── c3d.py ├── Notebooks ├── Remove_Duplicates.ipynb ├── Explore_Twint.ipynb ├── Bibliography.ipynb └── TwitterscraperDemo.ipynb ├── models └── bilstm.py ├── utils.py ├── README.md ├── LICENSE ├── ResNet-Transfer.ipynb ├── GLove+Bilstm.ipynb ├── ResNet.ipynb └── Preprocessing_Texts.ipynb /Requirements.txt: -------------------------------------------------------------------------------- 1 | pandas 2 | scikit-learn 3 | tqdm 4 | beautifulsoup4 -------------------------------------------------------------------------------- /Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BouzidiImen/Social_media_Prediction_depression/HEAD/Report.pdf -------------------------------------------------------------------------------- /Screenshots/brain.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BouzidiImen/Social_media_Prediction_depression/HEAD/Screenshots/brain.png -------------------------------------------------------------------------------- /Screenshots/ImageSample.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BouzidiImen/Social_media_Prediction_depression/HEAD/Screenshots/ImageSample.png -------------------------------------------------------------------------------- /Screenshots/crawl_images.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BouzidiImen/Social_media_Prediction_depression/HEAD/Screenshots/crawl_images.PNG -------------------------------------------------------------------------------- /Screenshots/crawl_texts.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BouzidiImen/Social_media_Prediction_depression/HEAD/Screenshots/crawl_texts.PNG -------------------------------------------------------------------------------- /Screenshots/texts_exmpl.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BouzidiImen/Social_media_Prediction_depression/HEAD/Screenshots/texts_exmpl.PNG -------------------------------------------------------------------------------- /Scripts/Images_functions/run_npz.py: -------------------------------------------------------------------------------- 1 | from npz import create_npz 2 | import numpy as np 3 | create_npz(path='Data/Unsplash_Pexels_Data/Depression/') 4 | create_npz(path='Data/Unsplash_Pexels_Data/Happiness/',label=np.array([0,1])) 5 | -------------------------------------------------------------------------------- /Scripts/Images_functions/run_pexels.py: -------------------------------------------------------------------------------- 1 | from pexels import get_pexels_images 2 | 3 | DepressionKeywords = ['Suicide', 'Sad', 'Depression', 'Stress', 'Anxiety', 'Grief', 'Despair', 'Crying'] 4 | HappyKeywords = ['Happy', 'excited'] 5 | for key_word in HappyKeywords: 6 | get_pexels_images(key_word) 7 | #for key_word in DepressionKeywords: 8 | #get_pexels_images(key_word) 9 | -------------------------------------------------------------------------------- /Scripts/Twitter_Crawler/run_TWINT.py: -------------------------------------------------------------------------------- 1 | from Twint_pkge import clean_data, get_profile_infos, get_timeline_by_username 2 | from tqdm import trange 3 | key_words = ["#lovemylife", "#lifeisgood", "#happyme"] 4 | data = clean_data(key_words) 5 | for i in trange(len(data)): 6 | get_profile_infos(data['username'].iloc[i]) 7 | print('Okay profile'+str(i)) 8 | get_timeline_by_username(data['username'].iloc[i]) 9 | -------------------------------------------------------------------------------- /Scripts/Images_functions/run_save_images_crawler.py: -------------------------------------------------------------------------------- 1 | from Save_images import save_images 2 | file_names_Deprssion_unsplash = ['Depression_unsplash'] 3 | for file_name in file_names_Deprssion_unsplash: 4 | save_images(file_name=file_names_Deprssion_unsplash) 5 | file_names_Deprssion_Pexels = ['Depression_Pexels'] 6 | for file_name in file_names_Deprssion_unsplash: 7 | save_images(website_name='Pexels_',file_name=file_name, folder_path='Pexels/Depressions/') 8 | -------------------------------------------------------------------------------- /Scripts/Images_functions/run_API_unsplash.py: -------------------------------------------------------------------------------- 1 | from UnsplashAPI import get_unsplash_images 2 | import json 3 | 4 | with open('Token.json') as t: 5 | api_key = json.load(t) 6 | token = api_key[ 'unsplash' ] 7 | DepressionKeywords = [ 'Suicide', 'Sad', 'Depression', 'Stress', 'Anxiety', 'Grief', 'Despair', 'Crying' ] 8 | for key_word in DepressionKeywords: 9 | get_unsplash_images(token, key_word) 10 | DepressionKeywords = [ 'Suicide', 'Sad', 'Depression', 'Stress', 'Anxiety', 'Grief', 'Despair', 'Crying' ] 11 | for key_word in DepressionKeywords: 12 | get_unsplash_images(token, key_word) 13 | -------------------------------------------------------------------------------- /Scripts/Images_functions/npz.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | import numpy as np 3 | import os 4 | from tqdm import tqdm 5 | 6 | DEFAULT_DATA_NAME = 'data.npz' 7 | DEFAULT_SCALE = .1 8 | DEFAULT_LABEL=np.array([1,0]) #for happiness changed to [0,1] 9 | DEFALUT_KEY='dep_' 10 | 11 | def _resize_to_np(filepath, label, scale): 12 | src = cv2.imread(filepath, cv2.IMREAD_UNCHANGED) 13 | width = int(src.shape[1] * scale) 14 | height = int(src.shape[0] * scale) 15 | output = cv2.resize(src, (width, height)) 16 | img = np.array([output, label]) 17 | return img.reshape((2, 1)) 18 | 19 | 20 | def create_npz(path,key=DEFALUT_KEY, data_name=DEFAULT_DATA_NAME, scale=DEFAULT_SCALE,label=DEFAULT_LABEL): 21 | filepath = path + data_name 22 | if os.path.exists(filepath): 23 | print('Deleting existing data...') 24 | os.remove(filepath) 25 | pictures = os.listdir(path) 26 | all_imgs = [] 27 | names=[] 28 | i = 0 29 | for pic in tqdm(pictures): 30 | all_imgs.append(_resize_to_np(path + pic, label, scale)) 31 | names.append(key+str(i)) 32 | i += 1 33 | print('all images were resized ') 34 | np.savez(filepath, **{name:value for name,value in zip(names,all_imgs)}) 35 | print(f"Data saved into '{filepath}'") 36 | -------------------------------------------------------------------------------- /Scripts/Images_functions/Save_images.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from requests import get 3 | from tqdm import trange 4 | import time 5 | from socket import error as socket_error 6 | import errno 7 | import pathlib 8 | from random import randint 9 | from urllib.request import urlcleanup 10 | 11 | DEFAULT_FILE_NAME = 'Suicide' 12 | DEFAULT_FOLDER_PATH = 'Unsplash/Depression/' 13 | DEFAULT_NAME = 'Unsplash_' 14 | 15 | headers = { 16 | 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5) AppleWebKit/537.36 (KHTML, like Gecko) Cafari/537.36'} 17 | 18 | 19 | def _save_image(website_name,Name, Extension, Link, path_to_folder): 20 | try: 21 | name = website_name + Name 22 | pic = get(Link, headers=headers) 23 | with open(path_to_folder + name + "." + Extension, 'wb') as photo: 24 | photo.write(pic.content) 25 | except socket_error as e: 26 | if e.errno != errno.ECONNRESET: 27 | raise 28 | urlcleanup() 29 | 30 | 31 | def save_images(website_name=DEFAULT_NAME,file_name=DEFAULT_FILE_NAME, folder_path=DEFAULT_FOLDER_PATH): 32 | missed = [] 33 | data = pd.read_csv(file_name + ".csv") 34 | pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True) 35 | for i in trange(len(data)): 36 | wait = randint(5, 15) 37 | time.sleep(wait) 38 | print(f'\nWaiting {wait}s...') 39 | try: 40 | _save_image(website_name,data['Name'].iloc[i], data['Extension'].iloc[i], data['Links'].iloc[i], folder_path) 41 | except: 42 | missed.append(data['Links'].iloc[i]) 43 | missed_data = pd.DataFrame({'Missed_Links': missed}, index=None) 44 | missed_data.to_csv('MissedData.csv') 45 | -------------------------------------------------------------------------------- /data/TwitterP.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import json 4 | import os 5 | from tqdm import tqdm 6 | from utils import maybe_download_and_extract, getlink 7 | 8 | SOURCE_URL = "http://www.mediafire.com/file/xp2jp8ezm3ynpc1/Twitter_Data%25282%2529.zip/file" 9 | DATA_DEFAULT_PATH = '~/.datasets/Crawled-Twitter-Data/' 10 | DEFAULT_TEST_SIZE = .25 11 | DEFAULT_SEED = 2020 12 | 13 | 14 | def load_data(data_path=DATA_DEFAULT_PATH, link=SOURCE_URL, test_size=DEFAULT_TEST_SIZE): 15 | """ 16 | load data from website and return train, test and validation data 17 | :param data_path where data will be saved 18 | :param link to data 19 | :param test_size 20 | 21 | """ 22 | data_path = os.path.expanduser(data_path) 23 | # Download files 24 | maybe_download_and_extract(getlink(link), data_path) 25 | # read data 26 | Twitter_data = pd.read_csv(data_path + "Twitter_data/Profiles.csv") 27 | L = len(Twitter_data) 28 | train, val, test = np.split(Twitter_data, [ int(L * (1 - test_size) * (1 - test_size)), int(L * (1 - test_size)) ]) 29 | return train, val, test 30 | 31 | 32 | def get_username_profile(username, data_path=DATA_DEFAULT_PATH): 33 | """ 34 | load user's profile for a given username 35 | :param data_path where data is saved 36 | :param username 37 | 38 | """ 39 | timelines = os.listdir(data_path + 'Twitter_data/Timelines') 40 | clean_usernames = [s.strip('.csv') for s in timelines] 41 | for i in range(len(clean_usernames)): 42 | if clean_usernames[i] == username: 43 | return (pd.read_csv(data_path + 'Twitter_data/Timelines/' + username + '.csv')) 44 | return (print("user's timeline not found")) 45 | -------------------------------------------------------------------------------- /Scripts/Images_functions/UnsplashAPI.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | import requests as requests 3 | from urllib.request import urlretrieve 4 | from tqdm import trange 5 | import time 6 | from socket import error as socket_error 7 | import errno 8 | 9 | DEFAULT_KEY_WORD = 'depression' 10 | DEFAULT_PATH = "Unsplash/Depression/" 11 | 12 | def get_unsplash_images(token, key_word=DEFAULT_KEY_WORD, path_to_folder=DEFAULT_PATH): 13 | """ 14 | Using Unsplash API to save url of photos in a csv 15 | :param path_to_folder: Path there images will be saved 16 | :param browser: webdriver 17 | :param key_word: key word to be searched 18 | :param token 19 | 20 | """ 21 | r = requests.get(f'https://api.unsplash.com/search/photos?query={key_word}&client_id={token}') 22 | infos = r.json() 23 | total_pages = infos['total_pages'] 24 | total_images = infos['total'] 25 | links = [] 26 | num_per_page = 200 27 | for pg in range(1, total_pages + 1): 28 | new_r = requests.get(f'https://api.unsplash.com/search/photos?query={key_word}' + 29 | f'&page={pg}&per_page={num_per_page}&client_id={token}') 30 | data = new_r.json() 31 | for img_data in data['results']: 32 | img_url = img_data['urls']['raw'] 33 | links.append(img_url) 34 | if len(links) == total_images: 35 | print('Links for all images') 36 | else: 37 | print('Missing links') 38 | Clean_links = [] 39 | for l in links: 40 | Clean_links.append(l.split('?')[0]) 41 | pathlib.Path(path_to_folder).mkdir(parents=True, exist_ok=True) 42 | try: 43 | for i in trange(len(Clean_links)): 44 | name = 'Unsplash_' + key_word + '_' + str(i) 45 | time.sleep(5) 46 | urlretrieve(links[i], path_to_folder+ name + "." + 'jpeg') 47 | except socket_error as e: 48 | if e.errno != errno.ECONNRESET: 49 | raise 50 | pass 51 | -------------------------------------------------------------------------------- /Scripts/Twitter_Crawler/Twint_pkge.py: -------------------------------------------------------------------------------- 1 | import twint 2 | import pandas as pd 3 | from tqdm import trange 4 | import sys, os 5 | 6 | DEFAULT_KEYWORD = "I suffer from depression" 7 | DEFAULT_KEYWORDS = [ "I am diagnosed with depression", 'I am fighting depression', 'I suffer from depression' ] 8 | DEFAULT_LIMIT = 3000 9 | DEFAULT_USERNAME = 'Imen' 10 | DEFAULT_LIMIT_PROFILE_STATUS = 100 11 | 12 | 13 | def clean_data(key_words=DEFAULT_KEYWORDS): 14 | data = pd.DataFrame() 15 | for key in key_words: 16 | data = data.append(pd.read_csv(key + ".csv")) 17 | data = data.drop_duplicates(subset=['user_id'], keep='first') 18 | return data 19 | 20 | 21 | def get_keywords_tweets(keyword=DEFAULT_KEYWORD, limit=DEFAULT_LIMIT): 22 | """ 23 | This function returns a csv data set with tweets containing the keyword 24 | :param keyword is the key word to search for 25 | :param limit number of tweets to retrieve 26 | """ 27 | c = twint.Config() 28 | c.Search = keyword 29 | c.Limit = limit 30 | c.Store_csv = True 31 | c.Output = keyword + ".csv" 32 | sys.stdout = open(os.devnull, 'w') 33 | twint.run.Search(c) 34 | 35 | 36 | def get_profile_infos(user_name=DEFAULT_USERNAME): 37 | """ 38 | This function returns a csv file with users pesonal information (bio/location/followers/following) 39 | :param user_name Username of a twitter user 40 | """ 41 | c = twint.Config() 42 | c.Username = user_name 43 | c.Store_csv = True 44 | c.Output = ("Profileinfos.csv") 45 | sys.stdout = open(os.devnull, 'w') 46 | twint.run.Lookup(c) 47 | 48 | 49 | def get_timeline_by_username(user_name=DEFAULT_USERNAME, limit=DEFAULT_LIMIT_PROFILE_STATUS): 50 | """ 51 | This function returns csv files each contain 100 recent post of a user 52 | :param user_name user_name Username of a twitter user 53 | """ 54 | c = twint.Config() 55 | c.Username = user_name 56 | c.Profile = True 57 | c.Retweets = True 58 | c.Limit = limit 59 | c.Store_csv = True 60 | c.Output = ('Timelines/' + user_name + ".csv") 61 | sys.stdout = open(os.devnull, 'w') 62 | twint.run.Search(c) 63 | -------------------------------------------------------------------------------- /data/Images_load.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | from tqdm import trange 4 | from utils import maybe_download_and_extract, getlink 5 | from data import DataSet, DataSets 6 | 7 | SOURCE_NEGATIVE_URL = "http://www.mediafire.com/file/17l0aqdbqhbvlfu/negative.npz/file" 8 | SOURCE_POSITIVE_URL = "http://www.mediafire.com/file/v1s3tqqzriuuq61/positive.npz/file" 9 | DATA_DEFAULT_PATH = '~/.datasets/Images_From_Unsplash_And_Pexels/' 10 | DEFAULT_TEST_SIZE = .25 11 | DEFAULT_SEED=2020 12 | 13 | 14 | def _extract(Pdata, Ndata): 15 | ''' 16 | Extract features and labels of images 17 | :param Pdata: data for depressed users' images 18 | :param Ndata: data for non depressed users' images 19 | :return: features and labels 20 | ''' 21 | # Extract features and labels 22 | features = list(Pdata[ Pdata.files[ 0 ] ][ 0 ]) 23 | labels = list(Pdata[ Pdata.files[ 0 ] ][ 1 ]) 24 | for i in trange(1,len(Pdata)): 25 | features.append(Pdata[ Pdata.files[ i ] ][ 0 ][ 0 ]) 26 | labels.append(Pdata[ Pdata.files[ i ] ][ 1 ][ 0 ]) 27 | for i in trange(len(Ndata)): 28 | features.append(Ndata[ Ndata.files[ i ] ][ 0 ][ 0 ]) 29 | labels.append(Ndata[ Ndata.files[ i ] ][ 1 ][ 0 ]) 30 | return features, labels 31 | 32 | 33 | def load_data(data_path=DATA_DEFAULT_PATH, seed=DEFAULT_SEED, test_size=DEFAULT_TEST_SIZE): 34 | """ 35 | Loads dataset. 36 | :param data_path: string 37 | the path of the directory that contains the dataset 38 | """ 39 | 40 | data_path = os.path.expanduser(data_path) 41 | 42 | # Download files 43 | maybe_download_and_extract(getlink(SOURCE_NEGATIVE_URL), data_path) 44 | maybe_download_and_extract(getlink(SOURCE_POSITIVE_URL), data_path) 45 | # read data 46 | P = np.load(data_path + 'positive.npz',allow_pickle=True) # load data for depressed users 47 | N = np.load(data_path + 'negative.npz',allow_pickle=True) # load data for not depressed users 48 | features, labels = _extract(P, N) 49 | 50 | data = DataSet(features, labels, seed=seed) 51 | train, test = data.split(split_size=test_size) 52 | train, validation = train.split(split_size=test_size) 53 | return DataSets(train, validation, test) 54 | -------------------------------------------------------------------------------- /Notebooks/Remove_Duplicates.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "### To remove duplicate images using fdupes for linux \n", 8 | "* fdupes is a Linux utility for identifying or deleting duplicate files by comparing md5sum then running a byte-to-byte comparaison" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "### Install fdupes for linux " 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "sudo apt-get update && apt-get install fdupes " 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | " ### Search duplicate photos in a folder " 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "fdupes Path_To_folder" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "### Number of duplicates in a folder " 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "fdupes -m Path_To_folder" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "### Delete files and preserve the first one" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "fdupes -dN Path_To_folder" 73 | ] 74 | } 75 | ], 76 | "metadata": { 77 | "kernelspec": { 78 | "display_name": "Python 3", 79 | "language": "python", 80 | "name": "python3" 81 | }, 82 | "language_info": { 83 | "codemirror_mode": { 84 | "name": "ipython", 85 | "version": 3 86 | }, 87 | "file_extension": ".py", 88 | "mimetype": "text/x-python", 89 | "name": "python", 90 | "nbconvert_exporter": "python", 91 | "pygments_lexer": "ipython3", 92 | "version": "3.6.10" 93 | } 94 | }, 95 | "nbformat": 4, 96 | "nbformat_minor": 4 97 | } 98 | -------------------------------------------------------------------------------- /data/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import collections 3 | import numpy as np 4 | from sklearn.model_selection import train_test_split 5 | 6 | DataSets = collections.namedtuple('DataSets', ['train', 'validation', 'test']) 7 | class DataSet(object): 8 | def __init__(self, features, labels, seed=None, name=None): 9 | if name is None: 10 | name = self.__class__.__name__ 11 | self._logger = logging.getLogger(self.__class__.__name__) 12 | self._features = features 13 | self._labels = labels 14 | self._seed = seed 15 | self._name = name 16 | 17 | self._epoch_completed = 0 18 | self._index_in_epoch = 0 19 | 20 | 21 | @property 22 | def features(self): 23 | return self._features 24 | 25 | @property 26 | def labels(self): 27 | return self._labels 28 | 29 | @property 30 | def num_examples(self): 31 | return len(self._features) 32 | 33 | @property 34 | def input_dim(self): 35 | if len(self._features.shape) == 2: 36 | return self._features.shape[1] 37 | else: 38 | return self._features.shape[1:] 39 | 40 | @property 41 | def output_dim(self): 42 | return self.n_classes 43 | 44 | @property 45 | def n_classes(self): 46 | if len(self.labels.shape) == 1: 47 | _n_classes = len(np.unique(self.labels)) 48 | else: 49 | _n_classes = self.labels.shape[1] 50 | return _n_classes 51 | 52 | def split(self, split_size): 53 | assert split_size > 0 and not split_size > 1 54 | features_train, features_test, labels_train, labels_test = train_test_split(self._features, self._labels, 55 | test_size=split_size, 56 | random_state=self._seed) 57 | 58 | train = DataSet(features_train, labels_train, seed=self._seed) 59 | test = DataSet(features_test, labels_test, seed=self._seed) 60 | return train, test 61 | 62 | def shuffle(self): 63 | idx = np.arange(0, self.num_examples) 64 | np.random.seed(self._seed) 65 | np.random.shuffle(idx) 66 | self._features = self.features[idx] 67 | self._labels = self.labels[idx] -------------------------------------------------------------------------------- /data/c3d.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import json 4 | import os 5 | from tqdm import tqdm 6 | import logging 7 | from utils import maybe_download_and_extract 8 | from data import DataSet, DataSets 9 | from utils import maybe_download_and_extract, getlink 10 | 11 | SOURCE_URL = "http://www.mediafire.com/file/33gw4n73pyoa2ea/data.zip/file" 12 | DATA_DEFAULT_PATH = '~/.datasets/Cross-Domain_Depression_Detection_via Harvesting_Social_Media/' 13 | DEFAULT_TEST_SIZE=.25 14 | DEFAULT_SEED=2020 15 | 16 | def _read_data(data_path): 17 | ''' 18 | Read downloaded data 19 | :param data_path: path for the dataset json files 20 | :return: tweets and labels 21 | ''' 22 | # Import data 23 | tweets=[] 24 | labels=[] 25 | labels_path = data_path+'data/' 26 | for cathegory in os.listdir(labels_path): 27 | data_files=labels_path+cathegory 28 | files = os.listdir(data_files) 29 | for file in tqdm(files): 30 | with open(data_files+'/'+file) as json_d: 31 | tmp = json.load(json_d) 32 | tweets.append(tmp['text']) 33 | if cathegory=='negative': 34 | labels.append(0) 35 | if cathegory=='positive': 36 | labels.append(0) 37 | x, y = np.array(tweets), np.array(labels) 38 | return x, y 39 | 40 | 41 | def load_data(data_path=DATA_DEFAULT_PATH, test_size=DEFAULT_TEST_SIZE, seed=DEFAULT_SEED): 42 | """ 43 | Loads dataset. 44 | 45 | Args: 46 | data_path: string 47 | the path of the directory that contains the dataset 48 | test_size: float 49 | Value between 0 and 1 that indicated the proportion to use for the test set. This is calculated from 50 | the train set. 51 | seed: integer 52 | initialization of the random number generator 53 | Returns: 54 | DataSets object 55 | A named tuple of type Datasets containing the train and test sets all of them of type dataset. 56 | """ 57 | 58 | data_path = os.path.expanduser(data_path) 59 | 60 | # Download files 61 | maybe_download_and_extract(getlink(SOURCE_URL), data_path) 62 | 63 | # read data to memory 64 | x, y = _read_data(data_path) 65 | 66 | data = DataSet(x, y, seed=seed) 67 | train, test = data.split(split_size=test_size) 68 | train, validation= train.split(split_size=test_size) 69 | return DataSets(train,validation,test) 70 | -------------------------------------------------------------------------------- /Scripts/Images_functions/pexels.py: -------------------------------------------------------------------------------- 1 | from selenium import webdriver 2 | import time 3 | from tqdm import trange 4 | import urllib.request 5 | from socket import error as socket_error 6 | import errno 7 | import pathlib 8 | 9 | 10 | DEFAULT_key_word = "Depression" 11 | DEFAULT_PATH_FOLDER = "Pexels/Happy/" 12 | DEFAULT_WEB_DRIVER = webdriver.Firefox() 13 | DEFAULT_TIME_SLEEP_SCROLL = 5 14 | 15 | 16 | class AppURLopener(urllib.request.FancyURLopener): 17 | version = "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.69 " \ 18 | "Safari/537.36 " 19 | 20 | 21 | def _choose(messy): 22 | clean = [] 23 | for elt in messy: 24 | link = elt.get_attribute('src') 25 | if not (link.find('https://images.pexels.com/photos/')): 26 | clean.append(link.split("?")[0]) 27 | return clean 28 | def _scroll_down(browser, time_sleep): 29 | """A method for scrolling the page.""" 30 | # Get scroll height. 31 | last_height = browser.execute_script("return document.body.scrollHeight") 32 | while True: 33 | # Scroll down to the bottom. 34 | browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") 35 | # Wait to load the page, this depends on the speed of internet connection 36 | time.sleep(time_sleep) 37 | # Calculate new scroll height and compare with last scroll height. 38 | new_height = browser.execute_script("return document.body.scrollHeight") 39 | if new_height == last_height: 40 | break 41 | last_height = new_height 42 | 43 | def get_pexels_images(key_word=DEFAULT_key_word, browser=DEFAULT_WEB_DRIVER, path_to_folder=DEFAULT_PATH_FOLDER, time_sleep_for_scroll = DEFAULT_TIME_SLEEP_SCROLL): 44 | """ 45 | Crawl Pexel Website and download all images with a key word 46 | :param path_to_folder: Path there images will be saved 47 | :param browser: webdriver 48 | :param key_word: key word to be searched 49 | 50 | """ 51 | pathlib.Path(path_to_folder).mkdir(parents=True, exist_ok=True) 52 | browser.get("https://www.pexels.com/search/" + key_word) 53 | _scroll_down(browser, time_sleep_for_scroll) 54 | browser.implicitly_wait(10) # seconds 55 | columns = browser.find_elements_by_class_name('photos__column') 56 | all_imgs = [] 57 | for column in columns: 58 | all_imgs.append(column.find_elements_by_tag_name("img")) 59 | imgs = [] 60 | for col in all_imgs: 61 | for img in col: 62 | imgs.append(img) 63 | img_links = _choose(imgs) 64 | try: # Because an exception occurred while running the code (three times) 65 | urllib_urlopener = AppURLopener() 66 | for i in trange(len(img_links)): 67 | extension = img_links[i].split('.')[-1] 68 | name = key_word + str(i) 69 | time.sleep(5) 70 | urllib_urlopener.retrieve(img_links[i], path_to_folder + name + "." + extension) 71 | except socket_error as e: 72 | if e.errno != errno.ECONNRESET: 73 | raise 74 | pass 75 | browser.close() 76 | -------------------------------------------------------------------------------- /models/bilstm.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from keras.preprocessing.text import Tokenizer 4 | from keras.preprocessing.sequence import pad_sequences 5 | from keras.models import Sequential 6 | from keras.layers import Dense, Embedding, LSTM, Dropout, Bidirectional 7 | from keras.utils.np_utils import to_categorical 8 | from keras.optimizers import Adam 9 | from keras.callbacks import EarlyStopping, ModelCheckpoint 10 | import os 11 | 12 | DEFAULT_MAX_FEATURES = 2000 13 | DEFAULT_MAX_LENGTH = 28 14 | DEFAULT_EMBED = 128 15 | DEFAULT_LSTM_UNITS = 400 16 | DEFAULT_BATCH = 300 17 | DEFAULT_EPOCHS = 200 18 | DEFAULT_LR = .001 19 | DEFAULT_PATIENCE=10 20 | 21 | 22 | def _preprocess_data(train,validation, test, max_features=DEFAULT_MAX_FEATURES, max_len=DEFAULT_MAX_LENGTH): 23 | """ 24 | Prepare data sequentially to feed it to the neural network 25 | :param train: 26 | train data 27 | :param test: 28 | test data 29 | :param max_features: 30 | maximum number that sentence may contain 31 | :param max_len: 32 | padding size 33 | :return: 34 | train and test, features and their labels 35 | """ 36 | train_tokenizer = Tokenizer(num_words=max_features, split=' ') 37 | train_tokenizer.fit_on_texts(train.features) 38 | x_train = train_tokenizer.texts_to_sequences(train.features) 39 | x_train = pad_sequences(x_train, maxlen=max_len) 40 | 41 | test_tokenizer = Tokenizer(num_words=max_features, split=' ') 42 | test_tokenizer.fit_on_texts(test.features) 43 | x_test = test_tokenizer.texts_to_sequences(test.features) 44 | x_test = pad_sequences(x_test, maxlen=max_len) 45 | 46 | validation_tokenizer = Tokenizer(num_words=max_features, split=' ') 47 | validation_tokenizer.fit_on_texts(validation.features) 48 | x_validation = validation_tokenizer.texts_to_sequences(validation.features) 49 | x_validation = pad_sequences(x_validation, maxlen=max_len) 50 | 51 | y_train = to_categorical(train.labels) 52 | y_test = to_categorical(test.labels) 53 | y_validation = to_categorical(validation.labels) 54 | 55 | return x_train, y_train, x_test, y_test, x_validation, y_validation 56 | 57 | 58 | def model(train, validation, test, embed_dim=DEFAULT_EMBED, lstm_units=DEFAULT_LSTM_UNITS, batch_size=DEFAULT_BATCH, 59 | lr=DEFAULT_LR,patience=DEFAULT_PATIENCE,epochs=DEFAULT_EPOCHS, max_features=DEFAULT_MAX_FEATURES, max_len=DEFAULT_MAX_LENGTH): 60 | """ 61 | LSTM MODEL FOR BINARY CLASSIFICATION 62 | :param train: 63 | train data 64 | :param validation 65 | validation data 66 | :param test: 67 | test data 68 | :param embed_dim: 69 | embedding dimension 70 | :param lstm_units: 71 | number of units in an lstm cell 72 | :param batch_size: 73 | batch size 74 | :param epochs: 75 | number of epochs 76 | :param max_features: 77 | maximum number that sentence may contain 78 | :param max_len: 79 | padding size 80 | :return: 81 | """ 82 | adam = Adam(lr=lr, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False) 83 | 84 | file_path = "weights-improvement-{epoch:02d}-{val_acc:.2f}.hdf5" 85 | 86 | check_point = ModelCheckpoint(file_path, monitor='val_loss', verbose=1, save_best_only=True, 87 | save_weights_only=True, mode='auto', period=1) 88 | early_stop = EarlyStopping(monitor='val_loss', patience=patience, verbose=1, 89 | mode='auto', restore_best_weights=True) 90 | 91 | out_dim = len(np.unique(train.labels)) 92 | x_train, y_train, x_test, y_test, x_validation, y_validation = _preprocess_data(train, validation, test, max_features, max_len) 93 | loss = 'binary_crossentropy' 94 | if y_train.shape[1]>2: 95 | loss = 'categorical_crossentropy' 96 | 97 | model = Sequential() 98 | model.add(Embedding(max_features, embed_dim, input_length=x_train.shape[1])) 99 | model.add(Dropout(.2)) 100 | model.add(Bidirectional(LSTM(lstm_units, dropout=.8, recurrent_dropout=.8))= 101 | model.add(Dropout(.8)) 102 | model.add(Dense(out_dim, activation='softmax')) 103 | model.compile(loss=loss, optimizer=adam, metrics=['accuracy']) 104 | model.fit(x_train, y_train, batch_size,epochs, verbose=True, 105 | validation_data=(x_validation,y_validation), 106 | callbacks=[check_point,early_stop]) 107 | loss, train_accuracy = model.evaluate(x_train, y_train, verbose=False) 108 | print("Training Accuracy: {:.4f}".format(train_accuracy)) 109 | loss, test_accuracy = model.evaluate(x_test, y_test, verbose=False) 110 | print("Testing Accuracy: {:.4f}".format(test_accuracy)) 111 | return train_accuracy,test_accuracy 112 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import urllib.request 4 | from bs4 import BeautifulSoup as soup 5 | import zipfile 6 | import tarfile 7 | import logging 8 | 9 | def maybe_download(file_name, data_path, url): 10 | """ 11 | Download file from url if not found. 12 | 13 | This function will check if the data_path directory exists otherwise it will create it. it will check if file_name 14 | exists in data_path directory otherwise it will download it from url. 15 | 16 | Args: 17 | file_name: string 18 | The name of file after download 19 | data_path: string 20 | The folder where where data should be downloaded 21 | url: string 22 | The url of the file to download 23 | 24 | Returns: 25 | string 26 | the name of the downloaded file 27 | 28 | """ 29 | logger = logging.getLogger(__name__ + '.maybe_download') 30 | 31 | file_path = data_path + file_name 32 | logger.debug(('Checking {} into {}'.format(file_name, data_path))) 33 | 34 | # Check data dir exists 35 | if not os.path.exists(data_path): 36 | logger.debug('Folder {} not found, creating it'.format(data_path)) 37 | os.makedirs(data_path) 38 | 39 | # Check data file exists 40 | if os.path.exists(file_path): 41 | logger.debug('File {} found'.format(file_path)) 42 | return file_path 43 | 44 | # Otherwise download it 45 | logger.info('Downloading file {} from {}'.format(file_path, url)) 46 | temp_file_name, _ = urllib.request.urlretrieve(url, file_path) 47 | logger.info('Successfully downloaded file {}, {} bites'.format(temp_file_name, os.stat(temp_file_name).st_size)) 48 | 49 | return file_path 50 | 51 | 52 | def _print_download_progress(count, block_size, total_size): 53 | """ 54 | Function used for printing the download progress. 55 | Used as a call-back function in maybe_download_and_extract(). 56 | """ 57 | 58 | # Percentage completion. 59 | pct_complete = float(count * block_size) / total_size 60 | 61 | # Limit it because rounding errors may cause it to exceed 100%. 62 | pct_complete = min(1.0, pct_complete) 63 | 64 | # Status-message. Note the \r which means the line should overwrite itself. 65 | msg = "\r- Download progress: {0:.1%}".format(pct_complete) 66 | 67 | # Print it. 68 | sys.stdout.write(msg) 69 | 70 | 71 | sys.stdout.flush() 72 | 73 | 74 | def maybe_download_and_extract(url, download_dir): 75 | """ 76 | Download and extract the data if it doesn't already exist. 77 | Assumes the url is a tar-ball file. 78 | :param url: 79 | Internet URL for the tar-file to download. 80 | Example: "https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz" 81 | :param download_dir: 82 | Directory where the downloaded file is saved. 83 | Example: "data/CIFAR-10/" 84 | :return: 85 | Nothing. 86 | """ 87 | 88 | # Filename for saving the file downloaded from the internet. 89 | # Use the filename from the URL and add it to the download_dir. 90 | filename = url.split('/')[-1] 91 | filename = filename.split('?')[0] 92 | file_path = os.path.join(download_dir, filename) 93 | 94 | # Check if the file already exists. 95 | # If it exists then we assume it has also been extracted, 96 | # otherwise we need to download and extract it now. 97 | if not os.path.exists(file_path): 98 | # Check if the download directory exists, otherwise create it. 99 | if not os.path.exists(download_dir): 100 | os.makedirs(download_dir) 101 | 102 | # Download the file from the internet. 103 | file_path, _ = urllib.request.urlretrieve(url=url, 104 | filename=file_path, 105 | reporthook=_print_download_progress) 106 | 107 | print() 108 | print("Download finished. Extracting files.") 109 | 110 | if file_path.endswith(".zip"): 111 | # Unpack the zip-file. 112 | zipfile.ZipFile(file=file_path, mode="r").extractall(download_dir) 113 | elif file_path.endswith((".tar.gz", ".tgz")): 114 | # Unpack the tar-ball. 115 | tarfile.open(name=file_path, mode="r:gz").extractall(download_dir) 116 | 117 | print("Done.") 118 | else: 119 | print("Data has apparently already been downloaded and unpacked.") 120 | 121 | def getlink(link): 122 | driver = urllib.request.urlopen(link) 123 | content = driver.read() 124 | driver.close() 125 | page = soup(content, "html.parser") 126 | download = page.find("div",{"class":"download_link"}) 127 | return download.find("a",{"class":"input"})['href'] -------------------------------------------------------------------------------- /Notebooks/Explore_Twint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Twint Package" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import twint\n", 17 | "import nest_asyncio\n", 18 | "import pandas as pd\n", 19 | "import sys, os\n", 20 | "import time\n", 21 | "from tqdm import trange" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "# Solve compatibility issues with notebooks and RunTime errors.\n", 31 | "nest_asyncio.apply()" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "# I was diagnosed with deprssion\n", 41 | "# I am fighting depression \n", 42 | "# I suffer from depression\n", 43 | "DEFAULT_KEYWORD=\"I was diagnosed with depression\"\n", 44 | "DEFAULT_LIMIT=400\n", 45 | "def get_keywords_tweets(keyword=DEFAULT_KEYWORD, limit=DEFAULT_LIMIT):\n", 46 | " c = twint.Config()\n", 47 | " c.Search = keyword\n", 48 | " c.Limit = limit\n", 49 | " c.Store_csv = True\n", 50 | " c.Output = (keyword+\".csv\")\n", 51 | " sys.stdout = open(os.devnull, 'w')\n", 52 | " print(twint.run.Search(c))" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 5, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "get_keywords_tweets()" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 3, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "key_words = [\"I am diagnosed with depression\",'I am fighting depression','I suffer from depression']\n", 71 | "data = pd.DataFrame() \n", 72 | "for key in key_words:\n", 73 | " data =data.append( pd.read_csv(key+\".csv\")) " 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 4, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "data": { 83 | "text/plain": [ 84 | "6000" 85 | ] 86 | }, 87 | "execution_count": 4, 88 | "metadata": {}, 89 | "output_type": "execute_result" 90 | } 91 | ], 92 | "source": [ 93 | "len(data)" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 5, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [ 102 | "data = data.drop_duplicates(subset=['user_id'], keep='first') #To drop duplicate usernames " 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": 6, 108 | "metadata": {}, 109 | "outputs": [ 110 | { 111 | "data": { 112 | "text/plain": [ 113 | "2877" 114 | ] 115 | }, 116 | "execution_count": 6, 117 | "metadata": {}, 118 | "output_type": "execute_result" 119 | } 120 | ], 121 | "source": [ 122 | "len(data)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 7, 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "data": { 132 | "text/plain": [ 133 | "('i have been officially diagnosed with clinical depression and anxiety after many years of suffering in silence, not understanding what’s wrong with me. i have reflected my insecurities on my friendships and relationships for the longest time and i am sorry to whoever i hurt.',\n", 134 | " 'slaysiah')" 135 | ] 136 | }, 137 | "execution_count": 7, 138 | "metadata": {}, 139 | "output_type": "execute_result" 140 | } 141 | ], 142 | "source": [ 143 | "data['tweet'].iloc[0],data['username'].iloc[0]" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 8, 149 | "metadata": {}, 150 | "outputs": [], 151 | "source": [ 152 | "user_name=data['username'].iloc[0]\n", 153 | "def get_profile_infos(user_name=user_name):\n", 154 | " c = twint.Config()\n", 155 | " c.Username = user_name\n", 156 | " c.Store_csv = True\n", 157 | " c.Output = (\"Profileinfos.csv\")\n", 158 | " sys.stdout = open(os.devnull, 'w')\n", 159 | " twint.run.Lookup(c)\n" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 17, 165 | "metadata": {}, 166 | "outputs": [], 167 | "source": [ 168 | "get_profile_infos()" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [ 177 | "profiles=pd.read_csv(\"Profileinfos.csv\")" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "for i in trange(len(data)):\n", 187 | " get_profile_infos(data['username'].iloc[i])" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 10, 193 | "metadata": {}, 194 | "outputs": [], 195 | "source": [ 196 | "user_name=data['username'].iloc[0]\n", 197 | "def get_timeline_by_usernames(user_name=user_name):\n", 198 | " c = twint.Config()\n", 199 | " c.Username =user_name\n", 200 | " c.Retweets = True\n", 201 | " c.Limit=100\n", 202 | " c.Store_csv = True\n", 203 | " c.Output = (\"Timelines/\"+user_name+\".csv\")\n", 204 | " sys.stdout = open(os.devnull, 'w')\n", 205 | " twint.run.Search(c)\n", 206 | " " 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "for i in trange(len(data)):\n", 216 | " get_timeline_by_usernames(data['username'].iloc[i])" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [] 225 | } 226 | ], 227 | "metadata": { 228 | "kernelspec": { 229 | "display_name": "Python 3", 230 | "language": "python", 231 | "name": "python3" 232 | }, 233 | "language_info": { 234 | "codemirror_mode": { 235 | "name": "ipython", 236 | "version": 3 237 | }, 238 | "file_extension": ".py", 239 | "mimetype": "text/x-python", 240 | "name": "python", 241 | "nbconvert_exporter": "python", 242 | "pygments_lexer": "ipython3", 243 | "version": "3.6.10" 244 | } 245 | }, 246 | "nbformat": 4, 247 | "nbformat_minor": 4 248 | } 249 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |

Depression Detection

3 | 4 | Logo-brain 5 | 6 |
7 | 8 | This project aims to detect early indicators of depression by analyzing data from a range of social media platforms, including images and texts. 9 | 10 | --- 11 | 12 | 13 |
14 | Table of Contents 15 | 16 |
    17 |
  1. Data collection 18 |
      19 |
    1. Visual Data
    2. 20 |
    3. Textual Data
    4. 21 |
    22 |
  2. 23 |
  3. Models 24 |
      25 |
    1. Models for images
    2. 26 |
    3. Models for texts
    4. 27 |
    28 |
  4. 29 | 30 | 31 |
  5. Softwares and technologies
  6. 32 |
  7. Hardware
  8. 33 | 34 | 35 |
36 |
37 | 38 | 39 | --- 40 | 41 | ## Data collection 42 | Data were collected from Pexels, Unsplash and Twitter .
43 | Pexels and Unsplash are two freely-usable images platforms.
Tweets used are publicly available. 44 | ### Visual Data: 45 | The overall process of scraping images from unsplash and pexels is presented as follows: 46 |
Image sample
47 | 48 | Images were crawled from Pexels using Selenium and from Unsplash using UnsplashAPI. 49 | 53 | 54 | This is a sample of the dataset: 55 |
Image sample
56 | 57 | Images can be loaded as shown in Project Cheat Sheet and codes are available here . 58 | ### Textual Data: 59 | 60 | Hashtags that were used are trending hashtags using Keywords inspired from DSM-5(Diagnostic and Statistical Manual of Mental Disorders). 61 | Textual data were collected from Twitter users sharing their posts publicly using twint.
62 | Overall, 5460 tweets were collected. 63 | The process was:
64 |
Image sample
65 | 66 | You can check the result of texts loader in
Project Cheat Sheet and codes are available here . 67 | This is a sample of the dataset: 68 |
Image sample
69 | 70 | ## Models
71 | #### Models for Images: 72 | Trained five different types of models: 73 | 81 | 82 | #### Models for Texts: 83 | Trained two different types of models: 84 | 88 | 89 |
90 | 91 | For the best models I actually chose, you can find three notebooks:
92 | 99 | 100 | You can find the saved weights for images best model and texts best model here. 101 | 102 | 103 | 104 | ## Software and technologies: 105 | 106 | 115 | 116 | 117 | 118 | ## Hardware 119 | In the process of the implementation of our solution we used two main machines, 120 | a local machine for refactoring codes, testing models and research, and a virtual 121 | machine (VM) on Google Cloud Platform (GCP) to run models and codes that 122 | are heavy in term of computation and time. Following are the specifications of 123 | these machines: 124 |