├── .gitignore
├── Holidays
├── data
│ └── country_data.csv
└── scripts
│ └── Holidays.R
├── Twitter Sentiment Analysis
├── app
│ └── www
│ │ ├── Page2.PNG
│ │ ├── Panels.PNG
│ │ ├── Loading1.gif
│ │ ├── Loading2.gif
│ │ ├── Nextgraph.PNG
│ │ ├── busyIndicator.css
│ │ └── style.css
└── scripts
│ └── Sentiment Analysis - Telcos.R
├── Consitent Billionaire Guide
├── app
│ ├── www
│ │ ├── Loading.gif
│ │ ├── Loading1.gif
│ │ ├── Loading2.gif
│ │ ├── Loading9.gif
│ │ ├── Typing.gif
│ │ ├── The Ghost.PNG
│ │ ├── The Ghosts.PNG
│ │ ├── The Newbie.png
│ │ ├── The Hustler.PNG
│ │ ├── The Consistent.PNG
│ │ ├── busyIndicator.css
│ │ ├── Loading1.htm
│ │ └── style.css
│ └── app.R
├── data
│ ├── year_list.csv
│ └── billionaire_data.csv
└── scripts
│ ├── Machine Learning for App.R
│ ├── billionaire_functions.R
│ ├── Billionaires.R
│ └── Billionaires - Compressed.R
├── Friends Analysis Laughter Prediction
├── S1
│ ├── Friends - 1x03 - The One With The Thumb.en.srt
│ ├── Friends - 1x23 - The One With The Birth.en.srt
│ ├── Friends - 1x18 - The One With All The Poker.en.srt
│ ├── Friends - 1x08 - The One Where Nana Dies Twice.en.srt
│ ├── Friends - 1x21 - The One With The Fake Monica.en.srt
│ ├── Friends - 1x22 - The One With The Ick Factor.en.srt
│ ├── Friends - 1x24 - The One Where Rachel Finds Out.en.srt
│ ├── Friends - 1x04 - The One With George Stephanopoulos.en.srt
│ ├── Friends - 1x07 - The One With The Blackout.720p HDTV.TvR.en.srt
│ ├── Friends - 1x08 - The One Where Nana Dies Twice.720p HDTV.Morphz.en.srt
│ ├── Friends - 1x04 - The One With George Stephanopoulos.720p HDTV.TvR.en.srt
│ ├── Friends - 1x05 - The One With The East German Laundry Detergent.720p HDTV.TvR.en.srt
│ └── Friends - 1x07 - The One With The Blackout.en.srt
├── Laughter_Detection.csv
├── Audio File to Dataset Conversion.ipynb
└── canned_laughter_detection.R
├── Twitter Crawler
└── scripts
│ ├── Database Initial Setup.R
│ └── Telecoms twitter crawler.R
├── README.md
├── The Making of Great Music
├── data
│ └── topic_dataset.csv
├── README.md
└── scripts
│ ├── music_sentiment.py
│ └── music_sentiment.R
├── Football Analysis
├── scripts
│ ├── FootballAnalysis - 2017_functions.R
│ ├── FootballAnalysis - 2017.R
│ └── FootballAnalysis - 2016.R
└── data
│ └── epl_data.csv
└── Others
└── CBN.R
/.gitignore:
--------------------------------------------------------------------------------
1 | /packrat
2 | /.spyproject
3 | .httr-oauth
4 | .RData
5 | .Rhistory
6 | .Rproj.user
7 | README.md
8 |
--------------------------------------------------------------------------------
/Holidays/data/country_data.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Holidays/data/country_data.csv
--------------------------------------------------------------------------------
/Twitter Sentiment Analysis/app/www/Page2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Twitter Sentiment Analysis/app/www/Page2.PNG
--------------------------------------------------------------------------------
/Twitter Sentiment Analysis/app/www/Panels.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Twitter Sentiment Analysis/app/www/Panels.PNG
--------------------------------------------------------------------------------
/Consitent Billionaire Guide/app/www/Loading.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Consitent Billionaire Guide/app/www/Loading.gif
--------------------------------------------------------------------------------
/Consitent Billionaire Guide/app/www/Loading1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Consitent Billionaire Guide/app/www/Loading1.gif
--------------------------------------------------------------------------------
/Consitent Billionaire Guide/app/www/Loading2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Consitent Billionaire Guide/app/www/Loading2.gif
--------------------------------------------------------------------------------
/Consitent Billionaire Guide/app/www/Loading9.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Consitent Billionaire Guide/app/www/Loading9.gif
--------------------------------------------------------------------------------
/Consitent Billionaire Guide/app/www/Typing.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Consitent Billionaire Guide/app/www/Typing.gif
--------------------------------------------------------------------------------
/Consitent Billionaire Guide/data/year_list.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Consitent Billionaire Guide/data/year_list.csv
--------------------------------------------------------------------------------
/Twitter Sentiment Analysis/app/www/Loading1.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Twitter Sentiment Analysis/app/www/Loading1.gif
--------------------------------------------------------------------------------
/Twitter Sentiment Analysis/app/www/Loading2.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Twitter Sentiment Analysis/app/www/Loading2.gif
--------------------------------------------------------------------------------
/Twitter Sentiment Analysis/app/www/Nextgraph.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Twitter Sentiment Analysis/app/www/Nextgraph.PNG
--------------------------------------------------------------------------------
/Consitent Billionaire Guide/app/www/The Ghost.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Consitent Billionaire Guide/app/www/The Ghost.PNG
--------------------------------------------------------------------------------
/Consitent Billionaire Guide/app/www/The Ghosts.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Consitent Billionaire Guide/app/www/The Ghosts.PNG
--------------------------------------------------------------------------------
/Consitent Billionaire Guide/app/www/The Newbie.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Consitent Billionaire Guide/app/www/The Newbie.png
--------------------------------------------------------------------------------
/Consitent Billionaire Guide/app/www/The Hustler.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Consitent Billionaire Guide/app/www/The Hustler.PNG
--------------------------------------------------------------------------------
/Consitent Billionaire Guide/data/billionaire_data.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Consitent Billionaire Guide/data/billionaire_data.csv
--------------------------------------------------------------------------------
/Consitent Billionaire Guide/app/www/The Consistent.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Consitent Billionaire Guide/app/www/The Consistent.PNG
--------------------------------------------------------------------------------
/Friends Analysis Laughter Prediction/S1/Friends - 1x03 - The One With The Thumb.en.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Friends Analysis Laughter Prediction/S1/Friends - 1x03 - The One With The Thumb.en.srt
--------------------------------------------------------------------------------
/Friends Analysis Laughter Prediction/S1/Friends - 1x23 - The One With The Birth.en.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Friends Analysis Laughter Prediction/S1/Friends - 1x23 - The One With The Birth.en.srt
--------------------------------------------------------------------------------
/Friends Analysis Laughter Prediction/S1/Friends - 1x18 - The One With All The Poker.en.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Friends Analysis Laughter Prediction/S1/Friends - 1x18 - The One With All The Poker.en.srt
--------------------------------------------------------------------------------
/Friends Analysis Laughter Prediction/S1/Friends - 1x08 - The One Where Nana Dies Twice.en.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Friends Analysis Laughter Prediction/S1/Friends - 1x08 - The One Where Nana Dies Twice.en.srt
--------------------------------------------------------------------------------
/Friends Analysis Laughter Prediction/S1/Friends - 1x21 - The One With The Fake Monica.en.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Friends Analysis Laughter Prediction/S1/Friends - 1x21 - The One With The Fake Monica.en.srt
--------------------------------------------------------------------------------
/Friends Analysis Laughter Prediction/S1/Friends - 1x22 - The One With The Ick Factor.en.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Friends Analysis Laughter Prediction/S1/Friends - 1x22 - The One With The Ick Factor.en.srt
--------------------------------------------------------------------------------
/Friends Analysis Laughter Prediction/S1/Friends - 1x24 - The One Where Rachel Finds Out.en.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Friends Analysis Laughter Prediction/S1/Friends - 1x24 - The One Where Rachel Finds Out.en.srt
--------------------------------------------------------------------------------
/Friends Analysis Laughter Prediction/S1/Friends - 1x04 - The One With George Stephanopoulos.en.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Friends Analysis Laughter Prediction/S1/Friends - 1x04 - The One With George Stephanopoulos.en.srt
--------------------------------------------------------------------------------
/Friends Analysis Laughter Prediction/S1/Friends - 1x07 - The One With The Blackout.720p HDTV.TvR.en.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Friends Analysis Laughter Prediction/S1/Friends - 1x07 - The One With The Blackout.720p HDTV.TvR.en.srt
--------------------------------------------------------------------------------
/Twitter Crawler/scripts/Database Initial Setup.R:
--------------------------------------------------------------------------------
1 | Telecoms-tweets-database/Database initial setup.R
2 | #Setup of sqlite database to store tweets
3 | library(dplyr)
4 | telecoms_db = src_sqlite("Telecoms tweets database",create = T)
5 | copy_to(telecoms_db,final_file,temporary = F)
--------------------------------------------------------------------------------
/Friends Analysis Laughter Prediction/S1/Friends - 1x08 - The One Where Nana Dies Twice.720p HDTV.Morphz.en.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Friends Analysis Laughter Prediction/S1/Friends - 1x08 - The One Where Nana Dies Twice.720p HDTV.Morphz.en.srt
--------------------------------------------------------------------------------
/Friends Analysis Laughter Prediction/S1/Friends - 1x04 - The One With George Stephanopoulos.720p HDTV.TvR.en.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Friends Analysis Laughter Prediction/S1/Friends - 1x04 - The One With George Stephanopoulos.720p HDTV.TvR.en.srt
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TheArtandScienceofData
2 | This is the code Repository for my blog: The Art and Science of Data.
3 |
4 | Feel free to message me on any issues you had running the scripts or suggestions you would like to make.
5 |
6 | The link for my blog is: https://theartandscienceofdata.wordpress.com/
7 |
--------------------------------------------------------------------------------
/Friends Analysis Laughter Prediction/S1/Friends - 1x05 - The One With The East German Laundry Detergent.720p HDTV.TvR.en.srt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RosebudAnwuri/TheArtandScienceofData/HEAD/Friends Analysis Laughter Prediction/S1/Friends - 1x05 - The One With The East German Laundry Detergent.720p HDTV.TvR.en.srt
--------------------------------------------------------------------------------
/Consitent Billionaire Guide/app/www/busyIndicator.css:
--------------------------------------------------------------------------------
1 | div.shinysky-busy-indicator {
2 |
3 | top: 20%;
4 | left: 40%;
5 | margin-top: 30%;
6 | margin-left: auto;
7 | display:none;
8 | background: NA;
9 | text-align: center;
10 | padding-top: 20px;
11 | padding-left: 30px;
12 | padding-bottom: 40px;
13 | padding-right: 30px;
14 | border-radius: 5px;
15 | }
--------------------------------------------------------------------------------
/The Making of Great Music/data/topic_dataset.csv:
--------------------------------------------------------------------------------
1 | document,topic,gamma
2 | 60s,1,0.9999971261483629
3 | 50s,1,0.9999926749082607
4 | 70s,1,0.9999174345245359
5 | 80s,1,0.9998679405308054
6 | 00s,2,0.9985746025827739
7 | 10s,2,0.9945805765167997
8 | 90s,1,0.5665976165494422
9 | 90s,2,0.43340238345055787
10 | 10s,1,0.00541942348320034
11 | 00s,1,0.0014253974172261596
12 | 80s,2,1.320594691946073e-4
13 | 70s,2,8.256547546413914e-5
14 | 50s,2,7.325091739302442e-6
15 | 60s,2,2.8738516370438335e-6
16 |
--------------------------------------------------------------------------------
/Consitent Billionaire Guide/app/www/Loading1.htm:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/The Making of Great Music/README.md:
--------------------------------------------------------------------------------
1 |
The Making of Great Music
2 |
Dataset Explanations
3 |
4 |
5 |
music_df.csv: Music dataset with features from Spotify's API and Kevin Schaich's Billboard Hot 100 dataset.
6 |
7 |
features_dataset.csv: Contains features and genres for all artist and featured artists combinations.
8 |
9 |
topic_dataset.csv: Dataset with most frequent topics in each decade.
10 |
11 |
12 | Feel free to message me on any issues you had running the scripts or suggestions you would like to make.
13 |
14 | The link for my blog is: https://theartandscienceofdata.wordpress.com/
15 |
--------------------------------------------------------------------------------
/Twitter Sentiment Analysis/app/www/busyIndicator.css:
--------------------------------------------------------------------------------
1 | div.shinysky-busy-indicator {
2 |
3 | top: 20% !important;
4 | left: 3% !important;
5 | position:absolute;
6 | margin-left: auto;
7 | display:none;
8 | background: NA;
9 | text-align: center;
10 | margin-top: 0% !important;
11 | margin-left: auto;
12 | display: none;
13 | background: NA;
14 | text-align: center;
15 | padding-top: 0% !important;
16 | padding-left: 0% !important;
17 | padding-bottom: 0% !important;
18 | padding-right: 0% !important;
19 | border-radius: 5px;
20 |
21 |
22 |
23 | }
24 | #shiny-tab-Table > div > div:nth-child(2) > div > div.box-body > div.shinysky-busy-indicator > img{
25 |
26 | width:500px;
27 | }
--------------------------------------------------------------------------------
/Twitter Sentiment Analysis/app/www/style.css:
--------------------------------------------------------------------------------
1 | @import url('https://fonts.googleapis.com/css?family=Raleway:300,400');
2 | body{
3 | font-family:'Raleway', sans-serif ;
4 | }
5 | .logo{
6 | font-family:'Raleway', sans-serif !important;
7 | }
8 | h1,h2,h3,h4,h5,h6{
9 | font-family:'Raleway', sans-serif ;
10 |
11 | }
12 | #Sarch1, #Sarch{
13 | border-radius:0px;
14 | border-color:#689FB0;
15 | background-color:#689FB0;
16 | animation-duration:2s;
17 | -moz-animation-duration: 2s;
18 | -webkit-animation-duration: 2s;
19 | -o-animation-duration:.2s;
20 | -ms-animation-duration:.2s;
21 |
22 | }
23 | #hider{
24 | font-size:14pt;
25 | }
26 | #severalPlot{
27 | box-sizing:unset;
28 | }
29 |
30 | #twitterTable{
31 | margin-left: 5%;
32 | }
33 | .shinysky-busy-indicator{
34 | width:300;
35 | height:300;
36 | }
--------------------------------------------------------------------------------
/Friends Analysis Laughter Prediction/Laughter_Detection.csv:
--------------------------------------------------------------------------------
1 | File,Start,End,Length,Talk Over
2 | Friends.S01.E01.480p,63,65,2,
3 | Friends.S01.E01.480p,68,70,2,
4 | Friends.S01.E01.480p,72,72,0,
5 | Friends.S01.E01.480p,76,78,2,1
6 | Friends.S01.E01.480p,87,89,2,
7 | Friends.S01.E01.480p,105,107,2,
8 | Friends.S01.E01.480p,109,110,1,1
9 | Friends.S01.E01.480p,121,122,1,
10 | Friends.S01.E01.480p,125,126,1,
11 | Friends.S01.E01.480p,135,136,1,
12 | Friends.S01.E01.480p,141,143,2,
13 | Friends.S01.E01.480p,148,149,1,
14 | Friends.S01.E01.480p,158,159,1,
15 | Friends.S01.E01.480p,173,173,0,
16 | Friends.S01.E01.480p,178,178,0,
17 | Friends.S01.E01.480p,186,187,1,
18 | Friends.S01.E01.480p,189,192,3,
19 | Friends.S01.E01.480p,194,195,1,1
20 | Friends.S01.E01.480p,202,203,1,
21 | Friends.S01.E01.480p,211,213,2,
22 | Friends.S01.E01.480p,226,226,0,
23 | Friends.S01.E01.480p,240,241,1,
24 | Friends.S01.E01.480p,244,244,0,
25 | Friends.S01.E01.480p,252,254,2,
26 | Friends.S01.E01.480p,256,261,5,
27 | Friends.S01.E01.480p,275,276,1,
28 | Friends.S01.E01.480p,288,294,6,
29 | Friends.S01.E01.480p,301,302,1,
30 | Friends.S01.E01.480p,320,321,1,
31 | Friends.S01.E01.480p,331,332,1,1
32 | Friends.S01.E01.480p,335,337,2,
33 | Friends.S01.E01.480p,358,360,2,
34 | Friends.S01.E01.480p,379,380,1,
35 | Friends.S01.E01.480p,384,385,1,
36 | Friends.S01.E01.480p,388,389,1,
37 | Friends.S01.E01.480p,400,403,3,
38 | Friends.S01.E01.480p,408,409,1,
39 | Friends.S01.E01.480p,413,414,1,
40 | Friends.S01.E01.480p,447,449,2,
41 | Friends.S01.E01.480p,463,464,1,
42 | Friends.S01.E01.480p,467,468,1,
43 | Friends.S01.E01.480p,476,478,2,
44 | Friends.S01.E01.480p,493,494,1,
45 | Friends.S01.E01.480p,498,499,1,
46 | Friends.S01.E01.480p,505,506,1,
47 | Friends.S01.E01.480p,508,510,2,
48 | Friends.S01.E01.480p,527,528,1,
49 | Friends.S01.E01.480p,534,535,1,
50 | Friends.S01.E01.480p,560,561,1,
51 | Friends.S01.E01.480p,573,574,1,
52 | Friends.S01.E01.480p,582,583,1,
53 | Friends.S01.E01.480p,588,588,0,
54 | Friends.S01.E01.480p,595,596,1,
55 | Friends.S01.E01.480p,602,603,1,
56 | Friends.S01.E01.480p,613,618,5,
57 | Friends.S01.E01.480p,629,630,1,
58 | Friends.S01.E01.480p,631,633,2,
59 | Friends.S01.E01.480p,641,642,1,
60 | Friends.S01.E01.480p,654,658,4,
61 | Friends.S01.E01.480p,667,667,0,
62 |
--------------------------------------------------------------------------------
/Consitent Billionaire Guide/scripts/Machine Learning for App.R:
--------------------------------------------------------------------------------
1 | library(h2o)
2 | library(nnet)
3 | library(plyr)
4 | library(dplyr)
5 | library(purrr)
6 | library(scales)
7 | library(caTools)
8 | library(lime)
9 | h2o.init()
10 |
11 |
12 |
13 | #############################
14 | ## Pre Processing ##
15 | #############################
16 | dat = read.csv("C:/Users/rose.anwuri/Documents/TheArtandScienceofData/Consitent Billionaire Guide/data/billionaire_data2.csv")
17 | dat1=dat %>%
18 | filter(Cluster !="The Newbie")
19 | data = dat1 %>%
20 | select(-Detail1,-Detail2) %>%
21 | as.h2o()
22 |
23 | splits = h2o.splitFrame(data,ratios = c(0.6,0.2),destination_frames = c("train", "valid", "test"), seed = 1234)
24 | train = h2o.getFrame("train")
25 | val = h2o.getFrame("valid")
26 | test = h2o.getFrame("test")
27 | #After some feature selection, these are the final features
28 | features=2:21
29 | response=22
30 |
31 |
32 | ##########################################
33 | ##Model Selection, Tuning and Validation##
34 | ## in one step! ##
35 | #########################################
36 |
37 | #Select best model (already tuned) using the automl fuction
38 | #Fix maximum number of models to be trained as 10
39 | model_selection=h2o.automl(x=features,y=response,training_frame = train,validation_frame = val,max_models = 10,stopping_metric = "AUC")
40 |
41 |
42 | #Extract the best model
43 | final_model = model_selection@leader
44 |
45 | model = do.call(h2o.gbm,
46 | {
47 | p <- final_model@parameters
48 | p$model_id = NULL ## do not overwrite the original grid model
49 | p$training_frame = data ## use the full dataset
50 | p$validation_frame = NULL ## no validation frame
51 | p$nfolds = 5 ## cross-validation
52 | p
53 | })
54 |
55 | #Save Model to be loaded into Shiny App!
56 | h2o.saveModel(model,"local_model1")
57 |
58 | #Lime Predictions [Optional]
59 | df_lime = as.data.frame(data)[,c(features,response)]
60 | df_lime=read.csv("C:/Users/rose.anwuri/Documents/TheArtandScienceofData/Consitent Billionaire Guide/app/billionaire_data_for_ml.csv")
61 | model=h2o.loadModel("C:/Users/rose.anwuri/Documents/TheArtandScienceofData/Consitent Billionaire Guide/app/local_model")
62 | df_lime$prediction =predict(model,data)[,1] %>% as.vector()
63 |
64 | explainer <- lime::lime(
65 | df_lime,
66 | model = model,
67 | bin_continuous = T)
68 |
69 | explanation <- lime::explain(
70 | df_lime[c(7)],
71 | explainer = explainer,
72 | n_labels = 1,
73 | n_features = 8,
74 | kernel_width = 0.5)
75 |
76 | plot_features(explanation)
77 |
78 |
--------------------------------------------------------------------------------
/Friends Analysis Laughter Prediction/Audio File to Dataset Conversion.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import numpy as np\n",
10 | "import pandas as pd\n",
11 | "import librosa\n",
12 | "import os\n",
13 | "import re\n",
14 | "import requests\n",
15 | "import urllib\n",
16 | "import io\n",
17 | "import gzip\n",
18 | "import warnings\n",
19 | "import time\n",
20 | "warnings.filterwarnings('ignore')"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": null,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "def create_mfcc(path, offset=0, duration=1, n_mfcc=40):\n",
30 | " start_time = time.time() \n",
31 | " audio_array = librosa.load(path, offset=offset, duration=duration)[0]\n",
32 | " audio_array = audio_array.reshape(duration, -1)\n",
33 | " mfcc = np.array([np.mean(librosa.feature.mfcc(ary, n_mfcc=n_mfcc), axis=1) for ary in audio_array])\n",
34 | " episode=re.search('E\\d{2}',url).group(0)\n",
35 | " season=re.search('S\\d{2}',url).group(0)\n",
36 | " duration=np.arange(0,duration,1).reshape(duration,-1)\n",
37 | " extras = np.full((mfcc.shape[0], 2), (season, episode))\n",
38 | " elapsed_time = int(time.time() - start_time)\n",
39 | " print('{:02d}:{:02d}:{:02d}'.format(elapsed_time // 3600, (elapsed_time % 3600 // 60), elapsed_time % 60))\n",
40 | " return np.hstack((mfcc, extras,duration))"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "list_of_urls=['FRIENDS.S02E'+\"{:02d}\".format(i)+'.480p.mkv' for i in range(1,25,1)]\n",
50 | "list_of_urls"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": null,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "season_two_audio=[]\n",
60 | "for url in list_of_urls:\n",
61 | " Y, sample_rate=librosa.load(url,res_type='kaiser_fast')\n",
62 | " print('Reading File for Episode ' + re.search('E\\d{2}',url).group(0)+'...')\n",
63 | " duration=(np.floor(len(Y)/sample_rate))\n",
64 | " result=create_mfcc(url,duration=int(duration))\n",
65 | " season_two_audio.append(result)\n"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": null,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "pd.DataFrame(np.vstack(season_two_audio)).to_csv('season_two_audio.csv')\n"
75 | ]
76 | }
77 | ],
78 | "metadata": {
79 | "kernelspec": {
80 | "display_name": "Python 3",
81 | "language": "python",
82 | "name": "python3"
83 | },
84 | "language_info": {
85 | "codemirror_mode": {
86 | "name": "ipython",
87 | "version": 3
88 | },
89 | "file_extension": ".py",
90 | "mimetype": "text/x-python",
91 | "name": "python",
92 | "nbconvert_exporter": "python",
93 | "pygments_lexer": "ipython3",
94 | "version": "3.7.3"
95 | }
96 | },
97 | "nbformat": 4,
98 | "nbformat_minor": 2
99 | }
100 |
--------------------------------------------------------------------------------
/Consitent Billionaire Guide/app/www/style.css:
--------------------------------------------------------------------------------
1 | @import url('https://fonts.googleapis.com/css?family=Raleway:300,400');
2 |
3 | .navbar-default {
4 | background-color: teal;
5 | border-color: #E7E7E7;
6 | }
7 | .navbar-default .navbar-nav > li > a {
8 | color: white;
9 | }
10 | .navbar-default .navbar-nav > .active > a, .navbar-default .navbar-nav > .active > a:hover, .navbar-default .navbar-nav > .active > a:focus {
11 | background-color: #00897b;
12 | color: white;
13 | }
14 | body > nav > div{
15 | background-color: #009688;
16 | }
17 | body{
18 | font-family:'Raleway', sans-serif;
19 | }
20 | .selectize-input {border-radius:0px;
21 | background-color: rgba(236, 240, 241,0) !important;
22 | border-width:1px;
23 |
24 | border-top: none;
25 | border-right: none;
26 | border-left: none;
27 | border-bottom-color: #546e7a;
28 | border-radius: 0;}
29 |
30 | .selectize-input.dropdown-active {border-radius:0px;
31 | background-color: rgba(236, 240, 241,0);
32 | border-width:1px;
33 | border-top: none;
34 | border-right: none;
35 | border-left: none;
36 | border-bottom-color: #546e7a;
37 | border-radius: 0}
38 | .selectize-dropdown {border-radius:0px;
39 | background-color: rgba(236, 240, 241,0);
40 | border-width:1px;
41 | border-top: none;
42 | border-right: none;
43 | border-left: none;
44 | border-bottom-color: #546e7a;
45 | border-radius: 0}
46 |
47 |
48 |
49 | body > nav{
50 | background-color:#009688;
51 | }
52 | .card-action {
53 | box-shadow: 0 1px 3px rgba(0,0,0,0.12), 0 1px 2px rgba(0,0,0,0.24);
54 | transition: all 0.3s cubic-bezier(.25,.8,.25,1);
55 | height:50px;
56 | text-align:center;
57 | }
58 | .card-action:hover {
59 | box-shadow: 0 14px 28px rgba(0,0,0,0.25), 0 10px 10px rgba(0,0,0,0.22);
60 | }
61 | @media only screen and (max-width : 500px) {
62 | img{
63 | width:100%;
64 | min-width:100%;
65 | }
66 | }
67 |
68 | #run{
69 | animation-duration:2s;
70 | -moz-animation-duration: 2s;
71 | -webkit-animation-duration: 2s;
72 | -o-animation-duration:.2s;
73 | -ms-animation-duration:.2s;
74 | border-radius: 0px;
75 | }
76 | #run1{
77 | animation-duration:2s;
78 | -moz-animation-duration: 2s;
79 | -webkit-animation-duration: 2s;
80 | -o-animation-duration:.2s;
81 | -ms-animation-duration:.2s;
82 | }
83 | .pulse{
84 | animation-duration:2s;
85 | -moz-animation-duration: 2s;
86 | -webkit-animation-duration: 2s;
87 | -o-animation-duration:.2s;
88 | -ms-animation-duration:.2s;
89 | }
90 | #close-button{
91 | animation-duration:2s;
92 | -moz-animation-duration: 2s;
93 | -webkit-animation-duration: 2s;
94 | -o-animation-duration:.2s;
95 | -ms-animation-duration:.2s;
96 | }
97 | .modal-content{
98 | background: none;
99 | border: none;
100 | box-shadow: none;
101 | }
102 | .modal-header{
103 | border:none;
104 | }
105 | .modal .close{
106 | color:white;
107 | }
108 | .modal-body{
109 | display:none;
110 | }
111 | .modal-footer{
112 | display:none;
113 | }
114 |
115 | #prefix_final{
116 | font-size: 6vh; text-align:center;
117 | }
118 | #prediction_final{
119 | font-size: 6vh; text-align:center;
120 | }
121 | #sidebar{background-color: rgba(236, 240, 241,0.4);}
122 | form-group label{}
123 | .form-group{color: #90a4ae ;text-transform:uppercase;font-size:10pt;}
124 | #Age,#founding_year{background-color: rgba(236, 240, 241,0);
125 | border-width:1px;
126 |
127 | border-top: none;
128 | border-right: none;
129 | border-left: none;
130 | border-bottom-color: #546e7a;
131 | border-radius: 0}
--------------------------------------------------------------------------------
/The Making of Great Music/scripts/music_sentiment.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Jan 19 17:18:58 2018
4 |
5 | @author: rose.anwuri
6 | """
7 |
8 | import pandas as pd
9 | import matplotlib.pyplot as plt
10 | import nltk
11 | from nltk.corpus import stopwords
12 | nltk.download()
13 | music_df = pd.read_csv('https://raw.githubusercontent.com/walkerkq/musiclyrics/master/billboard_lyrics_1964-2015.csv')
14 | music_df.head(n=5)
15 | music_df['Lyrics']=music_df.Lyrics.fillna("")
16 | from afinn import Afinn
17 | afinn = Afinn()
18 | music_df['sentiment_score']=map(afinn.score,music_df['Lyrics'])
19 | music_df.head(n=5)
20 | year_and_avg_sentiment=music_df[["Year","sentiment_score"]].groupby('Year').mean()
21 | plt.plot( year_and_avg_sentiment.index,year_and_avg_sentiment.sentiment_score, label='linear')
22 | plt.title("Sentiment of Popular Music between 1965-2015")
23 | plt.show()
24 | nrc_db = pd.read_table("https://raw.githubusercontent.com/mhbashari/NRC-Persian-Lexicon/master/NRC-emotion-lexicon-wordlevel-alphabetized-v0.92.txt",
25 | names = ["Word", "Sentiment", "Exists"])
26 | nrc_db.head(n=5)
27 | nrc_db=nrc_db[nrc_db.Exists >0]
28 | nrc_db.head(n=5)
29 | sentiment=list(set(nrc_db.Sentiment))
30 | for i in sentiment:
31 | music_df[i]=0
32 |
33 | music_df.head(n=5)
34 | stop_words=set(stopwords.words('english'))
35 | def add_sentiment(lyrics):
36 | words=lyrics.split()
37 | words = set(words) - stop_words
38 | words = list(words)
39 | for i in words:
40 | sentiment=list(nrc_db[nrc_db.Word==i]["Sentiment"])
41 | if len(sentiment)>0:
42 | music_df.loc[music_df.Lyrics==lyrics,sentiment] +=1
43 | else:
44 | music_df.loc[music_df.Lyrics==lyrics,list(set(nrc_db.Sentiment))]+=0
45 |
46 | for i in range(5100):
47 | add_sentiment(music_df.Lyrics[i])
48 | print(i)
49 |
50 | import spotipy
51 | client_credentials_manager = SpotifyClientCredentials()
52 | from spotipy.oauth2 import SpotifyClientCredentials
53 | client_credentials_manager = SpotifyClientCredentials(client_id="769ef3519e8444238fde9c8981c6371c",client_secret="b17e4a7ca0b4426f9962645ba5c74a63")
54 | sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)
55 | from collections import OrderedDict
56 | def get_spotify_features(track, artist):
57 | #Search for Spofity song ID
58 | songs=sp.search(q='track:'+track+' '+'artist:'+artist+'*' , type='track')
59 | items = songs['tracks']['items']
60 | if len(items) ==0:
61 | return([0]*len(features))
62 | else:
63 | track = items[0]
64 | song_id = str(track["id"])
65 | #Use ID to get Song features
66 | track_features=sp.audio_features(song_id)
67 | if len(track_features[0]) <18:
68 | return([0]*len(features))
69 | else:
70 | features_to_df = np.array(track_features)[0]
71 | #Order Dictionary
72 | features_to_df = OrderedDict(features_to_df)
73 | #Get Dictionary values
74 | feature_values = features_to_df.values()
75 | return(feature_values)
76 |
77 | music_df.loc[:,features]= music_df.loc[:,].apply(lambda row: pd.Series(get_spotify_features(row["Song"],row["artist_shortened"]),index=features) ,axis=1)
78 |
79 | ind=np.linspace(0,5100,num=5101-1)
80 | for i in ind:
81 | music_df.loc[i,features]=pd.Series(get_spotify_features(music_df.loc[i,"Song"],music_df.loc[i,"artist_shortened"]),index=features)
82 | print i
83 |
84 | music_df = pd.read_csv('C:/Users/rose.anwuri/Documents/TheArtandScienceofData/Music Sentiment Analysis/data/music_data.csv')
85 |
86 | def gather( df, key, value, cols ):
87 | id_vars = [ col for col in df.columns if col not in cols ]
88 | id_values = cols
89 | var_name = key
90 | value_name = value
91 | return pd.melt( df, id_vars, id_values, var_name, value_name )
92 |
93 | music_df_gathered = gather(music_df,"Sentiment","Score",list(music_df.columns[9:19]))
94 | music_df_gathered = gather(music_df,"audio_feature","feature_value",list(music_df_gathered.columns[10:28]))
95 |
--------------------------------------------------------------------------------
/Twitter Crawler/scripts/Telecoms twitter crawler.R:
--------------------------------------------------------------------------------
1 | library(twitteR)
2 | library(base64enc)
3 | library(RCurl)
4 | library(httr)
5 | library(RJSONIO)
6 | library(stringr)
7 |
8 |
9 | api_key <- "UE0sCwrNmxHb8YL759R7SuLEc" # From dev.twitter.com
10 | api_secret <- "C9OxWPmBAOwzQ6G4VXbCKeXd3XEHG5XvJjzTA1AVLoKbtwnpJy" # From dev.twitter.com
11 | token <- "370018889-WKxIRFsc8OJhvdtW3BOOdgIy1qGco48d7QlUO0in" # From dev.twitter.com
12 | token_secret <- "7Ah8qplWJf5ey4zB4IPTTBlypMCUenXnQsrCH7808UbRE" # From dev.twitter.com
13 |
14 | # Create Twitter Connection
15 | setup_twitter_oauth(api_key, api_secret, token, token_secret)
16 | set_config(config (ssl_verifypeer= 0L))
17 | date_time = Sys.Date()-1
18 | date_time = as.character.Date(date_time)
19 | tweetdetails <- searchTwitter("etisalat network -from:etisalat_9ja", n=1500, lang="en", since = date_time,until= date_time, geocode = '10,9, 200km')# Transform tweets list into a data frame
20 | tweetData_network = twListToDF(tweetData)
21 | tweetdetails <- searchTwitter("etisalat data -from:etisalat_9ja", n=1500, lang="en", since = "2016-05-03",until= date_time, geocode = '10,9, 200km')# Transform tweets list into a data frame
22 | tweetData = append(tweetData, tweetdetails)
23 |
24 | tweetData = twListToDF(tweetData)
25 | tweetDataFinal = rbind(tweetData, tweetData_network)
26 | tweetdetails <- searchTwitter("etisalat call -from:etisalat_9ja", n=1500, lang="en", since = "2016-05-03",until= date_time, geocode = '10,9, 200km')# Transform tweets list into a data frame
27 | tweetData = twListToDF(tweetData)
28 | etisalat = rbind(tweetData, tweetDataFinal)
29 | etisalat$service_provider = "Etisalat"
30 |
31 | #MTN
32 | tweetdetails <- searchTwitter("mtn network -from:mtnng", n=1500, lang="en", since = date_time,until= date_time, geocode = '10,9, 200km')# Transform tweets list into a data frame
33 | tweetData_network = twListToDF(tweetData)
34 | tweetdetails <- searchTwitter("mtn data -from:mtnng", n=1500, lang="en", since = "2016-05-03",until= date_time, geocode = '10,9, 200km')# Transform tweets list into a data frame
35 | tweetData = append(tweetData, tweetdetails)
36 |
37 | tweetData = twListToDF(tweetData)
38 | tweetDataFinal = rbind(tweetData, tweetData_network)
39 | tweetdetails <- searchTwitter("mtn call -from:mtnng", n=1500, lang="en", since = "2016-05-03",until= date_time, geocode = '10,9, 200km')# Transform tweets list into a data frame
40 | tweetData = twListToDF(tweetData)
41 | mtn = rbind(tweetData, tweetDataFinal)
42 | mtn$serviceProvider = "MTN"
43 |
44 | #Airtel
45 | tweetdetails <- searchTwitter("airtel network -from:airtelnigeria", n=1500, lang="en", since = date_time,until= date_time, geocode = '10,9, 200km')# Transform tweets list into a data frame
46 | tweetData_network = twListToDF(tweetData)
47 | tweetdetails <- searchTwitter("airtel data -from:airtelnigeria", n=1500, lang="en", since = "2016-05-03",until= date_time, geocode = '10,9, 200km')# Transform tweets list into a data frame
48 | tweetData = append(tweetData, tweetdetails)
49 |
50 | tweetData = twListToDF(tweetData)
51 | tweetDataFinal = rbind(tweetData, tweetData_network)
52 | tweetdetails <- searchTwitter("airtel call -from:airtelnigeria", n=1500, lang="en", since = "2016-05-03",until= date_time, geocode = '10,9, 200km')# Transform tweets list into a data frame
53 | tweetData = twListToDF(tweetData)
54 | airtel = rbind(tweetData, tweetDataFinal)
55 | airtel$service_provider = "Airtel"
56 | #Glo
57 | tweetdetails <- searchTwitter("glo network -from:gloworld", n=1500, lang="en", since = date_time,until= date_time, geocode = '10,9, 200km')# Transform tweets list into a data frame
58 | tweetData_network = twListToDF(tweetData)
59 | tweetdetails <- searchTwitter("glo data -from:gloworld", n=1500, lang="en", since = "2016-05-03",until= date_time, geocode = '10,9, 200km')# Transform tweets list into a data frame
60 | tweetData = append(tweetData, tweetdetails)
61 |
62 | tweetData = twListToDF(tweetData)
63 | tweetDataFinal = rbind(tweetData, tweetData_network)
64 | tweetdetails <- searchTwitter("glo call -from:gloworld", n=1500, lang="en", since = "2016-05-03",until= date_time, geocode = '10,9, 200km')# Transform tweets list into a data frame
65 | tweetData = twListToDF(tweetData)
66 | glo = rbind(tweetData, tweetDataFinal)
67 | glo$service_provider = "Glo"
68 |
69 | final_file = rbind(glo,etisalat,mtn,airtel)
70 | write.csv(final_file,paste0("CMT DATA_"),Sys.Date(),".csv",row.names = F)
--------------------------------------------------------------------------------
/Football Analysis/scripts/FootballAnalysis - 2017_functions.R:
--------------------------------------------------------------------------------
1 | #Apply moving average
2 |
3 | roll_apply = function(pts, no_of_lags){
4 | if(length(pts) ul > li:nth-child(3) > a')
15 | remDr$mouseMoveToLocation(webElement = clicktype)
16 | clicktype$click()
17 | doc = remDr$getPageSource()[[1]]
18 | current_doc = remDr$getPageSource()[[1]] %>%
19 | read_html(doc) %>%
20 | html_table(fill=T)
21 | current_doc= current_doc[c(1,3,6)]
22 | team_stat= current_doc %>%
23 | Reduce(function(x,y) merge(x,y,by="Team"),.) %>%
24 | select(-R.x,-R.y,-R)
25 | return(team_stat)
26 |
27 | }
28 | percent_to_numberic = function(percent){
29 | require(stringr)
30 | percent = str_replace(percent,"%","") %>%
31 | as.numeric
32 | return(percent/100)
33 |
34 | }
35 | defensive_gathering_pipeline = function(number){
36 | remDr$navigate(paste0("https://www.whoscored.com/Regions/252/Tournaments/2/Seasons/",number,"/England-Premier-League"))
37 | #Go to Team Statistics
38 | clicktype = remDr$findElement(using = "css selector", '#sub-navigation > ul > li:nth-child(3) > a')
39 | remDr$mouseMoveToLocation(webElement = clicktype)
40 | clicktype$click()
41 | clicktype = remDr$findElement(using = "css selector", '#stage-team-stats-options > li:nth-child(2) > a')
42 | remDr$mouseMoveToLocation(webElement = clicktype)
43 | clicktype$click()
44 | Sys.sleep(2)
45 | doc = remDr$findElement(using = "css selector", '#statistics-team-table-defensive')
46 | doc =doc$getElementAttribute("innerHTML")[[1]] %>%
47 | read_html %>%
48 | html_table()
49 | defense_stat= doc[[1]] %>%
50 | select(-R)
51 | return(defense_stat)
52 |
53 | }
54 |
55 | gathering_pipeline = function(type,number){
56 | remDr$navigate(paste0("https://www.whoscored.com/Regions/252/Tournaments/2/Seasons/",number,"/England-Premier-League"))
57 | #Go to Team Statistics
58 | clicktype = remDr$findElement(using = "css selector", '#sub-navigation > ul > li:nth-child(3) > a')
59 | remDr$mouseMoveToLocation(webElement = clicktype)
60 | clicktype$click()
61 | if(type=="Offensive"){
62 | clicktype = remDr$findElement(using = "css selector", '#stage-team-stats-options > li:nth-child(3) > a')
63 | remDr$mouseMoveToLocation(webElement = clicktype)
64 | clicktype$click()
65 | }
66 | else if(type=="Detailed"){
67 | clicktype = remDr$findElement(using = "css selector", '#stage-team-stats-options > li:nth-child(4) > a')
68 | remDr$mouseMoveToLocation(webElement = clicktype)
69 | clicktype$click()
70 | }
71 | else if(type=="Pass Types"){
72 | clicktype = remDr$findElement(using = "css selector", '#stage-situation-stats-options > li:nth-child(2) > a')
73 | remDr$mouseMoveToLocation(webElement = clicktype)
74 | clicktype$click()
75 | }
76 | else if(type=="Card Situations"){
77 | clicktype = remDr$findElement(using = "css selector", '#stage-situation-stats-options > li:nth-child(3) > a')
78 | remDr$mouseMoveToLocation(webElement = clicktype)
79 | clicktype$click()
80 | }
81 | Sys.sleep(2)
82 | if(type=="Offensive"){
83 | doc = remDr$findElement(using = "css selector", '#statistics-team-table-offensive')
84 | }
85 | else if(type=="Detailed"){
86 | doc = remDr$findElement(using = "css selector", '#top-team-stats-summary-grid')
87 | }
88 | else if(type=="Pass Types"){
89 | doc = remDr$findElement(using = "css selector", '#stage-passes-grid')
90 | }
91 | else if(type=="Card Situations"){
92 | doc = remDr$findElement(using = "css selector", '#stage-cards-grid')
93 | }
94 | doc =doc$getElementAttribute("outerHTML")[[1]] %>%
95 | read_html %>%
96 | html_table()
97 | stat= doc[[1]] %>%
98 | select(-R)
99 | return(stat)
100 |
101 | }
102 |
103 | cummean_fn = function(df, no_of_cols,no_of_rolls){
104 | if(nrow(df_split[[df]])% as.data.frame
124 | test.df$rank_pred=rank(-test.df$test_pred,ties.method = "min")
125 | rmse= RMSE(test.df$rank_pred,as.numeric(as.character(test.df$rank)),na.rm = T)
126 | return(rmse)
127 | }
128 |
129 | getID = function(model,newID){
130 | ID=model@model_id
131 | return(ID)
132 | }
--------------------------------------------------------------------------------
/Holidays/scripts/Holidays.R:
--------------------------------------------------------------------------------
1 | library(rvest)
2 | library(stringr)
3 | library(dplyr)
4 | library(doParallel)
5 | library(foreach)
6 |
7 |
8 | url1 <- "https://www.timeanddate.com/holidays/"
9 |
10 | links <- url1 %>% read_html %>% html_nodes(".main-content-div a") %>% html_attr("href")
11 | links <- links [4:231]
12 | links <- paste0("https://www.timeanddate.com",links)
13 | country <- str_replace_all(str_replace(links,"https://www.timeanddate.com/holidays/",""),"-"," ")
14 | first_word_cap <- function(word){
15 | require(stringr)
16 | words <- unlist(str_split(word, " "))
17 | words <- lapply(words, function(x) str_replace(x,"//w",toupper(substr(x,1,1))))
18 | word <- paste(words, collapse = " ")
19 | }
20 | country <-unlist(lapply(country,first_word_cap))
21 |
22 | no_of_holiday = function(x){
23 | require(rvest)
24 | require(dplyr)
25 | ifelse(is.list(try(x %>%
26 | read_html %>%
27 | html_node(".zebra") %>%
28 | html_table %>%
29 | filter (grepl("Holiday",`Holiday type`,ignore.case = T)),silent = T))
30 | ,nrow(x %>% read_html %>% html_node(".zebra") %>% html_table %>% filter (grepl("Holiday",`Holiday type`,ignore.case = T)))-1,0)
31 | }
32 | cores=detectCores()-1
33 | cl = makeCluster(cores)
34 | registerDoParallel(cl)
35 | no_of_holidays = unlist(foreach(n=links,.packages=c("rvest","dplyr")) %dopar% no_of_holiday(x=n))
36 | country_holidays <- cbind(data.frame(country),data.frame(no_of_holidays))
37 | hist(country_holidays$no_of_holidays)
38 |
39 | #Investigated outliers and relaized that some countires have county and regional holidays.
40 | #I wanted to ficus on more general holidays so I extracted countries that had a column stating what region the holiday was observed.
41 | outliers = function(x) {
42 | if("Where it is observed" %in% names(x %>%
43 | read_html %>%
44 | html_node(".zebra") %>%
45 | html_table) == T){
46 | x
47 | }
48 |
49 | }
50 | outliers = unlist(foreach(n=links,.packages=c("rvest","dplyr")) %dopar% outliers(x=n))
51 |
52 | #After extracting that, I extracted the new number of holidays.
53 | updated_holidays= function (x){
54 | nrow(x %>%
55 | read_html %>%
56 | html_node(".zebra") %>%
57 | html_table %>%
58 | filter( grepl('holiday',`Holiday type`,ignore.case =T)) %>%
59 | filter(Encoding(`Where it is observed`) == "UTF-8"))
60 | }
61 | updated_holidays = unlist(foreach(n=outliers,.packages=c("rvest","dplyr")) %dopar% updated_holidays(x=n))
62 |
63 | #Housekeeping... getting the country names and putting them in a consistent format
64 | outliers_country <- str_replace(outliers,"https://www.timeanddate.com/holidays/","")
65 | outliers_country <- str_replace_all(outliers_country,"-"," ")
66 | outliers_country <-unlist(lapply(outliers_country,first_word_cap))
67 |
68 | #Updated the dataframe with the new holidays
69 | country_holidays$no_of_holidays[country_holidays$country %in% outliers_country]=unlist(lapply(1:10,
70 | function(x)
71 | country_holidays$no_of_holidays[country_holidays$country==outliers_country[x]]=updated_holidays[x])
72 | )
73 | #remove countries with no holidays! 0_0 (Actually not countries. just the UN)
74 | country_holidays = subset(country_holidays,no_of_holidays>0)
75 |
76 | GDP.Data <- read.csv("~/Data Analysis/Datasets/GDP Data.csv")
77 | ###Merging the GDP data frame with the holiday dataframe
78 | #At this point, I should point out that I really had to clean the country name formats outside R
79 | #It would have been terribly cumbersome doing it any other way
80 | country_holidays=rename(country_holidays,Country_Name = country)
81 | country_data = merge(country_holidays,GDP.Data,by = "Country_Name",all.y = T)
82 | country_data$no_of_holidays[210] = 9
83 | country_data$no_of_holidays[213] = 13
84 | country_data$no_of_holidays[214] = 10
85 | country_data$no_of_holidays[219] = 15
86 | country_data = country_data[complete.cases(country_data),]
87 |
88 | #Health Spend
89 | health_spend = read_csv("~/Data Analysis/Datasets/Health expenditure, total (current US).csv")
90 |
91 | #Rehsaping the data to a better format
92 | health_spend=health_spend %>% gather(Year, GDP,YR2007:YR2015)
93 | health_spend$Year = str_replace(health_spend$Year,"YR",'')
94 | avg_health_spend=health_spend %>%
95 | filter(Year != 2015) %>%
96 | group_by(Country_Name) %>%
97 | summarise(Average_Total = mean(GDP))
98 | country_data = merge(avg_health_spend,country_data,by = "Country_Name",all.y = T)
99 | country_data = country_data%>% select(Country_Name,Average,no_of_holidays,Avg_seven)
100 |
101 | Health_expenditure_PPP <- read.csv("~/Data Analysis/Datasets/Health expenditure per capita, PPP.csv")
102 | Health_expenditure_PPP=Health_expenditure_PPP %>% gather(Year, GDP,YR2007:YR2014)
103 | Health_expenditure_PPP$Year = str_replace(Health_expenditure_PPP$Year,"YR",'')
104 | Health_expenditure_PPP = rename(Health_expenditure_PPP,Country_Name=Country.Name)
105 | avg_health_PPP= Health_expenditure_PPP %>%
106 | filter(Year != 2015) %>%
107 | group_by(Country_Name) %>%
108 | summarise(Average_PPP = mean(GDP))
109 |
110 | country_data = merge(avg_health_PPP,country_data,by = "Country_Name",all.y = T)
111 | country_data = country_data%>% select(Country_Name,Average,no_of_holidays,Avg_seven,Average_PPP)
112 |
--------------------------------------------------------------------------------
/Consitent Billionaire Guide/scripts/billionaire_functions.R:
--------------------------------------------------------------------------------
1 | #All Functions for Billionaires.R
2 | get_historical_ranks = function(year){
3 | year_update = ifelse(year <10,paste0(0,year),year)
4 | url = paste0("http://stats.areppim.com/listes/list_billionairesx",year_update,"xwor.htm")
5 | tbl="http://stats.areppim.com/listes/list_billionairesx17xwor.htm" %>%
6 | read_html() %>%
7 | html_node("body > table") %>%
8 | html_table(trim = T)
9 | names(tbl) = tbl[2,]
10 | tbl = tbl[3:nrow(tbl),]
11 | tbl = tbl[,-6]
12 | names(tbl) = c("Rank","Name","Citizenship","Age","Net Worth")
13 | tbl=tbl %>% mutate(Year= paste0(20,year_update))
14 | }
15 |
16 | joining_year = function(row){
17 | yr=names(full_list)[(which(is.na(full_list[row,3:13])==F))[1]+2]
18 | return(yr)
19 | }
20 |
21 | leavers = function(row){
22 | total =0
23 | for(i in 1:(length(row)-1)){
24 | if(is.na(as.character(row[i+1])) ==T && is.na(as.character(row[i])) ==F ){
25 | total = total+1
26 | }
27 | }
28 | return(total)
29 | }
30 |
31 | comers = function(row){
32 | total =0
33 | first_time = min(which(is.na(row)==F))
34 |
35 | if (first_time>=10){
36 | return(0)
37 | }
38 | else{
39 | for(i in first_time:(length(row)-1)){
40 | if(is.na(as.character(row[i+1])) ==F && is.na(as.character(row[i])) ==T ){
41 | total = total+1
42 | }
43 | }
44 | }
45 | return(total)
46 | }
47 |
48 | max_distance = function(row){
49 | lst=which(is.na(row)==F)
50 | lt = NULL
51 | if( length(lst)==1){
52 | return(0)
53 | }
54 | for (i in 1:(length(lst)-1)){
55 | h= lst[i+1]-(lst[i]+1)
56 | lt = append(lt,h)
57 | }
58 | return(max(lt))
59 | }
60 |
61 |
62 | last_year_on_list = function(row){
63 | years_onList=which(is.na(row)==F)
64 | year_left = names(full_list[,5:15])[max(years_onList)]
65 | return(year_left)
66 |
67 | }
68 | last_year_left = function(row){
69 | lst=NULL
70 | if (length(which(is.na(row)==T))==0){
71 | return ("")
72 | }
73 | else{
74 | for (i in 2:length(row)){
75 | if (is.na(row[i])==T&is.na(row[i-1])==F){
76 | lst = append(lst,i)
77 | }
78 | }
79 | year_left = names(full_list[,5:15])[max(lst)]
80 | return(year_left)
81 | }
82 | }
83 |
84 | #Make the names into URLs
85 | names_to_URLs = function(name){
86 | name = as.character(name)
87 | name = str_trim(name)
88 | name=str_replace_all(name,"[^0-9A-Za-z\\- ]",'')
89 | name=str_replace_all(name,"\\s+",' ')
90 | name = str_replace_all(name," ","-")
91 | name=tolower(name)
92 | url = paste0("https://www.forbes.com/profile/",name,"/")
93 | return(url)
94 | }
95 |
96 | #Making a function to display name backwards e.g Joy Jones as Jones Joy
97 | #This will come in handy when getting the billionaire's info
98 | backward_names = function(name){
99 | name = unlist(str_split(name," "))
100 | name = rev(name)
101 | name = paste(name,collapse = ' ')
102 | name = str_trim(name)
103 | name=names_to_URLs(name)
104 | return(name)
105 | }
106 |
107 |
108 | get_billionaire_info = function(names_of_billionaires){
109 | url=names_to_URLs(names_of_billionaires)
110 | data=try(url %>%
111 | read_html %>%
112 | html_nodes(".stats li") %>%
113 | html_text(trim=T)
114 | ,silent = T)
115 | if(class(data) =="try-error"){
116 | url = backward_names(names_of_billionaires)
117 | data=try(url %>%
118 | read_html %>%
119 | html_nodes(".stats li") %>%
120 | html_text(trim=T)
121 | ,silent = T)
122 | }
123 | sector = ifelse(length(data[grepl("Source of Wealth",data,T)])>0,data[grepl("Source of Wealth",data,T)],NA)
124 | education = ifelse(length(data[grepl("Education",data,T)])>0,data[grepl("Education",data,T)],NA)
125 | sector = str_replace_all(sector,".*\t|.*\n","")
126 | education = str_replace_all(education,".*\t|.*\n","")
127 | return(c(sector,education))
128 | }
129 |
130 | education_columns_creator=function(df){
131 | df$Self_Made = ifelse(grepl("self made",df$detail1,ignore.case = T),"Y","N")
132 | df$dropped_out = ifelse(grepl("drop out",df$detail2,ignore.case = T),"Y",ifelse(is.na(df$detail2),NA,"N"))
133 | df$bachelors_degree = ifelse(grepl("bachelor",df$detail2,ignore.case = T),"Y",ifelse(is.na(df$detail2),NA,"N"))
134 | df$masters_degree = ifelse(grepl("master of arts|master of science",df$detail2,ignore.case = T),"Y",ifelse(is.na(df$detail2),NA,"N"))
135 | df$MBA = ifelse(grepl("Master of Business Administration",df$detail2,ignore.case = T),"Y",ifelse(is.na(df$detail2),NA,"N"))
136 | df$phd_or_professional_degree = ifelse(grepl("doctor|llb",df$detail2,ignore.case = T),"Y",ifelse(is.na(df$detail2),NA,"N"))
137 | return(df)
138 | }
139 |
140 | #Extract Model ID from model
141 | getID = function(model){
142 | ID=model@model_id
143 | return(ID)
144 | }
145 |
146 | #Function to check validation set accuracy
147 | test_accuracy = function(model){
148 | table_accuracy=h2o.hit_ratio_table(model,valid = T)[1,2]
149 | return(table_accuracy)
150 | }
151 |
152 |
153 | model_type.H2OMultinomialModel = function(x, ...) {
154 | return("classification")
155 |
156 | }
157 |
158 | predict_model.H2OMultinomialModel = function(x, newdata, type, ...) {
159 | # Function performs prediction and returns dataframe with Response
160 | #
161 | # x is h2o model
162 | # newdata is data frame
163 | # type is only setup for data frame
164 |
165 | pred <- h2o.predict(x, as.h2o(newdata))
166 |
167 | # return classification probabilities only
168 | return(as.data.frame(pred[,-1]))
169 |
170 | }
171 |
--------------------------------------------------------------------------------
/Consitent Billionaire Guide/scripts/Billionaires.R:
--------------------------------------------------------------------------------
1 | library(RSelenium)
2 | library(rvest)
3 | library(stringr)
4 | library(dplyr)
5 | library(lubridate)
6 | library(readr)
7 | library(purrr)
8 | library(lime)
9 |
10 | #List of billionaires from 2007-2017
11 |
12 | years = 7:17
13 |
14 |
15 | full_list = lapply(years,get_historical_ranks)
16 | full_list = do.call("rbind",full_list)
17 |
18 | #At this point had to some excel work to harmonize the names e.g. some year bill gates was mentioned as William Gates III
19 | #This is where I spent most of my time
20 | #Mark X for each billionaire in each year (this will come in handy later)
21 | full_list = full_list %>% mutate(markings= "X")
22 |
23 | #Three billionaires have exactly the same names - Wang Wei, Robert Miller and Jim Davis
24 | full_list[9865,2] = "Jim Davis, new balance"
25 | full_list[12711,2] ="Jim Davis, recruitment"
26 | full_list[11497,2] = "Jim Davis, new balance"
27 | full_list[11030,2] ="Jim Davis, recruitment"
28 | full_list[9783,2] = "Wang Wei, delivery"
29 | full_list[10981,2] = "Wang Wei, computer hardware"
30 |
31 | #Use spread to make the year columns - the markings are useful here
32 | full_list=full_list %>%
33 | select(Name_updated,Citizenship,Year,markings) %>%
34 | spread(Year,markings)
35 |
36 | #Create year billionaire joined
37 |
38 | full_list$joining_year = unlist(lapply(1:nrow(full_list),joining_year))
39 |
40 | #No of time billionaire has been on the list
41 | full_list=full_list %>%
42 | mutate(no_of_times_on_list =rowSums(is.na(full_list[,3:13])==F))
43 |
44 | #No of times billionaires left
45 |
46 |
47 | full_list$no_of_times_left =apply(full_list[,3:13],1,leavers)
48 |
49 | #No of times billionaires have comeback to the list
50 |
51 |
52 |
53 | full_list$no_of_comebacks=apply(full_list[,3:13],1,comers)
54 |
55 | #Longest time away from the list
56 |
57 | full_list$longest_time_away = apply(full_list[,3:13],1,max_distance)
58 |
59 | #Last year on list and last year off it
60 |
61 | full_list$last_off_list = apply(full_list[,5:15],1,last_year_left)
62 |
63 |
64 | #Make the names into URLs
65 |
66 |
67 | #Making a function to display name backwards e.g Joy Jones as Jones Joy
68 | #This will come in handy when getting the billionaire's info
69 |
70 | #Get the billionaires info
71 |
72 |
73 | #Getting the columns for the details of billionaires set
74 | full_list$detail1 =""
75 | full_list$detail2 =""
76 |
77 | #I opted for a for loop as opposed to the sexy apply function as this gives me the ability
78 | #to store intermediate results
79 | for (i in 1:nrow(full_list)){
80 | dat=get_billionaire_info(full_list$Name[i])
81 | #dat = list(dat)
82 | full_list$detail1[i]=dat[1]
83 | full_list$detail2[i]=dat[2]
84 | print(paste0(round((i/nrow(full_list))*100,2),"% done at ",i))
85 | }
86 |
87 | #Create columns for educational information
88 |
89 | full_list = education_columns_creator(full_list)
90 | full_list$detail1 = unlist(full_list$detail1)
91 | full_list$detail2 = unlist(full_list$detail2)
92 |
93 | #Data Munging complete! :)
94 |
95 | #Clusters were made in Tableau using Sets and imported back into the csv file
96 | #================================Machine Learning======================================#
97 | library(h2o)
98 | library(plyr)
99 | library(dplyr)
100 | library(purrr)
101 | library(scales)
102 | library(lime)
103 | #Initialize H2o cluster
104 | h2o.init()
105 |
106 | dat1 = full_list %>%
107 | filter(Cluster !="The Newbie")
108 |
109 | data = dat1 %>%
110 | select(-Detail1,-Detail2) %>%
111 | as.h2o()
112 |
113 | splits = h2o.splitFrame(data,ratios = c(0.6,0.2),destination_frames = c("train", "valid", "test"), seed = 1234)
114 | train = h2o.getFrame("train")
115 | val = h2o.getFrame("valid")
116 | test = h2o.getFrame("test")
117 |
118 |
119 |
120 | #Column index numbers
121 | features=c(5,7,9,18,25,26,28)
122 | #c(4:11,18:19,25:26,28)
123 |
124 | #
125 | #2,4:11,18:19,25:26,29
126 | response=27
127 |
128 | #Models we would like to train and test the accuracy
129 | models = c("h2o.randomForest", "h2o.deeplearning" ,"h2o.gbm")
130 | names_of_models = c("Random Forest", "Deep Learning" ,"GBM")
131 |
132 | #The map function will invoke the functions in the "models" vector on the parameters below
133 | list_of_models =invoke_map(models, x=features,y=response,training_frame =train,validation_frame = val)
134 |
135 |
136 | #Store IDs for retreival later
137 | IDs = list_of_models %>%
138 | map_chr(getID)
139 |
140 |
141 | #check accuracy on validation set
142 | list_of_models %>%
143 | map_dbl(test_accuracy) %>%
144 | set_names(.,names_of_models)
145 | #GBM performed best so let's compute that variable importance
146 | model_gbm = h2o.getModel(IDs[3])
147 | h2o.varimp_plot(h2o.getModel(IDs[3]))
148 | #Store variable importance in csv for visualization
149 | var_imp_rf=h2o.varimp(model_gbm)
150 | write.csv(var_imp_rf,"Variable Importance GBM.csv",row.names = F)
151 | model_gbm=h2o.loadModel("C:/Users/rose.anwuri/Documents/TheArtandScienceofData/Consitent Billionaire Guide/app/GBM_Model")
152 | features_lime=model_gbm@parameters$x
153 | train_lime = train[,c(features_lime,"Cluster")] %>% h2o.na_omit()
154 | test_lime = val[,c(features_lime,"Cluster")]%>% h2o.na_omit()
155 |
156 | predict_model(x = model_gbm, newdata =test_lime[,-9], type = 'raw') %>%
157 | tibble::as_tibble()
158 | train_lime_df=as.data.frame(train_lime[,-9])
159 | train_lime_df=train_lime_df[complete.cases(train_lime_df),]
160 |
161 | explainer <- lime::lime(
162 | train_lime_df,
163 | model = model_gbm,
164 | bin_continuous = FALSE)
165 | test_lime_df = as.data.frame(test_lime) %>% filter(Cluster=="The Hustler") %>%as_tibble()
166 | test_lime_df=test_lime_df[complete.cases(test_lime_df),]
167 | explanation <- lime::explain(
168 | as.data.frame(test_lime_df[,-9]),
169 | explainer = explainer,
170 | n_labels = 1,
171 | n_features = 4,
172 | kernel_width = 0.5)
173 | plot_features(explanation)
174 |
--------------------------------------------------------------------------------
/Football Analysis/scripts/FootballAnalysis - 2017.R:
--------------------------------------------------------------------------------
1 | library(plyr)
2 | library(dplyr)
3 | library(stringr)
4 | library(RSelenium)
5 | library(rvest)
6 | library(XML)
7 | library(zoo)
8 | library(h2o)
9 | library(purrr)
10 | library(caret)
11 |
12 | options(stringsAsFactors = F)
13 | #Get standard league table from skysports.com
14 | Dat = NULL
15 | for (i in 2009:2016){
16 | teamStats = readHTMLTable(paste0('http://www.skysports.com/premier-league-table/',i))
17 | names(teamStats) = NULL
18 | teamStats = data.frame(teamStats)
19 | teamStats$Year = i
20 | Dat = rbind(Dat,teamStats)
21 | print(i)
22 | flush.console()
23 | }
24 |
25 | ####Housekeeping!#####
26 | Dat= subset(Dat, Year !=2006)
27 | Dat$X. = NULL
28 | Dat$Last.6 = NULL
29 | Dat[,2:9] = apply(Dat[,2:9],2,function(x) as.integer(as.character(x)))
30 | Dat$Team = factor(str_replace_all(as.character(Dat$Team),pattern = "[*]",''))
31 | Dat$Team = str_trim(Dat$Team)
32 |
33 | Dat1=Dat
34 | Dat$rank = rep(1:20,length.out=320)
35 |
36 | epl_db = ddply(Dat, ~Team,transform, W=roll_apply(W,2),
37 | L=roll_apply(L,2),
38 | Cummulative.Pts=roll_apply(Pts,2),
39 | Cummulative.rank=roll_apply(rank,2),
40 | GD=roll_apply(GD,2),
41 | D=roll_apply(D,2),
42 | F=roll_apply(F,2),
43 | A=roll_apply(A,2),
44 | Pts = lead(Pts),
45 | rank=lead(rank))
46 |
47 | ####Rendering whoscored.com website using a remote selenium driver and getting a data####
48 |
49 | rD=rsDriver()
50 | remDr = rD[["client"]]
51 |
52 | #Whoscored's weird way of encoding their year variables. This 2009 - 2016
53 | year_coding=c(1849,2458,2935,3389,3853,4311,5826,6335)
54 |
55 | remDr$setImplicitWaitTimeout(30000)
56 |
57 | team_statistics = ldply(year_coding,summary_gathering_pipeline)
58 |
59 | team_statistics[,14:16] = apply(team_statistics[,14:16],2,percent_to_numberic)
60 |
61 | defense_statistics = ldply(year_coding,defensive_gathering_pipeline)
62 |
63 | offense_stat = ldply(year_coding,function(x) gathering_pipeline(type = "Offensive",x))
64 | detailed_stat = ldply(year_coding,function(x) gathering_pipeline(type = "Detailed",x))
65 | pass_stat = ldply(year_coding,function(x) gathering_pipeline(type = "Pass Types",x))
66 | card_stat = ldply(year_coding,function(x) gathering_pipeline(type = "Card Situations",x))
67 |
68 |
69 | remDr$close()
70 |
71 | #Adding a Year column to the final table
72 | team_statistics$Year = rep(2009:2016, each=20, length.out = 160)
73 | defense_statistics$Year = rep(2009:2016, each=20, length.out = 160)
74 | offense_stat$Year = rep(2009:2016, each=20, length.out = 160)
75 | detailed_stat$Year = rep(2009:2016, each=20, length.out = 160)
76 | pass_stat$Year = rep(2009:2016, each=20, length.out = 160)
77 | card_stat$Year = rep(2009:2016, each=20, length.out = 160)
78 | offense_stat$`Shots pg` = NULL
79 | names(defense_statistics)[2] = "Shots.conceded.pg"
80 | in_c = left_join(team_statistics,defense_statistics,by=c("Team","Year"))
81 | in_c = left_join(in_c,offense_stat,by=c("Team","Year"))
82 | in_c = left_join(in_c,pass_stat,by=c("Team","Year"))
83 | epl_database = left_join(in_c,card_stat,by=c("Team","Year"))
84 | epl_database=epl_database %>%
85 | select(-matches("Rating"),-matches("\\X."),-X)
86 | rm(in_c)
87 | #Create cummulative means for all numerical variables
88 | df_split=split(epl_database,epl_database$Team)
89 | epl_data =ldply(names(df_split),function(x) cummean_fn(x,c(2:15,17:32),2))
90 | epl_data$Team = str_trim(epl_data$Team)
91 |
92 | ####Housekeeping! Again! I am about to join two tables based on their team names. Hence, I have to ensure that they are of the same format####
93 | epl_db$Team = as.character(epl_db$Team)
94 | epl_db$Team[epl_db$Team == "Birmingham City"] = "Birmingham"
95 | epl_db$Team[epl_db$Team == "Blackburn Rovers"] = "Blackburn"
96 | epl_db$Team[epl_db$Team == "Bolton Wanderers"] = "Bolton"
97 | epl_db$Team[epl_db$Team == "Cardiff City"] = "Cardiff"
98 | epl_db$Team[epl_db$Team == "Norwich City"] = "Norwich"
99 | epl_db$Team[epl_db$Team == "Swansea City"] = "Swansea"
100 | epl_db$Team[epl_db$Team == "Stoke City"] = "Stoke"
101 | epl_db$Team[epl_db$Team == "Tottenham Hotspur"] = "Tottenham"
102 | epl_db$Team[epl_db$Team == "West Ham United"] = "West Ham"
103 | epl_db$Team[epl_db$Team == "Wigan Athletic"] = "Wigan"
104 | epl_db$Team[epl_db$Team == "Hull City"] = "Hull"
105 | epl_db$Team[epl_db$Team == "Leicester City"] = "Leicester"
106 |
107 | #Merge final tables from skysports and whoscored together and remove duplicate columns
108 | epl_database = right_join(epl_db,epl_data,by = c("Year","Team"))
109 | write.csv(epl_database,"Football Analysis/data/epl_data.csv",row.names = F)
110 |
111 | #####Preprocessing#####
112 | h2o.init()
113 | model_data = epl_database %>%
114 | select(-Pl)
115 |
116 | train = model_data %>% filter(Year<2016) %>%
117 | as.h2o %>%
118 | h2o.na_omit()
119 | test = model_data %>% filter(Year == 2016) %>%
120 | as.h2o
121 |
122 | train$Team = as.factor(train$Team)
123 | test$Team = as.factor(test$Team)
124 | cor=h2o.cor(model_data[,2:42] %>% as.h2o %>% h2o.na_omit)
125 | row.names(cor) = colnames(cor)
126 | cor$Pts = abs(cor$Pts)
127 | features = c(2,4,7,11:13)
128 | #7,13,6,11
129 |
130 | response = 8
131 |
132 | #####Model Selection########
133 | models = c("h2o.glm","h2o.gbm","h2o.deeplearning","h2o.randomForest")
134 |
135 | list_of_models = invoke_map(models,x=features,y=response,training_frame =train, validation_frame = test)
136 |
137 | #Extract Model ID from model
138 |
139 | #Store IDs for retreival later
140 | IDs = list_of_models %>%
141 | map_chr(getID)
142 |
143 | #######Model Validation########
144 | list_of_models %>%
145 | map_dbl(test.r2) %>%
146 | set_names(.,models)
147 |
148 | list_of_models %>%
149 | map_dbl(test.rmse) %>%
150 | set_names(.,models)
151 |
152 |
153 |
154 |
155 | #######Model Tuning#######
156 |
157 | ##GLM###
158 | hyper_params =list(alpha=seq(0,1,0.1))
159 |
160 |
161 | glm_Grid = h2o.grid(algorithm = "glm",
162 | grid_id = "glm_tuning1",
163 | hyper_params = hyper_params,
164 | x=features,
165 | y=response,training_frame = train,
166 | validation_frame = test,
167 | seed=5.388119e+18,lambda_search=T)
168 |
169 |
170 | grid_sorted=h2o.getGrid("glm_tuning1",sort_by = "r2",decreasing = T)
171 | rmse=ldply(grid_sorted@model_ids %>% unlist, function(x) h2o.getModel(x) %>% test.rmse)
172 | cbind(alpha=grid_sorted@summary_table$alpha,rmse=rmse) %>% arrange(V1)
173 |
174 | #GBM
175 | hyper_params =list(learn_rate=seq(0.2,0.3,0.01),ntrees=seq(5,100,5),min_rows=1:10)
176 | gbm_Grid = h2o.grid(algorithm = "gbm",
177 |
178 | hyper_params = hyper_params,
179 | x=features,seed=8.973598e+18,
180 | y=response,training_frame = train,
181 | validation_frame = test
182 |
183 | )
184 |
185 | grid_sorted=h2o.getGrid(gbm_Grid@grid_id,sort_by = "r2",decreasing = T)
186 | rmse=ldply(grid_sorted@model_ids %>% unlist, function(x) h2o.getModel(x) %>% test.rmse)
187 | cbind(val=grid_sorted@summary_table[,1],rmse=rmse) %>% arrange(V1) %>%top_n(-5)
188 | model_gbm=h2o.getModel("gbm_grid7_model_0")
189 | hyper_params =list(col_sample_rate_per_tree=seq(0.1,1,0.1),min_rows=1:10,ntrees=seq(5,100,5),col_sample_rate_change_per_level=seq(1,2,0.1))
190 | rf_Grid = h2o.grid(algorithm = "randomForest",
191 | grid_id = "rfD_tuning8",
192 | hyper_params = hyper_params,
193 | x=features,
194 | y=response,training_frame = train,
195 | validation_frame = test,
196 | seed=-4.02063e+18
197 |
198 | )
199 |
200 |
201 | rmse=ldply(grid_sorted@model_ids %>% unlist, function(x) h2o.getModel(x) %>% test.rmse)
202 | cbind(val=grid_sorted@summary_table[,1],rmse=rmse) %>% arrange(V1) %>%top_n(-5)
203 |
204 |
205 |
206 | #####Final Model######
207 | #What if I told you all that tuning was of no use?
208 | model_gbm=h2o.gbm(x=features,
209 | y=response,training_frame = train,
210 | validation_frame = test
211 | )
212 | test$predict = h2o.predict(model_gbm,test)
213 | test.df = as.data.frame(test)
214 | test.df$pred_rank = rank(-test.df$predict,ties.method = "first")
215 |
--------------------------------------------------------------------------------
/Others/CBN.R:
--------------------------------------------------------------------------------
1 | library(h2o)
2 | library(shiny)
3 | library(shinyjs)
4 | library(shinydashboard)
5 | library(shinythemes)
6 | library(plyr)
7 | library(dplyr)
8 | library(wesanderson)
9 | library(scales)
10 | library(lubridate)
11 | #library(shinysky)
12 | library(extrafont)
13 | library(stringr)
14 | h2o.init()
15 |
16 | df = read.csv("C:/Users/rose.anwuri/OneDrive/TheArtandScienceofData/billionaire_data.csv")
17 | dat = df %>%
18 | filter(Cluster !="The Newbie") %>%
19 | filter(is.na(Age)==F)
20 |
21 | #model = h2o.loadModel("C:\\Users\\rose.anwuri\\Documents\\TheArtandScienceofData\\DRF_model_R_1501438266398_1")
22 | Countries=levels(dat$Country)
23 | Sectors = levels(dat$Sector)
24 | Relations = levels(dat$relation)
25 | ui = shinyUI(
26 | navbarPage("Billion Dollar Questions",inverse = F,collapsible = T,fluid = T,
27 | theme = shinytheme("flatly"),
28 | tabPanel("What Type of Billionaire Are You?", icon = icon("money"),
29 | sidebarLayout(position = 'left',
30 | sidebarPanel(id = "sidebar",
31 | selectizeInput("Country",label = h4("What country are you from?",style ="font-size: 12pt;"),choices=Countries),
32 | numericInput('Age',h4("How Old Are You?",style ="font-size: 12pt;"),value = 20,min = 12,max = 100),
33 | selectizeInput("selfMade",h4("Do you have your own company (or plan to have one)?",style ="font-size: 12pt;"),choices = c("Yes","No")),
34 | conditionalPanel("input.selfMade=='Yes'",selectizeInput("relation",h4("Choose one below that best describes your role in the business:",style ="font-size: 12pt;"),choices = Relations)),
35 | conditionalPanel("input.selfMade=='No'",selectizeInput("buffer",h4("That's okay. Think of a business where you are likely to make money from. What best describes your role in that company?",style ="font-size: 12pt;"),choices = Relations)),
36 | numericInput("founding_year",h4("When was/will this business (be) established?",style ="font-size: 12pt;"),value = 1999,min=1600,max=year(Sys.Date())+10),
37 | selectizeInput("Sector",h4("What Sector is this business?",style ="font-size: 12pt;"),choices = Sectors),
38 | selectizeInput("Bsc",h4("Do you have a Bachelor's Degree?",style ="font-size: 12pt;"),choices = c("Yes","No")),
39 | tags$head(
40 | tags$style(HTML('#run{border-radius: 0px;}'))
41 | ),
42 | div(actionButton("run",div("PREDICT",icon("flask"),style="text-align:center;font-size:10pt;"),styleclass = "success",size = "mini",css.class = "z-depth-5"),style="text-align:left;")
43 |
44 |
45 | ),
46 | mainPanel(position="left",
47 | br(),
48 |
49 | busyIndicator(text = h4("Running Model...",style="font-size: 40px; font-family:Papyrus;"),img = "shinysky/busyIndicator/Loading3.gif"),
50 |
51 | textOutput("prefix_final"),
52 | div(imageOutput("image_final"), style="text-align: center;"),
53 | textOutput("prediction_final"),
54 | br(),
55 | tags$head(tags$style("#prefix_final{font-size: 40px; text-align:center;}")),
56 | tags$head(tags$style("#prediction_final{font-size: 40px; text-align:center;}")),
57 | conditionalPanel("typeof output.image_final !== 'undefined'",div(uiOutput("urlInput"),style="text-align:center;"))
58 |
59 | ))),
60 | tabPanel("About", icon = icon("info"),
61 |
62 | box(width = 12,title= h2(strong("The Art and Science of Data",img(src="Typing.gif",height=60)),style= "font-size: 36pt;text-align:center; font-family:Papyrus; color:#009688;"),
63 | br(),
64 | div("This application was humbly created by me (Rosebud Anwuri). It's purely for fun and in no way guarantees you'd become a billionaire (I'm sorry). If you like this and would like to see more of the stuff I work on, you can visit my blog here:",a("The Art and Science of Data.",style= "font-size: 16pt; display: inline;color:#009688;",href="http://theartandscienceofdata.wordpress.com",target="_blank"),"My blog is mostly focused on the application of Data Science to everyday life and culture to make it more accessible, less technical and much more fun to a wider audience. Feel free to add a comment or drop me a message for questions or suggestions. Thank you!",style= "font-size: 14pt; font-family:Helvetica;")
65 | ))
66 | ))
67 |
68 |
69 |
70 |
71 |
72 |
73 | server = shinyServer(function(input, output, session){
74 | category_predictors = function(){
75 | self_made=ifelse(input$selfMade=="Yes","Y","N")
76 | year_born = year(Sys.Date())-input$Age
77 | age_at_start = ifelse(input$founding_year option:nth-child(',i,')'))
46 | remDr$mouseMoveToLocation(webElement = clicktype)
47 | clicktype$click()
48 | clicktype = remDr$findElement(using = "css selector", '#sub-navigation > ul:nth-child(1) > li:nth-child(3) > a:nth-child(1)')
49 | remDr$mouseMoveToLocation(webElement = clicktype)
50 | clicktype$click()
51 | doc = remDr$getPageSource()[[1]]
52 | current_doc = read_html(doc)
53 | firstTable = htmlParse(remDr$getPageSource()[[1]])
54 |
55 | #Housekeeping!
56 | v=readHTMLTable(firstTable, as.data.frame = T)
57 | v= v[c(1,3,6)]
58 | names(v) = NULL
59 | v=lapply(v, function(x){data.frame(x)})
60 | testData =Reduce(function(x,y) merge(x,y,by="Team", all = T),v)
61 | testData[,2:15] = apply(testData[,2:15],2,function(x) as.numeric(as.character(x)))
62 | testData$RedCards = substring(as.character(testData$Discipline),nchar(as.character(testData$Discipline)))
63 | testData$YellowCards = substring(as.character(testData$Discipline),1,nchar(as.character(testData$Discipline))-1)
64 | testData$Discipline = NULL
65 |
66 |
67 | #Defense Table
68 | clicktype = remDr$findElement(using = "css selector", '#stage-team-stats-options > li:nth-child(2) > a:nth-child(1)')
69 |
70 | remDr$mouseMoveToLocation(webElement = clicktype)
71 | clicktype$click()
72 | webElem = remDr$findElement(using = "css selector", '#statistics-team-table-defensive')
73 |
74 | #Dealing with web can be a real pain especially living with place terrible internet
75 | #This is area is for error handling. Whenever the table is not populated but instead is of the class try-error (which means it failed), we retry all the steps in getting that table till it works
76 | #Crude I know, will keep on thinking of ways to refine this.
77 | webElemtext = try(webElem$getElementAttribute("outerHTML")[[1]])
78 | defenseTable = try(readHTMLTable(webElemtext, header = T,as.data.frame = T)[[1]])
79 | while (class(defenseTable) == "try-error"){
80 | clicktype = remDr$findElement(using = "css selector", '#stage-team-stats-options > li:nth-child(2) > a:nth-child(1)')
81 | remDr$mouseMoveToLocation(webElement = clicktype)
82 | clicktype$click()
83 | webElem = remDr$findElement(using = "css selector", '#statistics-team-table-defensive')
84 | webElemtext = try(webElem$getElementAttribute("outerHTML")[[1]],silent = F)
85 | defenseTable = try(readHTMLTable(webElemtext, header = T,as.data.frame = T)[[1]])
86 | }
87 |
88 | defenseTable[,3:8] = apply(defenseTable[,3:8],2,function(x) as.numeric(x))
89 | bigTable = merge(testData,defenseTable,by = "Team",all = T)
90 |
91 |
92 | #Offensive
93 | clicktype1 = remDr$findElement(using = "css selector", '#stage-team-stats-options > li:nth-child(3) > a:nth-child(1)')
94 | remDr$mouseMoveToLocation(webElement = clicktype1)
95 | clicktype1$click()
96 | webElem1 = remDr$findElement(using = "css selector", '#statistics-team-table-offensive')
97 | webElemtext = try(webElem1$getElementAttribute("outerHTML")[[1]],silent = F)
98 | offenseTable = try(readHTMLTable(webElemtext, header = T,as.data.frame = T)[[1]])
99 | while(class(offenseTable) == "try-error"){
100 | clicktype1 = remDr$findElement(using = "css selector", '#stage-team-stats-options > li:nth-child(3) > a:nth-child(1)')
101 | remDr$mouseMoveToLocation(webElement = clicktype1)
102 | clicktype1$click()
103 | webElem1 = remDr$findElement(using = "css selector", '#statistics-team-table-offensive')
104 | webElemtext = try(webElem1$getElementAttribute("outerHTML")[[1]], silent = F)
105 | offenseTable = try(readHTMLTable(webElemtext, header = T,as.data.frame = T)[[1]])
106 | }
107 |
108 | offenseTable[,3:7] = apply(offenseTable[,3:7],2,function(x) as.numeric(x))
109 |
110 | bigTable = merge(bigTable,offenseTable,by = "Team",all = T)
111 |
112 |
113 |
114 | #Detailed
115 | clicktype3 = remDr$findElement(using = "css selector", '#stage-team-stats-options > li:nth-child(4) > a:nth-child(1)')
116 | remDr$mouseMoveToLocation(webElement = clicktype3)
117 | clicktype3$click()
118 | webElem2 = remDr$findElement(using = "css selector", '#statistics-team-table-detailed')
119 | webElemtext = try(webElem$getElementAttribute("outerHTML")[[1]])
120 | detailedTable = try(readHTMLTable(webElemtext, header = T,as.data.frame = T)[[1]])
121 | while (class(detailedTable) == "try-error"){
122 | clicktype3 = remDr$findElement(using = "css selector", '#stage-team-stats-options > li:nth-child(4) > a:nth-child(1)')
123 | remDr$mouseMoveToLocation(webElement = clicktype3)
124 | clicktype3$click()
125 | webElem2 = remDr$findElement(using = "css selector", '#statistics-team-table-detailed')
126 | webElemtext = try(webElem$getElementAttribute("outerHTML")[[1]])
127 | detailedTable = try(readHTMLTable(webElemtext, header = T,as.data.frame = T)[[1]])
128 | }
129 | bigTable = merge(bigTable,detailedTable,by = "Team",all = T)
130 | inTable = rbind(inTable,bigTable)
131 | print(i)
132 | flush.console()
133 | }
134 | remDr$close()
135 | browseURL("http://localhost:4444/selenium-server/driver/?cmd=shutDownSeleniumServer")
136 |
137 | #Adding a Year column to the final table
138 | inTable$Year = rep(2015:2009, each=20, length.out = 140)
139 |
140 | ####Housekeeping! Again! I am about to join two tables based on their team names. Hence, I have to ensure that they are of the same format####
141 | inTable$Team[inTable$Team == "Birmingham City"] = "Birmingham"
142 | inTable$Team[inTable$Team == "Blackburn Rovers"] = "Blackburn"
143 | inTable$Team[inTable$Team == "Bolton Wanderers"] = "Bolton"
144 | inTable$Team[inTable$Team == "Cardiff City"] = "Cardiff"
145 | inTable$Team[inTable$Team == "Norwich City"] = "Norwich"
146 | inTable$Team[inTable$Team == "Swansea City"] = "Swansea"
147 | inTable$Team[inTable$Team == "Stoke City"] = "Stoke"
148 | inTable$Team[inTable$Team == "Tottenham Hotspur"] = "Tottenham"
149 | inTable$Team[inTable$Team == "West Ham United"] = "West Ham"
150 | inTable$Team[inTable$Team == "Wigan Athletic"] = "Wigan"
151 |
152 | #Removing trailing and leading spaces in the team names
153 | inTable$Team = gsub("^\\s+|\\s+$", "",inTable$Team)
154 |
155 | #Merge final tables from skysports and whoscored together and remove duplicate columns
156 | Datasetfinal = merge(inTable,datasetLagged,by = c("Year","Team"),all = T)
157 | Datasetfinal <- Datasetfinal[, !duplicated(colnames(Datasetfinal), fromLast = TRUE)]
158 |
159 | #Converting the percentages to numbers
160 | Datasetfinal$Left.Side = as.numeric(str_replace(Datasetfinal$Left.Side,"%",""))/100
161 | Datasetfinal$Middle.of.the.pitch = as.numeric(str_replace(Datasetfinal$Middle.of.the.pitch,"%",""))/100
162 | Datasetfinal$Right.Side = as.numeric(str_replace(Datasetfinal$Right.Side,"%",""))/100
163 |
164 | #Removing not so useful columns
165 | Datasetfinal$R.x = NULL
166 | Datasetfinal$Rating.x = NULL
167 | Datasetfinal$R.y = NULL
168 | Datasetfinal$Rating.y = NULL
169 | Datasetfinal$Pl = NULL
170 |
171 | #Convert column types. Again.
172 | Datasetfinal[,3:37] = apply(Datasetfinal[,3:37],2,function(x) as.numeric(x))
173 |
174 | #Now, we have our data!
--------------------------------------------------------------------------------
/Consitent Billionaire Guide/scripts/Billionaires - Compressed.R:
--------------------------------------------------------------------------------
1 | library(RSelenium)
2 | library(rvest)
3 | library(stringr)
4 | library(dplyr)
5 | library(lubridate)
6 | library(readr)
7 | library(purrr)
8 | library(lime)
9 |
10 | #All Functions
11 | get_historical_ranks = function(year){
12 | year_update = ifelse(year <10,paste0(0,year),year)
13 | url = paste0("http://stats.areppim.com/listes/list_billionairesx",year_update,"xwor.htm")
14 | tbl="http://stats.areppim.com/listes/list_billionairesx17xwor.htm" %>%
15 | read_html() %>%
16 | html_node("body > table") %>%
17 | html_table(trim = T)
18 | names(tbl) = tbl[2,]
19 | tbl = tbl[3:nrow(tbl),]
20 | tbl = tbl[,-6]
21 | names(tbl) = c("Rank","Name","Citizenship","Age","Net Worth")
22 | tbl=tbl %>% mutate(Year= paste0(20,year_update))
23 | }
24 |
25 | joining_year = function(row){
26 | yr=names(full_list)[(which(is.na(full_list[row,3:13])==F))[1]+2]
27 | return(yr)
28 | }
29 |
30 | leavers = function(row){
31 | total =0
32 | for(i in 1:(length(row)-1)){
33 | if(is.na(as.character(row[i+1])) ==T && is.na(as.character(row[i])) ==F ){
34 | total = total+1
35 | }
36 | }
37 | return(total)
38 | }
39 |
40 | comers = function(row){
41 | total =0
42 | first_time = min(which(is.na(row)==F))
43 |
44 | if (first_time>=10){
45 | return(0)
46 | }
47 | else{
48 | for(i in first_time:(length(row)-1)){
49 | if(is.na(as.character(row[i+1])) ==F && is.na(as.character(row[i])) ==T ){
50 | total = total+1
51 | }
52 | }
53 | }
54 | return(total)
55 | }
56 |
57 | max_distance = function(row){
58 | lst=which(is.na(row)==F)
59 | lt = NULL
60 | if( length(lst)==1){
61 | return(0)
62 | }
63 | for (i in 1:(length(lst)-1)){
64 | h= lst[i+1]-(lst[i]+1)
65 | lt = append(lt,h)
66 | }
67 | return(max(lt))
68 | }
69 |
70 |
71 | last_year_on_list = function(row){
72 | years_onList=which(is.na(row)==F)
73 | year_left = names(full_list[,5:15])[max(years_onList)]
74 | return(year_left)
75 |
76 | }
77 | last_year_left = function(row){
78 | lst=NULL
79 | if (length(which(is.na(row)==T))==0){
80 | return ("")
81 | }
82 | else{
83 | for (i in 2:length(row)){
84 | if (is.na(row[i])==T&is.na(row[i-1])==F){
85 | lst = append(lst,i)
86 | }
87 | }
88 | year_left = names(full_list[,5:15])[max(lst)]
89 | return(year_left)
90 | }
91 | }
92 |
93 | #Make the names into URLs
94 | names_to_URLs = function(name){
95 | name = as.character(name)
96 | name = str_trim(name)
97 | name=str_replace_all(name,"[^0-9A-Za-z\\- ]",'')
98 | name=str_replace_all(name,"\\s+",' ')
99 | name = str_replace_all(name," ","-")
100 | name=tolower(name)
101 | url = paste0("https://www.forbes.com/profile/",name,"/")
102 | return(url)
103 | }
104 |
105 | #Making a function to display name backwards e.g Joy Jones as Jones Joy
106 | #This will come in handy when getting the billionaire's info
107 | backward_names = function(name){
108 | name = unlist(str_split(name," "))
109 | name = rev(name)
110 | name = paste(name,collapse = ' ')
111 | name = str_trim(name)
112 | name=names_to_URLs(name)
113 | return(name)
114 | }
115 |
116 |
117 | get_billionaire_info = function(names_of_billionaires){
118 | url=names_to_URLs(names_of_billionaires)
119 | data=try(url %>%
120 | read_html %>%
121 | html_nodes(".stats li") %>%
122 | html_text(trim=T)
123 | ,silent = T)
124 | if(class(data) =="try-error"){
125 | url = backward_names(names_of_billionaires)
126 | data=try(url %>%
127 | read_html %>%
128 | html_nodes(".stats li") %>%
129 | html_text(trim=T)
130 | ,silent = T)
131 | }
132 | sector = ifelse(length(data[grepl("Source of Wealth",data,T)])>0,data[grepl("Source of Wealth",data,T)],NA)
133 | education = ifelse(length(data[grepl("Education",data,T)])>0,data[grepl("Education",data,T)],NA)
134 | sector = str_replace_all(sector,".*\t|.*\n","")
135 | education = str_replace_all(education,".*\t|.*\n","")
136 | return(c(sector,education))
137 | }
138 |
139 | education_columns_creator=function(df){
140 | df$Self_Made = ifelse(grepl("self made",df$detail1,ignore.case = T),"Y","N")
141 | df$dropped_out = ifelse(grepl("drop out",df$detail2,ignore.case = T),"Y",ifelse(is.na(df$detail2),NA,"N"))
142 | df$bachelors_degree = ifelse(grepl("bachelor",df$detail2,ignore.case = T),"Y",ifelse(is.na(df$detail2),NA,"N"))
143 | df$masters_degree = ifelse(grepl("master of arts|master of science",df$detail2,ignore.case = T),"Y",ifelse(is.na(df$detail2),NA,"N"))
144 | df$MBA = ifelse(grepl("Master of Business Administration",df$detail2,ignore.case = T),"Y",ifelse(is.na(df$detail2),NA,"N"))
145 | df$phd_or_professional_degree = ifelse(grepl("doctor|llb",df$detail2,ignore.case = T),"Y",ifelse(is.na(df$detail2),NA,"N"))
146 | return(df)
147 | }
148 |
149 | #Extract Model ID from model
150 | getID = function(model){
151 | ID=model@model_id
152 | return(ID)
153 | }
154 |
155 | #Function to check validation set accuracy
156 | test_accuracy = function(model){
157 | table_accuracy=h2o.hit_ratio_table(model,valid = T)[1,2]
158 | return(table_accuracy)
159 | }
160 |
161 |
162 | model_type.H2OMultinomialModel = function(x, ...) {
163 | return("classification")
164 |
165 | }
166 |
167 | predict_model.H2OMultinomialModel = function(x, newdata, type, ...) {
168 | # Function performs prediction and returns dataframe with Response
169 | #
170 | # x is h2o model
171 | # newdata is data frame
172 | # type is only setup for data frame
173 |
174 | pred <- h2o.predict(x, as.h2o(newdata))
175 |
176 | # return classification probabilities only
177 | return(as.data.frame(pred[,-1]))
178 |
179 | }
180 |
181 | #List of billionaires from 2007-2017
182 |
183 | years = 7:17
184 |
185 |
186 | full_list = lapply(years,get_historical_ranks)
187 | full_list = do.call("rbind",full_list)
188 |
189 | #At this point had to some excel work to harmonize the names e.g. some year bill gates was mentioned as William Gates III
190 | #This is where I spent most of my time
191 | #Mark X for each billionaire in each year (this will come in handy later)
192 | full_list = full_list %>% mutate(markings= "X")
193 |
194 | #Three billionaires have exactly the same names - Wang Wei, Robert Miller and Jim Davis
195 | full_list[9865,2] = "Jim Davis, new balance"
196 | full_list[12711,2] ="Jim Davis, recruitment"
197 | full_list[11497,2] = "Jim Davis, new balance"
198 | full_list[11030,2] ="Jim Davis, recruitment"
199 | full_list[9783,2] = "Wang Wei, delivery"
200 | full_list[10981,2] = "Wang Wei, computer hardware"
201 |
202 | #Use spread to make the year columns - the markings are useful here
203 | full_list=full_list %>%
204 | select(Name_updated,Citizenship,Year,markings) %>%
205 | spread(Year,markings)
206 |
207 | #Create year billionaire joined
208 |
209 | full_list$joining_year = unlist(lapply(1:nrow(full_list),joining_year))
210 |
211 | #No of time billionaire has been on the list
212 | full_list=full_list %>%
213 | mutate(no_of_times_on_list =rowSums(is.na(full_list[,3:13])==F))
214 |
215 | #No of times billionaires left
216 |
217 |
218 | full_list$no_of_times_left =apply(full_list[,3:13],1,leavers)
219 |
220 | #No of times billionaires have comeback to the list
221 |
222 |
223 |
224 | full_list$no_of_comebacks=apply(full_list[,3:13],1,comers)
225 |
226 | #Longest time away from the list
227 |
228 | full_list$longest_time_away = apply(full_list[,3:13],1,max_distance)
229 |
230 | #Last year on list and last year off it
231 |
232 | full_list$last_off_list = apply(full_list[,5:15],1,last_year_left)
233 |
234 |
235 | #Make the names into URLs
236 |
237 |
238 | #Making a function to display name backwards e.g Joy Jones as Jones Joy
239 | #This will come in handy when getting the billionaire's info
240 |
241 | #Get the billionaires info
242 |
243 |
244 | #Getting the columns for the details of billionaires set
245 | full_list$detail1 =""
246 | full_list$detail2 =""
247 |
248 | #I opted for a for loop as opposed to the sexy apply function as this gives me the ability
249 | #to store intermediate results
250 | for (i in 1:nrow(full_list)){
251 | dat=get_billionaire_info(full_list$Name[i])
252 | #dat = list(dat)
253 | full_list$detail1[i]=dat[1]
254 | full_list$detail2[i]=dat[2]
255 | print(paste0(round((i/nrow(full_list))*100,2),"% done at ",i))
256 | }
257 |
258 | #Create columns for educational information
259 |
260 | full_list = education_columns_creator(full_list)
261 | full_list$detail1 = unlist(full_list$detail1)
262 | full_list$detail2 = unlist(full_list$detail2)
263 |
264 | #Data Munging complete! :)
265 |
266 | #Clusters were made in Tableau using Sets and imported back into the csv file
267 | #================================Machine Learning======================================#
268 | library(h2o)
269 | library(plyr)
270 | library(dplyr)
271 | library(purrr)
272 | library(scales)
273 | library(lime)
274 | #Initialize H2o cluster
275 | h2o.init()
276 |
277 | dat1 = full_list %>%
278 | filter(Cluster !="The Newbie")
279 |
280 | data = dat1 %>%
281 | select(-Detail1,-Detail2) %>%
282 | as.h2o()
283 |
284 | splits = h2o.splitFrame(data,ratios = c(0.6,0.2),destination_frames = c("train", "valid", "test"), seed = 1234)
285 | train = h2o.getFrame("train")
286 | val = h2o.getFrame("valid")
287 | test = h2o.getFrame("test")
288 |
289 |
290 |
291 | #Column index numbers
292 | features=c(5,7,9,18,25,26,28)
293 | #c(4:11,18:19,25:26,28)
294 |
295 | #
296 | #2,4:11,18:19,25:26,29
297 | response=27
298 |
299 | #Models we would like to train and test the accuracy
300 | models = c("h2o.randomForest", "h2o.deeplearning" ,"h2o.gbm")
301 | names_of_models = c("Random Forest", "Deep Learning" ,"GBM")
302 |
303 | #The map function will invoke the functions in the "models" vector on the parameters below
304 | list_of_models =invoke_map(models, x=features,y=response,training_frame =train,validation_frame = val)
305 |
306 |
307 | #Store IDs for retreival later
308 | IDs = list_of_models %>%
309 | map_chr(getID)
310 |
311 |
312 | #check accuracy on validation set
313 | list_of_models %>%
314 | map_dbl(test_accuracy) %>%
315 | set_names(.,names_of_models)
316 | #GBM performed best so let's compute that variable importance
317 | model_gbm = h2o.getModel(IDs[3])
318 | h2o.varimp_plot(h2o.getModel(IDs[3]))
319 | #Store variable importance in csv for visualization
320 | var_imp_rf=h2o.varimp(model_gbm)
321 | write.csv(var_imp_rf,"Variable Importance GBM.csv",row.names = F)
322 | model_gbm=h2o.loadModel("C:/Users/rose.anwuri/Documents/TheArtandScienceofData/Consitent Billionaire Guide/app/GBM_Model")
323 | features_lime=model_gbm@parameters$x
324 | train_lime = train[,c(features_lime,"Cluster")] %>% h2o.na_omit()
325 | test_lime = val[,c(features_lime,"Cluster")]%>% h2o.na_omit()
326 |
327 | predict_model(x = model_gbm, newdata =test_lime[,-9], type = 'raw') %>%
328 | tibble::as_tibble()
329 | train_lime_df=as.data.frame(train_lime[,-9])
330 | train_lime_df=train_lime_df[complete.cases(train_lime_df),]
331 |
332 | explainer <- lime::lime(
333 | train_lime_df,
334 | model = model_gbm,
335 | bin_continuous = FALSE)
336 | test_lime_df = as.data.frame(test_lime) %>% filter(Cluster=="The Hustler") %>%as_tibble()
337 | test_lime_df=test_lime_df[complete.cases(test_lime_df),]
338 | explanation <- lime::explain(
339 | as.data.frame(test_lime_df[,-9]),
340 | explainer = explainer,
341 | n_labels = 1,
342 | n_features = 4,
343 | kernel_width = 0.5)
344 | plot_features(explanation)
345 |
--------------------------------------------------------------------------------
/Twitter Sentiment Analysis/scripts/Sentiment Analysis - Telcos.R:
--------------------------------------------------------------------------------
1 | library(twitteR)
2 | library(base64enc)
3 | library(RCurl)
4 | library(httr)
5 | library(RJSONIO)
6 | library(stringr)
7 | library(tm)
8 | library(SnowballC)
9 | library(wordcloud)
10 | library(dplyr)
11 | library(tidytext)
12 |
13 | ####This has been removed for privacy####
14 | api_key <- "xxx" # From dev.twitter.com
15 | api_secret <- "xxx" # From dev.twitter.com
16 | token <- "xxx" # From dev.twitter.com
17 | token_secret <- "xxx" # From dev.twitter.com
18 | ####This has been removed for privacy####
19 | # Create Twitter Connection
20 | setup_twitter_oauth(api_key, api_secret, token, token_secret)
21 | set_config(config (ssl_verifypeer= 0L))
22 | date_time = Sys.Date()-10
23 | tweetData = NULL
24 |
25 | #The search words changes to each of the Telcos, I exclude tweets from their own company run accounts also
26 | #I had to search for keywords like data, call, network with the Telco names to avoid getting stuff like mtn bike but they still happened!
27 | while (date_time <= Sys.Date()+1){
28 | date_time = as.character.Date(date_time)
29 | tweetdetails <- searchTwitter("etisalat network -from:etisalat_9ja", n=1500, lang="en", since = "2016-05-03",until= date_time, geocode = '10,9, 200km')# Transform tweets list into a data frame
30 | tweetData = append(tweetData, tweetdetails)
31 | date_time = as.Date(date_time) +1
32 | print(date_time)
33 | flush.console()
34 | }
35 |
36 | tweetData_network = twListToDF(tweetData)
37 |
38 | date_time = Sys.Date()-10
39 | tweetData = NULL
40 | while (date_time <= Sys.Date()+1){
41 | date_time = as.character.Date(date_time)
42 | tweetdetails <- searchTwitter("etisalat data -from:etisalat_9ja", n=1500, lang="en", since = "2016-05-03",until= date_time, geocode = '10,9, 200km')# Transform tweets list into a data frame
43 | tweetData = append(tweetData, tweetdetails)
44 | date_time = as.Date(date_time) +1
45 | print(date_time)
46 | flush.console()
47 | }
48 | tweetData = twListToDF(tweetData)
49 | tweetDataFinal = rbind(tweetData, tweetData_network)
50 |
51 | date_time = Sys.Date()-10
52 | tweetData = NULL
53 | while (date_time <= Sys.Date()+1){
54 | date_time = as.character.Date(date_time)
55 | tweetdetails <- searchTwitter("etisalat call -from:etisalat_9ja", n=1500, lang="en", since = "2016-05-03",until= date_time, geocode = '10,9, 200km')# Transform tweets list into a data frame
56 | tweetData = append(tweetData, tweetdetails)
57 | date_time = as.Date(date_time) +1
58 | print(date_time)
59 | flush.console()
60 | }
61 | tweetData = twListToDF(tweetData)
62 | tweetDataFinal = rbind(tweetData, tweetDataFinal)
63 | View(tweetDataFinal)
64 |
65 | tweetDataFinal$text = as.character(tweetDataFinal$text)
66 | #Just to make sure the twitter search only picked tweets that about mtn
67 | tweetDataFinal = subset(tweetDataFinal,grepl('etisalat',text,ignore.case = T))
68 |
69 | #We shouldn't be analyzing tweets that MTN actually tweeted right?
70 | tweetDataFinal = subset(tweetDataFinal,!grepl('etisalat',screenName,ignore.case = T))
71 |
72 | #So we have emoji's to deal with and the because they are beyond the UTF-8 they appear...well..weird
73 | #One option is to remove them completely
74 | #But I'd be losing a lot of information right? Because Emoji's show sentiment
75 | #So I'd rather do some housekeeping and deal with this
76 | #An awesome human compiled a list of emojis bytes and utf-8 characters and their descriptions on GitHub. Shout to Jessica Peterka-Bonetta!
77 |
78 | emoticons = read.csv("https://raw.githubusercontent.com/today-is-a-good-day/Emoticons/master/emDict.csv",sep = ";")
79 | emoticons$Native = NULL
80 |
81 | #Ensuring the text is in the native R encoding
82 | tweetDataFinal$text = enc2native(tweetDataFinal$text)
83 |
84 | #Remove all the U+00 and leave the R encoding only
85 | tweetDataFinal$text = tolower(str_replace_all(tweetDataFinal$text,"U\\+00",''))
86 |
87 | #The function below find the first emoji in a tweet and puts it in a new column.
88 | #The idea behind picking only one emoji comes from the assumption that one emoji out of the many that people put in a tweet is enough to find the sentiment of that tweet
89 | #In fact, A lot of the time emojis are just repitions or are conveying the same emotion
90 | extract_emojis = function(df){
91 | df$emoticon = ''
92 | pt = txtProgressBar(min = 0, max=nrow(df),initial = 0)
93 | for (i in 1:nrow(df)){
94 | if (str_count(df$text[i],"0){
95 | emoji_placeholder = " nchar(df$text[i])){
101 | break
102 | }
103 | }
104 |
105 | df$emoticon[i] = emoji_placeholder
106 | }
107 |
108 | setTxtProgressBar(pt,i)
109 | }
110 |
111 | return (df)
112 | }
113 |
114 | tweetDataFinal = extract_emojis(tweetDataFinal)
115 |
116 | #Removing trailing and leading white space
117 | tweetDataFinal$emoticon = gsub("^\\s+|\\s+$", "",tweetDataFinal$emoticon)
118 |
119 | #By spot checking (really trial and error), I see that two emoticons do not exist in the emoticon csv, hence the loop searches to the end of the tweet
120 | #and because it checks till the end of the tweet, it picks up some stuff like hashtags, links and the likes. This function removes everything after
121 | #the last occurrence of ">" in the emoticon column
122 | tweetDataFinal$emoticon=str_replace(tweetDataFinal$emoticon,"(?<=>)[^>]*$",'')
123 |
124 | #Get the description of the emoticons by merging the emoticons table with the tweets dataframe, tweetDataFinal
125 | tweetDataFinal = merge(tweetDataFinal,emoticons,by.x = "emoticon",by.y = "R.encoding",all.x = T)
126 |
127 |
128 | #We see that some emoticons (two or three if we look carefully) have no description for them.
129 | #Let's take a closer look at them
130 |
131 | exploreUnknown = subset(tweetDataFinal,is.na(Description),emoticon!='')
132 |
133 | #Nothing particularly strage asides that they are some repition of the same tweet by the same person tweeted at the same time
134 | #This will be removed anyways.
135 |
136 | #I'm going to remove mentions and @s in the tweet because it adds no value to our analysis
137 | tweetDataFinal$text = str_replace_all(tweetDataFinal$text,'@\\S+',"")
138 |
139 | #Also removing RT because that information is already captured in the column isRetweet
140 | tweetDataFinal$text=str_replace_all(tweetDataFinal$text,'rt ','')
141 |
142 | #Removing links from the tweets because again, there is no feasible way that this will make our analysis better
143 | tweetDataFinal$text = gsub('http\\S+\\s*', "", tweetDataFinal$text)
144 |
145 | #Let's not forget to remove the R encoding since we have extracted the sentiment we needed from it
146 | tweetDataFinal$text = gsub("<.*>",'',tweetDataFinal$text)
147 |
148 | #Now, It's safe to remove all other non aplha numeric characters
149 | tweetDataFinal$text = gsub("[^0-9A-Za-z/// ]", "", tweetDataFinal$text, T)
150 |
151 | #If you look carefully, there are some duplicate tweets from the exact same person at the same time. This happens sometimes on twitter when a person tweets once and
152 | #for some network reasons, it is duplicated (but 6 times though? Weird.)
153 | #Further a lot of duplicate tweets are probably promotional tweets
154 | #The first step is to get out all the tweets that are not retweets and the remove duplicates
155 | notRetweets = subset(tweetDataFinal,isRetweet == F)
156 | notRetweets = notRetweets[!duplicated(notRetweets["text"]),]
157 |
158 | #I noticed something pretty weird with the retweeted tweets. I found duplicates of the same person retweeting the same thing within the same second
159 | #Pretty dodgy eh? So I factored that in and removed the duplicated
160 | Retweet = subset(tweetDataFinal,isRetweet == T)
161 | Retweets = Retweet[!duplicated(Retweet[c("text","screenName")]),]
162 |
163 | #Lets just put these two datasets together
164 | tweetDataFinal = rbind(Retweets,notRetweets)
165 |
166 |
167 | #######SPECIFIC TO MTN#########
168 | #Are we done cleaning now? Not quite but we are almost there. Apparently people spell mountain as mtn.
169 | #Natural Language. Sigh. So Just incase any of them pass through the network, call, data filter, let's remove them
170 | tweetDataFinal = subset(tweetDataFinal,!grepl("dew",text))
171 | tweetDataFinal = subset(tweetDataFinal,!grepl("bike",text))
172 | #######SPECIFIC TO MTN#########
173 |
174 | #######SPECIFIC TO GLO#########
175 | tweetDataFinal = subset(tweetDataFinal,!grepl("glo up",text))
176 | tweetDataFinal = subset(tweetDataFinal,!grepl("glo'd",text))
177 | tweetDataFinal = subset(tweetDataFinal,!grepl("glo gang",text))
178 | tweetDataFinal = subset(tweetDataFinal,!grepl("glo ing",text))
179 | tweetDataFinal = subset(tweetDataFinal,!grepl("gloing",text))
180 | tweetDataFinal = subset(tweetDataFinal,!grepl("glogang",text))
181 | tweetDataFinal = subset(tweetDataFinal,!grepl("lil glo",text))
182 | #######SPECIFIC TO MTN#########
183 |
184 | #Now let's add the description of our emoticons to the tweets
185 | #First, let's remove those pesky NAs in the description column and convert them to empty strings
186 | tweetDataFinal$Description = apply(data.frame(tweetDataFinal$Description),1,function(x) ifelse(is.na(x),"",x))
187 |
188 | #Now we are going to concatenate the tweets and emoji description
189 | tweetDataFinal$text = paste(tweetDataFinal$text,tolower(tweetDataFinal$Description))
190 |
191 | # I assume that because some description were empty we ould have some trailing spaces.
192 | #Let's remove trailing and leading spaces one last time just incase.
193 | tweetDataFinal$text = gsub("^\\s+|\\s+$", "",tweetDataFinal$text)
194 |
195 | #And we are done! Now for some sentiment analysis!
196 |
197 | #From the tidytext package, we would use the AFINN lexicon which scores word between -5 to 5 based on
198 | #negative or positive sentiment
199 | lexicon = sentiments %>% filter(lexicon == "AFINN")
200 |
201 | #This function uses the AFINN lexicon we loop through the tweets and score them accordingly
202 | #It also takes into account when the word 'not' is in a sentence. If the word 'not' is there,
203 | #I definitely need to look at the next word and check if that not + word exists in the lexicon
204 | #If it does not I check if the second word alone exists in the lexicon and simply
205 | #reverse the sign of the score e.g "not great". great exists with a score of 3 and with this function
206 | #not great would have a score of -3. Pretty awesome eh?
207 | sentiment_score = function(sentence) {
208 | score = 0
209 | words = str_split(sentence, " ")
210 | words = unlist(words)
211 | for (i in 1:length(words)) {
212 | if (words[i] == "not") {
213 | word = paste("not", words[i + 1])
214 | word.split = unlist(str_split(word, " "))
215 | if (word %in% lexicon$word == T) {
216 | score = score + lexicon$score[lexicon$word == word]
217 |
218 | }
219 | else if (word.split[2] %in% lexicon$word == T) {
220 | score = score - lexicon$score[lexicon$word == word.split[2]]
221 |
222 | }
223 |
224 |
225 | }
226 | else if (i > 1 && words[i - 1] == "not")
227 | next
228 |
229 | else if (words[i] %in% lexicon$word == T) {
230 | score = score + lexicon$score[lexicon$word == words[i]]
231 |
232 | }
233 |
234 |
235 | }
236 | return (score)
237 |
238 | }
239 |
240 | #Let's apply this function to every row in the data. So much faster than a for loop!
241 | tweetDataFinal$score=apply(data.frame(tweetDataFinal$text),1,sentiment_score)
242 |
243 | #A Histogram of the scores
244 | hist(tweetDataFinal$score[tweetDataFinal$score !=0])
245 |
246 | table(tweetDataFinal$score[tweetDataFinal$score !=0]>0)
247 | median(tweetDataFinal$score[tweetDataFinal$score !=0])
248 | tweetDataFinal$Telecoms = rep("Etisalat",nrow(tweetDataFinal))
249 | write.csv(tweetDataFinal,'etisalat.csv',row.names = F)
250 |
251 |
252 |
253 | #Let's create a term-frequency document using the tm package by creating a corpus of all the words in our data
254 |
255 | corpus = Corpus(VectorSource(tweetDataFinal$text[tweetDatafinal$score !=0]))
256 | corpus = tm_map(corpus, PlainTextDocument)
257 | corpus = tm_map(corpus, removeNumbers)
258 | corpus = tm_map(corpus, content_transformer(stripWhitespace))
259 | corpus = tm_map(corpus, content_transformer(tolower))
260 | corpus = tm_map(corpus, removeWords, c("etisalat","9ja","etisalat_9ja","vodacom","nigeria","home","sleep","start","tweet","phone","aaaaay","face","datamustfall","datamustfal","network","data","call","amp"))
261 | corpus = tm_map(corpus, stemDocument)
262 | corpus = tm_map(corpus, removeWords, stop_words$word)
263 | corpus = tm_map(corpus, removeWords, removePunctuation(stop_words$word))
264 | corpus = tm_map(corpus, removePunctuation)
265 |
266 | frequencies = DocumentTermMatrix(corpus)
267 |
268 | frequencies.common = removeSparseTerms(frequencies,0.9996)
269 |
270 | etisalatTweets = as.data.frame(as.matrix(frequencies.common))
271 |
272 | termFreq = data.frame(word = colnames(etisalatTweets),frequency = colSums(etisalatTweets))
273 |
274 | wordcloud(colnames(etisalatTweets), colSums(etisalatTweets), scale = c(4, 0.5),colors = 'darkgreen')
275 |
276 |
--------------------------------------------------------------------------------
/Friends Analysis Laughter Prediction/canned_laughter_detection.R:
--------------------------------------------------------------------------------
1 | library(parsnip)
2 | library(rsample)
3 | library(tidyverse)
4 | library(recipes)
5 | library(ggthemr)
6 | library(DALEX)
7 | library(subtools) #devtools::install_github("fkeck/subtools")
8 | library(httr)
9 | library(jsonlite)
10 | library(rvest)
11 | library(stringdist)
12 | laughter_transcribed <- read_csv('Laughter_Detection.csv')
13 | laughter_transcribed=laughter_transcribed%>%
14 | rowwise()%>%
15 | mutate(seconds=paste0(seq(Start,End),collapse = ','))%>%
16 | separate_rows(seconds,sep = ',')%>%
17 | mutate(seconds=as.numeric(seconds))
18 |
19 | season_one_audio=read_csv('season_one_audio.csv')
20 | season_one_audio=season_one_audio%>%
21 | select(-X1,-`Unnamed: 0`)%>%
22 | mutate(seconds=seconds-1)
23 |
24 | audio_training_data <- season_one_audio%>%
25 | filter(Episode=='E01')%>%
26 | mutate(seconds=seconds-1)%>%
27 | left_join(laughter_transcribed)%>%
28 | filter(seconds %
29 | mutate(label=ifelse(is.na(File),0,1))%>%
30 | select(-File,-Start,-End,-Length,-`Talk Over`,-seconds)
31 |
32 | audio_training_data <- select_if(audio_training_data,is.numeric)
33 |
34 |
35 | audio_training_data=audio_training_data%>%
36 | mutate(label=as.factor(label))
37 |
38 | data_split <- initial_split(audio_training_data, prop = 0.7)
39 |
40 | train=data_split %>%
41 | training()
42 |
43 | test=data_split %>%
44 | testing()
45 |
46 |
47 | model1=svm_poly( mode = "classification")%>%
48 | set_engine("kernlab") %>%
49 | parsnip::fit(label~., data = train)
50 |
51 | model2=mlp( mode = "classification")%>%
52 | set_engine("nnet") %>%
53 | parsnip::fit(label~., data = train)
54 |
55 | model3=rand_forest( mode = "classification")%>%
56 | set_engine("randomForest") %>%
57 | parsnip::fit(label~., data = train)
58 |
59 |
60 | model4=boost_tree( mode = "classification")%>%
61 | set_engine("xgboost") %>%
62 | parsnip::fit(label~., data = train)
63 |
64 | model5=logistic_reg( mode = "classification")%>%
65 | set_engine("glm") %>%
66 | parsnip::fit(label~., data = train)
67 |
68 | model6=rand_forest( mode = "classification")%>%
69 | set_engine("ranger") %>%
70 | parsnip::fit(label~., data = train)
71 |
72 |
73 | custom_predict_classif <- function(objectPred, set){
74 | as.data.frame(predict(objectPred, set, type = "prob"))[,2]
75 | }
76 |
77 | build_explainer <- function(model,test,label){
78 | explainer <- DALEX::explain(model, data=test, y=pull(test,label)%>%as.character()%>%as.numeric(), label =label,
79 | predict_function = custom_predict_classif, colorize = FALSE,verbose=FALSE)
80 | return(explainer)
81 |
82 | }
83 |
84 | get_weighted_precision <- function(model,data){
85 | data$prediction=predict(model,data)%>%pull()
86 | #data$prediction=ifelse(data$prediction>=0.5,'Uplift','Drop')
87 | model_name=model$spec$engine
88 | #f1_score=data%>%mutate_if(is.character,as.factor)%>%f_meas(revenue_success,prediction)%>%pull(.estimate)
89 | conf=table(data$label,data$prediction)
90 | res=conf[2,2]/(conf[2,2]+conf[1,2])
91 | recall=conf[2,2]/(conf[2,2]+conf[2,1])
92 | res1=(conf[2,2]+conf[1,1])/(conf[2,2]+conf[1,2]+conf[1,1]+conf[2,1])
93 | f1_score=2*((res*recall)/(res+recall))
94 | df=tibble(name=model_name,accuracy=100*res1,precision=100*res,f1_score=100*f1_score,recall=100*recall)
95 | return(df)
96 | }
97 |
98 |
99 | compare_explainers=pmap(list(
100 | model=list(model1,model2,model3,model4,model5,model6),
101 | test=list(test),
102 | label=c('svm_poly','nnet','rf','xgboost','glm','ranger')
103 | ),build_explainer)
104 |
105 | do.call('plot',c(map(compare_explainers,DALEX::model_performance),list(geom='roc')))+
106 | ggtitle('AUC for Test')
107 |
108 |
109 | test_results=map2_dfr(list(model1,model2,model3,model4,model5,model6),list(test),get_weighted_precision)
110 |
111 | test_results%>%
112 | arrange(-f1_score)
113 |
114 | canned_laughter_detection_model=model3
115 | saveRDS(canned_laughter_detection_model,'canned_laughter_detection_model')
116 |
117 |
118 | subtitles=list.files('S1')
119 | subtitles=subtitles[str_detect(subtitles,'SAiNTS')]
120 | subtitles=subtitles[c(1:15,17:23)]
121 | subtitles[23:24]=c('Friends - 1x16 - The One With Two Parts (1).en.srt','Friends - 1x17 - The One With Two Parts (2).en.srt')
122 | subtitles=sort(subtitles)
123 | read_subtitles_custom<- function(file_path){
124 | path <- paste0('S1/',file_path)
125 |
126 | season_ep=str_extract(file_path,'\\dx\\d{2}')%>%str_split('x',simplify = T)%>%as.vector()
127 | season=as.numeric(season_ep[1])
128 | episode=as.numeric(season_ep[2])
129 | subs=read_subtitles(path)
130 | subs <- subs%>%
131 | mutate(season=season,
132 | epsiode=episode)
133 | return(subs)
134 |
135 | }
136 |
137 | subtitles_df=map_dfr(subtitles,read_subtitles_custom)
138 |
139 | scripts_links=paste0('https://fangj.github.io/friends/season/01',sprintf('%02d', seq_len(24)),'.html')
140 | read_scripts <- function(url){
141 | script_text=url%>%
142 | read_html%>%
143 | html_nodes('p[align="left"],p')%>%
144 | html_text()
145 | script_df=tibble(script_text)%>%
146 | filter(!str_detect(script_text,'Written by:') & !script_text %in% c('End','Opening Credits','Closing Credits','Commercial Break'))%>%
147 | mutate(scene=ifelse(str_detect(script_text,'^\\['),script_text,NA))%>%
148 | fill(scene,.direction = 'down')%>%
149 | filter(!str_detect(script_text,'^\\[|^\\(|^\\{'))%>%
150 | mutate(speaker=str_extract(script_text,'.*?:'))%>%
151 | mutate(script_text=str_remove_all(script_text,speaker)%>%str_trim())%>%
152 | mutate(addressee=str_extract(script_text,'Monica|Joey|Chandler|Phoebe|Ross|Rachel'),
153 | addressee=ifelse(is.na(addressee),'All',addressee))%>%
154 | filter(!is.na(script_text))%>%
155 | mutate(speaker=str_remove_all(speaker,':'))%>%
156 | mutate(script_text=str_replace_all(script_text,'\n',' '))
157 | season_ep=str_extract(url,'\\d{4}')
158 | season=str_sub(season_ep,1,2)%>%as.numeric()
159 | episode=str_sub(season_ep,3,4)%>%as.numeric()
160 | script_df=script_df%>%
161 | mutate(season=season,
162 | episode=episode)
163 | return(script_df)
164 | }
165 | scripts_df=map_dfr(scripts_links,read_scripts)
166 |
167 | scripts_df=scripts_df%>%
168 | mutate(script_text=str_remove_all(script_text,'[:punct:]'))
169 |
170 | subtitles_df=subtitles_df%>%
171 | mutate(Text_content=str_remove_all(Text_content,'[:punct:]'))
172 | library(fuzzyjoin)
173 |
174 | script_to_subtitle_match <- function(sub_df,script_df,row){
175 | window=seq(row-5,row+5)
176 | window=window[window>0]
177 | matched=sub_df%>%
178 | slice(window)%>%
179 | mutate(check=str_detect(script_df$script_text[row],Text_content))%>%
180 | filter(check)%>%
181 | mutate(
182 | scene=script_df$scene[row],
183 | speaker=script_df$speaker[row],
184 | addressee=script_df$addressee[row],
185 | script_text=script_df$script_text[row]
186 | )
187 | if(nrow(matched)==0){
188 | matched=sub_df%>%
189 | slice(window)%>%
190 | mutate(check=stringdist(script_df$script_text[row],Text_content,method = 'dl'))%>%
191 | top_n(1,desc(check))%>%
192 | mutate(
193 | scene=script_df$scene[row],
194 | speaker=script_df$speaker[row],
195 | addressee=script_df$addressee[row],
196 | script_text=script_df$script_text[row]
197 | )
198 | }
199 | return(matched)
200 |
201 |
202 | }
203 |
204 | final=tibble()
205 | for(i in 1:24){
206 | print(paste('Running Episode',i))
207 | sub_df=subtitles_df%>%
208 | filter(epsiode==i)
209 | script_df=scripts_df%>%
210 | filter(episode==i)
211 | matched=map_dfr(seq_len(nrow(sub_df)),~script_to_subtitle_match(sub_df,script_df,.))
212 | final=final%>%bind_rows(matched)
213 | }
214 | season_one_text=final
215 | season_one_text=season_one_text%>%
216 | separate(Timecode_in,c('hour','minute','second'),sep = ':',remove = F)%>%
217 | mutate_at(3:5,as.numeric)%>%
218 | rowwise()%>%
219 | mutate(Timecode_in=sum(hour*60,minute*60,second))%>%
220 | select(-hour,-minute,-second)%>%
221 | separate(Timecode_out,c('hour','minute','second'),sep = ':',remove = F)%>%
222 | mutate_at(4:6,as.numeric)%>%
223 | rowwise()%>%
224 | mutate(Timecode_out=sum(hour*60,minute*60,second))%>%
225 | select(-check,-hour,-minute,-second)
226 |
227 | write_csv(season_one_audio,'season_one_audio.csv')
228 | write_csv(season_one_text,'season_one_text.csv')
229 | season_one_text=read_csv('season_one_text.csv')
230 | season_one_audio=read_csv('season_one_audio.csv')
231 | canned_laughter_detection_model=readRDS('canned_laughter_detection_model')
232 | season_one_audio$laughter=predict(canned_laughter_detection_model,season_one_audio)%>%pull(.pred_class)
233 | season_one_text=season_one_text%>%
234 | distinct(Timecode_in,Timecode_out,season,epsiode,scene,speaker,addressee,script_text)%>%
235 | group_by(epsiode,season,scene,speaker,addressee,script_text)%>%
236 | summarise(Timecode_in=min(Timecode_in),Timecode_out=max(Timecode_out))%>%
237 | ungroup()%>%
238 | arrange(epsiode,Timecode_in)
239 |
240 | audio_text_join <- function(episode,seconds){
241 | audio=season_one_text %>%
242 | filter(epsiode==episode)%>%
243 | mutate(diff=abs(seconds-Timecode_out))%>%
244 | top_n(1,desc(diff))%>%
245 | mutate(seconds=seconds)
246 | return(audio)
247 |
248 | }
249 | audio_text_dataset = tibble()
250 |
251 | for(i in 1:24){
252 | ep_df=season_one_text%>%
253 | filter(epsiode==i)
254 | one_ep_test=map_dfr(seq_len(nrow(ep_df)),~audio_text_join(i,.))
255 | audio_text_dataset=audio_text_dataset%>%bind_rows(one_ep_test)
256 |
257 | }
258 |
259 |
260 |
261 |
262 |
263 | library(ggformula)
264 | library(hrbrthemes)
265 | library(magick)
266 | library(grid)
267 | library(extrafont)
268 | font_import(pattern="GABRWFFR",paths = '/Users/rosebudanwuri/Downloads')
269 | color_palette=c('#F7483E','#e7d509','#93AAC9','#F7483D','#D6D2CE','#C59A80','#FFEAD5','#E78D86')
270 | theme_friends <- define_palette(
271 | swatch = color_palette,
272 | background = 'white',
273 | text = 'black',
274 | gradient = c(lower = '#f27789', upper = '#31c2a4'),
275 | line = '#f3f4f8'
276 | )
277 | ggthemr(theme_friends)
278 | chandler=image_read('chandler.png')%>%rasterGrob(interpolate=TRUE)
279 | joey=image_read('joey.png')%>%rasterGrob(interpolate=TRUE)
280 | monica=image_read('monica.png')%>%rasterGrob(interpolate=TRUE)
281 | phoebe=image_read('phoebe.png')%>%rasterGrob(interpolate=TRUE)
282 | rachel=image_read('rachel.png')%>%rasterGrob(interpolate=TRUE)
283 | ross=image_read('ross.png')%>%rasterGrob(interpolate=TRUE)
284 | p1=audio_text_dataset%>%
285 | left_join(season_one_audio%>%
286 | select(Season,Episode,seconds,laughter)%>%
287 | mutate(epsiode=str_extract(Episode,'\\d{2}')%>%as.numeric,
288 | season=str_extract(Season,'\\d{2}')%>%as.numeric))%>%
289 | distinct(season,epsiode,script_text,laughter,speaker,addressee)%>%
290 | filter(laughter==1)%>%
291 | count(epsiode,speaker)%>%
292 | arrange(-n)%>%
293 | filter(speaker %in% c('Ross','Joey','Chandler','Monica','Phoebe','Rachel'))%>%
294 | ggplot(aes(epsiode,n,color=speaker))+
295 | geom_spline(size=1,spar = 0.5)+
296 | ggtitle('Who Was Funniest blahh')+
297 | theme_ipsum()+
298 | theme( panel.grid.major = element_blank(),plot.title = element_text(size=16, family="Gabriel Weiss' Friends Font"),legend.position="top",
299 | text=element_text( family="Gabriel Weiss' Friends Font"), legend.key.width = unit(1, "cm"))+
300 | guides(color=guide_legend(ncol=6))+
301 | scale_color_discrete(name = " ")
302 |
303 |
304 | p2=audio_text_dataset%>%
305 | left_join(season_one_audio%>%
306 | select(Season,Episode,seconds,laughter)%>%
307 | mutate(epsiode=str_extract(Episode,'\\d{2}')%>%as.numeric,
308 | season=str_extract(Season,'\\d{2}')%>%as.numeric))%>%
309 | distinct(season,epsiode,script_text,laughter,speaker,addressee)%>%
310 | filter(laughter==1)%>%
311 | count(speaker)%>%
312 | arrange(-n)%>%
313 | filter(speaker %in% c('Ross','Joey','Chandler','Monica','Phoebe','Rachel'))%>%
314 | ggplot(aes(reorder(speaker,n),n,fill=speaker))+
315 | geom_col(color='black')+
316 | coord_flip()+
317 | theme_ipsum()+
318 | theme( panel.grid.major = element_blank(),plot.title = element_text(size=16, family="Gabriel Weiss' Friends Font"),legend.position="top",
319 | text=element_text( family="Gabriel Weiss' Friends Font"), legend.key.width = unit(1, "cm"))+
320 | guides(fill=guide_legend(ncol=6))+
321 | scale_fill_discrete(name = " ")
322 |
323 | library(patchwork)
324 | p2/p1
325 | audio_text_dataset%>%
326 | left_join(season_one_audio%>%
327 | select(Season,Episode,seconds,laughter)%>%
328 | mutate(epsiode=str_extract(Episode,'\\d{2}')%>%as.numeric,
329 | season=str_extract(Season,'\\d{2}')%>%as.numeric))%>%
330 | distinct(season,epsiode,script_text,laughter,speaker,addressee)%>%
331 | filter(laughter==1)%>%
332 | count(speaker,addressee)%>%
333 | filter(addressee!='All')%>%
334 | arrange(-n)
335 |
336 | audio_text_dataset%>%
337 | left_join(season_one_audio%>%
338 | select(Season,Episode,seconds,laughter)%>%
339 | mutate(epsiode=str_extract(Episode,'\\d{2}')%>%as.numeric,
340 | season=str_extract(Season,'\\d{2}')%>%as.numeric))%>%
341 | distinct(season,epsiode,script_text,speaker,addressee)%>%
342 | count(speaker,name='total_dialogue')%>%
343 | arrange(-total_dialogue)%>%
344 | filter(speaker %in% c('Ross','Joey','Chandler','Monica','Phoebe','Rachel'))%>%
345 | left_join(
346 | audio_text_dataset%>%
347 | left_join(season_one_audio%>%
348 | select(Season,Episode,seconds,laughter)%>%
349 | mutate(epsiode=str_extract(Episode,'\\d{2}')%>%as.numeric,
350 | season=str_extract(Season,'\\d{2}')%>%as.numeric))%>%
351 | distinct(season,epsiode,script_text,laughter,speaker,addressee)%>%
352 | filter(laughter==1)%>%
353 | count(speaker,name='funny_dialogue')%>%
354 | arrange(-funny_dialogue)%>%
355 | filter(speaker %in% c('Ross','Joey','Chandler','Monica','Phoebe','Rachel')))%>%
356 | mutate(funny_ratio=100*funny_dialogue/total_dialogue)%>%
357 | arrange(-funny_ratio)
358 | audio_text_dataset <- audio_text_dataset%>%
359 | left_join(season_one_audio%>%
360 | select(Season,Episode,seconds,laughter)%>%
361 | mutate(epsiode=str_extract(Episode,'\\d{2}')%>%as.numeric,
362 | season=str_extract(Season,'\\d{2}')%>%as.numeric,
363 | laughter=as.character(laughter)%>%as.numeric))%>%
364 | group_by(season,epsiode,script_text,speaker,addressee)%>%
365 | summarise(laughter=max(laughter),seconds=max(seconds))%>%
366 | arrange(epsiode,seconds)
367 | write_csv(audio_text_dataset,'season_one_final.csv')
368 | season_one_final=read_csv('season_one_final.csv')
369 | season_one_final=season_one_final%>%
370 | mutate(word_count=str_count(script_text," ")+1)%>%
371 | filter(word_count>2)
372 | write_csv(season_one_final,'season_one_final.csv')
373 |
--------------------------------------------------------------------------------
/The Making of Great Music/scripts/music_sentiment.R:
--------------------------------------------------------------------------------
1 | library(jsonlite)
2 | library(plyr)
3 | library(dplyr)
4 | library(tidyr)
5 | library(devtools)
6 | library(readr)
7 | library(stringr)
8 | library(tm)
9 | library(SnowballC)
10 | library(tidytext)
11 | library(wordcloud)
12 | library(httr)
13 | #List of Years with Billboard data available
14 | year_list=1950:2015
15 |
16 | set_config(config(ssl_verifypeer = 0L))
17 | get_billboard_data = function(year){
18 | df =fromJSON(paste0("https://raw.githubusercontent.com/kevinschaich/billboard-top-100-lyrics/master/data/years/",year,".json"))
19 | df[,c("neg","neu","pos","compound")]=df$sentiment
20 | df$sentiment=NULL
21 | return(df)
22 | }
23 |
24 | music_df=ldply(year_list,get_billboard_data)
25 | music_df=music_df %>%
26 | select(-tags)
27 | #install_github("tiagomendesdantas/Rspotify")
28 | library(Rspotify)
29 |
30 | keys <- spotifyOAuth(app_id="ASD","769ef3519e8444238fde9c8981c6371c","b17e4a7ca0b4426f9962645ba5c74a63")
31 |
32 | #name of Spotify Features
33 | features_name=c("id","danceability","energy","key","loudness","mode","speechiness",
34 | "acousticness","instrumentalness","liveness","valence","tempo",
35 | "duration_ms","time_signature","uri","analysis_url")
36 |
37 | #Initiatilizing the features in the data
38 | music_df[,c("id","danceability","energy","key","loudness","mode","speechiness",
39 | "acousticness","instrumentalness","liveness","valence","tempo",
40 | "duration_ms","time_signature","uri","analysis_url")]=0
41 |
42 |
43 | get_spotify_features=function(track, artist){
44 | songs= try(searchTrack(paste0("track:",track ," artist:",artist),keys),silent = T)
45 | if (class(songs)=="try-error"){
46 | return(rep(0,length(features_name)))
47 | }
48 | else{
49 | song_id=songs[,"id"][1]
50 | features=getFeatures(song_id,keys)
51 | return(features)
52 | }
53 | }
54 |
55 | #Two options for this
56 | #Option 1 Mapply: cleaner code but no intermediate data is saved
57 | #so if your internet suddenly goes off, you would have to start from scratch
58 |
59 | #music_df[,17:32]=mapply(get_spotify_features,music_df$title,music_df$artist)
60 |
61 | #Option 2 for loop: slower and less elegant but intermediate data is saved
62 | #so if your internet suddenly goes off, you can start from where it was aborted
63 | for(i in 1:nrow(music_df)){
64 | music_df[i,17:32]= get_spotify_features(track = music_df$title[i],music_df$artist[i])
65 | print(i)
66 |
67 | }
68 |
69 | #In order to avoid the code breaking where we cannot find an artist using Spotify's API
70 | #We initialized all song features and now we want to see if we can get partial matches
71 | #using first names because usually it is spelling error
72 | failed_ids = which(music_df$id =="0")
73 | get_spotify_features1=function(track, artist){
74 | artist=str_split(artist," ",simplify = T)[[1]]
75 | songs= try(searchTrack(paste0("track:",track ," artist:",artist,"*"),keys),silent = T)
76 | if (class(songs)=="try-error"){
77 | return(rep(0,length(features_name)))
78 | }
79 | else{
80 | song_id=songs[,"id"][1]
81 | features=getFeatures(song_id,keys)
82 | return(features)
83 | }
84 | }
85 |
86 | #Update the data for the failed IDs
87 | for(i in failed_ids){
88 | music_df[i,17:32]= get_spotify_features1(track = music_df$title[i],music_df$artist[i])
89 | print(i)
90 |
91 | }
92 |
93 | #Function to get spotify's featured and main artists in a song
94 | get_featured_artists=function(track, artist){
95 | songs= try(searchTrack(paste0("track:",track ," artist:",artist),keys),silent = T)
96 | if (class(songs)=="try-error"){
97 | return(NA)
98 | }
99 | else{
100 | song_id=songs[,"id"][1]
101 | aboutSong=getTrack(song_id,keys)
102 | artists=aboutSong$artists
103 | artists=as.character(artists)
104 | return(artists)
105 | }
106 | }
107 |
108 | ###Get the featured artists in each song###
109 |
110 | #Initialize column
111 | music_df$artist_with_features=""
112 |
113 | #Update new column with function output using Option 2: for loop
114 | for (i in 1:nrow(music_df)){
115 | music_df$artist_with_features[i]=get_featured_artists(music_df$title[i],(music_df$artist[i]))
116 | }
117 |
118 | #Take out the main artists for the "artists with features" column
119 | remove_main_artist=function(main_artist,artists){
120 | regex_pattern=paste0("(?<=;).*",main_artist,";|;.*",main_artist,"|",main_artist,".?;")
121 | new_list=str_replace(artists,regex_pattern,"")
122 | return(new_list)
123 | }
124 | music_df$artist_with_features=remove_main_artist(music_df$artist,music_df$artist_with_features)
125 |
126 | music_df$artist_with_features=ifelse(music_df$artist_with_features==music_df$artist,"",music_df$artist_with_features)
127 |
128 | #Get Images of Artist
129 | get_artist_image=function(artist){
130 | base_lang=Encoding(artist)
131 | base_lang=ifelse(base_lang=="unknown","UTF-8",base_lang)
132 | artist=iconv(artist,from=base_lang,to="ASCII//TRANSLIT")
133 | artists_tbl= try(searchArtist(artist,keys),silent = T)
134 | if (class(artists_tbl)=="try-error" ){
135 | #Picture not available image
136 | return("https://www.tabithaknowel.com/integrated/uploads/2017/05/noPhotoFound.png")
137 | }
138 | if (nrow(artists_tbl)==0){
139 | return("https://www.tabithaknowel.com/integrated/uploads/2017/05/noPhotoFound.png")
140 | }
141 | id=artists_tbl$id[1]
142 | req <- httr::GET(paste0("https://api.spotify.com/v1/artists/",
143 | id), httr::config(token = keys))
144 | json1 <- httr::content(req)
145 | no_of_images=length(json1$images)
146 | if(no_of_images <=0){
147 | return("https://www.tabithaknowel.com/integrated/uploads/2017/05/noPhotoFound.png")
148 | }
149 | image=json1$images[[1]]$url
150 | return(image)
151 | }
152 |
153 |
154 | artistImage=data.frame(artist=character(),image=character(), stringsAsFactors=FALSE)
155 | unique_artists=unique(music_df$artist)
156 | for (i in unique_artists[1:length(unique_artists)]){
157 | url=get_artist_image(i)
158 | artist_and_image=c(i,url)
159 | idx=which(unique_artists %in% i)
160 | artistImage[idx,]=artist_and_image
161 |
162 | }
163 |
164 | music_df=music_df %>%
165 | left_join(artistImage,by="artist")
166 |
167 | #Create the decades in the data
168 | music_df = music_df %>% mutate(year_bin= case_when(
169 | year<1960 ~"50s",
170 | year<1970 ~"60s",
171 | year<1980 ~"70s",
172 | year<1990 ~"80s",
173 | year<2000 ~"90s",
174 | year<2010 ~"00s",
175 | year>=2010 ~"10s"
176 | ))
177 | #Using the semi-colon seperator, we would create each feature as its own column from
178 | #Feature1 to Feature6. Feature6 because that's highest number of features for any
179 | #song in this dataset
180 | music_df_with_features=music_df %>%
181 | select(artist,title,artist_with_features)%>%
182 | separate(artist_with_features,paste0("Features",1:6),";")
183 |
184 | #We will then gather this into an an "artist, featured artist" key-value pair
185 | music_df_with_features=music_df_with_features%>%
186 | gather(ArtistFeatures,FeaturedArtists,Features1:Features6) %>%
187 | select(-ArtistFeatures)
188 |
189 |
190 | #List of Genres as defined fron Kevin Schaic
191 | genres = list("rock"= c("symphonic rock", "jazz-rock", "heartland rock", "rap rock", "garage rock", "folk-rock", "roots rock", "adult alternative pop rock", "rock roll", "punk rock", "arena rock", "pop-rock", "glam rock", "southern rock", "indie rock", "funk rock", "country rock", "piano rock", "art rock", "rockabilly", "acoustic rock", "progressive rock", "folk rock", "psychedelic rock", "rock & roll", "blues rock", "alternative rock", "rock and roll", "soft rock", "rock and indie", "hard rock", "pop/rock", "pop rock", "rock", "classic pop and rock", "psychedelic", "british psychedelia", "punk", "metal", "heavy metal"),
192 | "alternative/indie"= c("adult alternative pop rock", "alternative rock", "alternative metal", "alternative", "lo-fi indie", "indie", "indie folk", "indietronica", "indie pop", "indie rock", "rock and indie"),
193 | "electronic/dance"= c("dance and electronica", "electro house", "electronic", "electropop", "progressive house", "hip house", "house", "eurodance", "dancehall", "dance", "trap"),
194 | "soul"= c("psychedelic soul", "deep soul", "neo-soul", "neo soul", "southern soul", "smooth soul", "blue-eyed soul", "soul and reggae", "soul"),
195 | "classical/soundtrack"= c("classical", "orchestral", "film soundtrack", "composer"),
196 | "pop"= c("country-pop", "latin pop", "classical pop", "pop-metal", "orchestral pop", "instrumental pop", "indie pop", "sophisti-pop", "pop punk", "pop reggae", "britpop", "traditional pop", "power pop", "sunshine pop", "baroque pop", "synthpop", "art pop", "teen pop", "psychedelic pop", "folk pop", "country pop", "pop rap", "pop soul", "pop and chart", "dance-pop", "pop", "top 40"),
197 | "hip-hop"= c("conscious hip hop", "east coast hip hop", "hardcore hip hop", "west coast hip hop", "hiphop", "southern hip hop", "hip-hop", "hip hop", "hip hop rnb and dance hall", "gangsta rap", "rapper", "rap"),
198 | "rnb"=c("contemporary r b","rhythm and blues", "contemporary rnb", "contemporary r&b", "rnb", "rhythm & blues","r&b", "blues"),
199 | "disco"= c("disco"),
200 | "swing"= c("swing"),
201 | "folk"= c("contemporary folk", "folk"),
202 | "country"= c("country rock", "country-pop", "country pop", "contemporary country", "country"),
203 | "jazz"= c("vocal jazz", "jazz", "jazz-rock"),
204 | "religious"= c("christian", "christmas music", "gospel"),
205 | "blues"= c("delta blues", "rock blues", "urban blues", "electric blues", "acoustic blues", "soul blues", "country blues", "jump blues", "classic rock. blues rock", "jazz and blues", "piano blues", "british blues", "british rhythm & blues", "rhythm and blues", "blues", "blues rock", "rhythm & blues"),
206 | "reggae"= c("reggae fusion", "roots reggae", "reggaeton", "pop reggae", "reggae", "soul and reggae"))
207 |
208 |
209 | #Get Genre of an artist from Spotify
210 | get_artist_genre=function(artist){
211 | base_lang=Encoding(artist)
212 | #Takes care of non-UTF characters
213 | base_lang=ifelse(base_lang=="unknown","UTF-8",base_lang)
214 | artist=iconv(artist,from=base_lang,to="ASCII//TRANSLIT")
215 | artists_tbl= try(searchArtist(artist,keys),silent = T)
216 | if (class(artists_tbl)=="try-error" ){
217 | return("")
218 | }
219 | if (nrow(artists_tbl)==0){
220 | return("")
221 | }
222 | lst1=artists_tbl$genre[1]
223 | genre_list=str_split(lst1,",")[[1]]
224 | num_list=NULL
225 | for (i in 1:length(genres)){
226 | chk= genre_list%in% genres[i][[1]]
227 | totchk=sum(chk)
228 | num_list=append(num_list,totchk)
229 |
230 | }
231 | if (sum(num_list)==0){
232 | return("")
233 | }
234 | idx=which(num_list == max(num_list))
235 |
236 | final_genre=names(genres)[idx]
237 | final_genre=paste0(final_genre,collapse = "&")
238 | return(final_genre)
239 | }
240 |
241 |
242 | #Get genre of main artist i.e. artist that owns the song
243 | unique_main_artists=unique(music_df_with_features$artist)
244 |
245 | main_genre_df=data.frame(artist=character(),genre=character())
246 | for (i in unique_main_artists){
247 | gn=get_artist_genre(i)
248 | lt=list(i,gn)
249 | lt=data.frame(lt)
250 | names(lt)=c("artist","genre")
251 | main_genre_df=rbind(main_genre_df,lt)
252 | }
253 |
254 | #Like the features, seperate the genres into artist, genre key-value pairs from genre1
255 | #to genre5. genre5 because five is the highest amount of genres one artist is affliated with
256 | main_genre_df=main_genre_df %>%
257 | separate(genre,paste0("genre",1:5),"&")
258 |
259 | #Update the music features data with the genre for the main artists
260 | music_df_with_features=music_df_with_features%>%
261 | left_join(main_genre_df,by = "artist")%>%
262 | gather(xx,main_genre,genre1:genre5)
263 |
264 | #Filter out rows with no genre information and take out the dummy key column xx created above
265 | music_df_with_features=music_df_with_features%>%
266 | filter(!is.na(main_genre)) %>%
267 | select(-xx)
268 |
269 | #Get Genres for featured artists
270 | unique_featured_artists=unique(music_df_with_features$FeaturedArtists)
271 |
272 | feat_genre_df=data.frame(artist=character(),genre=character(), stringsAsFactors=FALSE)
273 |
274 | for (i in unique_featured_artists){
275 | i= str_replace_all(i,pattern = '[:punct:]',"")
276 | gn=get_artist_genre(i)
277 | lt=list(i,gn)
278 | lt=data.frame(lt)
279 | names(lt)=c("artist","genre")
280 | feat_genre_df=rbind(feat_genre_df,lt)
281 | }
282 |
283 | feat_genre_df=feat_genre_df %>%
284 | separate(genre,paste0("genre",1:5),"&")
285 |
286 | #Also make a key-value pair of main artist to featured artisr
287 | music_df_with_features=music_df_with_features%>%
288 | left_join(feat_genre_df,by=c("FeaturedArtists"="artist")) %>%
289 | gather(xx,featured_genre,genre1:genre5)%>%
290 | filter(!is.na(featured_genre)) %>%
291 | select(-xx)
292 |
293 | ################################Topic Modelling######################################
294 | library(topicmodels)
295 | library(tidytext)
296 | lyrics_words=music_df%>%
297 | unnest_tokens(word,lyrics) %>%
298 | select(word,year_bin,artist)
299 |
300 | #Take out stop words
301 | lyrics_words =lyrics_words %>%
302 | anti_join(stop_words)
303 |
304 | lyrics_words=lyrics_words %>%
305 | group_by(word,year_bin) %>%
306 | summarise(count =n())
307 |
308 | lyrics_dtm = lyrics_words %>%
309 | cast_dtm(year_bin,word,count)
310 |
311 | #Topic Modeling
312 | yearbin_lda <- LDA(lyrics_dtm, k = 2, control = list(seed = 1234))
313 |
314 | yearbin_lda_tidy = tidy(yearbin_lda)
315 |
316 | #select Top 5 terms for each topic
317 | top_terms=yearbin_lda_tidy %>%
318 | group_by(topic)%>%
319 | top_n(5,beta)
320 |
321 |
322 |
323 | library(ggplot2)
324 | library(hrbrthemes)
325 |
326 | #Plot Terms
327 | theme_set(theme_bw())
328 | top_terms$topic=paste("Topic", top_terms$topic)
329 | top_terms %>%
330 | ggplot(aes(reorder(term,-beta), beta)) +
331 | geom_bar(stat = "identity",aes(fill=as.character(topic)))+
332 | scale_fill_manual(values = c('Topic 1' = "skyblue3","Topic 2" = "seagreen3"))+
333 | facet_wrap(~ topic, scales = "free")+
334 | theme(axis.text.x = element_text(size = 15, angle = 90, hjust = 1))
335 |
336 | #Get presence of each topic in each decade
337 | year_gamma = tidy(yearbin_lda,matrix="gamma")
338 | topics_df=year_gamma %>%
339 | arrange(-gamma)
340 |
341 |
342 | write_csv(topics_df,"topic_dataset.csv")
343 |
344 | ##################CLUSTERING#############################
345 | #Prepare the data
346 |
347 | #Group artists by the median of all their song features
348 | cols=c(1:2,4:8,9,10:14,16:28,31)
349 |
350 | firstDf=music_df %>%
351 | mutate(year_bin=as.character(year_bin))%>%
352 | select(-lyrics,-title) %>%
353 | select(cols)%>%
354 | group_by(artist) %>%
355 | summarise_if(is.numeric,median)
356 |
357 | #Function to return mode of a vector
358 | Mode <- function(x) {
359 | ux <- unique(x)
360 | ux[which.max(tabulate(match(x, ux)))]
361 | }
362 |
363 | #Group artists by the decade the exist in the most
364 | secondDf=music_df %>%
365 | mutate(year_bin=as.character(year_bin))%>%
366 | select(-lyrics,-title) %>%
367 | group_by(artist) %>%
368 | summarise(year_bin=Mode(year_bin)) %>%
369 | as_tibble()
370 |
371 | #Join both tables
372 | artist_df=firstDf %>%
373 | inner_join(secondDf,by="artist")
374 |
375 | library(h2o)
376 | h2o.init()
377 |
378 |
379 | data = artist_df %>% as.h2o()
380 |
381 | #Spilt frame into train and validation set
382 | splits = h2o.splitFrame(data,ratios = 0.7,destination_frames = c("train", "valid"), seed = 1234)
383 | train = h2o.getFrame("train")
384 | val = h2o.getFrame("valid")
385 |
386 | #Column indices for all song features
387 | song_features=2:24
388 |
389 | #create k-means model
390 | knn_model=h2o.kmeans(train,song_features,validation_frame = val,k=2)
391 |
392 | #Update full dataset with knn prediction
393 | data$cluster=predict(knn_model,data)
394 | data$year_bin=h2o.asfactor(data$year_bin)
395 |
396 | #View tabular results
397 | h2o.table(data$year_bin,data$cluster)
398 |
399 | #Make H2O frame a dataframe and update cluster names
400 | cluster_df=as.data.frame(data)
401 | cluster_df=cluster_df %>%
402 | mutate(cluster=ifelse(cluster==0,"String Lover","Poetic"))
403 |
404 | #Update music dataset
405 | music_df=music_df%>%
406 | left_join((cluster_df%>%
407 | select(artist,cluster)),by="artist")
408 |
409 | #Find centers between both clusters
410 | ctrs = h2o.centers(knn_model)
411 | ctrs = as.data.frame(ctrs)
412 |
413 | #Calculate % difference between song features in each cluster
414 | ctrs[3,]=abs((ctrs[2,]-ctrs[1,])/ctrs[1,])
415 |
416 |
417 |
418 |
419 |
420 |
421 | ######################
422 | data = music_df %>%
423 | select(-lyrics,-title) %>%
424 | as.h2o()
425 | splits = h2o.splitFrame(data,ratios = c(0.6,0.2),destination_frames = c("train", "valid", "test"), seed = 1234)
426 | train = h2o.getFrame("train")
427 | features=c(1:2,4:8,10:14,16:28)
428 | test = h2o.getFrame("test")
429 | val = h2o.getFrame("valid")
430 | features=c(1:2,4:8,10:14,16:28)
431 | knn_model=h2o.kmeans(train,features,validation_frame = val,k=7)
432 | predict(knn_model,train)
433 | data$cluster=predict(knn_model,train)
434 |
435 | write_csv(music_df,"music_df.csv")
436 | write_csv(music_df_with_features,"features_dataset.csv")
437 |
--------------------------------------------------------------------------------
/Consitent Billionaire Guide/app/app.R:
--------------------------------------------------------------------------------
1 | library(h2o)
2 | library(shiny)
3 | library(shinydashboard)
4 | library(shinyjs)
5 | library(shinyBS)
6 | library(shinythemes)
7 | library(plyr)
8 | library(dplyr)
9 | library(scales)
10 | library(lubridate)
11 | library(shinysky)
12 | library(stringr)
13 | library(lime)
14 | library(ggplot2)
15 |
16 |
17 | dat = read.csv("billionaire_data_for_ml.csv")
18 |
19 | model_type.H2OMultinomialModel <<- function(x, ...)
20 | "classification"
21 |
22 | predict_model.H2OMultinomialModel <<-
23 | function(x, newdata, type, ...) {
24 | # Function performs prediction and returns dataframe with Response
25 | #
26 | # x is h2o model
27 | # newdata is data frame
28 | # type is only setup for data frame
29 |
30 | pred <- h2o.predict(x, as.h2o(newdata))
31 |
32 | # return classification probabilities only
33 | return(as.data.frame(pred[, -1]))
34 |
35 | }
36 |
37 |
38 | Countries = levels(dat$Country)
39 | Sectors = levels(dat$Sector)
40 | Relations = levels(dat$relation)
41 | busyIndicators <-
42 | function(text = "Calculation in progress..",
43 | img = "shinysky/busyIndicator/ajaxloaderq.gif",
44 | wait = 1000) {
45 | tagList(
46 | singleton(tags$head(
47 | tags$link(rel = "stylesheet", type = "text/css", href = "busyIndicator.css")
48 | ))
49 | ,
50 | div(class = "shinysky-busy-indicator", p(text), img(src = img))
51 | ,
52 | tags$script(
53 | sprintf(
54 | " setInterval(function(){
55 | if ($('html').hasClass('shiny-busy')) {
56 | setTimeout(function() {
57 | if ($('html').hasClass('shiny-busy')) {
58 | $('div.shinysky-busy-indicator').show()
59 | }
60 | }, %d)
61 | } else {
62 | $('div.shinysky-busy-indicator').hide()
63 | }
64 | },100)
65 | ",
66 | wait
67 | )
68 | )
69 | )
70 | }
71 | ui = shinyUI(
72 | navbarPage(
73 | "Billion Dollar Questions",
74 | inverse = F,
75 | collapsible = T,
76 | fluid = T,
77 | theme = shinytheme("flatly"),
78 | tabPanel(
79 | "What Type of Billionaire Are You?",
80 | icon = icon("money"),
81 | sidebarLayout(
82 | position = 'left',
83 |
84 | sidebarPanel(
85 | id = "sidebar",
86 | selectizeInput(
87 | "Country",
88 | label = h4("What country are you from?", style = "font-size: 10pt;font-family: 'Raleway';text-transform: uppercase;"),
89 | choices = Countries
90 | ),
91 | numericInput(
92 | 'Age',
93 | h4("How Old Are You?", style = "font-size: 10pt;font-family: 'Raleway';text-transform: uppercase;"),
94 | value = 20,
95 | min = 12,
96 | max = 100
97 | ),
98 | selectizeInput(
99 | "selfMade",
100 | h4("Do you have your own company (or plan to have one)?", style = "font-size: 10pt;font-family: 'Raleway';text-transform: uppercase;"),
101 | choices = c("Yes", "No")
102 | ),
103 | selectizeInput(
104 | "relation",
105 | h4(
106 | "Choose one below that best describes your role in the business:",
107 | style = "font-size: 10pt;font-family: 'Raleway';text-transform: uppercase;"
108 | ),
109 | choices = Relations
110 | ),
111 |
112 | numericInput(
113 | "founding_year",
114 | h4("When was/will this business (be) established?", style = "font-size: 10pt;font-family: 'Raleway';text-transform: uppercase;"),
115 | value = 1999,
116 | min = 1600,
117 | max = year(Sys.Date()) + 10
118 | ),
119 | selectizeInput(
120 | "Sector",
121 | h4("What Sector is this business?", style = "font-size: 10pt;font-family: 'Raleway';text-transform: uppercase;"),
122 | choices = Sectors
123 | ),
124 | selectizeInput(
125 | "Bsc",
126 | h4("Do you have a Bachelor's Degree?", style = "font-size: 10pt;font-family: 'Raleway';text-transform: uppercase;"),
127 | choices = c("Yes", "No")
128 | ),
129 | selectizeInput(
130 | "MBA",
131 | h4("Do you have an MBA?", style = "font-size: 10pt;font-family: 'Raleway';text-transform: uppercase;"),
132 | choices = c("Yes", "No")
133 | ),
134 | div(
135 | conditionalPanel(
136 | "!$('html').hasClass('shiny-busy')",
137 | actionButton(
138 | "run",
139 | div("PREDICT", icon("flask"), style = "text-align:center;font-size:10pt;width: 150px;"),
140 | styleclass = "success",
141 | size = "mini",
142 | css.class = "animated infinite rubberBand"
143 | )
144 | ),
145 | style = "text-align:center;"
146 | )
147 |
148 |
149 | ),
150 | mainPanel(
151 | position = "left",
152 |
153 |
154 | tags$head(
155 | tags$link(rel = "stylesheet", type = "text/css", href = "animate.min.css")
156 | ),tags$head(
157 | tags$link(rel = "stylesheet", type = "text/css", href = "style.css")
158 | ),
159 | bsModal(
160 | id = "startupMessage",
161 | trigger = '',
162 | size = 'large',
163 | HTML(
164 | '
165 |
166 |
167 |
168 |
169 |
170 |
171 |
172 |
Have you ever wondered what sort of billionaire you would end up as? A Consistent one, A Hustler or a Ghost?
173 | Learn a bit more about it on the link below and predict the one you could be with this simple app!