├── Python
    ├── CollaborativeFiltering
    │   ├── Goodreads_surprise.py
    │   └── TrainingModule.py
    ├── fireworks.py
    ├── flights_networkx.py
    ├── football_visual.ipynb
    ├── image.jpg
    ├── images
    │   └── networkx_basemap
    │   │   ├── map_0.png
    │   │   ├── map_1.png
    │   │   ├── map_2.png
    │   │   ├── map_3.png
    │   │   ├── table_1.PNG
    │   │   ├── table_2.PNG
    │   │   ├── table_3.PNG
    │   │   └── table_5.PNG
    ├── lincoln_estimate.py
    ├── mbappe.jpg
    ├── n_dimensionalNormal.py
    └── optimal_dating.py
├── R
    ├── EPL
    │   ├── Agg.png
    │   ├── Last.png
    │   ├── Misc
    │   │   └── TeamEvaluate2015.R
    │   ├── betting
    │   │   ├── Portfolio-xkcd.png
    │   │   ├── bet_strategy.R
    │   │   ├── clean_data.R
    │   │   └── prediction.R
    │   ├── penalty
    │   │   ├── Scraping.ipynb
    │   │   └── penalty.R
    │   ├── prediction
    │   │   ├── clean_data.R
    │   │   ├── match_simulate.R
    │   │   ├── sim.R
    │   │   └── visualize.R
    │   └── xkcd.ttf
    ├── Paul_hypothesis_test.R
    ├── RuleOfThree.R
    ├── bayes_god.R
    ├── bayesian_gym.R
    ├── dating_sim.R
    ├── end_to_end_projects.R
    └── lindy
    │   ├── Inverse_Random_Sampling.pdf
    │   └── lindy_simulation.R
├── README.md
├── data
    ├── Team2015season.csv
    ├── Vietnamese_2016.csv
    ├── all_games.csv
    ├── all_penalties.csv
    ├── fixtures.csv
    ├── history.csv
    ├── housing.csv
    └── national_longitudinal_survey.csv
├── dog_home.png
├── images
    ├── bayesian.png
    ├── dog_home.png
    ├── epl.png
    ├── fireworks.gif
    ├── messi-scribble.png
    ├── network.png
    ├── paul.png
    ├── scrible-test.png
    ├── selfie.png
    ├── selfie2.png
    └── test.R
└── llm_bots
    ├── animeyourself
        ├── anime_yourself.py
        ├── main.py
        ├── readme.md
        └── requirements.txt
    └── scribble2img
        ├── README.md
        ├── main.py
        ├── requirements.txt
        └── scribble2image.py


/Python/CollaborativeFiltering/Goodreads_surprise.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Code for Collaborative Filtering project
 3 | 
 4 | Instead of using module from Training module.py,
 5 | using surprise package which is a lot more efficient
 6 | in terms of sparse matrix handling.
 7 | """
 8 | 
 9 | import numpy as np
10 | import pandas as pd
11 | from surprise import SVD
12 | from surprise import Dataset
13 | from surprise import Reader
14 | from surprise import accuracy
15 | from surprise import BaselineOnly
16 | from surprise.model_selection import train_test_split
17 | 
18 | # 6 millions rows
19 | df = pd.read_csv('/ratings.csv')
20 | df['user_id'] = df['user_id'].astype(str)
21 | df.head()
22 | 
23 | # 10k books
24 | books = pd.read_csv('/books.csv')
25 | 
26 | # create a mapping between book_id and name
27 | id_to_name = {}
28 | for row in new_books.itertuples():
29 |     id_to_name[row[1]] = row[11]
30 | 
31 | # add my onw book ratings
32 | my_rating = {'user_id': [str(53425) for i in range(11)],
33 |              'book_id': [13, 119, 240, 283, 479, 1100, 2205, 2246, 3227, 7210, 5],
34 |              'rating': [5, 3, 4, 4, 4, 5, 2, 4, 4, 3, 3]}
35 | print([id_to_name[id] for id in [13, 119, 240, 283, 479, 1100, 2205, 2246, 3227, 7210, 5]])
36 | full_rating = pd.DataFrame(my_rating).append(new_rating)
37 | 
38 | 
39 | # Load our data into DataSet class of surprise package
40 | reader = Reader(rating_scale=(1, 5))
41 | data = Dataset.load_from_df(full_rating[['user_id', 'book_id', 'rating']], reader)
42 | 
43 | # split into trainset and testset
44 | trainset, testset = train_test_split(data, test_size=.10)
45 | train_eval = trainset.build_testset()
46 | 
47 | # train a Funk SGD-SVD algorithms:
48 | epochs = [1, 5, 10, 20, 40, 80, 100, 120, 150]
49 | train_mse = []
50 | test_mse = []
51 | for n_epoch in epochs:
52 |     print("Number of epochs trained", n_epoch)
53 |     algo = SVD(n_factors = 40, lr_all = 0.001, n_epochs = n_epoch)
54 |     algo.fit(trainset)
55 |     train_predictions = algo.test(train_eval)
56 |     test_predictions = algo.test(testset)
57 |     train_mse.append(accuracy.mse(train_predictions))
58 |     test_mse.append(accuracy.mse(test_predictions))
59 |     print(accuracy.mse(train_predictions), accuracy.mse(test_predictions))
60 | 
61 | # function to plot the learning curve through epochs
62 | def plot_learning_curve(iter_array, train_accuracy, test_accuracy, xlabel = 'iterations'):
63 |     plt.plot(iter_array, train_accuracy,
64 |              label='Train mse', linewidth=5)
65 |     plt.plot(iter_array, test_accuracy,
66 |              label='Test mse', linewidth=5)
67 | 
68 | 
69 |     plt.xticks(fontsize=16);
70 |     plt.yticks(fontsize=16);
71 |     plt.xlabel(xlabel, fontsize=30);
72 |     plt.ylabel('MSE', fontsize=30);
73 |     plt.legend(loc='best', fontsize=20);
74 | 
75 | plot_learning_curve(epochs, train_mse, test_mse)
76 | 
77 | #train on a full dataset and make prediction
78 | full_trainset = data.build_full_trainset()
79 | algo = SVD(n_factors = 40, lr_all = 0.001, verbose=True, n_epochs = 100)
80 | algo.fit(full_trainset)
81 | 
82 | # make prediction:
83 | all_book_id = full_rating.book_id.unique()
84 | top_n = []
85 | for book_id in all_book_id:
86 |     top_n.append(algo.predict(uid = str(53425), iid = book_id))
87 | top_n.sort(key=lambda x: x.est, reverse=True)
88 | print([id_to_name[pred.iid] for pred in top_n[:10]]
89 | 


--------------------------------------------------------------------------------
/Python/CollaborativeFiltering/TrainingModule.py:
--------------------------------------------------------------------------------
  1 | %matplotlib inline
  2 | import matplotlib.pyplot as plt
  3 | import seaborn as sns
  4 | sns.set()
  5 | import numpy as np
  6 | import pandas as pd
  7 | import io
  8 | from collections import defaultdict
  9 | from sklearn.metrics import mean_squared_error
 10 | from numpy.linalg import solve
 11 | np.random.seed(0)
 12 | 
 13 | # input is a dataframe with 3 columns
 14 | # user_id, item_id, rating
 15 | def create_rating_matrix(df):
 16 |     n_users = df.user_id.unique().shape[0]
 17 |     n_items = df.item_id.unique().shape[0]
 18 |     ratings = np.zeros((n_users, n_items))
 19 |     for row in df.itertuples():
 20 |         # row[1] - 1 is the user id readjusted to start by index 0
 21 |         # row[2] - 1 is the item id readjusted to start by index 0
 22 |         ratings[row[1]-1, row[2]-1] = row[3]
 23 |     ratings
 24 | 
 25 | 
 26 | # calculate sparsity of rating matrix
 27 | def calculate_sparsity(rating_matrix)
 28 |     sparsity = float(len(rating_matrixnonzero()[0])) * 100 / (rating_matrix.shape[0] * rating_matrix.shape[1])
 29 |     return sparsity
 30 | 
 31 | 
 32 | # function to split train, test data
 33 | def train_test_split(ratings, pct):
 34 |     test = np.zeros(ratings.shape)
 35 |     train = ratings.copy()
 36 |     for user in range(ratings.shape[0]):
 37 |         user_rating_idx = ratings[user, :].nonzero()[0]
 38 |         test_ratings = np.random.choice(user_rating_idx,
 39 |                                      size=int(len(user_rating_idx)*pct),
 40 |                                      replace=False)
 41 |         train[user, test_ratings] = 0.
 42 |         test[user, test_ratings] = ratings[user, test_ratings]
 43 |         
 44 |     # Test and training are truly disjoint
 45 |     assert(np.all((train * test) == 0)) 
 46 |     return train, test
 47 | 
 48 | 
 49 | # function to calculate MSE error
 50 | def get_mse(pred, actual):
 51 |     pred = pred[actual.nonzero()].flatten()
 52 |     actual = actual[actual.nonzero()].flatten()
 53 |     return mean_squared_error(pred, actual)
 54 | 
 55 | 
 56 | class AlternatingLeastSquareMF():
 57 |     def __init__(self, 
 58 |                  ratings, 
 59 |                  n_factors=40, 
 60 |                  item_reg=0.0, 
 61 |                  user_reg=0.0
 62 |                  ):
 63 |         """
 64 |         Train a matrix factorization model to predict empty 
 65 |         entries in a matrix.
 66 |         
 67 |         Params
 68 |         ======
 69 |         ratings : (ndarray)
 70 |             User x Item matrix with corresponding ratings
 71 |         
 72 |         n_factors : (int)
 73 |             Number of latent factors (k) to use in model
 74 |         
 75 |         item_reg : (float)
 76 |             Regularization term for item latent factors
 77 |         
 78 |         user_reg : (float)
 79 |             Regularization term for user latent factors
 80 |         """
 81 |         
 82 |         self.ratings = ratings
 83 |         self.n_users, self.n_items = ratings.shape
 84 |         self.n_factors = n_factors
 85 |         self.item_reg = item_reg
 86 |         self.user_reg = user_reg
 87 | 
 88 |     def alternating_step(self,
 89 |                          latent_vectors,
 90 |                          fixed_vecs,
 91 |                          ratings,
 92 |                          _lambda,
 93 |                          type='user'):
 94 |         """
 95 |         One of the two ALS steps. Solve for the latent vectors
 96 |         specified by type.
 97 |         """
 98 |         if type == 'user':
 99 |             # Precompute
100 |             YTY = fixed_vecs.T.dot(fixed_vecs)
101 |             lambdaI = np.eye(YTY.shape[0]) * _lambda
102 | 
103 |             for u in range(latent_vectors.shape[0]):
104 |                 latent_vectors[u, :] = solve((YTY + lambdaI), 
105 |                                              ratings[u, :].dot(fixed_vecs))
106 |         elif type == 'item':
107 |             # Precompute
108 |             XTX = fixed_vecs.T.dot(fixed_vecs)
109 |             lambdaI = np.eye(XTX.shape[0]) * _lambda
110 |             
111 |             for i in range(latent_vectors.shape[0]):
112 |                 latent_vectors[i, :] = solve((XTX + lambdaI), 
113 |                                              ratings[:, i].T.dot(fixed_vecs))
114 |         return latent_vectors
115 | 
116 |         
117 | 
118 |     def train(self, n_iter=10):
119 |         """ Train model for n_iter iterations from scratch."""
120 |         # initialize latent vectors
121 |         self.user_vecs = np.random.random((self.n_users, self.n_factors))
122 |         self.item_vecs = np.random.random((self.n_items, self.n_factors))
123 |         
124 |         ctr = 1
125 |         while ctr <= n_iter:
126 |             self.user_vecs = self.alternating_step(self.user_vecs, 
127 |                                                    self.item_vecs, 
128 |                                                    self.ratings, 
129 |                                                    self.user_reg, 
130 |                                                    type='user')
131 |             self.item_vecs = self.alternating_step(self.item_vecs, 
132 |                                                    self.user_vecs, 
133 |                                                    self.ratings, 
134 |                                                    self.item_reg, 
135 |                                                    type='item')
136 |             ctr += 1
137 |     
138 |     def predict_all(self):
139 |         """ Predict ratings for every user and item. """
140 |         predictions = np.zeros((self.user_vecs.shape[0], 
141 |                                 self.item_vecs.shape[0]))
142 |         for u in range(self.user_vecs.shape[0]):
143 |             for i in range(self.item_vecs.shape[0]):
144 |                 predictions[u, i] = self.predict(u, i)
145 |                 
146 |         return predictions
147 | 
148 |     def predict(self, u, i):
149 |         """ Single user and item prediction. """
150 |         return self.user_vecs[u, :].dot(self.item_vecs[i, :].T)
151 |     
152 |     def calculate_learning_curve(self, iter_array, test):
153 |         """
154 |         Keep track of MSE as a function of training iterations.
155 |         
156 |         Params
157 |         ======
158 |         iter_array : (list)
159 |             List of numbers of iterations to train for each step of 
160 |             the learning curve. e.g. [1, 5, 10, 20]
161 |         test : (2D ndarray)
162 |             Testing dataset (assumed to be user x item).
163 |         
164 |         The function creates two new class attributes:
165 |         
166 |         train_mse : (list)
167 |             Training data MSE values for each value of iter_array
168 |         test_mse : (list)
169 |             Test data MSE values for each value of iter_array
170 |         """
171 |         iter_array.sort()
172 |         self.train_mse =[]
173 |         self.test_mse = []
174 |         for (i, n_iter) in enumerate(iter_array):
175 |             self.train(n_iter)
176 |             predictions = self.predict_all()
177 | 
178 |             self.train_mse += [get_mse(predictions, self.ratings)]
179 |             self.test_mse += [get_mse(predictions, test)]
180 |             print('Train mse: ' + str(self.train_mse[-1]))
181 |             print('Test mse: ' + str(self.test_mse[-1]))
182 | 
183 | class SGDMF():
184 |     def __init__(self, 
185 |                  ratings,
186 |                  n_factors=40,
187 |                  item_fact_reg=0.0, 
188 |                  user_fact_reg=0.0,
189 |                  item_bias_reg=0.0,
190 |                  user_bias_reg=0.0,
191 |                  verbose=False
192 |                 ):
193 |         """
194 |         Train an SGD matrix factorization model to predict empty 
195 |         entries in a matrix. 
196 |         """
197 |         
198 |         self.ratings = ratings
199 |         self.n_users, self.n_items = ratings.shape
200 |         self.n_factors = n_factors
201 |         self.item_fact_reg = item_fact_reg
202 |         self.user_fact_reg = user_fact_reg
203 |         self.item_bias_reg = item_bias_reg
204 |         self.user_bias_reg = user_bias_reg
205 |         self.sample_row, self.sample_col = self.ratings.nonzero()
206 |         self.n_samples = len(self.sample_row)
207 | 
208 |     def sgd(self):
209 |         for idx in self.training_indices:
210 |             u = self.sample_row[idx]
211 |             i = self.sample_col[idx]
212 |             prediction = self.predict(u, i)
213 |             e = (self.ratings[u,i] - prediction) # error
214 |             
215 |             # Update biases
216 |             self.user_bias[u] += self.learning_rate * \
217 |                                 (e - self.user_bias_reg * self.user_bias[u])
218 |             self.item_bias[i] += self.learning_rate * \
219 |                                 (e - self.item_bias_reg * self.item_bias[i])
220 |             
221 |             #Update latent factors
222 |             self.user_vecs[u, :] += self.learning_rate * \
223 |                                     (e * self.item_vecs[i, :] - \
224 |                                      self.user_fact_reg * self.user_vecs[u,:])
225 |             self.item_vecs[i, :] += self.learning_rate * \
226 |                                     (e * self.user_vecs[u, :] - \
227 |                                      self.item_fact_reg * self.item_vecs[i,:])
228 |     
229 |     def train(self, n_iter=10, learning_rate=0.1):
230 |         """ Train model for n_iter iterations from scratch."""
231 |         # initialize latent vectors        
232 |         self.user_vecs = np.random.random(size=(self.n_users, self.n_factors))
233 |         self.item_vecs = np.random.random(size=(self.n_items, self.n_factors))
234 | 
235 |         self.learning_rate = learning_rate
236 |         self.user_bias = np.zeros(self.n_users)
237 |         self.item_bias = np.zeros(self.n_items)
238 |         self.global_bias = np.mean(self.ratings[np.where(self.ratings != 0)])
239 | 
240 |         ctr = 1
241 |         while ctr <= n_iter:
242 |             self.training_indices = np.arange(self.n_samples)
243 |             np.random.shuffle(self.training_indices)
244 |             self.sgd()
245 |             ctr += 1
246 |     
247 |     def predict(self, u, i):
248 |         prediction = self.global_bias + self.user_bias[u] + self.item_bias[i]
249 |         prediction += self.user_vecs[u, :].dot(self.item_vecs[i, :].T)
250 |         return prediction
251 |     
252 |     def predict_all(self):
253 |         """ Predict ratings for every user and item."""
254 |         predictions = np.zeros((self.user_vecs.shape[0], 
255 |                                 self.item_vecs.shape[0]))
256 |         for u in range(self.user_vecs.shape[0]):
257 |             for i in range(self.item_vecs.shape[0]):
258 |                 predictions[u, i] = self.predict(u, i)
259 |                 
260 |         return predictions
261 |     
262 |     def calculate_learning_curve(self, iter_array, test, learning_rate=0.1):
263 |         iter_array.sort()
264 |         self.train_mse =[]
265 |         self.test_mse = []
266 |         for (i, n_iter) in enumerate(iter_array):
267 |             self.train(n_iter, learning_rate)
268 | 
269 |             predictions = self.predict_all()
270 | 
271 |             self.train_mse += [get_mse(predictions, self.ratings)]
272 |             self.test_mse += [get_mse(predictions, test)]
273 |             print('Train mse: ' + str(self.train_mse[-1]))
274 |             print('Test mse: ' + str(self.test_mse[-1]))
275 |             
276 | 
277 |             
278 | def plot_learning_curve(iter_array, model):
279 |     plt.plot(iter_array, model.train_mse, \
280 |              label='Training', linewidth=5)
281 |     plt.plot(iter_array, model.test_mse, \
282 |              label='Test', linewidth=5)
283 | 
284 | 
285 |     plt.xticks(fontsize=16);
286 |     plt.yticks(fontsize=16);
287 |     plt.xlabel('iterations', fontsize=30);
288 |     plt.ylabel('MSE', fontsize=30);
289 |     plt.legend(loc='best', fontsize=20);
290 | 
291 |     
292 |     
293 |     
294 | 


--------------------------------------------------------------------------------
/Python/fireworks.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | FIREWORKS SIMULATION WITH TKINTER
  3 | 
  4 | *self-containing code
  5 | *to run: simply type python simple.py in your console
  6 | *compatible with both Python 2 and Python 3
  7 | *Dependencies: tkinter, Pillow (only for background image)
  8 | *The design is based on high school physics, with some small twists only for aesthetics purpose
  9 | 
 10 | '''
 11 | import tkinter as tk
 12 | #from tkinter import messagebox
 13 | #from tkinter import PhotoImage
 14 | from PIL import Image, ImageTk
 15 | from time import time, sleep
 16 | from random import choice, uniform, randint
 17 | from math import sin, cos, radians
 18 | 
 19 | # gravity, act as our constant g, you can experiment by changing it
 20 | GRAVITY = 0.05
 21 | # list of color, can choose randomly or use as a queue (FIFO)
 22 | colors = ['red', 'blue', 'yellow', 'white', 'green', 'orange', 'purple', 'seagreen','indigo', 'cornflowerblue']
 23 | 
 24 | '''
 25 | Generic class for particles
 26 | 
 27 | particles are emitted almost randomly on the sky, forming a round of circle (a star) before falling and getting removed
 28 | from canvas
 29 | 
 30 | Attributes:
 31 |     - id: identifier of a particular particle in a star
 32 |     - x, y: x,y-coordinate of a star (point of explosion)
 33 |     - vx, vy: speed of particle in x, y coordinate
 34 |     - total: total number of particle in a star
 35 |     - age: how long has the particle last on canvas
 36 |     - color: self-explantory
 37 |     - cv: canvas
 38 |     - lifespan: how long a particle will last on canvas
 39 | 
 40 | '''
 41 | class part:
 42 |     def __init__(self, cv, idx, total, explosion_speed, x=0., y=0., vx = 0., vy = 0., size=2., color = 'red', lifespan = 2, **kwargs):
 43 |         self.id = idx
 44 |         self.x = x
 45 |         self.y = y
 46 |         self.initial_speed = explosion_speed
 47 |         self.vx = vx
 48 |         self.vy = vy
 49 |         self.total = total
 50 |         self.age = 0
 51 |         self.color = color
 52 |         self.cv = cv
 53 |         self.cid = self.cv.create_oval(
 54 |             x - size, y - size, x + size,
 55 |             y + size, fill=self.color)
 56 |         self.lifespan = lifespan
 57 | 
 58 |     def update(self, dt):
 59 |         self.age += dt
 60 | 
 61 |         # particle expansions
 62 |         if self.alive() and self.expand():
 63 |             move_x = cos(radians(self.id*360/self.total))*self.initial_speed
 64 |             move_y = sin(radians(self.id*360/self.total))*self.initial_speed
 65 |             self.cv.move(self.cid, move_x, move_y)
 66 |             self.vx = move_x/(float(dt)*1000)
 67 | 
 68 |         # falling down in projectile motion
 69 |         elif self.alive():
 70 |             move_x = cos(radians(self.id*360/self.total))
 71 |             # we technically don't need to update x, y because move will do the job
 72 |             self.cv.move(self.cid, self.vx + move_x, self.vy+GRAVITY*dt)
 73 |             self.vy += GRAVITY*dt
 74 | 
 75 |         # remove article if it is over the lifespan
 76 |         elif self.cid is not None:
 77 |             cv.delete(self.cid)
 78 |             self.cid = None
 79 | 
 80 |     # define time frame for expansion
 81 |     def expand (self):
 82 |         return self.age <= 1.2
 83 | 
 84 |     # check if particle is still alive in lifespan
 85 |     def alive(self):
 86 |         return self.age <= self.lifespan
 87 | 
 88 | '''
 89 | Firework simulation loop:
 90 | Recursively call to repeatedly emit new fireworks on canvas
 91 | 
 92 | a list of list (list of stars, each of which is a list of particles)
 93 | is created and drawn on canvas at every call, 
 94 | via update protocol inside each 'part' object 
 95 | '''
 96 | def simulate(cv):
 97 |     t = time()
 98 |     explode_points = []
 99 |     wait_time = randint(10,100)
100 |     numb_explode = randint(6,10)
101 |     # create list of list of all particles in all simultaneous explosion
102 |     for point in range(numb_explode):
103 |         objects = []
104 |         x_cordi = randint(50,550)
105 |         y_cordi = randint(50, 150)
106 |         speed = uniform (0.5, 1.5)          
107 |         size = uniform (0.5,3)
108 |         color = choice(colors)
109 |         explosion_speed = uniform(0.2, 1)
110 |         total_particles = randint(10,50)
111 |         for i in range(1,total_particles):
112 |             r = part(cv, idx = i, total = total_particles, explosion_speed = explosion_speed, x = x_cordi, y = y_cordi, 
113 |                 vx = speed, vy = speed, color=color, size = size, lifespan = uniform(0.6,1.75))
114 |             objects.append(r)
115 |         explode_points.append(objects)
116 | 
117 |     total_time = .0
118 |     # keeps undate within a timeframe of 1.8 second
119 |     while total_time < 1.8:
120 |         sleep(0.01)
121 |         tnew = time()
122 |         t, dt = tnew, tnew - t
123 |         for point in explode_points:
124 |             for item in point:
125 |                 item.update(dt)
126 |         cv.update()
127 |         total_time += dt
128 |     # recursive call to continue adding new explosion on canvas
129 |     root.after(wait_time, simulate, cv)
130 | 
131 | def close(*ignore):
132 |     """Stops simulation loop and closes the window."""
133 |     global root
134 |     root.quit()
135 |     
136 | if __name__ == '__main__':
137 |     root = tk.Tk()
138 |     cv = tk.Canvas(root, height=600, width=600)
139 |     # use a nice background image
140 |     image = Image.open("image.jpg")
141 |     photo = ImageTk.PhotoImage(image)
142 |     cv.create_image(0, 0, image=photo, anchor='nw')
143 | 
144 |     cv.pack()
145 |     root.protocol("WM_DELETE_WINDOW", close)
146 | 
147 |     root.after(100, simulate, cv)
148 | 
149 |     root.mainloop()
150 | 


--------------------------------------------------------------------------------
/Python/flights_networkx.py:
--------------------------------------------------------------------------------
  1 | # import libaries
  2 | import pandas as pd
  3 | import numpy as np
  4 | import networkx as nx
  5 | import matplotlib.pyplot as plt
  6 | from mpl_toolkits.basemap import Basemap as Basemap
  7 | import matplotlib.lines as mlines
  8 | 
  9 | def main():
 10 | 	# download airport info data
 11 | 	airport_col = ['ID', 'Name', 'City', 'Country','IATA', 'ICAO', 'Lat', 'Long', 'Alt', 
 12 | 	               'Timezone', 'DST', 'Tz database time zone', 'type', 'source']
 13 | 	airport_df = pd.read_csv("https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat",
 14 | 	                        names = airport_col, index_col = 0)
 15 | 
 16 | 	# download flight routes data
 17 | 	route_cols = ['Airline', 'Airline ID', 'Source Airport', 'Source Airport ID',
 18 | 	              'Dest Airport', 'Dest Airport ID', 'Codeshare', 'Stops', 'equipment']
 19 | 	routes_df = pd.read_csv("https://raw.githubusercontent.com/jpatokal/openflights/master/data/routes.dat",
 20 | 	                        names = route_cols)
 21 | 	#clean up data, change 'object' type to numeric and drops NaNs
 22 | 	routes_df['Source Airport ID'] = pd.to_numeric(routes_df['Source Airport ID'].astype(str), 'coerce')
 23 | 	routes_df['Dest Airport ID'] = pd.to_numeric(routes_df['Dest Airport ID'].astype(str), 'coerce')
 24 | 	routes_df = routes_df.dropna(subset=["Source Airport ID", "Dest Airport ID"]) 
 25 | 
 26 | 
 27 | 	simple_visualization(airport_df, routes_df)
 28 | 	advanced_visualization(airport_df, routes_df)
 29 | 
 30 | ##### Part 1: simple network visualization, Alaska and other non-mainlain territories included ####
 31 | ###################################################################################################
 32 | # extract country and then extra columns
 33 | def simple_visualization (airport_df, routes_df):
 34 | 	if (airport_df is None) or (routes_df is None):
 35 | 		print "Data cannot be retrieved and read"
 36 | 	else:
 37 | 		airport_us = airport_df[(airport_df.Country == "United States")][['Name','Lat', 'Long', 'IATA', 'ICAO']]
 38 | 		us_airport_ix = airport_us.index.values
 39 | 		routes_us = routes_df[(routes_df['Source Airport ID'].isin(us_airport_ix)) &
 40 | 		                     (routes_df['Dest Airport ID'].isin(us_airport_ix))] #extract routes that flyies from AND to USA
 41 | 		routes_us =  pd.DataFrame(routes_us.groupby(['Source Airport', 'Dest Airport']).size().reset_index(name='counts'))
 42 | 		# to find number of flights in and out of an airport
 43 | 		# it is similar to find number of rows in which each airport occur in either one of the 2 columns
 44 | 		counts = routes_us['Source Airport'].append(routes_us.loc[routes_us['Source Airport'] != routes_us['Dest Airport'], 'Dest Airport']).value_counts()
 45 | 		# create a data frame of position based on names in count
 46 | 		counts = pd.DataFrame({'IATA': counts.index, 'total_flight': counts})
 47 | 		pos_data = counts.merge(airport_us, on = 'IATA')
 48 | 
 49 | 		# Create graph
 50 | 		graph = nx.from_pandas_edgelist(routes_us, source = 'Source Airport', target = 'Dest Airport',
 51 | 		                        edge_attr = 'counts',create_using = nx.DiGraph())
 52 | 
 53 | 		# default graph using Networkx inbuilt graph tools
 54 | 		plt.figure(figsize = (10,9))
 55 | 		nx.draw_networkx(graph)
 56 | 		plt.savefig("./images/networkx_basemap/map_0.png", format = "png", dpi = 300)
 57 | 		plt.show()
 58 | 
 59 | 		# Set up base map
 60 | 		plt.figure(figsize=(15,20))
 61 | 		m = Basemap(
 62 | 		        projection='merc',
 63 | 		        llcrnrlon=-180,
 64 | 		        llcrnrlat=10,
 65 | 		        urcrnrlon=-50,
 66 | 		        urcrnrlat=70,
 67 | 		        lat_ts=0,
 68 | 		        resolution='l',
 69 | 		        suppress_ticks=True)
 70 | 
 71 | 		# import long lat as m attribute
 72 | 		mx, my = m(pos_data['Long'].values, pos_data['Lat'].values)
 73 | 		pos = {}
 74 | 		for count, elem in enumerate (pos_data['IATA']):
 75 | 		    pos[elem] = (mx[count], my[count])
 76 | 
 77 | 		# draw nodes and edges and over aly on basemap
 78 | 		nx.draw_networkx_nodes(G = graph, pos = pos, node_list = graph.nodes(), node_color = 'r', alpha = 0.8,
 79 | 		                       node_size = [counts['total_flight'][s]*3 for s in graph.nodes()])
 80 | 		nx.draw_networkx_edges(G = graph, pos = pos, edge_color='g', width = routes_us['counts']*0.75, 
 81 | 		                       alpha=0.2, arrows = False)
 82 | 
 83 | 		m.drawcountries(linewidth = 3)
 84 | 		m.drawstates(linewidth = 0.2)
 85 | 		m.drawcoastlines(linewidth=3)
 86 | 		plt.tight_layout()
 87 | 		plt.savefig("./images/networkx_basemap/map_2.png", format = "png", dpi = 300)
 88 | 		plt.show()
 89 | 		print ("successful visualization")
 90 | 		return 0
 91 | 
 92 | ##### Part 2: more on visualization, only mainlain territories with more features ####
 93 | ######################################################################################
 94 | # extract country and then extra columns
 95 | def advanced_visualization (airport_df, routes_df):
 96 | 	if (airport_df is None) or (routes_df is None):
 97 | 		print ("Data cannot be retrieved and read")
 98 | 	else:
 99 | 		airport_us = airport_df[(airport_df.Country == "United States") & (airport_df.Lat > 25) 
100 | 								& (airport_df.Lat < 50) & (airport_df.Long > -130) & (airport_df.Long < -60)]
101 | 		us_airport_ix = airport_us.index.values
102 | 		routes_us = routes_df[(routes_df['Source Airport ID'].isin(us_airport_ix)) &
103 | 		                     (routes_df['Dest Airport ID'].isin(us_airport_ix))] #extract routes that flyies from AND to USA
104 | 		routes_us =  pd.DataFrame(routes_us.groupby(['Source Airport', 'Dest Airport']).size().reset_index(name='counts'))
105 | 		# to find number of flights in and out of an airport
106 | 		# it is similar to find number of rows in which each airport occur in either one of the 2 columns
107 | 		counts = routes_us['Source Airport'].append(routes_us.loc[routes_us['Source Airport'] != routes_us['Dest Airport'], 'Dest Airport']).value_counts()
108 | 		# create a data frame of position based on names in count
109 | 		counts = pd.DataFrame({'IATA': counts.index, 'total_flight': counts})
110 | 		pos_data = counts.merge(airport_us, on = 'IATA')
111 | 
112 | 		# Create graph
113 | 		graph = nx.from_pandas_edgelist(routes_us, source = 'Source Airport', target = 'Dest Airport',
114 | 		                        edge_attr = 'counts',create_using = nx.DiGraph())
115 | 
116 | 		# Set up base map
117 | 		plt.figure(figsize=(15,20))
118 | 		m = Basemap(
119 | 		        projection='merc',
120 | 		        llcrnrlon=-180,
121 | 		        llcrnrlat=10,
122 | 		        urcrnrlon=-50,
123 | 		        urcrnrlat=70,
124 | 		        lat_ts=0,
125 | 		        resolution='l',
126 | 		        suppress_ticks=True)
127 | 
128 | 		# import long lat as m attribute
129 | 		mx, my = m(pos_data['Long'].values, pos_data['Lat'].values)
130 | 		pos = {}
131 | 		for count, elem in enumerate (pos_data['IATA']):
132 | 		    pos[elem] = (mx[count], my[count])
133 | 
134 | 		# draw nodes and edges and overly on basemap
135 | 		nx.draw_networkx_nodes(G = graph, pos = pos, nodelist = [x for x in graph.nodes() if counts['total_flight'][x] >= 100],
136 | 		                       node_color = 'r', alpha = 0.8,
137 | 		                       node_size = [counts['total_flight'][x]*4  for x in graph.nodes() if counts['total_flight'][x] >= 100])
138 | 
139 | 		nx.draw_networkx_labels(G = graph, pos = pos, font_size=10,
140 | 		                        labels = {x:x for x in graph.nodes() if counts['total_flight'][x] >= 100})
141 | 
142 | 		nx.draw_networkx_nodes(G = graph, pos = pos, nodelist = [x for x in graph.nodes() if counts['total_flight'][x] < 100],
143 | 		                       node_color = 'b', alpha = 0.6,
144 | 		                       node_size = [counts['total_flight'][x]*4  for x in graph.nodes() if counts['total_flight'][x] < 100])
145 | 
146 | 		nx.draw_networkx_edges(G = graph, pos = pos, edge_color = 'g', width = routes_us['counts']*0.75, 
147 | 		                       alpha=0.06, arrows = False)
148 | 
149 | 		m.drawcountries(linewidth = 3)
150 | 		m.drawstates(linewidth = 0.2)
151 | 		m.drawcoastlines(linewidth=1)
152 | 		m.fillcontinents(alpha = 0.3)
153 | 		line1 = mlines.Line2D(range(1), range(1), color="white", marker='o', markerfacecolor="red")
154 | 		line2 = mlines.Line2D(range(1), range(1), color="white", marker='o',markerfacecolor="blue")
155 | 		line3 = mlines.Line2D(range(1), range(1), color="green", marker='',markerfacecolor="green")
156 | 		plt.legend((line1, line2, line3), ('Large Airport > 100 routes', 'Smaller airports', 'routes'),
157 | 		           loc=4, fontsize = 'xx-large')
158 | 		plt.title("Network graph of flight routes in the USA", fontsize = 30)
159 | 		#m.bluemarble()
160 | 		plt.tight_layout()
161 | 		plt.savefig("./images/networkx_basemap/map_3.png", format = "png", dpi = 300)
162 | 		plt.show()
163 | 		print ("successful visualization")
164 | 		return 0
165 | 
166 | if __name__ == "__main__":
167 |     main()  
168 | 


--------------------------------------------------------------------------------
/Python/image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/image.jpg


--------------------------------------------------------------------------------
/Python/images/networkx_basemap/map_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/images/networkx_basemap/map_0.png


--------------------------------------------------------------------------------
/Python/images/networkx_basemap/map_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/images/networkx_basemap/map_1.png


--------------------------------------------------------------------------------
/Python/images/networkx_basemap/map_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/images/networkx_basemap/map_2.png


--------------------------------------------------------------------------------
/Python/images/networkx_basemap/map_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/images/networkx_basemap/map_3.png


--------------------------------------------------------------------------------
/Python/images/networkx_basemap/table_1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/images/networkx_basemap/table_1.PNG


--------------------------------------------------------------------------------
/Python/images/networkx_basemap/table_2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/images/networkx_basemap/table_2.PNG


--------------------------------------------------------------------------------
/Python/images/networkx_basemap/table_3.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/images/networkx_basemap/table_3.PNG


--------------------------------------------------------------------------------
/Python/images/networkx_basemap/table_5.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/images/networkx_basemap/table_5.PNG


--------------------------------------------------------------------------------
/Python/lincoln_estimate.py:
--------------------------------------------------------------------------------
 1 | from matplotlib import pyplot as plt
 2 | 
 3 | def like_insta_post(p):
 4 |     "Find an error with probability p"
 5 |     return 1 if random.random() < p else 0
 6 | 
 7 | def simulate(true_audience, p1, p2, reps=10000):
 8 |     """Simulate Lincoln's method for estimating errors
 9 |     given the true number of errors, each person's probability
10 |     of finding an error, and the number of simulations to run."""
11 |     naive_estimates = []
12 |     lincoln_estimates = []
13 |     
14 |     for rep in range(reps):
15 |         like_post_1 = np.array([like_insta_post(p1) for _ in range(true_audience)])
16 |         like_post_2 = np.array([like_insta_post(p2) for _ in range(true_audience)])
17 |         like_post1_count = sum(like_post_1)
18 |         like_post2_count = sum(like_post_2)
19 |         overlap = np.sum(like_post_1 & like_post_2)
20 |         
21 |         naive_estimates.append(like_post1_count + like_post2_count - overlap)
22 |         if overlap > 0:
23 |             lincoln_estimates.append(like_post1_count*like_post2_count / float(overlap))
24 |     
25 |     return naive_estimates, lincoln_estimates
26 | 
27 | def calc_stats(arr):
28 |     return (
29 |             np.mean(arr),
30 |             np.std(arr, ddof=1),
31 |             np.mean(arr) - 1.96*np.std(arr, ddof=1),
32 |             np.mean(arr) + 1.96*np.std(arr, ddof=1)
33 |            )
34 | 
35 | sims = [[0.3, 0.5], [0.6, 0.4], [0.7, 0.8], [0.9, 0.9]]
36 | # create 2 lists, 1 of data frame of values, 1 of titles
37 | res_arr = []
38 | title_arr = []
39 | 
40 | for p in sims:
41 |     naive_estimates, lincoln_estimates = simulate(100, p[0], p[1], reps=100000)
42 |     naive_stats = calc_stats(naive_estimates)
43 |     lincoln_stats = calc_stats(lincoln_estimates)
44 |     naive_mean, naive_std = naive_stats[0], naive_stats[1]
45 |     lincoln_mean, lincoln_std = lincoln_stats[0], lincoln_stats[1]
46 |     
47 |     pd_res = pd.DataFrame(
48 |         {
49 |             "method":["naive", "Lincoln"],
50 |             "estimate":[naive_mean, lincoln_mean], 
51 |             "std": [naive_std, lincoln_std]}
52 |     )
53 |     res_arr.append(pd_res)
54 |     title_arr.append(f" p1={str(p[0])}\n p2={str(p[1])}")
55 | 
56 | colors = ['blue', 'orange']
57 | fig, axes = plt.subplots(1, 4, figsize=(18, 6), sharey=True)
58 | ax1, ax2, ax3, ax4 = axes
59 | for dat_df, ax, title in zip(
60 |                    res_arr, 
61 |                    [ax1, ax2, ax3, ax4],
62 |                    title_arr
63 | ):
64 |     dat_df.plot(x='method', y='estimate', yerr = 'std', kind='bar', color = colors,
65 |                 ax=ax, legend=False, xlabel='', ylabel = 'mean of estimates').set_title(title)
66 | 
67 | for ax in axes:
68 |     ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
69 |     for side in ('right', 'top', 'left'):
70 |         if (ax == ax1) and (side == 'left'):
71 |             continue
72 |         else:
73 |             sp = ax.spines[side]
74 |             sp.set_visible(False)
75 | 


--------------------------------------------------------------------------------
/Python/mbappe.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/mbappe.jpg


--------------------------------------------------------------------------------
/Python/n_dimensionalNormal.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import matplotlib as plt
  3 | 
  4 | def get_3_std_estimates(x):
  5 |   return (x.mean() - 3 * x.std(), x.mean() + 3 * x.std())
  6 | 
  7 | def get_2_std_estimates(x):
  8 |   return (x.mean() - 2 * x.std(), x.mean() + 2 * x.std())
  9 | 
 10 | def get_1_std_estimates(x):
 11 |   return (x.mean() - x.std(), x.mean() + x.std())
 12 | 
 13 | 
 14 | N = 10000
 15 | 
 16 | def get_graph(n, title):
 17 |   """
 18 |   Draw a distribution histogram for a sample of N data from 
 19 |   n-dimensional Normal distribution
 20 |   """
 21 |   
 22 |   sample = np.random.normal(size=(N, n))
 23 |   dist = np.square(np.linalg.norm(sample, axis = 1))
 24 |   lower_bound, upper_bound = get_2_std_estimates(dist)
 25 |   n, bins, patches = plt.hist(dist, bins = 'auto', density = "true")
 26 |   plt.axvline(x = lower_bound, color = 'red')
 27 |   plt.axvline(x = upper_bound, color = 'red')
 28 |   plt.title(title, fontdict = {'fontsize': 20})
 29 |   plt.show()
 30 |   
 31 | get_graph(100, "Distribution of distance from origin for n = 100")
 32 | 
 33 | 
 34 | def get_boundary(n):
 35 |   """
 36 |   For a dimension value n, sample N data points from a n-dimensional
 37 |   Normal distribution and find the 2 standard deviation boundary
 38 |   for the squared Euclidan norms.
 39 |   """
 40 |   
 41 |   sample = np.random.normal(size=(N, n))
 42 |   dist = np.square(np.linalg.norm(sample, axis = 1))
 43 |   lower_bound, upper_bound = get_2_std_estimates(dist)
 44 |   return (lower_bound, upper_bound)
 45 | 
 46 | ### simulation
 47 | n_range = range(1, 5001)
 48 | lower_bounds = []
 49 | upper_bounds = []
 50 | 
 51 | for n in n_range:
 52 |   lower_bound, upper_bound = get_boundary(n)
 53 |   lower_bounds.append(lower_bound/n)
 54 |   upper_bounds.append(upper_bound/n)
 55 | 
 56 | plt.style.use('seaborn-notebook')
 57 | plt.plot(n_range, lower_bounds, label = 'lower_bounds\ndivided by n')
 58 | plt.plot(n_range, upper_bounds, label = 'upper_bounds\ndivided by n')
 59 | #plt.axvline(x=1000, color = 'red', linestyle = '--')
 60 | plt.legend(prop={'size': 13})
 61 | plt.xlim(1, 5000)
 62 | plt.xlabel("dimensions")
 63 | plt.title("Ratio between 2-standard devation boundaries and n as n increases", fontdict = {'fontsize': 16})
 64 | plt.show()
 65 | 
 66 | 
 67 | 
 68 | #### how many points lie in the 10% period or outside
 69 | def get_pct_for_interval(n):
 70 |   sample = np.random.normal(size=(N, n))
 71 |   dist = np.square(np.linalg.norm(sample, axis = 1))
 72 |   
 73 |   lower_interval = np.count_nonzero(dist < n*0.95)
 74 |   middle_interval = np.count_nonzero((dist >= n*0.95) & (dist <= n*1.05))
 75 |   large_interval = np.count_nonzero(dist > n*1.05)
 76 |   
 77 |   return lower_interval/N, middle_interval/N, large_interval/N
 78 |   
 79 | lower_intervals = []
 80 | middle_intervals = []
 81 | large_intervals = []
 82 | 
 83 | for n in n_range:
 84 |   lower_interval, middle_interval, large_interval = get_pct_for_interval(n)
 85 |   lower_intervals.append(lower_interval)
 86 |   middle_intervals.append(middle_interval)
 87 |   large_intervals.append(large_interval)
 88 | 
 89 | plt.stackplot(n_range,
 90 |               lower_intervals,
 91 |               middle_intervals,
 92 |               large_intervals,
 93 |               labels=['d^2 < 0.95n',
 94 |                       '0.95n <= d^2 <= 1.05n',
 95 |                       'd^2 > 1.05n'])
 96 | plt.legend()
 97 | plt.xlabel("dimensions")
 98 | plt.title("Probability that a sample point will be at some distance from the origin", fontdict = {'fontsize': 16})
 99 | plt.ylim(0, 1)
100 | plt.xlim(1, 5000)
101 | 


--------------------------------------------------------------------------------
/Python/optimal_dating.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import matplotlib as plt
  3 | import seaborn as sns
  4 | 
  5 | #################
  6 | # Top k algorithm
  7 | #################
  8 | 
  9 | 
 10 | def perm_rank(n):
 11 |   """create a ranked order list of n items"""
 12 |   return random.sample(range(1, n+1), n)
 13 | 
 14 | 
 15 | def top_k_selection_algo(array, m, k):
 16 |   """for any list of order, apply top-k algorithm
 17 |   
 18 |   Return whether we succeed (1) or failure (0) to
 19 |   identify top-k value
 20 |   """
 21 |   top_first_m = min(array[:(m-1)])
 22 |   # then for array[n:]
 23 |   # we pick first k values that is greater than max_first_m 
 24 |   inspect_array = np.array(array[m-1:])
 25 |   qualified_cand = inspect_array[inspect_array < top_first_m][:k]
 26 | 
 27 |   if len(qualified_cand) == k and max(qualified_cand) == k:
 28 |       return 1
 29 |   return 0
 30 | 
 31 | 
 32 | def simulation_top_k(n, k, iters):
 33 |   """
 34 |   for any value of k and n
 35 |   simulate all exploration cutoff from 2-> n
 36 |   and return a list of success probability at different cutoff
 37 |   """
 38 |   result = []
 39 |   for m in range(2, n+1):
 40 |       result_m = []
 41 |       for i in range(iters):
 42 |           order = perm_rank(n)
 43 |           success = selection_algo(order, m, k)
 44 |           result_m.append(success)
 45 |       result.append(np.mean(result_m))
 46 |   return result
 47 | 
 48 | 
 49 | result = simulation_top_k(100, 1, iters)
 50 | result_3 = simulation_top_k(100, 3, iters)
 51 | result_5 = simulation_top_k(100, 5, iters)
 52 | result_10 = simulation_top_k(100, 10, iters)
 53 | 
 54 | 
 55 | plt.style.use('fivethirtyeight')
 56 | plt.figure(figsize=(13,6))
 57 | sns.scatterplot(np.arange(2, 101),y=result, label = "k = 1")
 58 | sns.scatterplot(np.arange(2, 101),y=result_3, label = "k = 3")
 59 | sns.scatterplot(np.arange(2, 101),y=result_5, label = "k = 5")
 60 | sns.scatterplot(np.arange(2, 101),y=result_10, label = "k = 10")
 61 | plt.grid(False)
 62 | plt.title("Probability of finding top k partners\n by exploring first r values")
 63 | plt.xlabel("r values")
 64 | plt.ylabel("Probability")
 65 | 
 66 | 
 67 | ##############################
 68 | # Top candidate with p success
 69 | ##############################
 70 | 
 71 | def selection_algo_with_success_rate(array, m, p):
 72 |   top_first_m = min(array[:(m-1)])
 73 |   available_array = np.random.binomial(1, p, len(array))
 74 |   #print(available_array)
 75 |   # then for array[n:]
 76 |   # we pick first k values that is greater than max_first_m 
 77 |   #print("top first m", top_first_m)
 78 |   #print(array[:(m-1)], array[m-1:])
 79 |   inspect_array = array[m-1:]
 80 |   inspect_available = available_array[m-1:]
 81 | 
 82 |   if top_first_m == 1:
 83 |       return 0
 84 |   available_idx = np.where(inspect_available == 1)[0]
 85 |   available_person = np.array(inspect_array)[available_idx]
 86 |   pass_cand =  available_person[available_person < top_first_m]
 87 |   #print(pass_cand)
 88 |   if len(pass_cand) == 0:
 89 |       return 0
 90 |   accept = pass_cand[0]
 91 |   if accept == 1:
 92 |       return 1
 93 |   return 0
 94 |  
 95 | def simulate_with_success_rate(n, p, iters):
 96 |   result = []
 97 |   for m in range(2, n+1):
 98 |       result_m = []
 99 |       for i in range(iters):
100 |           order = perm_rank(n)
101 |           success = selection_algo_with_success_rate(order, m, p)
102 |           result_m.append(success)
103 |       result.append(np.mean(result_m))
104 |   return result
105 | 
106 | result_avail_1 = simulate_with_success_rate(100, 1, iters)
107 | result_avail_2 = ssimulate_with_success_rate(100, 0.25, iters)
108 | result_avail_5 = simulate_with_success_rate(100, 0.5, iters)
109 | result_avail_7 = simulate_with_success_rate(100, 0.75, iters)
110 | 
111 | plt.style.use('fivethirtyeight')
112 | plt.figure(figsize=(13,6))
113 | sns.scatterplot(np.arange(2, 101),y=result_avail_1, label = "p = 1")
114 | sns.scatterplot(np.arange(2, 101),y=result_avail_2, label = "p = 0.25")
115 | sns.scatterplot(np.arange(2, 101),y=result_avail_5, label = "p = 0.5")
116 | sns.scatterplot(np.arange(2, 101),y=result_avail_7, label = "p = 0.75")
117 | plt.title("Probability of finding top partner at different success rate\n by exploring first r values")
118 | plt.grid(False)
119 | plt.xlabel("r values")
120 | plt.ylabel("Probability")
121 | 


--------------------------------------------------------------------------------
/R/EPL/Agg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/R/EPL/Agg.png


--------------------------------------------------------------------------------
/R/EPL/Last.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/R/EPL/Last.png


--------------------------------------------------------------------------------
/R/EPL/Misc/TeamEvaluate2015.R:
--------------------------------------------------------------------------------
  1 | # Load libraries and read files
  2 | packages <- c("dplyr", "fpc", "cluster", 
  3 |               "factoextra", "dendextend", 
  4 |               "psych", "qgraph")
  5 | lapply(packages, library, character.only = TRUE)
  6 | 
  7 | raw_df <- read.csv("./Team2015season.csv", header=T)
  8 | # scale data 
  9 | 
 10 | scaled_data <- raw_df %>%
 11 |   remove_rownames() %>%
 12 |   column_to_rownames("Team") %>%
 13 |   scale()
 14 | 
 15 | 
 16 | #######################################
 17 | #  Hierarchical Cluster Analysis
 18 | #  Useful tutorial: 
 19 | #  https://uc-r.github.io/hc_clustering
 20 | #######################################
 21 | 
 22 | #Eucledian, Ward's method
 23 | d_1 <- dist(scaled_data, method="euclidean")
 24 | clust_1 <- hclust(d_1, method="ward.D")
 25 | #draw the dendrogram
 26 | plot(clust_1,
 27 |      cex=0.7,
 28 |      xlab="",
 29 |      ylab="Distance",
 30 |      main="Clusterings of 60 European teams")
 31 | rect.hclust(clust_1, k = 4, border = 2:5)
 32 | 
 33 | #get membership vector 
 34 | cuts <- cutree(clust_1,k=4)
 35 | scaled_data %>%
 36 |   as.data.frame() %>%
 37 |   mutate(cluster = cuts) %>%
 38 |   head
 39 | 
 40 | # Compute distance matrix
 41 | res.dist <- dist(scaled_data, method = "euclidean")
 42 | 
 43 | # Compute 2 hierarchical clusterings
 44 | hc1 <- hclust(res.dist, method = "complete")
 45 | hc2 <- hclust(res.dist, method = "ward.D2")
 46 | 
 47 | # Create two dendrograms and compare group partition
 48 | dend1 <- as.dendrogram (hc1)
 49 | dend2 <- as.dendrogram (hc2)
 50 | 
 51 | dend_list <- dendlist(dend1, dend2)
 52 | 
 53 | tanglegram(dend1, dend2,
 54 |            lwd = 1,
 55 |            edge.lwd = 1,
 56 |            lab.cex = 0.5,
 57 |            columns_width = c(8, 3, 8),
 58 |            highlight_distinct_edges = FALSE, # Turn-off dashed lines
 59 |            common_subtrees_color_lines = FALSE, # Turn-off line colors
 60 |            common_subtrees_color_branches = TRUE, # Color common branches 
 61 |            main = paste("entanglement =", round(entanglement(dend_list), 2))
 62 | )
 63 | 
 64 | ###########################################
 65 | #  K-means clustering
 66 | #  Useful tutorial: 
 67 | #  https://uc-r.github.io/kmeans_clustering
 68 | ###########################################
 69 | 
 70 | # use 4 centers that Hc clustering suggests
 71 | # nstart: attempts multiple initial configurations
 72 | # and reports on the best one.
 73 | km_results <- kmeans(scaled_data, centers = 4, nstart = 100)
 74 | km_results
 75 | 
 76 | # fviz_cluster does PCA and plot the data points 
 77 | # according to the first two PCs that explain the majority of the variance
 78 | fviz_cluster(km_results, data = scaled_data)
 79 | 
 80 | # Evaluating clustering
 81 | # Best number of cluster using scree-plot (elbow method)
 82 | # optimal total-wihtin cluster sum of square
 83 | set.seed(123)
 84 | fviz_nbclust(scaled_data, kmeans, method = "wss")
 85 | 
 86 | # Average Silhouette method
 87 | # measuring the quality of the clusters
 88 | # by how well object lies within a cluster
 89 | # try to maximize average silhouette
 90 | fviz_nbclust(scaled_data, kmeans, method = "silhouette")
 91 | 
 92 | # GAP statistics method
 93 | # can apply to both kmeans and HC
 94 | # compares the total intracluster variation
 95 | # with their expected values 
 96 | # under null reference distribution of the data
 97 | # at various value of k
 98 | set.seed(123)
 99 | gap_stat <- clusGap(scaled_data,
100 |                     FUN = kmeans,
101 |                     nstart = 100,
102 |                     K.max = 10,
103 |                     B = 50)
104 | # Print the result
105 | print(gap_stat, method = "firstmax")
106 | fviz_gap_stat(gap_stat)
107 | 
108 | ###################################################################
109 | #  Factor analysis
110 | #  Useful tutorial: 
111 | #  http://www.di.fc.ul.pt/~jpn/r/factoranalysis/factoranalysis.html
112 | #  https://rpubs.com/aaronsc32/factor-analysis-introduction
113 | ###################################################################
114 | # determined the number of factors to use with scree plot
115 | parallel <- fa.parallel(scaled_data,
116 |                         fm = 'minres',
117 |                         fa = 'fa')
118 | 
119 | # factor analysis -- no rotation
120 | # Varimax: assume factors completely uncorrelated
121 | # Oblique: correlations in factors
122 | 
123 | # Method: factanal only support MaxLikelihood
124 | # In fa (psych), we can use "PAF (pa)" or "mingres", 
125 | # the later provide results similar to `MaxLikelihood` 
126 | # without assuming multivariate normal distribution 
127 | # and derives solutions through iterative eigen decomposition like principal axis.
128 | 
129 | fa1 <- factanal(scaled_data,
130 |                 factors=2, 
131 |                 rotation="none",
132 |                 scores="regression")
133 | 
134 | fa2 <- fa(scaled_data,
135 |           nfactors = 3,
136 |           rotate = "oblimin",
137 |           fm="minres")
138 | fa1
139 | 
140 | # biplot
141 | biplot(fa1$scores[,1:2],
142 |        loadings(fa1),
143 |        cex=c(0.7,0.8))
144 | # qgraph
145 | # a different visualization of biplot
146 | qg.fa1 <- qgraph(fa1)
147 | 
148 | # NOTE:
149 | # - after Exploratory Factor Analysis (EFA), 
150 | # - the next step could be Confirmatory Factor Analysis
151 | # - which is part of a larger subset: Structual Equation Modelling 
152 | # - https://socialsciences.mcmaster.ca/jfox/Misc/sem/SEM-paper.pdf
153 | 
154 | 
155 | # we can get some flexibility from the "psych" package
156 | fa_analysis <- function(data_set, factor,
157 |                         rotate = "varimax", fm = "pa"){
158 |   res <- fa(data_set, nfactors = factor,
159 |             rotate = rotate, fm = fm)
160 |   print("Factor Analysis results:")
161 |   print(res)
162 |   
163 |   # get loading plot for the first two factors
164 |   plot(res$loadings, pch=18, col='red')
165 |   abline(h=0)
166 |   abline(v=0)
167 |   text(res$loadings, labels=names(data_set),cex=0.8)
168 |   
169 |   #get reproduced correlation matrix
170 |   repro <- res$loadings%*%t(res$loadings)
171 |   #residual correlation matrix
172 |   residual <- cor(data_set)-repro
173 |   print("Residual correlation matrx")
174 |   round(resid2,2)
175 |   
176 |   #get root-mean squared residuals
177 |   len <- length(residual[upper.tri(residual)])
178 |   RMSR <- sqrt(sum(residual[upper.tri(residual)]^2)/len)
179 |   print("Root-mean squared residuals:", RMSR)
180 |   
181 |   #get proportion of residuals greater than 0.05 in absolute value
182 |   prop <- sum(rep(1,len)[abs(residual[upper.tri(residual)])>0.05])/len
183 |   print("Proportion of residuals greater than 0.05 in absolute value:", prop)
184 | }
185 | 
186 | # varimax - paf
187 | fa_analysis(soccer, 3)
188 | 
189 | # quartimax - pag
190 | fa_analysis(soccer, 3, "quartimax", "pa")
191 | 


--------------------------------------------------------------------------------
/R/EPL/betting/Portfolio-xkcd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/R/EPL/betting/Portfolio-xkcd.png


--------------------------------------------------------------------------------
/R/EPL/betting/bet_strategy.R:
--------------------------------------------------------------------------------
  1 | library(tidyverse)
  2 | source("prediction.R")
  3 | # in MAC, may have to go to font book to activate xkcd.ttf
  4 | #library(extrafont)
  5 | #font_import(path = ".", pattern="xkcd")
  6 | #fonts()
  7 | #loadfonts()
  8 | 
  9 | betting_house <- c("B365", "BW", "IW", "PS", "WH", "VC")
 10 | 
 11 | # easy computation of max odd or mean probability
 12 | # find max_odd if find_max is TRUE, else return Consensus Probability of event
 13 | row_max_prob <- function(df, row_idx, find_max){
 14 |   predict_outcome = df[row_idx, "predict_outcome"]
 15 |   if (is.na(predict_outcome)) return (NA)
 16 |   col_names <- paste0(betting_house, predict_outcome)
 17 |   val = ifelse(find_max, max(df[row_idx,col_names]), 1/mean(as.numeric(df[row_idx,col_names])))
 18 |   return (val)
 19 | }
 20 | 
 21 | 
 22 | ##### find total return at every round
 23 | # based on prediction, max_odd, Consensus Probability and amount of capital to bet
 24 | # input in Round (Matchweek), method ("poisson", "merson", "random") and Amount of available capital
 25 | betting_round <- function (round, method, capital){
 26 |   total_return = 0
 27 |   
 28 |   round_data <- df_prediction %>%
 29 |     filter(Round == round) %>%
 30 |     mutate(method = method,
 31 |            predict_outcome = ifelse(method == "random", sample(c("H", "D", "A"),n(), replace = TRUE),
 32 |                             ifelse(method == 'poisson', poisson_predict, Merson_predict)))
 33 |   no_matches = dim(round_data)[1]
 34 |   round_data$max_odd <- sapply(1:no_matches, function(x) row_max_prob(round_data, x, TRUE))
 35 |   round_data$prob <- sapply(1:no_matches, function(x) row_max_prob(round_data, x, FALSE))
 36 |   
 37 |   round_data <- round_data %>%
 38 |     mutate (fraction = ((prob*max_odd - (1-prob))/max_odd),
 39 |             f_normalize = fraction/sum(fraction, na.rm = TRUE),
 40 |             bet_amount = f_normalize * capital,
 41 |             payoff = ifelse(FTR == predict_outcome, bet_amount*max_odd, 0),
 42 |             profit = payoff-bet_amount)
 43 |   
 44 |   return (sum(round_data$profit, na.rm = TRUE))
 45 | }
 46 | 
 47 | # inititate a table to store return result
 48 | # remove Paul's Merson bet
 49 | return_table <- data.frame(round = 0:30,
 50 |                            Poisson = rep(0,31),
 51 |                            random_bet = rep(0,31))
 52 | 
 53 | return_table[1,c("Poisson", "random_bet")] <- rep(1000,2)
 54 | 
 55 | for (i in 1:30){
 56 |   Poisson_return <- betting_round(i, "poisson",1000/30)
 57 |   random_return  <- betting_round(i, "random", 1000/30)
 58 |   #Merson_return  <- betting_round(i, "Merson",1000/30)
 59 | 
 60 |   return_table[i+1,"Poisson"]    <- Poisson_return
 61 |   return_table[i+1,"random_bet"] <- random_return
 62 |   #return_table[i+1,"Merson_bet"] <- Merson_return
 63 | }
 64 | 
 65 | # we are interested the change in the portfolio overtime
 66 | return_table$Poisson    <- cumsum(return_table$Poisson)
 67 | return_table$random_bet <- cumsum(return_table$random_bet)
 68 | #return_table$Merson_bet <- cumsum(return_table$Merson_bet)
 69 | 
 70 | return_table %>% 
 71 |   gather("method", "value", -round) %>%
 72 |   mutate(method = factor(method, levels = c('Poisson', 'random_bet'),
 73 |                          labels = c('Poisson prediction', 'random prediction'))) %>%
 74 |   ggplot(aes(x=round, y=value, group=method)) +
 75 |   geom_line(aes(color=method)) +
 76 |   scale_x_continuous(breaks = seq(0, 30, by = 5)) +
 77 |   ggtitle("Portfolio value at the end of every matchweek") +
 78 |   theme(axis.line = element_line(size=1, colour = "black"),
 79 |         panel.grid.major = element_blank(),
 80 |         panel.grid.minor = element_blank(),
 81 |         panel.border = element_blank(),
 82 |         panel.background = element_blank(),
 83 |         plot.title=element_text(size = 18, family="xkcd"),
 84 |         text=element_text(size = 13, family="xkcd"),
 85 |         axis.text.x=element_text(colour="black", size = 12),
 86 |         axis.text.y=element_text(colour="black", size = 12)) +
 87 |   ylab('Portfolio total value in dollars') +
 88 |   xlab ('Matchweek')
 89 | 
 90 | 
 91 | #############
 92 | # Extra note
 93 | # If you can invest as much as you want with $1000 buffer
 94 | # meaning that you set out to invest 1000/30 per round
 95 | # but if Kelly criterion asks for more, you can till afford it
 96 | ############
 97 | 
 98 | ##### Method to combine Kelly criterion and odds
 99 | betting_round <- function (round, predict_method, capital){
100 |   total_return = 0
101 |   
102 |   round_data <- df_prediction %>%
103 |     filter(Round == round)
104 |   
105 |   for (i in 1:dim(round_data)[1]){
106 |     predict = ifelse(predict_method == "random", sample(c("H", "D", "A"),1), 
107 |                      round_data[i,predict_method])
108 |     if (is.na(predict)){
109 |       total_return = total_return
110 |     }
111 |     else{
112 |       # once I have the prediction, I find the one with the highest odd
113 |       odds = as.vector (round_data[i, paste0(betting_house, predict)])
114 |       odd = max(odds)
115 |       
116 |       predict_prob =  1/rowMeans(round_data[i,paste0(betting_house, predict)])
117 |       bet_amount = ((predict_prob*odd - (1-predict_prob))/odd)*capital
118 |       
119 |       total_return = ifelse(round_data[i, "FTR"] == predict, 
120 |                             total_return + bet_amount*(odd-1), 
121 |                             total_return - bet_amount)
122 |     }
123 |   }
124 |   return (total_return)
125 | }
126 | 


--------------------------------------------------------------------------------
/R/EPL/betting/clean_data.R:
--------------------------------------------------------------------------------
 1 | ########################################
 2 | # scripts to clean data to usable format
 3 | # source:
 4 | # - fixtures.csv: dedicatedexcel.com
 5 | # - Historical results: https://www.kaggle.com/thefc17/epl-results-19932018
 6 | #########################################
 7 | library (dplyr)
 8 | 
 9 | link_fixture = "https://raw.githubusercontent.com/tuangauss/DataScienceProjects/master/data/all_games.csv"
10 | link_history = "https://raw.githubusercontent.com/tuangauss/DataScienceProjects/master/data/history.csv"
11 | 
12 | fixtures <- read.csv(link_fixture, stringsAsFactors = FALSE)
13 | 
14 | # get the team
15 | teams <- unique(fixtures$HOME.TEAM)
16 | 
17 | # extract historic results
18 | history <- read.csv(link_history, stringsAsFactors = FALSE)
19 | 
20 | # get info from the 2010 up to 2018
21 | seasons <- sapply(10:17, function(x) paste0(2000+x,'-',x+1))
22 | 
23 | recent.pl <- history %>%
24 |   filter(Season %in% seasons, div == 'E0')
25 | 
26 | # because the two data comes from different source, so the teams name don't match
27 | teams[!teams %in% recent.pl$HomeTeam]
28 | unique(recent.pl$HomeTeam)
29 | 
30 | # now we need to fix it
31 | pair_fix <- list(c('Manchester United', 'Man United'),
32 |                  c('Newcastle United', 'Newcastle'),
33 |                  c('Huddersfield Town', 'Huddersfield'),
34 |                  c('Wolverhampton Wanderers', 'Wolves'),
35 |                  c('Cardiff City', 'Cardiff'),
36 |                  c('Leicester City', 'Leicester'),
37 |                  c('Tottenham Hotspur', 'Tottenham'),
38 |                  c('West Ham United', 'West Ham'),
39 |                  c('Manchester City', "Man City"),
40 |                  c('Brighton and Hove Albion', 'Brighton'))
41 | 
42 | # fix the recent.pl dataset
43 | # for name-conformity
44 | for (i in 1:length(pair_fix)){
45 |   recent.pl <- recent.pl %>%
46 |     mutate(HomeTeam = replace(HomeTeam,
47 |                               HomeTeam == pair_fix[[i]][2],
48 |                               pair_fix[[i]][1]),
49 |            AwayTeam = replace(AwayTeam,
50 |                               AwayTeam == pair_fix[[i]][2],
51 |                               pair_fix[[i]][1]))
52 | }
53 | 
54 | 
55 | # a bland average dataframe
56 | ave_home <- recent.pl %>%
57 |   group_by(HomeTeam) %>%
58 |   summarize (ave_scored_h = mean(FTHG), ave_conceded_h = mean(FTAG)) %>%
59 |   filter (HomeTeam %in% teams) %>% rename(Team = HomeTeam)
60 | 
61 | ave_away <- recent.pl %>%
62 |   group_by(AwayTeam) %>%
63 |   summarize (ave_scored_a = mean(FTAG), ave_conceded_a = mean(FTHG)) %>%
64 |   filter (AwayTeam %in% teams)  %>% rename(Team = AwayTeam)
65 | 
66 | ave <- merge(ave_home, ave_away, by = 'Team')
67 | 
68 | 
69 | # more precise result with pairwise
70 | hist_pair.pl <- recent.pl %>%
71 |   group_by(HomeTeam, AwayTeam) %>%
72 |   filter (HomeTeam %in% teams, AwayTeam %in% teams) %>%
73 |   summarize (match = n(), ave_home_scored = mean(FTHG), ave_away_scored = mean(FTAG))
74 | 
75 | # data set for new season
76 | # just clean the data name for readability
77 | new_season <- fixtures %>%
78 |   rename(HomeTeam = HOME.TEAM,
79 |          AwayTeam = AWAY.TEAM)
80 | 
81 | # clean data form memory
82 | rm(history, seasons, recent.pl, pair_fix, ave_home, ave_away, fixtures)
83 | 


--------------------------------------------------------------------------------
/R/EPL/betting/prediction.R:
--------------------------------------------------------------------------------
 1 | library (dplyr)
 2 | source ('clean_data.R')
 3 | 
 4 | # function to simplify result
 5 | # from scoreline to who wins the match, H (Home), A(Away) or D(Draw)
 6 | result_calc <- function (h_goal, a_goal){
 7 |   result = ifelse(h_goal == a_goal, 'D', ifelse(h_goal > a_goal, 'H', 'A'))
 8 |   return (result)
 9 | }
10 | 
11 | # function to calibrate results
12 | # The idea is to make sure that if Probability of wining of Home and Away is tight
13 | # e.g: 0.451(H) vs 0.447 (A)
14 | # then it should be thought as a draw
15 | result_calibrate <- function(prob_h, prob_d, prob_a){
16 |   result = ifelse(abs(prob_h - prob_a) < 0.01, "D",
17 |                   ifelse (prob_h == pmax(prob_d,prob_h,prob_a), "H", 
18 |                           ifelse(prob_d == pmax(prob_h,prob_d,prob_a), "D", "A" )))
19 |   return (result)
20 | }
21 | 
22 | 
23 | # get most frequent score line of a match after n, sim time
24 | get_score <- function (home, away, nsim){
25 |   # try to get from history, pair
26 |   subset <- hist_pair.pl[ which( hist_pair.pl$HomeTeam ==home | hist_pair.pl$AwayTeam ==away), ]
27 |   # more efficient code, no need to retract back to dataframe many times
28 |   ave_h_s = subset$ave_home_scored[1]
29 |   ave_a_s = subset$ave_away_scored[1]
30 |   
31 |   t_ave_h_s = ave[ave$Team == home,]$ave_scored_h
32 |   t_ave_a_c = ave[ave$Team == away,]$ave_conceded_a
33 |   t_ave_h_c = ave[ave$Team == home,]$ave_conceded_h
34 |   t_ave_a_s = ave[ave$Team == away,]$ave_scored_a
35 |   result = character(length(nsim))
36 |   for (i in 1:nsim){
37 |     if ((dim(subset)[1] == 1) & (subset$match[1] > 3)){
38 |       h_scored = rpois(1, ave_h_s)
39 |       a_scored = rpois(1, ave_a_s)
40 |     }
41 |     # if we have no historical result of the match
42 |     else{
43 |       # take into account both attacking stat of home and defense stats of away
44 |       h_scored = rpois(1, 1/2 * (t_ave_h_s + t_ave_a_c))
45 |       a_scored = rpois(1, 1/2 * (t_ave_a_s + t_ave_h_c))
46 |     } 
47 |     result[i] = result_calc(h_scored, a_scored)
48 |   }
49 |   result_tab  = table(result)/nsim
50 |   return (c(result_tab['H'], result_tab['D'], result_tab['A']))
51 | }
52 | 
53 | nsim = 10000
54 | matches <- mapply(get_score, new_season$HomeTeam, new_season$AwayTeam, nsim, SIMPLIFY = FALSE)
55 | new_season$H <- sapply(matches, function(x) x[1])
56 | new_season$D <- sapply(matches, function(x) x[2])
57 | new_season$A <- sapply(matches, function(x) x[3])
58 | 
59 | df_prediction <- new_season %>%
60 |   mutate(poisson_predict = result_calibrate(H,D,A))
61 |                        
62 | # The data about Paul Merson's prediction seems to get lost somehow                       
63 | #df_prediction <- new_season %>%
64 | #  mutate(poisson_predict = result_calibrate(H,D,A),
65 | #         Merson_predict = result_calc(Merson.H, Merson.A))
66 | 


--------------------------------------------------------------------------------
/R/EPL/penalty/Scraping.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 34,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import requests \n",
 10 |     "from bs4 import BeautifulSoup\n",
 11 |     "import time\n",
 12 |     "import random\n",
 13 |     "import pandas as pd\n",
 14 |     "import numpy as np"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 14,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "root = \"https://www.statbunker.com/competitions/Penalties?comp_id=\""
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 5,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "PL = [200,243,279,323,373,415,449,481,515,556,586,614,639]\n",
 33 |     "year_list = [str(i)+\"/\" + str(i+1) for i in range (7,20)]\n",
 34 |     "year_europa = [str(i)+\"/\" + str(i+1) for i in range (9,20)]"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 2,
 40 |    "metadata": {},
 41 |    "outputs": [],
 42 |    "source": [
 43 |     "Laliga = [212,259,293,336,378,413,461,485,518,564,600,622,648] #start 07/08\n",
 44 |     "Seria = [211,258,292,337,377,414,462,486,517,562,593,623,649]\n",
 45 |     "bundes = [204,250,285,330,374,416,447,483,516,561,591,620,646]\n",
 46 |     "france = [202,251,284,331,375,412,454,484,514,563,594,621,647]\n",
 47 |     "championship = [207,246,280,325,370,420,451,488,524,557,587,615,640]\n",
 48 |     "scottish = [205,249,283,329,369,419,455,491,521,566,590,618,643]\n",
 49 |     "CL = [203,261,295,332,366,429,468,500,540,571,601,628,655]#07/08\n",
 50 |     "europa = [296,335,362,430,470,501,541,572,602,629,656] #09/10"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 28,
 56 |    "metadata": {},
 57 |    "outputs": [],
 58 |    "source": [
 59 |     "cup_dict = {\"Premier League\": PL,\n",
 60 |     "           \"La Liga\": Laliga,\n",
 61 |     "           \"Bundesliga\": bundes,\n",
 62 |     "           \"Ligue One\": france,\n",
 63 |     "           \"English Championship\": championship,\n",
 64 |     "           \"Scottish Premiership\": scottish,\n",
 65 |     "           \"Champion League\": CL,\n",
 66 |     "           \"Europa Cup\": europa}"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 29,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "# scrape for PL first then append the others later\n",
 76 |     "full_name, full_club, full_year, full_league, full_penalties, full_home, full_away, full_scored, full_missed, full_saved = ([] for i in range(10))"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 31,
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "name": "stdout",
 86 |      "output_type": "stream",
 87 |      "text": [
 88 |       "Done Premier League, season 7/8\n",
 89 |       "Done Premier League, season 8/9\n",
 90 |       "Done Premier League, season 9/10\n",
 91 |       "Done Premier League, season 10/11\n",
 92 |       "Done Premier League, season 11/12\n",
 93 |       "Done Premier League, season 12/13\n",
 94 |       "Done Premier League, season 13/14\n",
 95 |       "Done Premier League, season 14/15\n",
 96 |       "Done Premier League, season 15/16\n",
 97 |       "Done Premier League, season 16/17\n",
 98 |       "Done Premier League, season 17/18\n",
 99 |       "Done Premier League, season 18/19\n",
100 |       "Done Premier League, season 19/20\n",
101 |       "Done La Liga, season 7/8\n",
102 |       "Done La Liga, season 8/9\n",
103 |       "Done La Liga, season 9/10\n",
104 |       "Done La Liga, season 10/11\n",
105 |       "Done La Liga, season 11/12\n",
106 |       "Done La Liga, season 12/13\n",
107 |       "Done La Liga, season 13/14\n",
108 |       "Done La Liga, season 14/15\n",
109 |       "Done La Liga, season 15/16\n",
110 |       "Done La Liga, season 16/17\n",
111 |       "Done La Liga, season 17/18\n",
112 |       "Done La Liga, season 18/19\n",
113 |       "Done La Liga, season 19/20\n",
114 |       "Done Bundesliga, season 7/8\n",
115 |       "Done Bundesliga, season 8/9\n",
116 |       "Done Bundesliga, season 9/10\n",
117 |       "Done Bundesliga, season 10/11\n",
118 |       "Done Bundesliga, season 11/12\n",
119 |       "Done Bundesliga, season 12/13\n",
120 |       "Done Bundesliga, season 13/14\n",
121 |       "Done Bundesliga, season 14/15\n",
122 |       "Done Bundesliga, season 15/16\n",
123 |       "Done Bundesliga, season 16/17\n",
124 |       "Done Bundesliga, season 17/18\n",
125 |       "Done Bundesliga, season 18/19\n",
126 |       "Done Bundesliga, season 19/20\n",
127 |       "Done Ligue One, season 7/8\n",
128 |       "Done Ligue One, season 8/9\n",
129 |       "Done Ligue One, season 9/10\n",
130 |       "Done Ligue One, season 10/11\n",
131 |       "Done Ligue One, season 11/12\n",
132 |       "Done Ligue One, season 12/13\n",
133 |       "Done Ligue One, season 13/14\n",
134 |       "Done Ligue One, season 14/15\n",
135 |       "Done Ligue One, season 15/16\n",
136 |       "Done Ligue One, season 16/17\n",
137 |       "Done Ligue One, season 17/18\n",
138 |       "Done Ligue One, season 18/19\n",
139 |       "Done Ligue One, season 19/20\n",
140 |       "Done English Championship, season 7/8\n",
141 |       "Done English Championship, season 8/9\n",
142 |       "Done English Championship, season 9/10\n",
143 |       "Done English Championship, season 10/11\n",
144 |       "Done English Championship, season 11/12\n",
145 |       "Done English Championship, season 12/13\n",
146 |       "Done English Championship, season 13/14\n",
147 |       "Done English Championship, season 14/15\n",
148 |       "Done English Championship, season 15/16\n",
149 |       "Done English Championship, season 16/17\n",
150 |       "Done English Championship, season 17/18\n",
151 |       "Done English Championship, season 18/19\n",
152 |       "Done English Championship, season 19/20\n",
153 |       "Done Scottish Premiership, season 7/8\n",
154 |       "Done Scottish Premiership, season 8/9\n",
155 |       "Done Scottish Premiership, season 9/10\n",
156 |       "Done Scottish Premiership, season 10/11\n",
157 |       "Done Scottish Premiership, season 11/12\n",
158 |       "Done Scottish Premiership, season 12/13\n",
159 |       "Done Scottish Premiership, season 13/14\n",
160 |       "Done Scottish Premiership, season 14/15\n",
161 |       "Done Scottish Premiership, season 15/16\n",
162 |       "Done Scottish Premiership, season 16/17\n",
163 |       "Done Scottish Premiership, season 17/18\n",
164 |       "Done Scottish Premiership, season 18/19\n",
165 |       "Done Scottish Premiership, season 19/20\n",
166 |       "Done Champion League, season 7/8\n",
167 |       "Done Champion League, season 8/9\n",
168 |       "Done Champion League, season 9/10\n",
169 |       "Done Champion League, season 10/11\n",
170 |       "Done Champion League, season 11/12\n",
171 |       "Done Champion League, season 12/13\n",
172 |       "Done Champion League, season 13/14\n",
173 |       "Done Champion League, season 14/15\n",
174 |       "Done Champion League, season 15/16\n",
175 |       "Done Champion League, season 16/17\n",
176 |       "Done Champion League, season 17/18\n",
177 |       "Done Champion League, season 18/19\n",
178 |       "Done Champion League, season 19/20\n",
179 |       "Done Europa Cup, season 9/10\n",
180 |       "Done Europa Cup, season 10/11\n",
181 |       "Done Europa Cup, season 11/12\n",
182 |       "Done Europa Cup, season 12/13\n",
183 |       "Done Europa Cup, season 13/14\n",
184 |       "Done Europa Cup, season 14/15\n",
185 |       "Done Europa Cup, season 15/16\n",
186 |       "Done Europa Cup, season 16/17\n",
187 |       "Done Europa Cup, season 17/18\n",
188 |       "Done Europa Cup, season 18/19\n",
189 |       "Done Europa Cup, season 19/20\n"
190 |      ]
191 |     }
192 |    ],
193 |    "source": [
194 |     "for cup in cup_dict:\n",
195 |     "    if cup == \"Europa Cup\":\n",
196 |     "        season = year_europa\n",
197 |     "    else:\n",
198 |     "        season = year_list\n",
199 |     "        \n",
200 |     "    name, club, year, league, penalties, home, away, scored, missed, saved = ([] for i in range(10))\n",
201 |     "    code_enum = cup_dict[cup]\n",
202 |     "    for count, el in enumerate(code_enum):\n",
203 |     "        URL = root + str(el)\n",
204 |     "        r = requests.get(URL)\n",
205 |     "        soup = BeautifulSoup(r.content, 'html5lib')\n",
206 |     "        details = soup.findAll(True, {'class':['odd', 'even']})\n",
207 |     "        for row in details:\n",
208 |     "            el_list = list(row.strings)\n",
209 |     "            name.append(el_list[0])\n",
210 |     "            club.append(el_list[1])\n",
211 |     "            year.append(season[count])\n",
212 |     "            league.append(cup)\n",
213 |     "            penalties.append(el_list[2])\n",
214 |     "            home.append(el_list[3])\n",
215 |     "            away.append(el_list[4])\n",
216 |     "            scored.append(el_list[5])\n",
217 |     "            missed.append(el_list[6])\n",
218 |     "            saved.append(el_list[7])\n",
219 |     "        print (\"Done \" + cup + \", season \" + season[count])\n",
220 |     "        time.sleep(random.randint(1,5))\n",
221 |     "    full_name += name\n",
222 |     "    full_club += club\n",
223 |     "    full_year += year\n",
224 |     "    full_league += league\n",
225 |     "    full_penalties += penalties\n",
226 |     "    full_home += home\n",
227 |     "    full_away += away\n",
228 |     "    full_scored += scored\n",
229 |     "    full_missed += missed\n",
230 |     "    full_saved += saved       "
231 |    ]
232 |   },
233 |   {
234 |    "cell_type": "code",
235 |    "execution_count": 32,
236 |    "metadata": {},
237 |    "outputs": [],
238 |    "source": [
239 |     "table = pd.DataFrame({'name': full_name, 'club': full_club,\n",
240 |     "                      'year': full_year, 'league': full_league,\n",
241 |     "                      'penalties': full_penalties, 'home': full_home, 'away': full_away,\n",
242 |     "                      'scored': full_scored, 'missed': full_missed, 'saved': full_saved})"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": 35,
248 |    "metadata": {},
249 |    "outputs": [
250 |     {
251 |      "data": {
252 |       "text/plain": [
253 |        "(3937, 10)"
254 |       ]
255 |      },
256 |      "execution_count": 35,
257 |      "metadata": {},
258 |      "output_type": "execute_result"
259 |     }
260 |    ],
261 |    "source": [
262 |     "np.shape(table)"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": 38,
268 |    "metadata": {},
269 |    "outputs": [
270 |     {
271 |      "data": {
272 |       "text/html": [
273 |        "<div>\n",
274 |        "<style scoped>\n",
275 |        "    .dataframe tbody tr th:only-of-type {\n",
276 |        "        vertical-align: middle;\n",
277 |        "    }\n",
278 |        "\n",
279 |        "    .dataframe tbody tr th {\n",
280 |        "        vertical-align: top;\n",
281 |        "    }\n",
282 |        "\n",
283 |        "    .dataframe thead th {\n",
284 |        "        text-align: right;\n",
285 |        "    }\n",
286 |        "</style>\n",
287 |        "<table border=\"1\" class=\"dataframe\">\n",
288 |        "  <thead>\n",
289 |        "    <tr style=\"text-align: right;\">\n",
290 |        "      <th></th>\n",
291 |        "      <th>name</th>\n",
292 |        "      <th>club</th>\n",
293 |        "      <th>year</th>\n",
294 |        "      <th>league</th>\n",
295 |        "      <th>penalties</th>\n",
296 |        "      <th>home</th>\n",
297 |        "      <th>away</th>\n",
298 |        "      <th>scored</th>\n",
299 |        "      <th>missed</th>\n",
300 |        "      <th>saved</th>\n",
301 |        "    </tr>\n",
302 |        "  </thead>\n",
303 |        "  <tbody>\n",
304 |        "    <tr>\n",
305 |        "      <th>3927</th>\n",
306 |        "      <td>Claudiu Keseru</td>\n",
307 |        "      <td>Ludogorets Razgrad</td>\n",
308 |        "      <td>19/20</td>\n",
309 |        "      <td>Europa Cup</td>\n",
310 |        "      <td>1</td>\n",
311 |        "      <td>1</td>\n",
312 |        "      <td>-</td>\n",
313 |        "      <td>1</td>\n",
314 |        "      <td>-</td>\n",
315 |        "      <td>-</td>\n",
316 |        "    </tr>\n",
317 |        "    <tr>\n",
318 |        "      <th>3928</th>\n",
319 |        "      <td>Adem Ljajic</td>\n",
320 |        "      <td>Besiktas</td>\n",
321 |        "      <td>19/20</td>\n",
322 |        "      <td>Europa Cup</td>\n",
323 |        "      <td>1</td>\n",
324 |        "      <td>-</td>\n",
325 |        "      <td>1</td>\n",
326 |        "      <td>1</td>\n",
327 |        "      <td>-</td>\n",
328 |        "      <td>-</td>\n",
329 |        "    </tr>\n",
330 |        "    <tr>\n",
331 |        "      <th>3929</th>\n",
332 |        "      <td>Andraz Sporar</td>\n",
333 |        "      <td>Slovan Bratislava</td>\n",
334 |        "      <td>19/20</td>\n",
335 |        "      <td>Europa Cup</td>\n",
336 |        "      <td>1</td>\n",
337 |        "      <td>-</td>\n",
338 |        "      <td>1</td>\n",
339 |        "      <td>-</td>\n",
340 |        "      <td>-</td>\n",
341 |        "      <td>1</td>\n",
342 |        "    </tr>\n",
343 |        "    <tr>\n",
344 |        "      <th>3930</th>\n",
345 |        "      <td>James Tavernier</td>\n",
346 |        "      <td>Rangers</td>\n",
347 |        "      <td>19/20</td>\n",
348 |        "      <td>Europa Cup</td>\n",
349 |        "      <td>1</td>\n",
350 |        "      <td>1</td>\n",
351 |        "      <td>-</td>\n",
352 |        "      <td>-</td>\n",
353 |        "      <td>1</td>\n",
354 |        "      <td>-</td>\n",
355 |        "    </tr>\n",
356 |        "    <tr>\n",
357 |        "      <th>3931</th>\n",
358 |        "      <td>Ryan Christie</td>\n",
359 |        "      <td>Celtic</td>\n",
360 |        "      <td>19/20</td>\n",
361 |        "      <td>Europa Cup</td>\n",
362 |        "      <td>1</td>\n",
363 |        "      <td>-</td>\n",
364 |        "      <td>1</td>\n",
365 |        "      <td>1</td>\n",
366 |        "      <td>-</td>\n",
367 |        "      <td>-</td>\n",
368 |        "    </tr>\n",
369 |        "    <tr>\n",
370 |        "      <th>3932</th>\n",
371 |        "      <td>Tomas de Vincenti</td>\n",
372 |        "      <td>APOEL Nicosia</td>\n",
373 |        "      <td>19/20</td>\n",
374 |        "      <td>Europa Cup</td>\n",
375 |        "      <td>1</td>\n",
376 |        "      <td>1</td>\n",
377 |        "      <td>-</td>\n",
378 |        "      <td>1</td>\n",
379 |        "      <td>-</td>\n",
380 |        "      <td>-</td>\n",
381 |        "    </tr>\n",
382 |        "    <tr>\n",
383 |        "      <th>3933</th>\n",
384 |        "      <td>Bruno Fernandes</td>\n",
385 |        "      <td>Sporting Lisbon</td>\n",
386 |        "      <td>19/20</td>\n",
387 |        "      <td>Europa Cup</td>\n",
388 |        "      <td>1</td>\n",
389 |        "      <td>-</td>\n",
390 |        "      <td>1</td>\n",
391 |        "      <td>1</td>\n",
392 |        "      <td>-</td>\n",
393 |        "      <td>-</td>\n",
394 |        "    </tr>\n",
395 |        "    <tr>\n",
396 |        "      <th>3934</th>\n",
397 |        "      <td>Ciprian Deac</td>\n",
398 |        "      <td>CFR Cluj</td>\n",
399 |        "      <td>19/20</td>\n",
400 |        "      <td>Europa Cup</td>\n",
401 |        "      <td>1</td>\n",
402 |        "      <td>1</td>\n",
403 |        "      <td>-</td>\n",
404 |        "      <td>1</td>\n",
405 |        "      <td>-</td>\n",
406 |        "      <td>-</td>\n",
407 |        "    </tr>\n",
408 |        "    <tr>\n",
409 |        "      <th>3935</th>\n",
410 |        "      <td>M'Baye Niang</td>\n",
411 |        "      <td>Stade Rennes</td>\n",
412 |        "      <td>19/20</td>\n",
413 |        "      <td>Europa Cup</td>\n",
414 |        "      <td>1</td>\n",
415 |        "      <td>1</td>\n",
416 |        "      <td>-</td>\n",
417 |        "      <td>1</td>\n",
418 |        "      <td>-</td>\n",
419 |        "      <td>-</td>\n",
420 |        "    </tr>\n",
421 |        "    <tr>\n",
422 |        "      <th>3936</th>\n",
423 |        "      <td>Bibras Natcho</td>\n",
424 |        "      <td>Partizan Belgrade</td>\n",
425 |        "      <td>19/20</td>\n",
426 |        "      <td>Europa Cup</td>\n",
427 |        "      <td>1</td>\n",
428 |        "      <td>1</td>\n",
429 |        "      <td>-</td>\n",
430 |        "      <td>1</td>\n",
431 |        "      <td>-</td>\n",
432 |        "      <td>-</td>\n",
433 |        "    </tr>\n",
434 |        "  </tbody>\n",
435 |        "</table>\n",
436 |        "</div>"
437 |       ],
438 |       "text/plain": [
439 |        "                   name                 club   year      league penalties  \\\n",
440 |        "3927     Claudiu Keseru  Ludogorets Razgrad   19/20  Europa Cup         1   \n",
441 |        "3928        Adem Ljajic            Besiktas   19/20  Europa Cup         1   \n",
442 |        "3929      Andraz Sporar   Slovan Bratislava   19/20  Europa Cup         1   \n",
443 |        "3930    James Tavernier             Rangers   19/20  Europa Cup         1   \n",
444 |        "3931      Ryan Christie              Celtic   19/20  Europa Cup         1   \n",
445 |        "3932  Tomas de Vincenti       APOEL Nicosia   19/20  Europa Cup         1   \n",
446 |        "3933    Bruno Fernandes     Sporting Lisbon   19/20  Europa Cup         1   \n",
447 |        "3934       Ciprian Deac            CFR Cluj   19/20  Europa Cup         1   \n",
448 |        "3935       M'Baye Niang        Stade Rennes   19/20  Europa Cup         1   \n",
449 |        "3936      Bibras Natcho   Partizan Belgrade   19/20  Europa Cup         1   \n",
450 |        "\n",
451 |        "     home away scored missed saved  \n",
452 |        "3927    1    -      1      -     -  \n",
453 |        "3928    -    1      1      -     -  \n",
454 |        "3929    -    1      -      -     1  \n",
455 |        "3930    1    -      -      1     -  \n",
456 |        "3931    -    1      1      -     -  \n",
457 |        "3932    1    -      1      -     -  \n",
458 |        "3933    -    1      1      -     -  \n",
459 |        "3934    1    -      1      -     -  \n",
460 |        "3935    1    -      1      -     -  \n",
461 |        "3936    1    -      1      -     -  "
462 |       ]
463 |      },
464 |      "execution_count": 38,
465 |      "metadata": {},
466 |      "output_type": "execute_result"
467 |     }
468 |    ],
469 |    "source": [
470 |     "table.tail(10)"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": 37,
476 |    "metadata": {},
477 |    "outputs": [],
478 |    "source": [
479 |     "table.to_csv (r'./all_penalties.csv', index = None, header=True)"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "code",
484 |    "execution_count": null,
485 |    "metadata": {},
486 |    "outputs": [],
487 |    "source": []
488 |   }
489 |  ],
490 |  "metadata": {
491 |   "kernelspec": {
492 |    "display_name": "Python 3",
493 |    "language": "python",
494 |    "name": "python3"
495 |   },
496 |   "language_info": {
497 |    "codemirror_mode": {
498 |     "name": "ipython",
499 |     "version": 3
500 |    },
501 |    "file_extension": ".py",
502 |    "mimetype": "text/x-python",
503 |    "name": "python",
504 |    "nbconvert_exporter": "python",
505 |    "pygments_lexer": "ipython3",
506 |    "version": "3.7.3"
507 |   }
508 |  },
509 |  "nbformat": 4,
510 |  "nbformat_minor": 2
511 | }
512 | 


--------------------------------------------------------------------------------
/R/EPL/penalty/penalty.R:
--------------------------------------------------------------------------------
  1 | ################################
  2 | #### Data Science Project  #####
  3 | # Article:                     #  
  4 | # https://tinyurl.com/y2ynruqo #
  5 | ################################
  6 | 
  7 | library(MASS)
  8 | library(tidyverse)
  9 | library(betareg)
 10 | library(xkcd)
 11 | 
 12 | # read raw_data
 13 | raw_data <- read.csv("./all_penalties.csv",
 14 |                      stringsAsFactors = FALSE)
 15 | 
 16 | # basic cleaning and group by player
 17 | player_data <- raw_data %>%
 18 |   mutate(name = str_squish(name),
 19 |          penalties = ifelse(penalties == '-',0,penalties),
 20 |          scored = as.numeric(ifelse(scored =='-', 0, scored))) %>%
 21 |   group_by(name) %>%
 22 |   summarise(total = sum(penalties),
 23 |             total_score = sum(scored))%>%
 24 |   mutate(ratio = total_score/(total)) %>%
 25 |   filter(total >= 4 & ratio > 0 & ratio < 1) %>%
 26 |   na.omit()
 27 | 
 28 | #### draw xkcd with dataman
 29 | xrange = c(0.2,1.0)
 30 | yrange = c(0,4)
 31 | ratioxy <- diff(xrange) / diff(yrange)
 32 | mapping <- aes(x=x,
 33 |                y=y,
 34 |                scale=scale,
 35 |                ratioxy=ratioxy,
 36 |                angleofspine = angleofspine,
 37 |                anglerighthumerus = anglerighthumerus,
 38 |                anglelefthumerus = anglelefthumerus,
 39 |                anglerightradius = anglerightradius,
 40 |                angleleftradius = angleleftradius,
 41 |                anglerightleg =  anglerightleg,
 42 |                angleleftleg = angleleftleg,
 43 |                angleofneck = angleofneck)
 44 | 
 45 | dataman <- data.frame( x= 0.3, y=3,
 46 |                        scale = 0.5,
 47 |                        ratioxy = ratioxy,
 48 |                        angleofspine =  -pi/2,
 49 |                        anglerighthumerus = -pi/6,
 50 |                        anglelefthumerus = -pi/2 -pi/6,
 51 |                        anglerightradius = pi/5,
 52 |                        angleleftradius = pi/5,
 53 |                        angleleftleg = 3*pi/2  + pi / 12 ,
 54 |                        anglerightleg = 3*pi/2  - pi / 12,
 55 |                        angleofneck = runif(1, min = 3 * pi / 2 - pi/10 , max = 3 * pi / 2 + pi/10))
 56 | 
 57 | # draw histogram of conversion rates
 58 | player_data %>%
 59 |   ggplot(aes(ratio)) +
 60 |   geom_histogram(breaks = 5:25/25,
 61 |                 fill = hcl(0, 50, 80)) +
 62 |   xkcdaxis(c(0.1,1), c(0,80)) +
 63 |   labs (x = "\nHistogram of penalties conversion rate", y = "Count") +
 64 |   theme_xkcd() 
 65 | 
 66 | # fit a beta distribution on the histogram
 67 | m <- MASS::fitdistr(player_data$ratio, dbeta,
 68 |                     start = list(shape1 = 10, shape2 = 1),
 69 |                     lower=c(0.1,0.1))
 70 | alpha0 <- m$estimate[1]
 71 | beta0 <- m$estimate[2]
 72 | 
 73 | # plot the fit with some fun xkcd
 74 | ggplot(player_data) +
 75 |   geom_histogram(aes(ratio, y = ..density..),
 76 |                  breaks = 5:25/25,
 77 |                  fill = hcl(0, 50, 80)) +
 78 |   stat_function(fun = function(x) dbeta(x, alpha0, beta0), color = "red",
 79 |                 size = 1) +
 80 |   xlab("\n Penalty Coversion Rate") + 
 81 |   theme_xkcd() +
 82 |   xkcdaxis(xrange, yrange) +
 83 |   xkcdman(mapping, dataman) +
 84 |   annotate("text", x=0.4, y = 4,
 85 |            label = "Does not look an amazing good fit\nBut it's okay",
 86 |            family="xkcd") +
 87 |   xkcdline(aes(x=xbegin,y=ybegin,xend=xend,yend=yend),
 88 |            data.frame(xbegin=0.36,ybegin=3,xend=0.42,yend=3.5),
 89 |            xjitteramount = 0.01)
 90 | 
 91 | # adjusted ratio:
 92 | adjusted_ratio <- player_data %>%
 93 |   mutate(eb_estimate = (total_score + alpha0) / (total + alpha0 + beta0)) %>%
 94 |   arrange(desc(eb_estimate))
 95 | 
 96 | # posterior plots for specific players:
 97 | specific_players <- adjusted_ratio %>%
 98 |   filter(name %in% c("Cristiano Ronaldo",
 99 |                      "Nicolas Pepe",
100 |                      "Alexis Sanchez",
101 |                      "Antoine Griezmann")) %>%
102 |   mutate(alpha = total_score + alpha0,
103 |          beta = total - total_score + beta0)
104 | 
105 | # draw posterior beta distribution for these players
106 | specific_players %>%
107 |   crossing(x=seq(0.4,0.99,.002)) %>%
108 |   ungroup() %>%
109 |   mutate(density=dbeta(x,alpha,beta)) %>%
110 |   ggplot(aes(x, density, color = name)) +
111 |   geom_line() +
112 |   stat_function(fun=function(x) dbeta(x, alpha0, beta0), lty = 2, color = 'black') +
113 |   xlab("Conversion rate") +
114 |   theme_xkcd()
115 | 
116 | # draw actual vs adjusted ratio plot
117 | ggplot(adjusted_ratio, aes(ratio, eb_estimate, color = total)) +
118 |   geom_hline(yintercept = alpha0 / (alpha0 + beta0), color = "red", lty = 2) +
119 |   geom_point() +
120 |   geom_abline(color = "red") +
121 |   scale_colour_gradient(breaks = c(0,20,30,50,70)) +
122 |   xlim(0.5,1) +
123 |   ylim(0.5,1) +
124 |   xlab("Actual goal scoring average") +
125 |   ylab("Posterior goal scoring average")
126 | 
127 |                
128 | #### When it seems that a unimodal beta distribution is not a good fit
129 | #### we can use E-M algorithm (implemented in the betareg package
130 | #### to fit 2 beta distributions
131 | m<- betamix(ratio ~ 1| 1, data = player_data, k = 1:3)
132 | 
133 | mu <- plogis(coef(m)[,1])
134 | phi <- exp(coef(m)[,2])
135 | a <- mu*phi
136 | b <- (1-mu)*phi
137 | # get the cluser
138 | cl <- clusters(m)
139 | 
140 | # plotting
141 | ## separate histograms for both clusters
142 | ## TODO: convert back to ggplot code
143 | hist(subset(player_data, cl == 1)$ratio, breaks = 5:25/25, freq = FALSE,
144 |      col = hcl(0, 50, 80), main = "", xlab = "Penalty Conversion Rate", ylim = c(0, 9))
145 | 
146 | hist(subset(player_data, cl == 2)$ratio, breaks = 5:25/25, freq = FALSE,
147 |      col = hcl(240, 50, 80), main = "", xlab = "Penalty Conversion Rate", ylim = c(0, 9), add = TRUE)
148 | 
149 | ## lines for fitted densities
150 | ys <- seq(0, 1, by = 0.01)
151 | lines(ys, dbeta(ys, shape1 = a[1], shape2 = b[1]),
152 |       col = hcl(0, 80, 50), lwd = 2)
153 | lines(ys, dbeta(ys, shape1 = a[2], shape2 = b[2]),
154 |       col = hcl(240, 80, 50), lwd = 2)
155 | 
156 | ## lines for corresponding means
157 | abline(v = mu[1], col = hcl(0, 80, 50), lty = 2, lwd = 2)
158 | abline(v = mu[2], col = hcl(240, 80, 50), lty = 2, lwd = 2)
159 | 
160 | ## repeat Bayesian updating
161 | ## only group specific this time
162 | post <- posterior(m)
163 | post[,1]
164 | # posterior probabilies of being assigned to each group
165 | player_data$post_1 <- post[,1]
166 | player_data$post_2 <- post[,2]
167 | 
168 | player_data <- player_data %>%
169 |   mutate(shrunkage_1 = (total_score + a[1])/(total + a[1] + b[1]),
170 |          shrunkage_2 = (total_score + a[2])/(total + a[2] + b[2]),
171 |          shrunkage_ave = (post_1*shrunkage_1 + post_2*shrunkage_2)) %>%
172 |   arrange(desc(shrunkage_ave))
173 | 
174 | # plot
175 | player_data %>%
176 |   gather(type, value, ratio, shrunkage_ave)%>%
177 |   mutate(type = ifelse(type == 'ratio',
178 |                        'Raw scoring ratio',
179 |                        'Average posterior'),
180 |          type = relevel(factor(type), 'Raw scoring ratio')) %>%
181 |   ggplot(aes(total_score, value)) +
182 |   geom_point() +
183 |   facet_wrap(~ type) +
184 |   ylab("Estimate") +
185 |   theme_bw()
186 | 


--------------------------------------------------------------------------------
/R/EPL/prediction/clean_data.R:
--------------------------------------------------------------------------------
 1 | ########################################
 2 | # scripts to clean data to usable format
 3 | # pipe directly to sim.R
 4 | # source:
 5 | # - fixtures.csv: dedicatedexcel.com
 6 | # - Historical results: https://www.kaggle.com/thefc17/epl-results-19932018
 7 | #########################################
 8 | library (dplyr)
 9 | 
10 | fixtures <- read.csv("fixtures.csv", stringsAsFactors = FALSE)
11 | 
12 | # get the team
13 | teams <- unique(fixtures$HOME.TEAM)
14 | 
15 | # extract historic results
16 | history <- read.csv("history.csv", stringsAsFactors = FALSE)
17 | 
18 | # get info from the 2010 up to 2018
19 | seasons <- sapply(10:17, function(x) paste0(2000+x,'-',x+1))
20 | 
21 | recent.pl <- history %>%
22 |   filter(Season %in% seasons, div == 'E0')
23 | 
24 | # because the two data comes from different source, so the teams name don't match
25 | teams[!teams %in% recent.pl$HomeTeam]
26 | unique(recent.pl$HomeTeam)
27 | 
28 | # now we need to fix it
29 | pair_fix <- list(c('Manchester United', 'Man United'), c('Newcastle United', 'Newcastle'),
30 |                  c('Huddersfield Town', 'Huddersfield'), c('Wolverhampton Wanderers', 'Wolves'),
31 |                  c('Cardiff City', 'Cardiff'), c('Leicester City', 'Leicester'),
32 |                  c('Tottenham Hotspur', 'Tottenham'), c('West Ham United', 'West Ham'),
33 |                  c('Manchester City', "Man City"), c('Brighton and Hove Albion', 'Brighton'))
34 | 
35 | # fix the recent.pl dataset
36 | for (i in 1:length(pair_fix)){
37 |   recent.pl <- recent.pl %>%
38 |     mutate(HomeTeam = replace(HomeTeam, HomeTeam == pair_fix[[i]][2], pair_fix[[i]][1]),
39 |            AwayTeam = replace(AwayTeam, AwayTeam == pair_fix[[i]][2], pair_fix[[i]][1]))
40 | }
41 | 
42 | 
43 | # a bland average dataframe
44 | ave_home <- recent.pl %>%
45 |   group_by(HomeTeam) %>%
46 |   summarize (ave_scored_h = mean(FTHG), ave_conceded_h = mean(FTAG)) %>%
47 |   filter (HomeTeam %in% teams) %>% rename(Team = HomeTeam)
48 | 
49 | ave_away <- recent.pl %>%
50 |   group_by(AwayTeam) %>%
51 |   summarize (ave_scored_a = mean(FTAG), ave_conceded_a = mean(FTHG)) %>%
52 |   filter (AwayTeam %in% teams)  %>% rename(Team = AwayTeam)
53 | 
54 | ave <- merge(ave_home, ave_away, by = 'Team')
55 | 
56 | 
57 | # more precise result with pairwise
58 | hist_pair.pl <- recent.pl %>%
59 |   group_by(HomeTeam, AwayTeam) %>%
60 |   filter (HomeTeam %in% teams, AwayTeam %in% teams) %>%
61 |   summarize (match = n(), ave_home_scored = mean(FTHG), ave_away_scored = mean(FTAG))
62 |   
63 | rm(history, seasons, pair_fix, ave_home, ave_away)
64 | 


--------------------------------------------------------------------------------
/R/EPL/prediction/match_simulate.R:
--------------------------------------------------------------------------------
 1 | library (dplyr)
 2 | source ('clean_data.R')
 3 | 
 4 | # get most frequent score line of a match after n, sim time
 5 | nsim = 100
 6 | get_score <- function (home, away, nsim){
 7 |   # try to get from history, pair
 8 |   subset <- hist_pair.pl[ which( hist_pair.pl$HomeTeam ==home | hist_pair.pl$AwayTeam ==away), ]
 9 |   # more efficient code, no need to retract back to dataframe many times
10 |   ave_h_s = subset$ave_home_scored[1]
11 |   ave_a_s = subset$ave_away_scored[1]
12 |   
13 |   t_ave_h_s = ave[ave$Team == home,]$ave_scored_h
14 |   t_ave_a_c = ave[ave$Team == away,]$ave_conceded_a
15 |   t_ave_h_c = ave[ave$Team == home,]$ave_conceded_h
16 |   t_ave_a_s = ave[ave$Team == away,]$ave_scored_a
17 |   score_line = character(length(nsim))
18 |   # simulation idea similar to that of sim.R
19 |   for (i in 1:nsim){
20 |     if ((dim(subset)[1] == 1) & (subset$match[1] > 3)){
21 |       h_scored = rpois(1, ave_h_s)
22 |       a_scored = rpois(1, ave_a_s)
23 |     }
24 |     # if we have no historical result of the match
25 |     else{
26 |       # take into account both attacking stat of home and defense stats of away
27 |       h_scored = rpois(1, 1/2 * (t_ave_h_s + t_ave_a_c))
28 |       a_scored = rpois(1, 1/2 * (t_ave_a_s + t_ave_h_c))
29 |     } 
30 |     score_line[i] = paste0(h_scored, '-', a_scored)
31 |   }
32 |   return (list(names(which.max(table(score_line))), max(table(score_line))))
33 | }
34 | 
35 | round_1 <- head(fixtures,10)
36 | matches <- mapply(get_score, round_1$HOME.TEAM, round_1$AWAY.TEAM, nsim, SIMPLIFY = FALSE)
37 | round_1$score_line <- sapply(matches, function(x) x[1])
38 | round_1$prob <- sapply(matches, function(x) x[2])
39 | 


--------------------------------------------------------------------------------
/R/EPL/prediction/sim.R:
--------------------------------------------------------------------------------
  1 | library (dplyr)
  2 | source ('clean_data.R')
  3 | 
  4 | # get score of a match
  5 | get_score <- function (home, away){
  6 |   # try to get from history, pair
  7 |   subset <- hist_pair.pl[ which( hist_pair.pl$HomeTeam ==home & hist_pair.pl$AwayTeam ==away), ]
  8 |   # only use this method if we have at least 4 matches
  9 |   if ((dim(subset)[1] == 1) & (subset$match[1] > 3)){
 10 |     h_scored = rpois(1, subset$ave_home_scored[1])
 11 |     a_scored = rpois(1, subset$ave_away_scored[1])
 12 |   }
 13 |   # if we have no historical result of the match
 14 |   else{
 15 |     # take into account both attacking stat of home and defense stats of away
 16 |     h_scored = rpois(1, 1/2 * (ave[ave$Team == home,]$ave_scored_h +
 17 |                                  ave[ave$Team == away,]$ave_conceded_a))
 18 |     a_scored = rpois(1, 1/2 * (ave[ave$Team == away,]$ave_scored_a +
 19 |                                  ave[ave$Team == home,]$ave_conceded_h))
 20 |   }
 21 |   return (list(h_scored, a_scored))
 22 | }
 23 | 
 24 | rank <- function (m_result){
 25 |   table <- data.frame(name = teams,
 26 |                       goal_score = rep(0,20),
 27 |                       goal_conceded = rep(0,20),
 28 |                       point = rep(0,20))
 29 |   # loop through all the results and then update
 30 |   for (i in 1:nrow(m_result)){
 31 |     home = m_result$HOME.TEAM[i]
 32 |     away = m_result$AWAY.TEAM[i]
 33 |     h_goal = m_result$h_scored[i]
 34 |     a_goal = m_result$a_scored[i]
 35 |     
 36 |     # add goal
 37 |     table[table$name == home,]$goal_score = table[table$name == home,]$goal_score + h_goal
 38 |     table[table$name == home,]$goal_conceded = table[table$name == home,]$goal_conceded + a_goal
 39 |     table[table$name == away,]$goal_score = table[table$name == away,]$goal_score + a_goal
 40 |     table[table$name == away,]$goal_conceded = table[table$name == away,]$goal_conceded + h_goal
 41 |     
 42 |     
 43 |     # calculate point
 44 |     if (h_goal > a_goal){
 45 |       table[table$name == home,]$point = table[table$name == home,]$point + 3
 46 |     }
 47 |     else if (h_goal < a_goal){
 48 |       table[table$name == away,]$point = table[table$name == away,]$point + 3
 49 |     }
 50 |     else{
 51 |       table[table$name == home,]$point = table[table$name == home,]$point + 1
 52 |       table[table$name == away,]$point = table[table$name == away,]$point + 1
 53 |     }
 54 |   }
 55 |   
 56 |   table$goal_dif <- table$goal_score - table$goal_conceded
 57 |   table <- table[order(-table$point, -table$goal_dif, -table$goal_score), ]
 58 |   
 59 |   return (table)
 60 | }
 61 | 
 62 | simulate <- function(fixtures){
 63 |   matches <- mapply(get_score, fixtures$HOME.TEAM, fixtures$AWAY.TEAM, SIMPLIFY = FALSE)
 64 |   fixtures$h_scored <- unlist(sapply(matches, function(x) x[1]))
 65 |   fixtures$a_scored <- unlist(sapply(matches, function(x) x[2]))
 66 |   table <- rank(fixtures)
 67 |   return (table)
 68 | }
 69 | 
 70 | 
 71 | nsim = 10000
 72 | tabulate_data <- data.frame(name = teams,
 73 |                             champion = rep(0,20),
 74 |                             runner_up = rep(0,20),
 75 |                             top_4 = rep(0,20),
 76 |                             top_6 = rep(0,20),
 77 |                             relegate = rep(0,20))
 78 | pb <- txtProgressBar(min = 0, max = nsim, style = 3)
 79 | 
 80 | for (sim in 1:nsim){
 81 |   table = simulate(fixtures)
 82 |   
 83 |   first = table$name[1]
 84 |   second = table$name[2]
 85 |   first_4 = table$name[1:4]
 86 |   first_6 = table$name[1:6]
 87 |   last_3 = table$name[18:20]
 88 |   
 89 |   tabulate_data <- tabulate_data %>%
 90 |     mutate(champion = ifelse(name == first, champion+1, champion),
 91 |            runner_up = ifelse(name == second, runner_up+1, runner_up),
 92 |            top_4 = ifelse(name %in% first_4, top_4+1, top_4),
 93 |            top_6 = ifelse(name %in% first_6, top_6+1, top_6),
 94 |            relegate = ifelse(name %in% last_3, relegate+1, relegate))
 95 |   setTxtProgressBar(pb, sim)
 96 | }
 97 | 
 98 | # convert to percentage
 99 | tabulate_data <- tabulate_data %>%
100 |   mutate (champion = champion/nsim,
101 |           runner_up = runner_up/nsim,
102 |           top_4 = top_4/nsim,
103 |           top_6 = top_6/nsim,
104 |           relegate = relegate/nsim)
105 |                                      
106 | # write result into csv
107 | write.csv(tabulate_data, "tabulate_data.csv", row.names = FALSE)
108 | 
109 |                                    
110 | 


--------------------------------------------------------------------------------
/R/EPL/prediction/visualize.R:
--------------------------------------------------------------------------------
 1 | library (dplyr)
 2 | library (ggplot2)
 3 | library (xkcd)
 4 | library (extrafont)
 5 | 
 6 | download.file("http://simonsoftware.se/other/xkcd.ttf",
 7 |               dest="xkcd.ttf", mode="wb")
 8 | system("cp xkcd.ttf ~/Library/Fonts")
 9 | font_import(path="~/Library/Fonts", pattern = "xkcd", prompt=FALSE)
10 | fonts()
11 | fonttable()
12 | if(.Platform$OS.type != "unix") {
13 |   ## Register fonts for Windows bitmap output
14 |     loadfonts(device="win")
15 |   } else {
16 |   loadfonts()
17 | }
18 | 
19 | # extract historic results
20 | history <- read.csv("https://raw.githubusercontent.com/tuangauss/Various-projects/master/data/history.csv", stringsAsFactors = FALSE)
21 | 
22 | # get info from the 2010 up to 2018
23 | seasons <- sapply(10:17, function(x) paste0(2000+x,'-',x+1))
24 | 
25 | 
26 | graph_func <- function(season){
27 |   if (season[1] == "2017-18"){
28 |     title = "Last season: 2017-2018"
29 |   }
30 |   else{
31 |     title = "From 2010-11 to 2017-18"
32 |   }
33 |   data <- history %>% 
34 |     filter (Season %in% season, div == 'E0') %>%
35 |     mutate (total = FTAG + FTHG)
36 |   
37 |   ave_score <- mean(data$total)
38 |   
39 |   prob_data <- data %>%
40 |     group_by(total) %>%
41 |     summarize (prob = n()/nrow(data))
42 |   
43 |   ggplot(data=prob_data, aes(x=total, y=prob)) +
44 |     geom_bar(stat="identity", color="blue", fill="grey") +
45 |     scale_x_continuous(breaks=seq(0,10,1)) +
46 |     geom_line(aes(x = total, y = dpois(x=total, lambda = ave_score)), 
47 |               col = "red", size = 0.5) +
48 |     geom_point(aes(x = total, y = dpois(x=total, lambda = ave_score)), 
49 |                col = "black", size = 3) +
50 |     ggtitle(title) + labs (x = "Total Goal", y = "Probability") +
51 |     theme_xkcd()
52 | }
53 | 
54 | graph_func(seasons)
55 | graph_func(c('2017-18'))
56 | 


--------------------------------------------------------------------------------
/R/EPL/xkcd.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/R/EPL/xkcd.ttf


--------------------------------------------------------------------------------
/R/Paul_hypothesis_test.R:
--------------------------------------------------------------------------------
 1 | #######################################
 2 | ## Hypothesis testing procedure 
 3 | ## The curious case of Paul the Octopus
 4 | ## Fisher vs N-p approach
 5 | #######################################
 6 | 
 7 | # The script is self-containing, no extra module or libary needed
 8 | 
 9 | 
10 | # graph binomial distribution, color extreme value and beyond
11 | graph <- function(n,p, value){
12 |   x <- seq(0,n)
13 |   prob <- dbinom(x,size=n,prob=p)
14 |   cols <- c("grey","red")[(h >= value) + 1] 
15 |   barplot(prob,names.arg=0:n, col = cols,
16 |           main=sprintf(paste('binomial distribution, size:',n, "prob:",p)))
17 | }
18 | 
19 | graph(14,0.5,12)
20 | 
21 | 
22 | # calculate p value for binomial distribtion, at x = 12
23 | p_value = 1-pbinom(11,14,0.5)
24 | #p_value = dbinom(12,14,0.5) + dbinom(13,14,0.5) + dbinom(14,14,0.5)
25 | 
26 | # Neyman- Pearson approach
27 | # calculate current power
28 | # a. Assuming type 1 error = 0.01
29 | p_value = 1-pbinom(0:14, 14,0.5)
30 | critical_value = which(p_value == p_value[p_value < 0.01][1])-1
31 | type2 = pbinom(critical_value-1,14,0.75)
32 | 
33 | # b. More interesting problem
34 | # You should try first before looking up the code
35 | # if we want to achieve type 1 error < 1% and power > 90%, how many observation do we need to make?
36 | 
37 | stop = FALSE
38 | for (n in 1:50){
39 |   for (k in 0:n){
40 |     type1 <- 1- pbinom(k,n,0.5)
41 |     type2 <- pbinom(k-1,n,0.75)
42 |     if (type1 < 0.01 & type2 <0.1){
43 |       print (paste("n is ",toString(n),", k is", toString(k)))
44 |       stop = TRUE
45 |       break
46 |     }
47 |   }
48 |   if (stop) break
49 | }
50 | # need 42 observations
51 | 
52 | 
53 | # if we cut it some slack
54 | # type 1 of 5% and type 2 of 20%
55 | stop = FALSE
56 | for (n in 1:50){
57 |   for (k in 0:n){
58 |     type1 <- 1- pbinom(k,n,0.5)
59 |     type2 <- pbinom(k-1,n,0.75)
60 |     if (type1 < 0.05 & type2 <0.2){
61 |       print (paste("n is ",toString(n),", k is", toString(k)))
62 |       stop = TRUE
63 |       break
64 |     }
65 |   }
66 |   if (stop) break
67 | }
68 | # still need 16 observations
69 | 


--------------------------------------------------------------------------------
/R/RuleOfThree.R:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | 
 3 | true_p <- 0.001
 4 | iter <- function(size){
 5 |   samp <- sample(x= c(1,0),
 6 |                  size = size,
 7 |                  prob = c(true_p, 1 - true_p),
 8 |                  replace = TRUE)
 9 |   cut <- which.max(samp) - 1
10 |   upper_bound <- min(3/cut, 1)
11 |   return(upper_bound)
12 | }
13 | 
14 | res <- replicate(n = 100000, iter(size = 10000))
15 | sum(res > true_p)
16 | 
17 | ggplot() + 
18 |   aes(res) +
19 |   geom_histogram(colour="black", fill="grey", bins = 100) +
20 |   geom_vline(aes(xintercept = true_p), color = "red") +
21 |   xlim(c(0, .05)) +
22 |   theme_bw() +
23 |   theme(legend.position = "none") +
24 |   labs(x = "Upper Bound")
25 | 


--------------------------------------------------------------------------------
/R/bayes_god.R:
--------------------------------------------------------------------------------
 1 | ########################
 2 | # Illustrative calculation
 3 | # Article: A Bayesian quest to find God
 4 | # Published: July 19, 2019
 5 | ########################
 6 | 
 7 | library (tidyverse)
 8 | 
 9 | bayes <- function(x, y_x, y_nx){
10 |   num <- y_x * x
11 |   denom <- y_x * x + y_nx * (1-x)
12 |   return (num/denom)
13 | }
14 | 
15 | bayes(0.01, 1, 1/7)
16 | 
17 | days <- seq(0,10,1)
18 | posterior <- rep(0.01,11)
19 | for (i in 2:11){
20 |   post <- bayes(posterior[i-1], 1, 1/7)
21 |   posterior[i] <- post
22 | }
23 | 
24 | 
25 | posterior_2<- rep(0.0001,11)
26 | for (i in 2:11){
27 |   post <- bayes(posterior_2[i-1], 1, 1/7)
28 |   posterior_2[i] <- post
29 | }
30 | 
31 | #https://www.datanovia.com/en/blog/ggplot-legend-title-position-and-labels/
32 | df <- data.frame(days, posterior, posterior_2)
33 | 
34 | vis1 <- df %>%
35 |   ggplot(aes(x=days, y = posterior)) +
36 |   geom_point() +
37 |   scale_x_continuous(breaks = days) + 
38 |   labs (title = "           Posterior estimate") +
39 |   theme_classic()
40 | 
41 | vis2 <- df %>%
42 |   gather(prior, value, -days) %>%
43 |   ggplot(aes(x=days, y = value, color = prior)) +
44 |   geom_point() +
45 |   scale_color_discrete(name = "Value of prior \n on Day-1",
46 |                        labels = c(0.01, 0.0001)) +
47 |   scale_x_continuous(breaks = days) + 
48 |   labs (title = "           Posterior estimate") +
49 |   theme_classic()
50 | 


--------------------------------------------------------------------------------
/R/bayesian_gym.R:
--------------------------------------------------------------------------------
  1 | # load libraries
  2 | library(rjags)
  3 | library(dplyr)
  4 | library(MASS)
  5 | library(ggplot2)
  6 | 
  7 | # load data
  8 | raw_data <- read.csv("~/data/Vietnamese_2016.csv", 
  9 |                      head = TRUE, sep = ";")
 10 | head(raw_data)
 11 | summary(raw_data$Age_gr)
 12 | 
 13 | # clean data
 14 | data <- raw_data %>%
 15 |   filter(Age_gr == "18-29") %>%
 16 |   filter(Sex == "male") %>%
 17 |   dplyr::select(height, weight, BMI) %>%
 18 |   mutate(height = as.numeric(gsub(",", ".", height))) %>%
 19 |   mutate(weight = as.numeric(gsub(",", ".", weight)))
 20 | 
 21 | # my info
 22 | m_height = 168
 23 | m_weight = 58
 24 | m_BMI =  m_weight / (m_height/100)^2
 25 | 
 26 | # visualization:
 27 | truehist(data$weight,nbins = 50, 
 28 |          main = paste("Histogram of Vietnamese male weight"), xlab = "Weight in kg")
 29 | abline(v=m_weight,col="black", lwd = 4)
 30 | abline(v=median(data$weight), col = "red", lty = 4, lwd = 4)
 31 | abline(v=mean(data$weight), col ="orange", lty = 4, lwd = 4)
 32 | text(m_weight-2, 0.12, "Me!!!")
 33 | 
 34 | my_data <- data.frame(height = m_height, weight = m_weight)
 35 | ggplot(data, aes(height, weight)) +
 36 |   geom_point(shape = 16, size = 5, show.legend = FALSE, colour = "blue", alpha = 0.4) + theme_minimal() +
 37 |   geom_point(data = my_data, color ="red", size = 5) + 
 38 |   labs (title = "Weight versus Height plot of 383 Vietnamese male and Tuan", subtitle = "***Red point is author's own measurement") + 
 39 |   theme(plot.title = element_text(color="#666666", face="bold", size=20, hjust=0))
 40 | 
 41 | 
 42 | # add standard least square line
 43 | model <- lm(data$weight ~ data$height) #fit linear model
 44 | label_text <- paste('Fitted model: ', round(coef(model)[1], 3), ' + ', round(coef(model)[2], 3), ' x', sep = '')
 45 | ggplot(data, aes(height, weight)) +
 46 |   geom_point(shape = 16, size = 5, show.legend = FALSE, colour = "blue", alpha = 0.4) + theme_minimal() +
 47 |   geom_smooth(method = "lm", fullrange=TRUE, color = "red") +
 48 |   geom_text(aes(x = 143, y = 55, label = label_text),hjust = 0, size = 6) +
 49 |   geom_point(data = my_data, color ="red", size = 5) + 
 50 |   labs (title = "Weight versus Height plot of 383 Vietnamese male and Tuan") + 
 51 |   theme(plot.title = element_text(color="#666666", face="bold", size=20, hjust=0))
 52 | 
 53 | 
 54 | 
 55 | #### Running JAGS model ####
 56 | ############################
 57 | 
 58 | n <- nrow(data) #383 data points
 59 | 
 60 | mymodel <- "
 61 | model{
 62 | for(i in 1:n){
 63 | y[i] ~ dnorm(a + b*x[i], tau)
 64 | }
 65 | a ~ dnorm(0, 1e-6)
 66 | b ~ dnorm(0, 1e-6)
 67 | tau ~ dgamma(.01,.01)
 68 | sig <- 1/sqrt(tau)
 69 | }
 70 | "
 71 | 
 72 | jm <- jags.model(file = textConnection(mymodel), data=list(n=n, x=data$height, y=data$weight))
 73 | cs <- coda.samples(jm, c("a","b","sig"), 11000)
 74 | sample_data <- as.data.frame(cs[[1]][-(1:1000),])
 75 | 
 76 | cmean <- sample_data$a + sample_data$b*m_height  # "conditional mean"
 77 | 
 78 | m_perc <- pnorm(q = m_weight, mean = cmean, sd = sample_data$sig) 
 79 | truehist(m_perc, main = "Posterior distribution for my weight percentile", 
 80 |          xlab = "percentile", ylab = "Frequency")
 81 | mean(m_perc<=0.4)
 82 | mean(m_perc)
 83 | 
 84 | 
 85 | ### What happen if I compare myself to American men
 86 | nls_data <-read.csv("~/data/national_longitudinal_survey.csv", head = TRUE)
 87 | nls_data <- nls_data %>%
 88 |   filter(Gender == "Male") %>%
 89 |   mutate (height = Height..inches.*2.54) %>%
 90 |   mutate (weight = Weight..lbs./2.2046) %>%
 91 |   dplyr::select(height,weight)
 92 | 
 93 | #4150 data points
 94 | n <- nrow(nls_data)
 95 | 
 96 | mymodel <- "
 97 | model{
 98 | for(i in 1:n){
 99 | y[i] ~ dnorm(a + b*x[i], tau)
100 | }
101 | a ~ dnorm(0, 1e-6)
102 | b ~ dnorm(0, 1e-6)
103 | tau ~ dgamma(.01,.01)
104 | sig <- 1/sqrt(tau)
105 | }
106 | "
107 | 
108 | jm <- jags.model(file = textConnection(mymodel), data=list(n=n, x=nls_data$height, y=nls_data$weight))
109 | cs <- coda.samples(jm, c("a","b","sig"), 11000)
110 | sample_data <- as.data.frame(cs[[1]][-(1:1000),])
111 | 
112 | cmean <- sample_data$a + sample_data$b*m_height
113 | m_perc <- pnorm(q = m_weight, mean = cmean, sd = sample_data$sig) 
114 | truehist(m_perc)
115 | 


--------------------------------------------------------------------------------
/R/dating_sim.R:
--------------------------------------------------------------------------------
 1 | ############################################
 2 | ## The Optimal dating strategy
 3 | ## Why we should always reject the first 37%
 4 | ## An MC simulation
 5 | ############################################
 6 | 
 7 | # calculate the theoretical probability of P(S_n,k)
 8 | theo_prob <- function(x){
 9 |   if (x == 1) return (1/100)
10 |   else return ((x-1)/100 * (sum(1/((x:100)-1))))
11 | }
12 | 
13 | # a util function to simulate the 'best-partner rank'
14 | perm_rank <- function(n){
15 |   return (sample(1:n, n))
16 | }
17 | 
18 | # simulation(n) will run a MC simulation for the case of N=n
19 | # returning the optimal M and the corresponding optimal probability
20 | simulation <- function(n){
21 |   M_range <- 2:n
22 |   niter <- 1000 #for each value of M, we simulate 1000 times
23 |   
24 |   # declare a vector to store results, 
25 |   # notice that if M = 1, the probability is 1/100
26 |   prob_result <- rep(1/100, 100)
27 |   
28 |   # do a simulation for each value of M
29 |   for (M in M_range){
30 |     result <- rep(0, niter)
31 |     for (i in 1:niter){
32 |       order <- perm_rank(n) #simulate the order
33 |       # find the best among the first M-1 that gets rejected
34 |       highest_reject <- min(head(order, M-1))
35 |       if (highest_reject != 1){
36 |         accept <- order[order < highest_reject][1]
37 |         # we consider ourselves successful if:
38 |         # - rank 1 is not included in the first M-1 candidates
39 |         # - rank 1 is the first person who is better than all we have seen
40 |         if (accept == 1){
41 |           result[i] <- 1 
42 |         }
43 |       }
44 |     }
45 |     prob_result[M] <- mean(result)
46 |   }
47 |   return (c(max(prob_result), which.max(prob_result)/n))
48 | }
49 | 
50 | # applying simulation(n) to different values of n
51 | opt_p <- sapply(2:30, function(x) simulation(x)[1])
52 | plot(2:30, opt_p, ylim = c(0.2,1), main = 'Optimal probability \n P(S_n,k)',
53 |      xlab = 'N', ylab = 'Probability')
54 | 
55 | opt_ratio <- sapply(2:30, function(x) simulation(x)[2])
56 | plot(2:30, opt_ratio, ylim = c(0.2,1.1), main = 'Optimal ratio \n M/N',
57 |      xlab = 'N', ylab = 'Ratio')
58 | 


--------------------------------------------------------------------------------
/R/end_to_end_projects.R:
--------------------------------------------------------------------------------
  1 | # import necessary libary
  2 | library(MASS)
  3 | library(dplyr)
  4 | library(caret)
  5 | library (ggplot2)
  6 | library(rpart)
  7 | library(e1071)
  8 | library (leaps)
  9 | 
 10 | # download and extract dataset from source
 11 | link <- "http://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz"
 12 | download.file(link, destfile = "~/data/cal_housing.tgz")
 13 | untar("cal_housing.gz")
 14 | cal_housing <- read.csv("~data/CaliforniaHousing/cal_housing.data")
 15 | 
 16 | # explore and visualize data
 17 | head(cal_housing)
 18 | dim(cal_housing)
 19 | str(cal_housing)
 20 | summary(cal_housing) #very useful
 21 | 
 22 | levels(cal_housing$ocean_proximity) #categorical var
 23 | 
 24 | # plot numerical vars
 25 | cal_housing_num <- subset(cal_housing, 
 26 |                           select = -c(ocean_proximity))
 27 | par(mfrow= c(3,3))
 28 | invisible(lapply(names(cal_housing_num), 
 29 |                  function(col_name) truehist(cal_housing_num[,col_name], 
 30 |                                              main = paste("Histogram of ", col_name), 
 31 |                                              xlab = NA)))
 32 | 
 33 | # scatter plot with ggplots
 34 | g <- ggplot(cal_housing, aes(x = longitude, y = latitude, colour = median_income))
 35 | g + geom_point() + scale_color_distiller(palette = "Spectral") +
 36 |   labs(title = "Plot of data points by location and median_income") + 
 37 |   theme(plot.title = element_text(color="black", size=14, face="bold.italic"))
 38 | 
 39 | # correlation
 40 | cor(subset(cal_housing, select = -c(ocean_proximity)),
 41 |     use = "pairwise.complete.obs")[,"median_house_value"]
 42 | 
 43 | #### Data Wrangling #####
 44 | #########################
 45 | 
 46 | # dealing wiht missing data: 3 options
 47 | cal_housing <- subset(cal_housing, 
 48 |                       select = -c(total_bedroom)) #delete column
 49 | 
 50 | cal_housing <- cal_housing[complete.cases(cal_housing),] #remove missing entries
 51 | 
 52 | cal_housing$total_bedrooms[is.na(cal_housing$total_bedrooms)] <- median(cal_housing$total_bedrooms, na.rm=TRUE)#impute NAs with a good statistics (eg: median)
 53 | 
 54 | # other cleaning tasks
 55 | cal_housing <- cal_housing %>%
 56 |   filter(median_house_value < 500000) %>%
 57 |   mutate(rooms_per_house = total_rooms / households) %>%
 58 |   mutate(population_per_house = population / households) %>%
 59 |   mutate(ocean_proximity = as.factor(ocean_proximity)) %>%
 60 |   mutate_at(vars(-ocean_proximity, -median_house_value, -median_income), funs(scale)) %>%
 61 |   data.matrix %>% data.frame
 62 | 
 63 | 
 64 | #### Split to training set and test set ####
 65 | ############################################
 66 | 
 67 | # random sampling
 68 | set.seed(365)
 69 | train_id <- sample(nrow(cal_housing), size = 0.8*nrow(cal_housing))
 70 | train_set <- cal_housing[train_id,]
 71 | test_set <- cal_housing[-train_id,]
 72 | print (paste(nrow(train_set), "train +", nrow(test_set), "test"))
 73 | 
 74 | # stratified sampling
 75 | par(mfrow = c(1,2))
 76 | truehist(cal_housing[,"median_income"], main = paste("Histogram of median income"), xlab = NA)
 77 | cal_housing <- cal_housing %>% #categorize median income
 78 |   mutate(income_level = ceiling(median_income/2)) %>%
 79 |   mutate(income_level = factor(ifelse(income_level >= 5, 5, income_level))) %>%
 80 |   select(-median_income)
 81 | plot(cal_housing$income_level, main = paste("Bar plot of income level"), xlab = NA)
 82 | 
 83 | train_str_id <- createDataPartition(cal_housing$income_level, p =.8,
 84 |                                     list = FALSE, times = 1)
 85 | train_str <- cal_housing[train_str_id,]
 86 | test_str <- cal_housing[-train_str_id,]
 87 | # test to see if we achieve stratified sampling
 88 | table(cal_housing$income_level) / nrow(cal_housing)
 89 | table(train_str$income_level) / nrow(train_str)
 90 | 
 91 | 
 92 | #compare performance of 2 sampling method
 93 | overall<- as.vector(table(cal_housing$income_level) / nrow(cal_housing))
 94 | normal_sampling <- factor(sapply(ceiling(test_set$median_income/2), 
 95 |                                  function(value) ifelse(value >=5, 5, value))) #sapply automatically returns a list
 96 | normal_sampling <- as.vector(table(normal_sampling) / length(normal_sampling))
 97 | str_sampling <- as.vector(table(test_str$income_level) / nrow(test_str))
 98 | compare <- data.frame(overall, str_sampling, normal_sampling) %>%
 99 |   mutate(rand_error = 100*normal_sampling/overall - 100) %>%
100 |   mutate(strat_error = 100*str_sampling/overall-100)
101 | 
102 | compare
103 | 
104 | #### Fit models ####
105 | ####################
106 | 
107 | # linear model
108 | model_lm <- lm(median_house_value~., train_str)
109 | summary(model_lm)
110 | predict_lm_train <- predict(model_lm, train_str)
111 | sqrt(mean((train_str$median_house_value - predict_lm_train)^2)) #RMSE
112 | 
113 | 
114 | # Decision tree
115 | model_decision_tree <- rpart(median_house_value~.,
116 |                              data = train_str, method = "anova",
117 |                              control = rpart.control(minsplit = 2, cp=0.001))
118 | predict_decision_tree <- predict(model_decision_tree, train_str)
119 | sqrt(mean((train_str$median_house_value - predict_decision_tree)^2)) #RMSE
120 | 
121 | #SVM
122 | model_svm <- svm(median_house_value~.,
123 |                  data = train_str, cost = 10)
124 | predict_svm <- predict(model_svm, train_str)
125 | sqrt(mean((train_str$median_house_value - predict_svm)^2)) #RMSE
126 | 
127 | #### 10-fold cross validation:
128 | cal_housing_copy <- cal_housing[sample(nrow(cal_housing)),] # randomly shuffle your data
129 | 
130 | 
131 | folds <- cut(seq(1,nrow(cal_housing_copy)),
132 |              breaks=10,labels=FALSE) #Create 10 equally size folds
133 | 
134 | #Perform 10 fold cross validation
135 | MSE_lm <- 0
136 | MSE_tree <- 0
137 | MSE_svm <- 0
138 | 
139 | for(i in 1:10){
140 |   #Segement your data by fold using the which() function 
141 |   testIndexes <- which(folds==i,arr.ind=TRUE)
142 |   testData <- cal_housing_copy[testIndexes, ]
143 |   trainData <- cal_housing_copy[-testIndexes, ]
144 |   
145 |   # fit in the models
146 |   lm_model <- lm(median_house_value~., trainData)
147 |   tree_model <- rpart(median_house_value~.,data = trainData, method = "anova",
148 |                       control = rpart.control(minsplit = 2, cp = 0.001))
149 |   svm_model <- svm(median_house_value~.,data = trainData, cost = 10)
150 |   
151 |   # make predictions
152 |   predict1 <- predict(lm_model, testData)
153 |   predict2 <- predict (tree_model, testData)
154 |   predict3 <- predict(svm_model, testData)
155 |   
156 |   #update MSE
157 |   MSE_lm <- MSE_lm + sum(folds == i)/nrow(cal_housing_copy) * mean((predict1 - testData$median_house_value)^2)
158 |   MSE_tree <- MSE_tree + sum(folds == i)/nrow(cal_housing_copy) * mean((predict2 - testData$median_house_value)^2)
159 |   MSE_svm <- MSE_svm + sum(folds == i)/nrow(cal_housing_copy) * mean((predict3 - testData$median_house_value)^2)
160 |   
161 | }
162 | 
163 | sqrt(MSE_lm)
164 | sqrt(MSE_tree)
165 | sqrt(MSE_svm)
166 | 
167 | #### Tuning parameters ####
168 | ###########################
169 | 
170 | # Decision tree:
171 | tune_tree <- tune.rpart(median_house_value~., 
172 |                         data = train_str, minsplit = c(5,10,15, 20), 
173 |                         cp = c(0.1,0.01,0.001,0.0001))
174 | summary(tune_tree)
175 | plot(tune_tree)
176 | 
177 | best_tree <- tune_tree$best.model
178 | predict_tree <- predict(best_tree, train_str)
179 | sqrt(mean((train_str$median_house_value - predict_tree)^2)) #RMSE of best tree model
180 | 
181 | # SVM:
182 | tune_svm <- tune.svm(median_house_value ~.,
183 |                      data = train_str,
184 |                      cost=10^(-1:2), gamma=c(0.1,0,1))
185 | summary(tune_svm)
186 | plot(tune_svm)
187 | best_svm <- tune_svm$best.model
188 | predict_svm <- predict (best_svm, train_str)
189 | sqrt(mean((train_str$median_house_value - predict_svm)^2))
190 | 
191 | 
192 | #### Applying on test set ####
193 | ##############################
194 | 
195 | predict_tree_final <- predict(best_tree, test_str)
196 | sqrt(mean((test_str$median_house_value - predict_tree_final)^2))
197 | 
198 | predict_svm_final <- predict(best_svm, test_str)
199 | sqrt(mean((test_str$median_house_value - predict_svm_final)^2))


--------------------------------------------------------------------------------
/R/lindy/Inverse_Random_Sampling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/R/lindy/Inverse_Random_Sampling.pdf


--------------------------------------------------------------------------------
/R/lindy/lindy_simulation.R:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | 
 3 | # conditional weibull
 4 | # subtract t_0 to get exepcted years left
 5 | sample_w <- function(u, lambda, kappa, t_0){
 6 |   (t_0^kappa - lambda^kappa*log(1-u))^(1/kappa) - t_0
 7 | }
 8 | # conditional pareto
 9 | sample_p <- function(u, t_0, alpha = 2){
10 |   t_0*(1-u)^(-1/alpha) - t_0
11 | }
12 | 
13 | result_w = c()
14 | result_p = c()
15 | year_range <- seq(0, 80, 10)
16 | for(t in year_range){
17 |   u <- runif(10000)
18 |   samps_w <- sample_w(u, 77.1, 5.05, t)
19 |   samps_p <- sample_p(u, t)
20 |   result_w <- c(result_w, mean(samps_w))
21 |   result_p <- c(result_p, mean(samps_p))
22 | }
23 | 
24 | plot(year_range, result_w,
25 |      type = "l", ylim = c(0, 100),
26 |      main="Expected remaining year",
27 |      xlab = "Year passed",
28 |      ylab = "Years remaining",
29 |      bty = "n")
30 | lines(year_range, result_p, col = "green", lty = 2)
31 | par(xpd=TRUE)
32 | legend(x=4.5, y = 100,
33 |        legend=c("Human life time", "Lindy's good"),
34 |        lty=1:2,
35 |        col = c("black", "green"),
36 |        ncol=2)
37 | 
38 | # changing shape parameter
39 | u <- runif(100000)
40 | samps_p1 <- sample_p(u, 20, alpha = 2)
41 | samps_p2 <- sample_p(u, 20, alpha = 1.5)
42 | samps_p3 <- sample_p(u, 20, alpha = 3)
43 | df <- data.frame("type"= c(rep("alpha = 1.5", 100000),
44 |                            rep("alpha = 2", 100000),
45 |                            rep("alpha = 3", 100000)),
46 |                  "value" = c(samps_p2, samps_p1, samps_p3))
47 | ggplot(df, aes(x=value, fill = type)) +
48 |   geom_density(alpha = .3) +
49 |   xlim(0, 50) + 
50 |   ggtitle("pdf of years remaining after the first 20 years") + 
51 |   xlab("Years remaining") +
52 |   ylab("Probability")
53 | 
54 | 
55 | 
56 | #after 20 years
57 | u <- runif(100000)
58 | samps_w <- sample_w(u, 77.1, 5.05, 20)
59 | mean_w <- mean(samps_w)
60 | samps_p <- sample_p(u, 20)
61 | mean_p <- mean(samps_p)
62 | df <- data.frame("type"= c(rep("Human life time", 100000),
63 |                    rep("Lindy's good", 100000)),
64 |                  "value" = c(samps_w, samps_p))
65 | ggplot(df, aes(x=value, fill = type)) +
66 |   geom_density(alpha = .3) +
67 |   xlim(0, 100) + 
68 |   ggtitle("pdf of years remaining after the first 20 years") + 
69 |   xlab("Years remaining") +
70 |   ylab("Probability") +
71 |   geom_vline(xintercept = mean_p, linetype = "dashed", color = "blue") +
72 |   geom_vline(xintercept = mean_w, linetype = "dashed", color = "red")
73 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Overview
 2 | 
 3 | In this repository, you will find the source code to various projects I have been working on or still work-in-progress. The majority of the projects are accompanied by a Medium blog posts at [tuannguyen-doan.medium.com](https://tuannguyen-doan.medium.com/). I published almost exclusively on Towards Data Science publication through Medium's Partnership program so please check out these articles as a way to support me and my future projects. Alternatively, you can also find my blog posts at my personal website [here](https://tuangauss.github.io/).
 4 | 
 5 | My interests lie in the intersection of statistical techniques, data visualization and sports (especially football). All the codes are written entirely in Python or R. I don't have a strong preference or attempt to make a concerted effort to code in a specific language/platform. The decision is mostly based on how specific functionalities needed for a project are supported (scraping in Python and data processing with dplyr piping in R).
 6 | 
 7 | ### I. Statistical application:
 8 | 
 9 | #### The statistics of modern football:
10 | A collection of projects that explore the intricate statistical aspect of the Beautiful Game
11 | 
12 | - [Empirical Bayes and penalty taking ability](https://towardsdatascience.com/men-of-steel-finding-the-best-penalty-takers-with-empirical-bayes-estimation-aa0e126fb08b) - Using Bayesian statistics to make meaningful comparison between players across Europe.
13 | - [Poisson process and match prediction](https://towardsdatascience.com/o-jogo-bonito-predicting-the-premier-league-with-a-random-model-1b02fa3a7e5a) - Here we learn about the Poisson process and how a random model outperforms football experts with its prediction.
14 | - [The mathematics of football betting strategies](https://towardsdatascience.com/making-big-bucks-with-a-data-driven-sports-betting-strategy-6c21a6869171) - With the Poisson model and some additional help from mathematical research, can we beat the bookies?
15 | - [Fisher vs Neyman-Person debate and Paul the Octopus](https://towardsdatascience.com/what-can-an-octopus-tell-us-about-the-biggest-debate-in-statistical-theory-f017295d781f) - We went over the theory (or many theories) of hypothesis testings and see how they apply to the psychic ability of Paul the Octopus.
16 | 
17 | #### Statistical theory and its application:
18 | 
19 | - [Bayes theorem and a probabilistic argument for God](https://towardsdatascience.com/a-bayesian-quest-to-find-god-b30934972473) - Bayes theory and how people have been using it to justify the necessary existence of God.
20 | - [Dating with probability theory](https://towardsdatascience.com/probability-theory-and-the-optimal-dating-strategy-for-2018-2b75b26fb0b) - Here we explore what probability theory has to say about the most optimal strategy to find the love of your life.
21 | - [Bayes theorem and why it matters to my workout routine](https://towardsdatascience.com/how-bayesian-statistics-convinced-me-to-hit-the-gym-fa737b0a7ac) - A lightweight introduction to Bayes' theorem and how it helps convince me to hit the gym.
22 | - [The Rule of Three and its application](https://towardsdatascience.com/the-rule-of-three-calculating-the-probability-of-events-that-have-not-yet-occurred-106144dc2c39) - A short introduction of the Rule of Three and how we can apply it to calculate the probability of events that have yet to happen. Application in voting, vaccine development, product quality monitoring, etc.
23 | - [Lindy's effect](https://towardsdatascience.com/a-statistical-rule-to-optimize-your-life-the-lindys-effect-96d2c75b080d) - A (slightly) mathematical description of the Lindy's effect and how one can use it as a guide for life.
24 | - [Normal Distribution with High Dimensionality](https://towardsdatascience.com/disney-movies-were-right-we-are-all-special-and-statistically-so-3bb56e79ab71) - A statistical investigation into the myth of the "average Joe."
25 | - [Mark-Recapture method](https://medium.com/towards-data-science/the-statistical-theory-behind-why-your-instagram-posts-have-so-few-likes-31f46d03448b) - An intro to the statistics behind sampling theory and how you can use it to count *almost* everything
26 | 
27 | ### II. External Collaborations:
28 | 
29 | #### Published papers:
30 | - [A robust and scalable method to compare Percentile metrics in online experiments (Quora Data Blog, 2022)](https://quoradata.quora.com/A-Robust-and-Scalable-method-to-compare-Percentile-Metrics-in-online-experiments) Conducting statistical tests for Percentile metrics can be tricky, as they have less neat mathematical properties than other more common metrics, such as the average or the ratios. I discuss Quora's method to A/B test these metrics in a statistically valid and scalable manner.
31 | - [How social learning amplifies moral outrage expression in online social networks (Science Advances, 2021)](https://www.science.org/doi/pdf/10.1126/sciadv.abe5641) - Moral outrage shapes fundamental aspects of social life and is now widespread in online social networks. Here, we show how social learning processes amplify online moral outrage expressions over time.
32 | - [Application of machine learning models in predicting length of stay among healthcare workers in underserved communities in South Africa (Human Resources for Health, 2018)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6293620/) - We aim to use machine learning methods to predict health professional’s length of practice in the rural public healthcare sector based on their demographic information.
33 | 
34 | ### III. General tutorials with Python and R:
35 | 
36 | #### Data visualization:
37 | - [NetworkX and Basemap](https://towardsdatascience.com/catching-that-flight-visualizing-social-network-with-networkx-and-basemap-ce4a0d2eaea6) - Here is a comprehensive tutorial of how we can visualize geographical data with powerful tools that support Python.
38 | - [Tkinter and Python](https://towardsdatascience.com/having-your-own-fun-how-to-build-your-own-macys-firework-show-with-python-and-tkinter-79cc31631b44) - Building your own firework shows with Tkinter (and some math chops).
39 | - [Data visualization with Matplotlib and Seaborn](https://towardsdatascience.com/advanced-sports-visualization-with-pandas-matplotlib-and-seaborn-9c16df80a81b) - Learn how to construct publish-worthy visualizations with Matplotlib and Seaborn packages.
40 | 
41 | #### Machine Learning practicals:
42 | - [End-to-end Machine Learning project with R](https://github.com/tuangauss/DataScienceProjects/blob/master/R/end_to_end_projects.R) - Here is a full data science project that covers data collection, cleaning, visualization, machine learning and validation.
43 | - [Unsupervised Learning - Clustering method with R](https://github.com/tuangauss/DataScienceProjects/blob/master/R/EPL/Misc/TeamEvaluate2015.R) - An introduction to an array of unsupervised learning algorithms: Hierachical clustering, k-means, and Factor Analysis.
44 | - [Collaborative Filtering with Python](https://towardsdatascience.com/building-my-own-2021-book-recommendation-engine-903ea10d5021) - A comprehensive guide to the mathematical details and implementation of popular Matrix Factorization methods.
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/data/Team2015season.csv:
--------------------------------------------------------------------------------
 1 | Team,DribbledPast,Interception,Fouled,Fouls,Yellow,Red,Clearance,Total block,Total goals,Successful Dribble,Total Aerial challenges,AccLB,InAccLB,AccSP,InAccSP,Possession per game,Average salary per week_1,Summer transfer_1,Net spend on transfer,Average attendance per game
 2 | Arsenal,376,767,418,376,68,2,1004,610,69,599,1390,877,871,16342,2446,55.7,166000,78000000,46400000,59992
 3 | Manchester City,482,571,297,445,77,2,872,670,82,434,1188,902,1031,17790,2381,57,205000,53000000,32000000,45365
 4 | Chelsea,387,370,432,382,73,4,1026,564,72,489,1299,1153,1118,15709,2289,54.1,192000,88500000,-800000,41546
 5 | Manchester United,302,680,395,453,64,5,1117,513,60,374,1188,1736,1233,16407,1933,58.8,215000,145500000,122000000,75335
 6 | Southampton,336,628,396,469,57,3,879,510,49,265,1308,1325,1397,13170,2289,51.9,63000,57500000,-30700000,30652
 7 | Liverpool,324,441,466,385,66,3,1177,590,48,438,1161,1052,1097,14988,2240,54.4,144000,117000000,35700000,44659
 8 | Tottenham,315,678,383,442,79,4,1045,527,56,413,1232,1113,1248,14575,2392,55.3,100000,27000000,-6500000,35728
 9 | Stoke,366,501,409,487,82,1,1170,502,47,357,1655,1166,1257,10747,2206,50.2,61000,3000000,400000,27081
10 | Swansea,372,692,463,398,48,5,1202,426,43,351,1205,1092,1201,13586,1998,50.7,63000,5500000,1300000,20555
11 | West Ham,330,507,419,420,64,2,1474,489,43,260,1608,1098,1448,9022,1998,46.7,64000,23800000,31000000,34871
12 | Leicester,362,743,348,456,50,4,1179,457,45,349,1960,1022,1683,8460,2177,44.8,36300,8000000,8000000,31693
13 | Crystal Palace,340,601,434,527,63,4,1269,442,46,357,1759,945,1665,7435,2024,42.7,46000,12900000,11000000,24421
14 | Everton,341,444,450,390,66,2,1285,483,46,392,1228,1270,1289,14236,2008,53.6,69000,32800000,32800000,38406
15 | Burnley,403,565,398,407,64,2,1473,430,27,243,1888,1296,1724,8292,2315,44,21000,1500000,10000000,19131
16 | West Bromwich Albion,352,588,406,423,64,3,1090,411,37,241,1303,1054,1674,9771,1932,44.9,65000,14000000,13600000,25064
17 | Queens Park Rangers,387,561,392,447,75,3,1445,535,39,307,1732,1307,1698,8362,2294,46.3,78000,28000000,21000000,17809
18 | Hull,378,606,401,451,73,6,1317,429,32,255,1619,1171,1554,10192,1942,45.6,43000,32900000,24400000,23557
19 | Sunderland,436,579,447,441,94,2,993,408,30,341,1176,1081,1404,9922,2043,46.2,70000,14000000,9500000,43157
20 | Newcastle United,321,643,441,434,65,7,1025,468,40,444,1549,952,1424,10593,2052,48.1,78000,29500000,24900000,50359
21 | Aston Villa,286,484,422,401,70,7,1129,417,31,287,1400,1323,1419,11724,2051,49.1,69000,4700000,5900000,34133
22 | Juventus,300,555,501,525,73,5,762,602,71,405,940,1288,843,16023,2249,55.7,122000,46000000,11000000,38553
23 | Lazio,355,751,456,678,104,9,934,551,70,375,1215,1229,1259,12717,2185,52.7,55100,12250000,12750000,35500
24 | Inter,336,632,485,560,95,5,833,615,58,433,1142,952,874,16494,2242,57.8,70200,15700000,-3150000,37372
25 | Roma,295,670,511,521,103,5,910,525,53,420,894,1547,1080,15893,1990,57.5,94500,98600000,9075000,40118
26 | Fiorentina,360,626,555,538,82,3,853,616,61,383,977,1261,1031,14957,1996,55.9,56000,9100000,-24000000,30266
27 | Napoli,380,641,484,524,78,6,842,624,65,303,1062,1187,1153,15406,2213,54.1,67000,34050000,29490000,32266
28 | Genoa,301,592,622,636,103,6,1034,489,58,394,1042,1029,1201,11326,2191,51.2,35500,4340000,10160000,20882
29 | Sampdoria,297,559,600,615,97,3,1043,503,46,346,949,1363,1125,10715,1901,49,38500,24050000,-1700000,22276
30 | Palermo,354,522,549,582,84,4,1048,479,52,403,1144,1101,1201,11345,2182,49,24300,19700000,2030000,17466
31 | Torino,291,617,529,545,97,5,806,546,48,339,975,1340,1092,11818,1845,48.8,22500,34200000,3500000,17234
32 | AC Milan,261,682,615,541,81,13,905,450,54,294,898,1392,1225,12564,1906,51.2,97000,20600000,-15600000,36730
33 | Sassuolo,382,732,668,586,101,9,1231,448,46,264,1121,1069,1369,9976,1927,47.2,28000,16000000,2200000,13086
34 | Chievo,338,665,579,625,82,3,926,458,27,258,1162,1234,1449,8069,2071,43.6,18300,8000000,2510000,10652
35 | Empoli,414,371,610,492,58,4,953,516,43,424,1015,799,1207,13920,2437,51.4,11000,0,3750000,9229
36 | Cagliari,353,585,621,595,109,6,934,495,45,445,948,826,1054,12505,2343,49.1,16000,20200000,1700000,10551
37 | Verona,357,565,511,523,97,7,1159,461,48,239,1005,1142,1159,9497,1774,43.9,20500,16700000,-6200000,19312
38 | Udinese,321,503,504,532,91,6,1151,426,41,277,1290,1093,1484,9529,2007,45.2,25000,29200000,14850000,9132
39 | Atalanta,336,603,604,630,103,9,1041,461,37,254,1298,1158,1312,9702,2183,46.2,19700,0,-5400000,15673
40 | Cesena,370,563,557,632,102,4,1167,360,34,232,1303,1325,1294,9231,2243,45.2,11500,825000,950000,16204
41 | Parma,357,607,484,662,100,8,853,418,31,270,904,1218,1316,10543,2060,45.2,26200,4320000,-6360000,11758
42 | Barcelona,260,496,557,371,67,3,494,625,108,527,739,1315,682,21439,2371,65.3,347900,166000000,85000000,77632
43 | Real Madrid,343,482,544,406,85,3,658,686,116,477,925,1285,803,16083,1952,56.2,328000,135500000,21500000,73081
44 | Atletico Madrid,372,606,539,544,110,5,871,437,65,270,1596,1034,1242,10810,2443,49.4,105000,117600000,29600000,46603
45 | Valencia,294,683,613,568,103,9,1026,441,65,313,1351,1242,1223,10432,2102,50.7,73000,53300000,850000,43205
46 | Sevilla,350,614,517,609,116,2,811,463,68,320,1283,1092,1347,10597,2306,49.9,78600,20950000,-29600000,30671
47 | Villarreal,248,721,443,526,94,2,993,507,48,278,1302,1047,1287,10524,2535,48.9,44200,18700000,-3800000,16040
48 | Celta Vigo,334,673,495,586,117,5,771,499,47,375,1452,1233,1392,13577,2409,55.2,18700,0,0,19156
49 | Athletic Club,320,751,511,532,96,4,848,443,41,270,1699,1100,1645,10929,2753,50.8,38900,1000000,-35000000,43454
50 | Real Sociedad,261,606,559,514,88,1,1040,429,41,265,1533,1246,1366,10110,2415,50.3,49900,12000000,-33200000,22103
51 | Espanyol,328,637,535,594,111,6,964,368,47,266,1405,1013,1496,8546,2208,46.4,32200,2200000,-8800000,18693
52 | Rayo Vallecano,320,658,525,554,106,6,760,492,46,301,1421,1396,1368,12218,2309,55.4,15800,500000,150000,10628
53 | Malaga,400,631,509,571,107,8,830,461,40,297,1248,1157,1324,9233,2285,48.9,27500,1800000,-17600000,24530
54 | Deportivo La Coruna,351,748,498,554,103,4,892,399,33,284,1348,1044,1432,9884,2216,48.3,15300,200000,200000,21271
55 | Getafe,332,757,474,542,96,4,901,421,32,364,1301,1069,1439,9340,2169,47.3,18700,0,-8000000,7753
56 | Almeria,270,660,518,560,107,11,1021,394,33,300,1296,1117,1490,7317,2019,46.1,11700,0,-3000000,10405
57 | Eibar,343,653,410,568,103,5,963,354,33,216,1515,1257,1682,7366,2335,44.6,12800,162000,-5162000,4780
58 | Cordoba,338,692,496,487,95,9,1029,389,21,337,1433,1014,1383,9839,2126,47.4,16300,500000,500000,16126
59 | Granada,252,668,488,572,112,5,976,401,29,337,1307,1091,1331,8511,1992,46.9,22800,5750000,-10000000,17248
60 | Levante,296,688,480,558,107,3,835,404,34,286,1495,1092,1555,8046,2153,44.1,17000,0,0,15267
61 | Elche,318,562,485,588,103,6,834,407,35,248,1287,1279,1564,9889,2138,47.9,12500,1350000,-4650000,21684
62 | 


--------------------------------------------------------------------------------
/data/fixtures.csv:
--------------------------------------------------------------------------------
  1 | DIVISION,DATE,TIME,FIXTURE,HOME TEAM,AWAY TEAM
  2 | EPL,8/10/2018,20:00,Manchester United V Leicester City,Manchester United,Leicester City
  3 | EPL,8/11/2018,12:30,Newcastle United V Tottenham Hotspur,Newcastle United,Tottenham Hotspur
  4 | EPL,8/11/2018,15:00,Bournemouth V Cardiff City,Bournemouth,Cardiff City
  5 | EPL,8/11/2018,15:00,Fulham V Crystal Palace,Fulham,Crystal Palace
  6 | EPL,8/11/2018,15:00,Huddersfield Town V Chelsea,Huddersfield Town,Chelsea
  7 | EPL,8/11/2018,15:00,Watford V Brighton and Hove Albion,Watford,Brighton and Hove Albion
  8 | EPL,8/11/2018,17:30,Wolverhampton Wanderers V Everton,Wolverhampton Wanderers,Everton
  9 | EPL,8/12/2018, 13:30,Liverpool V West Ham United,Liverpool,West Ham United
 10 | EPL,8/12/2018, 13:30,Southampton V Burnley,Southampton,Burnley
 11 | EPL,8/12/2018, 16:00,Arsenal V Manchester City,Arsenal,Manchester City
 12 | EPL,8/18/2018, 12:30,Cardiff City V Newcastle United,Cardiff City,Newcastle United
 13 | EPL,8/18/2018, 15:00,Everton V Southampton,Everton,Southampton
 14 | EPL,8/18/2018, 15:00,Leicester City V Wolverhampton Wanderers,Leicester City,Wolverhampton Wanderers
 15 | EPL,8/18/2018, 15:00,Tottenham Hotspur V Fulham,Tottenham Hotspur,Fulham
 16 | EPL,8/18/2018, 15:00,West Ham United V Bournemouth,West Ham United,Bournemouth
 17 | EPL,8/18/2018, 17:30,Chelsea V Arsenal,Chelsea,Arsenal
 18 | EPL,8/19/2018, 13:30,Burnley V Watford,Burnley,Watford
 19 | EPL,8/19/2018, 13:30,Manchester City V Huddersfield Town,Manchester City,Huddersfield Town
 20 | EPL,8/19/2018, 16:00,Brighton and Hove Albion V Manchester United,Brighton and Hove Albion,Manchester United
 21 | EPL,8/20/2018, 20:00,Crystal Palace V Liverpool,Crystal Palace,Liverpool
 22 | EPL,8/25/2018, 12:30,Wolverhampton Wanderers V Manchester City,Wolverhampton Wanderers,Manchester City
 23 | EPL,8/25/2018, 15:00,Arsenal V West Ham United,Arsenal,West Ham United
 24 | EPL,8/25/2018, 15:00,Bournemouth V Everton,Bournemouth,Everton
 25 | EPL,8/25/2018, 15:00,Fulham V Burnley,Fulham,Burnley
 26 | EPL,8/25/2018, 15:00,Huddersfield Town V Cardiff City,Huddersfield Town,Cardiff City
 27 | EPL,8/25/2018, 15:00,Southampton V Leicester City,Southampton,Leicester City
 28 | EPL,8/25/2018, 17:30,Liverpool V Brighton and Hove Albion,Liverpool,Brighton and Hove Albion
 29 | EPL,8/26/2018, 13:30,Watford V Crystal Palace,Watford,Crystal Palace
 30 | EPL,8/26/2018, 16:00,Newcastle United V Chelsea,Newcastle United,Chelsea
 31 | EPL,8/27/2018, 20:00,Manchester United V Tottenham Hotspur,Manchester United,Tottenham Hotspur
 32 | EPL,9/1/2018, 12:30,Leicester City V Liverpool,Leicester City,Liverpool
 33 | EPL,9/1/2018, 15:00,Brighton and Hove Albion V Fulham,Brighton and Hove Albion,Fulham
 34 | EPL,9/1/2018, 15:00,Burnley V Manchester United,Burnley,Manchester United
 35 | EPL,9/1/2018, 15:00,Chelsea V Bournemouth,Chelsea,Bournemouth
 36 | EPL,9/1/2018, 15:00,Crystal Palace V Southampton,Crystal Palace,Southampton
 37 | EPL,9/1/2018, 15:00,Everton V Huddersfield Town,Everton,Huddersfield Town
 38 | EPL,9/1/2018, 15:00,West Ham United V Wolverhampton Wanderers,West Ham United,Wolverhampton Wanderers
 39 | EPL,9/1/2018, 17:30,Manchester City V Newcastle United,Manchester City,Newcastle United
 40 | EPL,9/2/2018, 13:30,Cardiff City V Arsenal,Cardiff City,Arsenal
 41 | EPL,9/2/2018, 16:00,Watford V Tottenham Hotspur,Watford,Tottenham Hotspur
 42 | EPL,9/15/2018, 12:30,Tottenham Hotspur V Liverpool,Tottenham Hotspur,Liverpool
 43 | EPL,9/15/2018, 15:00,Bournemouth V Leicester City,Bournemouth,Leicester City
 44 | EPL,9/15/2018, 15:00,Chelsea V Cardiff City,Chelsea,Cardiff City
 45 | EPL,9/15/2018, 15:00,Huddersfield Town V Crystal Palace,Huddersfield Town,Crystal Palace
 46 | EPL,9/15/2018, 15:00,Manchester City V Fulham,Manchester City,Fulham
 47 | EPL,9/15/2018, 15:00,Newcastle United V Arsenal,Newcastle United,Arsenal
 48 | EPL,9/15/2018, 17:30,Watford V Manchester United,Watford,Manchester United
 49 | EPL,9/16/2018, 13:30,Wolverhampton Wanderers V Burnley,Wolverhampton Wanderers,Burnley
 50 | EPL,9/16/2018, 16:00,Everton V West Ham United,Everton,West Ham United
 51 | EPL,9/17/2018, 20:00,Southampton V Brighton and Hove Albion,Southampton,Brighton and Hove Albion
 52 | EPL,9/22/2018, 12:30,Fulham V Watford,Fulham,Watford
 53 | EPL,9/22/2018, 15:00,Burnley V Bournemouth,Burnley,Bournemouth
 54 | EPL,9/22/2018, 15:00,Cardiff City V Manchester City,Cardiff City,Manchester City
 55 | EPL,9/22/2018, 15:00,Crystal Palace V Newcastle United,Crystal Palace,Newcastle United
 56 | EPL,9/22/2018, 15:00,Leicester City V Huddersfield Town,Leicester City,Huddersfield Town
 57 | EPL,9/22/2018, 15:00,Liverpool V Southampton,Liverpool,Southampton
 58 | EPL,9/22/2018, 15:00,Manchester United V Wolverhampton Wanderers,Manchester United,Wolverhampton Wanderers
 59 | EPL,9/22/2018, 17:30,Brighton and Hove Albion V Tottenham Hotspur,Brighton and Hove Albion,Tottenham Hotspur
 60 | EPL,9/23/2018, 13:30,West Ham United V Chelsea,West Ham United,Chelsea
 61 | EPL,9/23/2018, 16:00,Arsenal V Everton,Arsenal,Everton
 62 | EPL,9/29/2018, 12:30,West Ham United V Manchester United,West Ham United,Manchester United
 63 | EPL,9/29/2018, 15:00,Arsenal V Watford,Arsenal,Watford
 64 | EPL,9/29/2018, 15:00,Everton V Fulham,Everton,Fulham
 65 | EPL,9/29/2018, 15:00,Huddersfield Town V Tottenham Hotspur,Huddersfield Town,Tottenham Hotspur
 66 | EPL,9/29/2018, 15:00,Manchester City V Brighton and Hove Albion,Manchester City,Brighton and Hove Albion
 67 | EPL,9/29/2018, 15:00,Newcastle United V Leicester City,Newcastle United,Leicester City
 68 | EPL,9/29/2018, 15:00,Wolverhampton Wanderers V Southampton,Wolverhampton Wanderers,Southampton
 69 | EPL,9/29/2018, 17:30,Chelsea V Liverpool,Chelsea,Liverpool
 70 | EPL,9/30/2018, 16:00,Cardiff City V Burnley,Cardiff City,Burnley
 71 | EPL,10/1/2018, 20:00,Bournemouth V Crystal Palace,Bournemouth,Crystal Palace
 72 | EPL,10/5/2018, 20:00,Brighton and Hove Albion V West Ham United,Brighton and Hove Albion,West Ham United
 73 | EPL,10/6/2018, 15:00,Burnley V Huddersfield Town,Burnley,Huddersfield Town
 74 | EPL,10/6/2018, 15:00,Crystal Palace V Wolverhampton Wanderers,Crystal Palace,Wolverhampton Wanderers
 75 | EPL,10/6/2018, 15:00,Leicester City V Everton,Leicester City,Everton
 76 | EPL,10/6/2018, 15:00,Tottenham Hotspur V Cardiff City,Tottenham Hotspur,Cardiff City
 77 | EPL,10/6/2018, 15:00,Watford V Bournemouth,Watford,Bournemouth
 78 | EPL,10/6/2018, 17:30,Manchester United V Newcastle United,Manchester United,Newcastle United
 79 | EPL,10/7/2018, 12:00,Fulham V Arsenal,Fulham,Arsenal
 80 | EPL,10/7/2018, 14:15,Southampton V Chelsea,Southampton,Chelsea
 81 | EPL,10/7/2018, 16:30,Liverpool V Manchester City,Liverpool,Manchester City
 82 | EPL,10/20/2018, 12:30,Chelsea V Manchester United,Chelsea,Manchester United
 83 | EPL,10/20/2018, 15:00,Bournemouth V Southampton,Bournemouth,Southampton
 84 | EPL,10/20/2018, 15:00,Cardiff City V Fulham,Cardiff City,Fulham
 85 | EPL,10/20/2018, 15:00,Manchester City V Burnley,Manchester City,Burnley
 86 | EPL,10/20/2018, 15:00,Newcastle United V Brighton and Hove Albion,Newcastle United,Brighton and Hove Albion
 87 | EPL,10/20/2018, 15:00,West Ham United V Tottenham Hotspur,West Ham United,Tottenham Hotspur
 88 | EPL,10/20/2018, 15:00,Wolverhampton Wanderers V Watford,Wolverhampton Wanderers,Watford
 89 | EPL,10/20/2018, 17:30,Huddersfield Town V Liverpool,Huddersfield Town,Liverpool
 90 | EPL,10/21/2018, 16:00,Everton V Crystal Palace,Everton,Crystal Palace
 91 | EPL,10/22/2018, 20:00,Arsenal V Leicester City,Arsenal,Leicester City
 92 | EPL,10/27/2018, 12:30,Manchester United V Everton,Manchester United,Everton
 93 | EPL,10/27/2018, 15:00,Brighton and Hove Albion V Wolverhampton Wanderers,Brighton and Hove Albion,Wolverhampton Wanderers
 94 | EPL,10/27/2018, 15:00,Fulham V Bournemouth,Fulham,Bournemouth
 95 | EPL,10/27/2018, 15:00,Liverpool V Cardiff City,Liverpool,Cardiff City
 96 | EPL,10/27/2018, 15:00,Southampton V Newcastle United,Southampton,Newcastle United
 97 | EPL,10/27/2018, 15:00,Watford V Huddersfield Town,Watford,Huddersfield Town
 98 | EPL,10/27/2018, 17:30,Leicester City V West Ham United,Leicester City,West Ham United
 99 | EPL,10/28/2018, 13:30,Burnley V Chelsea,Burnley,Chelsea
100 | EPL,10/28/2018, 13:30,Crystal Palace V Arsenal,Crystal Palace,Arsenal
101 | EPL,10/28/2018, 16:00,Tottenham Hotspur V Manchester City,Tottenham Hotspur,Manchester City
102 | EPL,11/3/2018, 12:30,Bournemouth V Manchester United,Bournemouth,Manchester United
103 | EPL,11/3/2018, 15:00,Cardiff City V Leicester City,Cardiff City,Leicester City
104 | EPL,11/3/2018, 15:00,Everton V Brighton and Hove Albion,Everton,Brighton and Hove Albion
105 | EPL,11/3/2018, 15:00,Manchester City V Southampton,Manchester City,Southampton
106 | EPL,11/3/2018, 15:00,Newcastle United V Watford,Newcastle United,Watford
107 | EPL,11/3/2018, 15:00,West Ham United V Burnley,West Ham United,Burnley
108 | EPL,11/3/2018, 17:30,Arsenal V Liverpool,Arsenal,Liverpool
109 | EPL,11/4/2018, 13:30,Wolverhampton Wanderers V Tottenham Hotspur,Wolverhampton Wanderers,Tottenham Hotspur
110 | EPL,11/4/2018, 16:00,Chelsea V Crystal Palace,Chelsea,Crystal Palace
111 | EPL,11/5/2018, 20:00,Huddersfield Town V Fulham,Huddersfield Town,Fulham
112 | EPL,11/10/2018, 12:30,Cardiff City V Brighton and Hove Albion,Cardiff City,Brighton and Hove Albion
113 | EPL,11/10/2018, 15:00,Huddersfield Town V West Ham United,Huddersfield Town,West Ham United
114 | EPL,11/10/2018, 15:00,Leicester City V Burnley,Leicester City,Burnley
115 | EPL,11/10/2018, 15:00,Newcastle United V Bournemouth,Newcastle United,Bournemouth
116 | EPL,11/10/2018, 15:00,Southampton V Watford,Southampton,Watford
117 | EPL,11/10/2018, 17:30,Crystal Palace V Tottenham Hotspur,Crystal Palace,Tottenham Hotspur
118 | EPL,11/11/2018, 12:00,Liverpool V Fulham,Liverpool,Fulham
119 | EPL,11/11/2018, 14:15,Chelsea V Everton,Chelsea,Everton
120 | EPL,11/11/2018, 16:30,Arsenal V Wolverhampton Wanderers,Arsenal,Wolverhampton Wanderers
121 | EPL,11/11/2018, 16:30,Manchester City V Manchester United,Manchester City,Manchester United
122 | EPL,11/24/2018, 15:00,Brighton and Hove Albion V Leicester City,Brighton and Hove Albion,Leicester City
123 | EPL,11/24/2018, 15:00,Everton V Cardiff City,Everton,Cardiff City
124 | EPL,11/24/2018, 15:00,Fulham V Southampton,Fulham,Southampton
125 | EPL,11/24/2018, 15:00,Manchester United V Crystal Palace,Manchester United,Crystal Palace
126 | EPL,11/24/2018, 15:00,Watford V Liverpool,Watford,Liverpool
127 | EPL,11/24/2018, 15:00,West Ham United V Manchester City,West Ham United,Manchester City
128 | EPL,11/24/2018, 17:30,Tottenham Hotspur V Chelsea,Tottenham Hotspur,Chelsea
129 | EPL,11/25/2018, 13:30,Bournemouth V Arsenal,Bournemouth,Arsenal
130 | EPL,11/25/2018, 16:00,Wolverhampton Wanderers V Huddersfield Town,Wolverhampton Wanderers,Huddersfield Town
131 | EPL,11/26/2018, 20:00,Burnley V Newcastle United,Burnley,Newcastle United
132 | EPL,12/1/2018, 15:00,Arsenal V Tottenham Hotspur,Arsenal,Tottenham Hotspur
133 | EPL,12/1/2018, 15:00,Cardiff City V Wolverhampton Wanderers,Cardiff City,Wolverhampton Wanderers
134 | EPL,12/1/2018, 15:00,Chelsea V Fulham,Chelsea,Fulham
135 | EPL,12/1/2018, 15:00,Crystal Palace V Burnley,Crystal Palace,Burnley
136 | EPL,12/1/2018, 15:00,Huddersfield Town V Brighton and Hove Albion,Huddersfield Town,Brighton and Hove Albion
137 | EPL,12/1/2018, 15:00,Leicester City V Watford,Leicester City,Watford
138 | EPL,12/1/2018, 15:00,Liverpool V Everton,Liverpool,Everton
139 | EPL,12/1/2018, 15:00,Manchester City V Bournemouth,Manchester City,Bournemouth
140 | EPL,12/1/2018, 15:00,Newcastle United V West Ham United,Newcastle United,West Ham United
141 | EPL,12/1/2018, 15:00,Southampton V Manchester United,Southampton,Manchester United
142 | EPL,12/4/2018, 19:45,Bournemouth V Huddersfield Town,Bournemouth,Huddersfield Town
143 | EPL,12/4/2018, 19:45,Brighton and Hove Albion V Crystal Palace,Brighton and Hove Albion,Crystal Palace
144 | EPL,12/4/2018, 19:45,Burnley V Liverpool,Burnley,Liverpool
145 | EPL,12/4/2018, 19:45,Fulham V Leicester City,Fulham,Leicester City
146 | EPL,12/4/2018, 19:45,Watford V Manchester City,Watford,Manchester City
147 | EPL,12/4/2018, 19:45,West Ham United V Cardiff City,West Ham United,Cardiff City
148 | EPL,12/4/2018, 19:45,Wolverhampton Wanderers V Chelsea,Wolverhampton Wanderers,Chelsea
149 | EPL,12/4/2018, 20:00,Manchester United V Arsenal,Manchester United,Arsenal
150 | EPL,12/5/2018, 19:45,Everton V Newcastle United,Everton,Newcastle United
151 | EPL,12/5/2018, 20:00,Tottenham Hotspur V Southampton,Tottenham Hotspur,Southampton
152 | EPL,12/8/2018, 15:00,Arsenal V Huddersfield Town,Arsenal,Huddersfield Town
153 | EPL,12/8/2018, 15:00,Bournemouth V Liverpool,Bournemouth,Liverpool
154 | EPL,12/8/2018, 15:00,Burnley V Brighton and Hove Albion,Burnley,Brighton and Hove Albion
155 | EPL,12/8/2018, 15:00,Cardiff City V Southampton,Cardiff City,Southampton
156 | EPL,12/8/2018, 15:00,Chelsea V Manchester City,Chelsea,Manchester City
157 | EPL,12/8/2018, 15:00,Everton V Watford,Everton,Watford
158 | EPL,12/8/2018, 15:00,Leicester City V Tottenham Hotspur,Leicester City,Tottenham Hotspur
159 | EPL,12/8/2018, 15:00,Manchester United V Fulham,Manchester United,Fulham
160 | EPL,12/8/2018, 15:00,Newcastle United V Wolverhampton Wanderers,Newcastle United,Wolverhampton Wanderers
161 | EPL,12/8/2018, 15:00,West Ham United V Crystal Palace,West Ham United,Crystal Palace
162 | EPL,12/15/2018, 15:00,Brighton and Hove Albion V Chelsea,Brighton and Hove Albion,Chelsea
163 | EPL,12/15/2018, 15:00,Crystal Palace V Leicester City,Crystal Palace,Leicester City
164 | EPL,12/15/2018, 15:00,Fulham V West Ham United,Fulham,West Ham United
165 | EPL,12/15/2018, 15:00,Huddersfield Town V Newcastle United,Huddersfield Town,Newcastle United
166 | EPL,12/15/2018, 15:00,Liverpool V Manchester United,Liverpool,Manchester United
167 | EPL,12/15/2018, 15:00,Manchester City V Everton,Manchester City,Everton
168 | EPL,12/15/2018, 15:00,Southampton V Arsenal,Southampton,Arsenal
169 | EPL,12/15/2018, 15:00,Tottenham Hotspur V Burnley,Tottenham Hotspur,Burnley
170 | EPL,12/15/2018, 15:00,Watford V Cardiff City,Watford,Cardiff City
171 | EPL,12/15/2018, 15:00,Wolverhampton Wanderers V Bournemouth,Wolverhampton Wanderers,Bournemouth
172 | EPL,12/22/2018, 15:00,Arsenal V Burnley,Arsenal,Burnley
173 | EPL,12/22/2018, 15:00,Bournemouth V Brighton and Hove Albion,Bournemouth,Brighton and Hove Albion
174 | EPL,12/22/2018, 15:00,Cardiff City V Manchester United,Cardiff City,Manchester United
175 | EPL,12/22/2018, 15:00,Chelsea V Leicester City,Chelsea,Leicester City
176 | EPL,12/22/2018, 15:00,Everton V Tottenham Hotspur,Everton,Tottenham Hotspur
177 | EPL,12/22/2018, 15:00,Huddersfield Town V Southampton,Huddersfield Town,Southampton
178 | EPL,12/22/2018, 15:00,Manchester City V Crystal Palace,Manchester City,Crystal Palace
179 | EPL,12/22/2018, 15:00,Newcastle United V Fulham,Newcastle United,Fulham
180 | EPL,12/22/2018, 15:00,West Ham United V Watford,West Ham United,Watford
181 | EPL,12/22/2018, 15:00,Wolverhampton Wanderers V Liverpool,Wolverhampton Wanderers,Liverpool
182 | EPL,12/26/2018, 15:00,Brighton and Hove Albion V Arsenal,Brighton and Hove Albion,Arsenal
183 | EPL,12/26/2018, 15:00,Burnley V Everton,Burnley,Everton
184 | EPL,12/26/2018, 15:00,Crystal Palace V Cardiff City,Crystal Palace,Cardiff City
185 | EPL,12/26/2018, 15:00,Fulham V Wolverhampton Wanderers,Fulham,Wolverhampton Wanderers
186 | EPL,12/26/2018, 15:00,Leicester City V Manchester City,Leicester City,Manchester City
187 | EPL,12/26/2018, 15:00,Liverpool V Newcastle United,Liverpool,Newcastle United
188 | EPL,12/26/2018, 15:00,Manchester United V Huddersfield Town,Manchester United,Huddersfield Town
189 | EPL,12/26/2018, 15:00,Southampton V West Ham United,Southampton,West Ham United
190 | EPL,12/26/2018, 15:00,Tottenham Hotspur V Bournemouth,Tottenham Hotspur,Bournemouth
191 | EPL,12/26/2018, 15:00,Watford V Chelsea,Watford,Chelsea
192 | EPL,12/29/2018, 15:00,Brighton and Hove Albion V Everton,Brighton and Hove Albion,Everton
193 | EPL,12/29/2018, 15:00,Burnley V West Ham United,Burnley,West Ham United
194 | EPL,12/29/2018, 15:00,Crystal Palace V Chelsea,Crystal Palace,Chelsea
195 | EPL,12/29/2018, 15:00,Fulham V Huddersfield Town,Fulham,Huddersfield Town
196 | EPL,12/29/2018, 15:00,Leicester City V Cardiff City,Leicester City,Cardiff City
197 | EPL,12/29/2018, 15:00,Liverpool V Arsenal,Liverpool,Arsenal
198 | EPL,12/29/2018, 15:00,Manchester United V Bournemouth,Manchester United,Bournemouth
199 | EPL,12/29/2018, 15:00,Southampton V Manchester City,Southampton,Manchester City
200 | EPL,12/29/2018, 15:00,Tottenham Hotspur V Wolverhampton Wanderers,Tottenham Hotspur,Wolverhampton Wanderers
201 | EPL,12/29/2018, 15:00,Watford V Newcastle United,Watford,Newcastle United
202 | EPL,1/1/2019, 15:00,Arsenal V Fulham,Arsenal,Fulham
203 | EPL,1/1/2019, 15:00,Bournemouth V Watford,Bournemouth,Watford
204 | EPL,1/1/2019, 15:00,Cardiff City V Tottenham Hotspur,Cardiff City,Tottenham Hotspur
205 | EPL,1/1/2019, 15:00,Chelsea V Southampton,Chelsea,Southampton
206 | EPL,1/1/2019, 15:00,Everton V Leicester City,Everton,Leicester City
207 | EPL,1/1/2019, 15:00,Huddersfield Town V Burnley,Huddersfield Town,Burnley
208 | EPL,1/1/2019, 15:00,Manchester City V Liverpool,Manchester City,Liverpool
209 | EPL,1/1/2019, 15:00,Newcastle United V Manchester United,Newcastle United,Manchester United
210 | EPL,1/1/2019, 15:00,West Ham United V Brighton and Hove Albion,West Ham United,Brighton and Hove Albion
211 | EPL,1/1/2019, 15:00,Wolverhampton Wanderers V Crystal Palace,Wolverhampton Wanderers,Crystal Palace
212 | EPL,1/12/2019, 15:00,Brighton and Hove Albion V Liverpool,Brighton and Hove Albion,Liverpool
213 | EPL,1/12/2019, 15:00,Burnley V Fulham,Burnley,Fulham
214 | EPL,1/12/2019, 15:00,Cardiff City V Huddersfield Town,Cardiff City,Huddersfield Town
215 | EPL,1/12/2019, 15:00,Chelsea V Newcastle United,Chelsea,Newcastle United
216 | EPL,1/12/2019, 15:00,Crystal Palace V Watford,Crystal Palace,Watford
217 | EPL,1/12/2019, 15:00,Everton V Bournemouth,Everton,Bournemouth
218 | EPL,1/12/2019, 15:00,Leicester City V Southampton,Leicester City,Southampton
219 | EPL,1/12/2019, 15:00,Manchester City V Wolverhampton Wanderers,Manchester City,Wolverhampton Wanderers
220 | EPL,1/12/2019, 15:00,Tottenham Hotspur V Manchester United,Tottenham Hotspur,Manchester United
221 | EPL,1/12/2019, 15:00,West Ham United V Arsenal,West Ham United,Arsenal
222 | EPL,1/19/2019, 15:00,Arsenal V Chelsea,Arsenal,Chelsea
223 | EPL,1/19/2019, 15:00,Bournemouth V West Ham United,Bournemouth,West Ham United
224 | EPL,1/19/2019, 15:00,Fulham V Tottenham Hotspur,Fulham,Tottenham Hotspur
225 | EPL,1/19/2019, 15:00,Huddersfield Town V Manchester City,Huddersfield Town,Manchester City
226 | EPL,1/19/2019, 15:00,Liverpool V Crystal Palace,Liverpool,Crystal Palace
227 | EPL,1/19/2019, 15:00,Manchester United V Brighton and Hove Albion,Manchester United,Brighton and Hove Albion
228 | EPL,1/19/2019, 15:00,Newcastle United V Cardiff City,Newcastle United,Cardiff City
229 | EPL,1/19/2019, 15:00,Southampton V Everton,Southampton,Everton
230 | EPL,1/19/2019, 15:00,Watford V Burnley,Watford,Burnley
231 | EPL,1/19/2019, 15:00,Wolverhampton Wanderers V Leicester City,Wolverhampton Wanderers,Leicester City
232 | EPL,1/29/2019, 19:45,Arsenal V Cardiff City,Arsenal,Cardiff City
233 | EPL,1/29/2019, 19:45,Bournemouth V Chelsea,Bournemouth,Chelsea
234 | EPL,1/29/2019, 19:45,Fulham V Brighton and Hove Albion,Fulham,Brighton and Hove Albion
235 | EPL,1/29/2019, 19:45,Huddersfield Town V Everton,Huddersfield Town,Everton
236 | EPL,1/29/2019, 19:45,Wolverhampton Wanderers V West Ham United,Wolverhampton Wanderers,West Ham United
237 | EPL,1/29/2019, 20:00,Manchester United V Burnley,Manchester United,Burnley
238 | EPL,1/30/2019, 19:45,Newcastle United V Manchester City,Newcastle United,Manchester City
239 | EPL,1/30/2019, 19:45,Southampton V Crystal Palace,Southampton,Crystal Palace
240 | EPL,1/30/2019, 20:00,Liverpool V Leicester City,Liverpool,Leicester City
241 | EPL,1/30/2019, 20:00,Tottenham Hotspur V Watford,Tottenham Hotspur,Watford
242 | EPL,2/2/2019, 15:00,Brighton and Hove Albion V Watford,Brighton and Hove Albion,Watford
243 | EPL,2/2/2019, 15:00,Burnley V Southampton,Burnley,Southampton
244 | EPL,2/2/2019, 15:00,Cardiff City V Bournemouth,Cardiff City,Bournemouth
245 | EPL,2/2/2019, 15:00,Chelsea V Huddersfield Town,Chelsea,Huddersfield Town
246 | EPL,2/2/2019, 15:00,Crystal Palace V Fulham,Crystal Palace,Fulham
247 | EPL,2/2/2019, 15:00,Everton V Wolverhampton Wanderers,Everton,Wolverhampton Wanderers
248 | EPL,2/2/2019, 15:00,Leicester City V Manchester United,Leicester City,Manchester United
249 | EPL,2/2/2019, 15:00,Manchester City V Arsenal,Manchester City,Arsenal
250 | EPL,2/2/2019, 15:00,Tottenham Hotspur V Newcastle United,Tottenham Hotspur,Newcastle United
251 | EPL,2/2/2019, 15:00,West Ham United V Liverpool,West Ham United,Liverpool
252 | EPL,2/9/2019, 15:00,Brighton and Hove Albion V Burnley,Brighton and Hove Albion,Burnley
253 | EPL,2/9/2019, 15:00,Crystal Palace V West Ham United,Crystal Palace,West Ham United
254 | EPL,2/9/2019, 15:00,Fulham V Manchester United,Fulham,Manchester United
255 | EPL,2/9/2019, 15:00,Huddersfield Town V Arsenal,Huddersfield Town,Arsenal
256 | EPL,2/9/2019, 15:00,Liverpool V Bournemouth,Liverpool,Bournemouth
257 | EPL,2/9/2019, 15:00,Manchester City V Chelsea,Manchester City,Chelsea
258 | EPL,2/9/2019, 15:00,Southampton V Cardiff City,Southampton,Cardiff City
259 | EPL,2/9/2019, 15:00,Tottenham Hotspur V Leicester City,Tottenham Hotspur,Leicester City
260 | EPL,2/9/2019, 15:00,Watford V Everton,Watford,Everton
261 | EPL,2/9/2019, 15:00,Wolverhampton Wanderers V Newcastle United,Wolverhampton Wanderers,Newcastle United
262 | EPL,2/23/2019, 15:00,Arsenal V Southampton,Arsenal,Southampton
263 | EPL,2/23/2019, 15:00,Bournemouth V Wolverhampton Wanderers,Bournemouth,Wolverhampton Wanderers
264 | EPL,2/23/2019, 15:00,Burnley V Tottenham Hotspur,Burnley,Tottenham Hotspur
265 | EPL,2/23/2019, 15:00,Cardiff City V Watford,Cardiff City,Watford
266 | EPL,2/23/2019, 15:00,Chelsea V Brighton and Hove Albion,Chelsea,Brighton and Hove Albion
267 | EPL,2/23/2019, 15:00,Everton V Manchester City,Everton,Manchester City
268 | EPL,2/23/2019, 15:00,Leicester City V Crystal Palace,Leicester City,Crystal Palace
269 | EPL,2/23/2019, 15:00,Manchester United V Liverpool,Manchester United,Liverpool
270 | EPL,2/23/2019, 15:00,Newcastle United V Huddersfield Town,Newcastle United,Huddersfield Town
271 | EPL,2/23/2019, 15:00,West Ham United V Fulham,West Ham United,Fulham
272 | EPL,2/26/2019, 19:45,Arsenal V Bournemouth,Arsenal,Bournemouth
273 | EPL,2/26/2019, 19:45,Cardiff City V Everton,Cardiff City,Everton
274 | EPL,2/26/2019, 19:45,Huddersfield Town V Wolverhampton Wanderers,Huddersfield Town,Wolverhampton Wanderers
275 | EPL,2/26/2019, 19:45,Leicester City V Brighton and Hove Albion,Leicester City,Brighton and Hove Albion
276 | EPL,2/26/2019, 20:00,Crystal Palace V Manchester United,Crystal Palace,Manchester United
277 | EPL,2/27/2019, 19:45,Chelsea V Tottenham Hotspur,Chelsea,Tottenham Hotspur
278 | EPL,2/27/2019, 19:45,Newcastle United V Burnley,Newcastle United,Burnley
279 | EPL,2/27/2019, 19:45,Southampton V Fulham,Southampton,Fulham
280 | EPL,2/27/2019, 20:00,Liverpool V Watford,Liverpool,Watford
281 | EPL,2/27/2019, 20:00,Manchester City V West Ham United,Manchester City,West Ham United
282 | EPL,3/2/2019, 15:00,Bournemouth V Manchester City,Bournemouth,Manchester City
283 | EPL,3/2/2019, 15:00,Brighton and Hove Albion V Huddersfield Town,Brighton and Hove Albion,Huddersfield Town
284 | EPL,3/2/2019, 15:00,Burnley V Crystal Palace,Burnley,Crystal Palace
285 | EPL,3/2/2019, 15:00,Everton V Liverpool,Everton,Liverpool
286 | EPL,3/2/2019, 15:00,Fulham V Chelsea,Fulham,Chelsea
287 | EPL,3/2/2019, 15:00,Manchester United V Southampton,Manchester United,Southampton
288 | EPL,3/2/2019, 15:00,Tottenham Hotspur V Arsenal,Tottenham Hotspur,Arsenal
289 | EPL,3/2/2019, 15:00,Watford V Leicester City,Watford,Leicester City
290 | EPL,3/2/2019, 15:00,West Ham United V Newcastle United,West Ham United,Newcastle United
291 | EPL,3/2/2019, 15:00,Wolverhampton Wanderers V Cardiff City,Wolverhampton Wanderers,Cardiff City
292 | EPL,3/9/2019, 15:00,Arsenal V Manchester United,Arsenal,Manchester United
293 | EPL,3/9/2019, 15:00,Cardiff City V West Ham United,Cardiff City,West Ham United
294 | EPL,3/9/2019, 15:00,Chelsea V Wolverhampton Wanderers,Chelsea,Wolverhampton Wanderers
295 | EPL,3/9/2019, 15:00,Crystal Palace V Brighton and Hove Albion,Crystal Palace,Brighton and Hove Albion
296 | EPL,3/9/2019, 15:00,Huddersfield Town V Bournemouth,Huddersfield Town,Bournemouth
297 | EPL,3/9/2019, 15:00,Leicester City V Fulham,Leicester City,Fulham
298 | EPL,3/9/2019, 15:00,Liverpool V Burnley,Liverpool,Burnley
299 | EPL,3/9/2019, 15:00,Manchester City V Watford,Manchester City,Watford
300 | EPL,3/9/2019, 15:00,Newcastle United V Everton,Newcastle United,Everton
301 | EPL,3/9/2019, 15:00,Southampton V Tottenham Hotspur,Southampton,Tottenham Hotspur
302 | EPL,3/16/2019, 15:00,Bournemouth V Newcastle United,Bournemouth,Newcastle United
303 | EPL,3/16/2019, 15:00,Brighton and Hove Albion V Cardiff City,Brighton and Hove Albion,Cardiff City
304 | EPL,3/16/2019, 15:00,Burnley V Leicester City,Burnley,Leicester City
305 | EPL,3/16/2019, 15:00,Everton V Chelsea,Everton,Chelsea
306 | EPL,3/16/2019, 15:00,Fulham V Liverpool,Fulham,Liverpool
307 | EPL,3/16/2019, 15:00,Manchester United V Manchester City,Manchester United,Manchester City
308 | EPL,3/16/2019, 15:00,Tottenham Hotspur V Crystal Palace,Tottenham Hotspur,Crystal Palace
309 | EPL,3/16/2019, 15:00,Watford V Southampton,Watford,Southampton
310 | EPL,3/16/2019, 15:00,West Ham United V Huddersfield Town,West Ham United,Huddersfield Town
311 | EPL,3/16/2019, 15:00,Wolverhampton Wanderers V Arsenal,Wolverhampton Wanderers,Arsenal
312 | EPL,3/30/2019, 15:00,Arsenal V Newcastle United,Arsenal,Newcastle United
313 | EPL,3/30/2019, 15:00,Brighton and Hove Albion V Southampton,Brighton and Hove Albion,Southampton
314 | EPL,3/30/2019, 15:00,Burnley V Wolverhampton Wanderers,Burnley,Wolverhampton Wanderers
315 | EPL,3/30/2019, 15:00,Cardiff City V Chelsea,Cardiff City,Chelsea
316 | EPL,3/30/2019, 15:00,Crystal Palace V Huddersfield Town,Crystal Palace,Huddersfield Town
317 | EPL,3/30/2019, 15:00,Fulham V Manchester City,Fulham,Manchester City
318 | EPL,3/30/2019, 15:00,Leicester City V Bournemouth,Leicester City,Bournemouth
319 | EPL,3/30/2019, 15:00,Liverpool V Tottenham Hotspur,Liverpool,Tottenham Hotspur
320 | EPL,3/30/2019, 15:00,Manchester United V Watford,Manchester United,Watford
321 | EPL,3/30/2019, 15:00,West Ham United V Everton,West Ham United,Everton
322 | EPL,4/6/2019, 15:00,Bournemouth V Burnley,Bournemouth,Burnley
323 | EPL,4/6/2019, 15:00,Chelsea V West Ham United,Chelsea,West Ham United
324 | EPL,4/6/2019, 15:00,Everton V Arsenal,Everton,Arsenal
325 | EPL,4/6/2019, 15:00,Huddersfield Town V Leicester City,Huddersfield Town,Leicester City
326 | EPL,4/6/2019, 15:00,Manchester City V Cardiff City,Manchester City,Cardiff City
327 | EPL,4/6/2019, 15:00,Newcastle United V Crystal Palace,Newcastle United,Crystal Palace
328 | EPL,4/6/2019, 15:00,Southampton V Liverpool,Southampton,Liverpool
329 | EPL,4/6/2019, 15:00,Tottenham Hotspur V Brighton and Hove Albion,Tottenham Hotspur,Brighton and Hove Albion
330 | EPL,4/6/2019, 15:00,Watford V Fulham,Watford,Fulham
331 | EPL,4/6/2019, 15:00,Wolverhampton Wanderers V Manchester United,Wolverhampton Wanderers,Manchester United
332 | EPL,4/13/2019, 15:00,Brighton and Hove Albion V Bournemouth,Brighton and Hove Albion,Bournemouth
333 | EPL,4/13/2019, 15:00,Burnley V Cardiff City,Burnley,Cardiff City
334 | EPL,4/13/2019, 15:00,Crystal Palace V Manchester City,Crystal Palace,Manchester City
335 | EPL,4/13/2019, 15:00,Fulham V Everton,Fulham,Everton
336 | EPL,4/13/2019, 15:00,Leicester City V Newcastle United,Leicester City,Newcastle United
337 | EPL,4/13/2019, 15:00,Liverpool V Chelsea,Liverpool,Chelsea
338 | EPL,4/13/2019, 15:00,Manchester United V West Ham United,Manchester United,West Ham United
339 | EPL,4/13/2019, 15:00,Southampton V Wolverhampton Wanderers,Southampton,Wolverhampton Wanderers
340 | EPL,4/13/2019, 15:00,Tottenham Hotspur V Huddersfield Town,Tottenham Hotspur,Huddersfield Town
341 | EPL,4/13/2019, 15:00,Watford V Arsenal,Watford,Arsenal
342 | EPL,4/20/2019, 15:00,Arsenal V Crystal Palace,Arsenal,Crystal Palace
343 | EPL,4/20/2019, 15:00,Bournemouth V Fulham,Bournemouth,Fulham
344 | EPL,4/20/2019, 15:00,Cardiff City V Liverpool,Cardiff City,Liverpool
345 | EPL,4/20/2019, 15:00,Chelsea V Burnley,Chelsea,Burnley
346 | EPL,4/20/2019, 15:00,Everton V Manchester United,Everton,Manchester United
347 | EPL,4/20/2019, 15:00,Huddersfield Town V Watford,Huddersfield Town,Watford
348 | EPL,4/20/2019, 15:00,Manchester City V Tottenham Hotspur,Manchester City,Tottenham Hotspur
349 | EPL,4/20/2019, 15:00,Newcastle United V Southampton,Newcastle United,Southampton
350 | EPL,4/20/2019, 15:00,West Ham United V Leicester City,West Ham United,Leicester City
351 | EPL,4/20/2019, 15:00,Wolverhampton Wanderers V Brighton and Hove Albion,Wolverhampton Wanderers,Brighton and Hove Albion
352 | EPL,4/27/2019, 15:00,Brighton and Hove Albion V Newcastle United,Brighton and Hove Albion,Newcastle United
353 | EPL,4/27/2019, 15:00,Burnley V Manchester City,Burnley,Manchester City
354 | EPL,4/27/2019, 15:00,Crystal Palace V Everton,Crystal Palace,Everton
355 | EPL,4/27/2019, 15:00,Fulham V Cardiff City,Fulham,Cardiff City
356 | EPL,4/27/2019, 15:00,Leicester City V Arsenal,Leicester City,Arsenal
357 | EPL,4/27/2019, 15:00,Liverpool V Huddersfield Town,Liverpool,Huddersfield Town
358 | EPL,4/27/2019, 15:00,Manchester United V Chelsea,Manchester United,Chelsea
359 | EPL,4/27/2019, 15:00,Southampton V Bournemouth,Southampton,Bournemouth
360 | EPL,4/27/2019, 15:00,Tottenham Hotspur V West Ham United,Tottenham Hotspur,West Ham United
361 | EPL,4/27/2019, 15:00,Watford V Wolverhampton Wanderers,Watford,Wolverhampton Wanderers
362 | EPL,5/4/2019, 15:00,Arsenal V Brighton and Hove Albion,Arsenal,Brighton and Hove Albion
363 | EPL,5/4/2019, 15:00,Bournemouth V Tottenham Hotspur,Bournemouth,Tottenham Hotspur
364 | EPL,5/4/2019, 15:00,Cardiff City V Crystal Palace,Cardiff City,Crystal Palace
365 | EPL,5/4/2019, 15:00,Chelsea V Watford,Chelsea,Watford
366 | EPL,5/4/2019, 15:00,Everton V Burnley,Everton,Burnley
367 | EPL,5/4/2019, 15:00,Huddersfield Town V Manchester United,Huddersfield Town,Manchester United
368 | EPL,5/4/2019, 15:00,Manchester City V Leicester City,Manchester City,Leicester City
369 | EPL,5/4/2019, 15:00,Newcastle United V Liverpool,Newcastle United,Liverpool
370 | EPL,5/4/2019, 15:00,West Ham United V Southampton,West Ham United,Southampton
371 | EPL,5/4/2019, 15:00,Wolverhampton Wanderers V Fulham,Wolverhampton Wanderers,Fulham
372 | EPL,5/12/2019, 15:00,Brighton and Hove Albion V Manchester City,Brighton and Hove Albion,Manchester City
373 | EPL,5/12/2019, 15:00,Burnley V Arsenal,Burnley,Arsenal
374 | EPL,5/12/2019, 15:00,Crystal Palace V Bournemouth,Crystal Palace,Bournemouth
375 | EPL,5/12/2019, 15:00,Fulham V Newcastle United,Fulham,Newcastle United
376 | EPL,5/12/2019, 15:00,Leicester City V Chelsea,Leicester City,Chelsea
377 | EPL,5/12/2019, 15:00,Liverpool V Wolverhampton Wanderers,Liverpool,Wolverhampton Wanderers
378 | EPL,5/12/2019, 15:00,Manchester United V Cardiff City,Manchester United,Cardiff City
379 | EPL,5/12/2019, 15:00,Southampton V Huddersfield Town,Southampton,Huddersfield Town
380 | EPL,5/12/2019, 15:00,Tottenham Hotspur V Everton,Tottenham Hotspur,Everton
381 | EPL,5/12/2019, 15:00,Watford V West Ham United,Watford,West Ham United
382 | 


--------------------------------------------------------------------------------
/dog_home.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/dog_home.png


--------------------------------------------------------------------------------
/images/bayesian.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/bayesian.png


--------------------------------------------------------------------------------
/images/dog_home.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/dog_home.png


--------------------------------------------------------------------------------
/images/epl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/epl.png


--------------------------------------------------------------------------------
/images/fireworks.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/fireworks.gif


--------------------------------------------------------------------------------
/images/messi-scribble.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/messi-scribble.png


--------------------------------------------------------------------------------
/images/network.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/network.png


--------------------------------------------------------------------------------
/images/paul.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/paul.png


--------------------------------------------------------------------------------
/images/scrible-test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/scrible-test.png


--------------------------------------------------------------------------------
/images/selfie.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/selfie.png


--------------------------------------------------------------------------------
/images/selfie2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/selfie2.png


--------------------------------------------------------------------------------
/images/test.R:
--------------------------------------------------------------------------------
1 | # test file
2 | 


--------------------------------------------------------------------------------
/llm_bots/animeyourself/anime_yourself.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | This bot uses all options provided by the Poe protocol. 
  4 | 
  5 | """
  6 | from __future__ import annotations
  7 | 
  8 | import asyncio
  9 | import json
 10 | from typing import AsyncIterable
 11 | 
 12 | from fastapi_poe import PoeBot, run
 13 | from fastapi_poe.types import (
 14 |     ContentType,
 15 |     QueryRequest,
 16 |     ReportFeedbackRequest,
 17 |     SettingsRequest,
 18 |     SettingsResponse,
 19 | )
 20 | import os
 21 | import replicate
 22 | import re
 23 | import random
 24 | import textwrap
 25 | import asyncio
 26 | from sse_starlette.sse import ServerSentEvent
 27 | 
 28 | SETTINGS = SettingsResponse(
 29 |     context_clear_window_secs=60 * 60, allow_user_context_clear=True
 30 | )
 31 | 
 32 | MODEL_URL = "mcai/dreamshaper-v6-img2img:c7959eb3a86c09b449dacc11ce8bba295fda466fc6935ab8709e35f4f48c980c"
 33 | 
 34 | _WAIT_TIMEOUT_S = 1
 35 | 
 36 | 
 37 | def parse_text(txt):
 38 |     # Define a regular expression to match the fields and their values
 39 |     pattern = re.compile(r'(image|prompt):\s*"([^"]+)"')
 40 |     matches = pattern.findall(txt)
 41 |     result = {}
 42 | 
 43 |     for field, value in matches:
 44 |         result[field] = value
 45 |     return result
 46 | 
 47 | core_positive_prompt = """
 48 | portrait closeup, best quality, intricately detailed, 
 49 | moe manga style, finely detailed features perfect art,
 50 | professional majestic impressionism oil painting by Waterhouse, 
 51 | John Constable, Ed Blinkey, Atey Ghailan, Studio Ghibli, 
 52 | by Jeremy Mann, Greg Manchess, Antonio Moro, trending on ArtStation, 
 53 | trending on CGSociety, cinematic lighting, hand drawn, hand colored.
 54 | """
 55 | 
 56 | alternative_prompt = """
 57 | portrait closeup, best quality,
 58 | moe manga style, finely detailed features perfect art,
 59 | anime style, 8k, artwork in the style of guweiz, 
 60 | cinematic lighting, hand drawn, hand colored.
 61 | """
 62 | 
 63 | negative_prompt = """
 64 | disfigured, kitsch, ugly, oversaturated, greain, low-res, deformed, blurry, bad anatomy, 
 65 | poorly drawn face, mutation, mutated, extra limb, missing limb, 
 66 | floating limbs, disconnected limbs, malformed hands, extra fingers, poorly drawn hands,
 67 | """
 68 | 
 69 | def error_message():
 70 |     msg = textwrap.dedent(f"""
 71 |     Sorry, I cannot parse your input. Please try again and make sure your input has the format:
 72 | 
 73 |     ```python
 74 |     image: "<image_public_url>"
 75 |     prompt: (Optional) "<your prompt here>" #no worry, we will generate an anime of you to start
 76 |     ```
 77 | 
 78 |     """
 79 |     )
 80 |     return msg
 81 | 
 82 | def _get_complete_message(second, input_url, output_url):
 83 |     _COMPLETE_MESSAGE = f"""
 84 |     Completed! (took {second}s)
 85 | 
 86 |     This is you:
 87 | 
 88 |     ![]({input_url})
 89 | 
 90 |     This is the anime version of yourself.
 91 | 
 92 |     ![]({output_url})
 93 | 
 94 |     """
 95 |     return textwrap.dedent(_COMPLETE_MESSAGE)
 96 | 
 97 | class AnimeYourself(PoeBot):
 98 |     async def get_response(self, query: QueryRequest) -> AsyncIterable[ServerSentEvent]:
 99 |         """Return an async iterator of events to send to the user."""
100 |         last_message = query.query[-1].content.lower()
101 |         response_content_type: ContentType = ("text/markdown")
102 |         yield self.meta_event(
103 |             content_type=response_content_type,
104 |             linkify=False,
105 |             refetch_settings=False,
106 |             suggested_replies=False,
107 |         )
108 | 
109 |         input_dict = parse_text(last_message)
110 |         if "image" not in input_dict:
111 |             yield self.text_event(error_message())
112 |         else:
113 |             ### call the model to get results:
114 |             input_prompt = "" if 'prompt' not in input_dict else input_dict['prompt']
115 | 
116 |             generated_image_task = asyncio.create_task(
117 |                 self._generate_image(
118 |                     image_url = input_dict['image'], 
119 |                     prompt = "mksks style," + input_prompt + "," + alternative_prompt
120 |                 )
121 |             )
122 | 
123 |             i = 0
124 |             while True:
125 |                 done, _ = await asyncio.wait(
126 |                     [generated_image_task], timeout=_WAIT_TIMEOUT_S
127 |                 )
128 |                 if done:
129 |                     output = done.pop().result()
130 |                     break
131 |                 yield self.replace_response_event(f"Generating your image: {i}s elapsed...")
132 |                 i += 1
133 | 
134 |             if len(output) != 1:
135 |                 yield self.replace_response_event(
136 |                     textwrap.dedent(
137 |                         f"""
138 | 
139 |                         Sorry, something seems to go wrong.
140 | 
141 |                         Please don't blame the developer. He's trying ᕕ( ᐛ )ᕗ.
142 | 
143 |                         But he does want you to know that you look amazing who you are.
144 | 
145 |                         ![]({input_dict['image']})
146 |                         """
147 |                     )
148 |                 )
149 |             else:
150 |                 yield self.replace_response_event(
151 |                     textwrap.dedent(
152 |                         _get_complete_message(
153 |                             second = i, 
154 |                             input_url = input_dict['image'],
155 |                             output_url = output[0])
156 |                     )
157 |                 )
158 | 
159 |     async def _generate_image(self, image_url: str, prompt: str):
160 |         loop = asyncio.get_running_loop()
161 |         output = await loop.run_in_executor(
162 |             None,
163 |             lambda: replicate.run(
164 |                 MODEL_URL,
165 |                 input={
166 |                     "image": image_url, 
167 |                     "prompt": prompt,
168 |                     "negative_prompt": negative_prompt,
169 |                     "num_inference_steps": 50,
170 |                 }
171 |             )
172 |         )
173 |         return output
174 | 
175 |     async def on_feedback(self, feedback: ReportFeedbackRequest) -> None:
176 |         """Called when we receive user feedback such as likes."""
177 |         print(
178 |             f"User {feedback.user_id} gave feedback on {feedback.conversation_id}"
179 |             f"message {feedback.message_id}: {feedback.feedback_type}"
180 |         )
181 | 
182 |     async def get_settings(self, settings: SettingsRequest) -> SettingsResponse:
183 |         """Return the settings for this bot."""
184 |         return SETTINGS
185 | 
186 | 
187 | if __name__ == "__main__":
188 |     run(AnimeYourself())


--------------------------------------------------------------------------------
/llm_bots/animeyourself/main.py:
--------------------------------------------------------------------------------
 1 | from fastapi_poe import make_app
 2 | import modal
 3 | from modal import Image, Stub, asgi_app
 4 | from anime_yourself import AnimeYourself
 5 | import os
 6 | 
 7 | # specific to hosting with modal.com
 8 | image = Image.debian_slim().pip_install_from_requirements(
 9 |     "requirements.txt"
10 | )
11 | stub = Stub("animeyourself")
12 | 
13 | 
14 | @stub.function(image=image, secret=modal.Secret.from_name("my-replicate-key"))
15 | @asgi_app()
16 | def fastapi_app():
17 |     bot = AnimeYourself()
18 |     app = make_app(bot, allow_without_key=True)
19 |     return app
20 | 
21 | 


--------------------------------------------------------------------------------
/llm_bots/animeyourself/readme.md:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/llm_bots/animeyourself/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi-poe==0.0.14
2 | replicate
3 | 


--------------------------------------------------------------------------------
/llm_bots/scribble2img/README.md:
--------------------------------------------------------------------------------
1 | # Poe API Bot tutorial
2 | 
3 | This is the companion repository to the Poe API bot
4 | [quick start](https://developer.poe.com/api-bots/quick-start). Please follow that guide
5 | for instructions on how to use this repository.
6 | 


--------------------------------------------------------------------------------
/llm_bots/scribble2img/main.py:
--------------------------------------------------------------------------------
 1 | from fastapi_poe import make_app
 2 | import modal
 3 | from modal import Image, Stub, asgi_app
 4 | from scribble2image import Scribble2ImageBot
 5 | import os
 6 | 
 7 | # specific to hosting with modal.com
 8 | image = Image.debian_slim().pip_install_from_requirements(
 9 |     "requirements.txt"
10 | )
11 | stub = Stub("scribble2image")
12 | 
13 | 
14 | @stub.function(image=image, secret=modal.Secret.from_name("my-replicate-key"))
15 | @asgi_app()
16 | def fastapi_app():
17 |     bot = Scribble2ImageBot()
18 |     app = make_app(bot, allow_without_key=True)
19 |     return app
20 | 


--------------------------------------------------------------------------------
/llm_bots/scribble2img/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi-poe==0.0.14
2 | replicate==0.8.3
3 | 


--------------------------------------------------------------------------------
/llm_bots/scribble2img/scribble2image.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | This bot uses all options provided by the Poe protocol. You can use it to get examples
  4 | of all the protocol has to offer.
  5 | 
  6 | """
  7 | from __future__ import annotations
  8 | 
  9 | import asyncio
 10 | import json
 11 | from typing import AsyncIterable
 12 | 
 13 | from fastapi_poe import PoeBot, run
 14 | from fastapi_poe.types import (
 15 |     ContentType,
 16 |     QueryRequest,
 17 |     ReportFeedbackRequest,
 18 |     SettingsRequest,
 19 |     SettingsResponse,
 20 | )
 21 | import os
 22 | import replicate
 23 | import re
 24 | import random
 25 | import textwrap
 26 | import asyncio
 27 | from sse_starlette.sse import ServerSentEvent
 28 | 
 29 | SETTINGS = SettingsResponse(
 30 |     context_clear_window_secs=60 * 60, allow_user_context_clear=True
 31 | )
 32 | 
 33 | MODEL_URL = "jagilley/controlnet-scribble:435061a1b5a4c1e26740464bf786efdfa9cb3a3ac488595a2de23e143fdb0117"
 34 | 
 35 | _WAIT_TIMEOUT_S = 1
 36 | 
 37 | encouraging_msgs = [
 38 |     "The developer loves your scribble 😍",
 39 |     "With the drawing skill like this, do you even need this bot?",
 40 |     "Wow! Your scribbling skill is hella strong",
 41 |     "Interesting interesting! I gotcha",
 42 |     "Your doodle is simply world class. Let me see what else I can add."
 43 | ]
 44 | 
 45 | def _get_complete_message(second, encouraging_msg, input_url, output_url):
 46 |     _COMPLETE_MESSAGE = f"""
 47 |     Completed! (took {second}s)
 48 | 
 49 |     This is the original. {encouraging_msg}:
 50 | 
 51 |     ![]({input_url})
 52 | 
 53 |     This is your scribble brought to life:
 54 | 
 55 |     ![]({output_url})
 56 | 
 57 |     """
 58 |     return _COMPLETE_MESSAGE
 59 | 
 60 | 
 61 | def parse_text(txt):
 62 |     # Define a regular expression to match the fields and their values
 63 |     pattern = re.compile(r'(image|prompt):\s*"([^"]+)"')
 64 |     matches = pattern.findall(txt)
 65 |     result = {}
 66 | 
 67 |     for field, value in matches:
 68 |         result[field] = value
 69 |     return result
 70 | 
 71 | def error_message(missing_image=True, image_url=None):
 72 |     missing_piece = "image" if missing_image else "prompt"
 73 |     if image_url:
 74 |         additional_txt = f"""
 75 | 
 76 |     But I just wanna say that I love your scribble.
 77 | 
 78 |     ![]({image_url})
 79 |         """
 80 |     else:
 81 |         additional_txt = ""
 82 |     msg = textwrap.dedent(f"""
 83 |     Sorry, I cannot parse your {missing_piece}. Please try again and make sure your input has the format:
 84 | 
 85 |     ```python
 86 |     image: "<image_public_url>"
 87 |     prompt: "<your prompt here>"
 88 |     ```
 89 | 
 90 |     {additional_txt}
 91 |     """
 92 |     )
 93 |     return msg
 94 | 
 95 | class Scribble2ImageBot(PoeBot):
 96 | 
 97 |     async def get_response(self, query: QueryRequest) -> AsyncIterable[ServerSentEvent]:
 98 |         """Return an async iterator of events to send to the user."""
 99 |         last_message = query.query[-1].content.lower()
100 |         response_content_type: ContentType = ("text/markdown")
101 |         yield self.meta_event(
102 |             content_type=response_content_type,
103 |             linkify=False,
104 |             refetch_settings=False,
105 |             suggested_replies=False,
106 |         )
107 | 
108 |         input_dict = parse_text(last_message)
109 |         if "image" not in input_dict:
110 |             yield self.text_event(error_message())
111 |         elif "prompt" not in input_dict:
112 |             yield self.text_event(error_message(
113 |                 missing_image=False,
114 |                 image_url = input_dict['image']
115 |                 )
116 |             )
117 |         else:
118 |             ### call the model to get results:
119 |             generated_image_task = asyncio.create_task(
120 |                 self._generate_image(input_dict['image'], input_dict['prompt'])
121 |             )
122 | 
123 |             i = 0
124 |             while True:
125 |                 done, _ = await asyncio.wait(
126 |                     [generated_image_task], timeout=_WAIT_TIMEOUT_S
127 |                 )
128 |                 if done:
129 |                     output = done.pop().result()
130 |                     break
131 |                 yield self.replace_response_event(f"Generating your image: {i}s elapsed...")
132 |                 i += 1
133 | 
134 |             if len(output) != 2:
135 |                 yield self.replace_response_event(
136 |                     textwrap.dedent(
137 |                         f"""
138 | 
139 |                         Sorry, something seems to go wrong.
140 | 
141 |                         Please don't blame the developer. He's trying ᕕ( ᐛ )ᕗ.
142 | 
143 |                         But he does want you to know that he loves your scribble.
144 | 
145 |                         ![]({input_dict['image']})
146 |                         """
147 |                     )
148 |                 )
149 |             else:
150 |                 yield self.replace_response_event(
151 |                     textwrap.dedent(
152 |                         _get_complete_message(
153 |                             second = i, 
154 |                             encouraging_msg = random.choice(encouraging_msgs), 
155 |                             input_url = input_dict['image'], 
156 |                             output_url = output[1])
157 |                     )
158 |                 )
159 | 
160 |     async def _generate_image(self, image_url: str, prompt: str):
161 |         loop = asyncio.get_running_loop()
162 |         output = await loop.run_in_executor(
163 |             None,
164 |             lambda: replicate.run(
165 |                         MODEL_URL,
166 |                         input={
167 |                             "image": image_url, 
168 |                             "prompt": prompt,
169 |                         }
170 |                     )
171 |         )
172 |         return output
173 | 
174 | 
175 |     async def on_feedback(self, feedback: ReportFeedbackRequest) -> None:
176 |         """Called when we receive user feedback such as likes."""
177 |         print(
178 |             f"User {feedback.user_id} gave feedback on {feedback.conversation_id}"
179 |             f"message {feedback.message_id}: {feedback.feedback_type}"
180 |         )
181 | 
182 |     async def get_settings(self, settings: SettingsRequest) -> SettingsResponse:
183 |         """Return the settings for this bot."""
184 |         return SETTINGS
185 | 
186 | 
187 | if __name__ == "__main__":
188 |     run(Scribble2ImageBot())


--------------------------------------------------------------------------------