├── Python ├── CollaborativeFiltering │ ├── Goodreads_surprise.py │ └── TrainingModule.py ├── fireworks.py ├── flights_networkx.py ├── football_visual.ipynb ├── image.jpg ├── images │ └── networkx_basemap │ │ ├── map_0.png │ │ ├── map_1.png │ │ ├── map_2.png │ │ ├── map_3.png │ │ ├── table_1.PNG │ │ ├── table_2.PNG │ │ ├── table_3.PNG │ │ └── table_5.PNG ├── lincoln_estimate.py ├── mbappe.jpg ├── n_dimensionalNormal.py └── optimal_dating.py ├── R ├── EPL │ ├── Agg.png │ ├── Last.png │ ├── Misc │ │ └── TeamEvaluate2015.R │ ├── betting │ │ ├── Portfolio-xkcd.png │ │ ├── bet_strategy.R │ │ ├── clean_data.R │ │ └── prediction.R │ ├── penalty │ │ ├── Scraping.ipynb │ │ └── penalty.R │ ├── prediction │ │ ├── clean_data.R │ │ ├── match_simulate.R │ │ ├── sim.R │ │ └── visualize.R │ └── xkcd.ttf ├── Paul_hypothesis_test.R ├── RuleOfThree.R ├── bayes_god.R ├── bayesian_gym.R ├── dating_sim.R ├── end_to_end_projects.R └── lindy │ ├── Inverse_Random_Sampling.pdf │ └── lindy_simulation.R ├── README.md ├── data ├── Team2015season.csv ├── Vietnamese_2016.csv ├── all_games.csv ├── all_penalties.csv ├── fixtures.csv ├── history.csv ├── housing.csv └── national_longitudinal_survey.csv ├── dog_home.png ├── images ├── bayesian.png ├── dog_home.png ├── epl.png ├── fireworks.gif ├── messi-scribble.png ├── network.png ├── paul.png ├── scrible-test.png ├── selfie.png ├── selfie2.png └── test.R └── llm_bots ├── animeyourself ├── anime_yourself.py ├── main.py ├── readme.md └── requirements.txt └── scribble2img ├── README.md ├── main.py ├── requirements.txt └── scribble2image.py /Python/CollaborativeFiltering/Goodreads_surprise.py: -------------------------------------------------------------------------------- 1 | """ 2 | Code for Collaborative Filtering project 3 | 4 | Instead of using module from Training module.py, 5 | using surprise package which is a lot more efficient 6 | in terms of sparse matrix handling. 7 | """ 8 | 9 | import numpy as np 10 | import pandas as pd 11 | from surprise import SVD 12 | from surprise import Dataset 13 | from surprise import Reader 14 | from surprise import accuracy 15 | from surprise import BaselineOnly 16 | from surprise.model_selection import train_test_split 17 | 18 | # 6 millions rows 19 | df = pd.read_csv('/ratings.csv') 20 | df['user_id'] = df['user_id'].astype(str) 21 | df.head() 22 | 23 | # 10k books 24 | books = pd.read_csv('/books.csv') 25 | 26 | # create a mapping between book_id and name 27 | id_to_name = {} 28 | for row in new_books.itertuples(): 29 | id_to_name[row[1]] = row[11] 30 | 31 | # add my onw book ratings 32 | my_rating = {'user_id': [str(53425) for i in range(11)], 33 | 'book_id': [13, 119, 240, 283, 479, 1100, 2205, 2246, 3227, 7210, 5], 34 | 'rating': [5, 3, 4, 4, 4, 5, 2, 4, 4, 3, 3]} 35 | print([id_to_name[id] for id in [13, 119, 240, 283, 479, 1100, 2205, 2246, 3227, 7210, 5]]) 36 | full_rating = pd.DataFrame(my_rating).append(new_rating) 37 | 38 | 39 | # Load our data into DataSet class of surprise package 40 | reader = Reader(rating_scale=(1, 5)) 41 | data = Dataset.load_from_df(full_rating[['user_id', 'book_id', 'rating']], reader) 42 | 43 | # split into trainset and testset 44 | trainset, testset = train_test_split(data, test_size=.10) 45 | train_eval = trainset.build_testset() 46 | 47 | # train a Funk SGD-SVD algorithms: 48 | epochs = [1, 5, 10, 20, 40, 80, 100, 120, 150] 49 | train_mse = [] 50 | test_mse = [] 51 | for n_epoch in epochs: 52 | print("Number of epochs trained", n_epoch) 53 | algo = SVD(n_factors = 40, lr_all = 0.001, n_epochs = n_epoch) 54 | algo.fit(trainset) 55 | train_predictions = algo.test(train_eval) 56 | test_predictions = algo.test(testset) 57 | train_mse.append(accuracy.mse(train_predictions)) 58 | test_mse.append(accuracy.mse(test_predictions)) 59 | print(accuracy.mse(train_predictions), accuracy.mse(test_predictions)) 60 | 61 | # function to plot the learning curve through epochs 62 | def plot_learning_curve(iter_array, train_accuracy, test_accuracy, xlabel = 'iterations'): 63 | plt.plot(iter_array, train_accuracy, 64 | label='Train mse', linewidth=5) 65 | plt.plot(iter_array, test_accuracy, 66 | label='Test mse', linewidth=5) 67 | 68 | 69 | plt.xticks(fontsize=16); 70 | plt.yticks(fontsize=16); 71 | plt.xlabel(xlabel, fontsize=30); 72 | plt.ylabel('MSE', fontsize=30); 73 | plt.legend(loc='best', fontsize=20); 74 | 75 | plot_learning_curve(epochs, train_mse, test_mse) 76 | 77 | #train on a full dataset and make prediction 78 | full_trainset = data.build_full_trainset() 79 | algo = SVD(n_factors = 40, lr_all = 0.001, verbose=True, n_epochs = 100) 80 | algo.fit(full_trainset) 81 | 82 | # make prediction: 83 | all_book_id = full_rating.book_id.unique() 84 | top_n = [] 85 | for book_id in all_book_id: 86 | top_n.append(algo.predict(uid = str(53425), iid = book_id)) 87 | top_n.sort(key=lambda x: x.est, reverse=True) 88 | print([id_to_name[pred.iid] for pred in top_n[:10]] 89 | -------------------------------------------------------------------------------- /Python/CollaborativeFiltering/TrainingModule.py: -------------------------------------------------------------------------------- 1 | %matplotlib inline 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | sns.set() 5 | import numpy as np 6 | import pandas as pd 7 | import io 8 | from collections import defaultdict 9 | from sklearn.metrics import mean_squared_error 10 | from numpy.linalg import solve 11 | np.random.seed(0) 12 | 13 | # input is a dataframe with 3 columns 14 | # user_id, item_id, rating 15 | def create_rating_matrix(df): 16 | n_users = df.user_id.unique().shape[0] 17 | n_items = df.item_id.unique().shape[0] 18 | ratings = np.zeros((n_users, n_items)) 19 | for row in df.itertuples(): 20 | # row[1] - 1 is the user id readjusted to start by index 0 21 | # row[2] - 1 is the item id readjusted to start by index 0 22 | ratings[row[1]-1, row[2]-1] = row[3] 23 | ratings 24 | 25 | 26 | # calculate sparsity of rating matrix 27 | def calculate_sparsity(rating_matrix) 28 | sparsity = float(len(rating_matrixnonzero()[0])) * 100 / (rating_matrix.shape[0] * rating_matrix.shape[1]) 29 | return sparsity 30 | 31 | 32 | # function to split train, test data 33 | def train_test_split(ratings, pct): 34 | test = np.zeros(ratings.shape) 35 | train = ratings.copy() 36 | for user in range(ratings.shape[0]): 37 | user_rating_idx = ratings[user, :].nonzero()[0] 38 | test_ratings = np.random.choice(user_rating_idx, 39 | size=int(len(user_rating_idx)*pct), 40 | replace=False) 41 | train[user, test_ratings] = 0. 42 | test[user, test_ratings] = ratings[user, test_ratings] 43 | 44 | # Test and training are truly disjoint 45 | assert(np.all((train * test) == 0)) 46 | return train, test 47 | 48 | 49 | # function to calculate MSE error 50 | def get_mse(pred, actual): 51 | pred = pred[actual.nonzero()].flatten() 52 | actual = actual[actual.nonzero()].flatten() 53 | return mean_squared_error(pred, actual) 54 | 55 | 56 | class AlternatingLeastSquareMF(): 57 | def __init__(self, 58 | ratings, 59 | n_factors=40, 60 | item_reg=0.0, 61 | user_reg=0.0 62 | ): 63 | """ 64 | Train a matrix factorization model to predict empty 65 | entries in a matrix. 66 | 67 | Params 68 | ====== 69 | ratings : (ndarray) 70 | User x Item matrix with corresponding ratings 71 | 72 | n_factors : (int) 73 | Number of latent factors (k) to use in model 74 | 75 | item_reg : (float) 76 | Regularization term for item latent factors 77 | 78 | user_reg : (float) 79 | Regularization term for user latent factors 80 | """ 81 | 82 | self.ratings = ratings 83 | self.n_users, self.n_items = ratings.shape 84 | self.n_factors = n_factors 85 | self.item_reg = item_reg 86 | self.user_reg = user_reg 87 | 88 | def alternating_step(self, 89 | latent_vectors, 90 | fixed_vecs, 91 | ratings, 92 | _lambda, 93 | type='user'): 94 | """ 95 | One of the two ALS steps. Solve for the latent vectors 96 | specified by type. 97 | """ 98 | if type == 'user': 99 | # Precompute 100 | YTY = fixed_vecs.T.dot(fixed_vecs) 101 | lambdaI = np.eye(YTY.shape[0]) * _lambda 102 | 103 | for u in range(latent_vectors.shape[0]): 104 | latent_vectors[u, :] = solve((YTY + lambdaI), 105 | ratings[u, :].dot(fixed_vecs)) 106 | elif type == 'item': 107 | # Precompute 108 | XTX = fixed_vecs.T.dot(fixed_vecs) 109 | lambdaI = np.eye(XTX.shape[0]) * _lambda 110 | 111 | for i in range(latent_vectors.shape[0]): 112 | latent_vectors[i, :] = solve((XTX + lambdaI), 113 | ratings[:, i].T.dot(fixed_vecs)) 114 | return latent_vectors 115 | 116 | 117 | 118 | def train(self, n_iter=10): 119 | """ Train model for n_iter iterations from scratch.""" 120 | # initialize latent vectors 121 | self.user_vecs = np.random.random((self.n_users, self.n_factors)) 122 | self.item_vecs = np.random.random((self.n_items, self.n_factors)) 123 | 124 | ctr = 1 125 | while ctr <= n_iter: 126 | self.user_vecs = self.alternating_step(self.user_vecs, 127 | self.item_vecs, 128 | self.ratings, 129 | self.user_reg, 130 | type='user') 131 | self.item_vecs = self.alternating_step(self.item_vecs, 132 | self.user_vecs, 133 | self.ratings, 134 | self.item_reg, 135 | type='item') 136 | ctr += 1 137 | 138 | def predict_all(self): 139 | """ Predict ratings for every user and item. """ 140 | predictions = np.zeros((self.user_vecs.shape[0], 141 | self.item_vecs.shape[0])) 142 | for u in range(self.user_vecs.shape[0]): 143 | for i in range(self.item_vecs.shape[0]): 144 | predictions[u, i] = self.predict(u, i) 145 | 146 | return predictions 147 | 148 | def predict(self, u, i): 149 | """ Single user and item prediction. """ 150 | return self.user_vecs[u, :].dot(self.item_vecs[i, :].T) 151 | 152 | def calculate_learning_curve(self, iter_array, test): 153 | """ 154 | Keep track of MSE as a function of training iterations. 155 | 156 | Params 157 | ====== 158 | iter_array : (list) 159 | List of numbers of iterations to train for each step of 160 | the learning curve. e.g. [1, 5, 10, 20] 161 | test : (2D ndarray) 162 | Testing dataset (assumed to be user x item). 163 | 164 | The function creates two new class attributes: 165 | 166 | train_mse : (list) 167 | Training data MSE values for each value of iter_array 168 | test_mse : (list) 169 | Test data MSE values for each value of iter_array 170 | """ 171 | iter_array.sort() 172 | self.train_mse =[] 173 | self.test_mse = [] 174 | for (i, n_iter) in enumerate(iter_array): 175 | self.train(n_iter) 176 | predictions = self.predict_all() 177 | 178 | self.train_mse += [get_mse(predictions, self.ratings)] 179 | self.test_mse += [get_mse(predictions, test)] 180 | print('Train mse: ' + str(self.train_mse[-1])) 181 | print('Test mse: ' + str(self.test_mse[-1])) 182 | 183 | class SGDMF(): 184 | def __init__(self, 185 | ratings, 186 | n_factors=40, 187 | item_fact_reg=0.0, 188 | user_fact_reg=0.0, 189 | item_bias_reg=0.0, 190 | user_bias_reg=0.0, 191 | verbose=False 192 | ): 193 | """ 194 | Train an SGD matrix factorization model to predict empty 195 | entries in a matrix. 196 | """ 197 | 198 | self.ratings = ratings 199 | self.n_users, self.n_items = ratings.shape 200 | self.n_factors = n_factors 201 | self.item_fact_reg = item_fact_reg 202 | self.user_fact_reg = user_fact_reg 203 | self.item_bias_reg = item_bias_reg 204 | self.user_bias_reg = user_bias_reg 205 | self.sample_row, self.sample_col = self.ratings.nonzero() 206 | self.n_samples = len(self.sample_row) 207 | 208 | def sgd(self): 209 | for idx in self.training_indices: 210 | u = self.sample_row[idx] 211 | i = self.sample_col[idx] 212 | prediction = self.predict(u, i) 213 | e = (self.ratings[u,i] - prediction) # error 214 | 215 | # Update biases 216 | self.user_bias[u] += self.learning_rate * \ 217 | (e - self.user_bias_reg * self.user_bias[u]) 218 | self.item_bias[i] += self.learning_rate * \ 219 | (e - self.item_bias_reg * self.item_bias[i]) 220 | 221 | #Update latent factors 222 | self.user_vecs[u, :] += self.learning_rate * \ 223 | (e * self.item_vecs[i, :] - \ 224 | self.user_fact_reg * self.user_vecs[u,:]) 225 | self.item_vecs[i, :] += self.learning_rate * \ 226 | (e * self.user_vecs[u, :] - \ 227 | self.item_fact_reg * self.item_vecs[i,:]) 228 | 229 | def train(self, n_iter=10, learning_rate=0.1): 230 | """ Train model for n_iter iterations from scratch.""" 231 | # initialize latent vectors 232 | self.user_vecs = np.random.random(size=(self.n_users, self.n_factors)) 233 | self.item_vecs = np.random.random(size=(self.n_items, self.n_factors)) 234 | 235 | self.learning_rate = learning_rate 236 | self.user_bias = np.zeros(self.n_users) 237 | self.item_bias = np.zeros(self.n_items) 238 | self.global_bias = np.mean(self.ratings[np.where(self.ratings != 0)]) 239 | 240 | ctr = 1 241 | while ctr <= n_iter: 242 | self.training_indices = np.arange(self.n_samples) 243 | np.random.shuffle(self.training_indices) 244 | self.sgd() 245 | ctr += 1 246 | 247 | def predict(self, u, i): 248 | prediction = self.global_bias + self.user_bias[u] + self.item_bias[i] 249 | prediction += self.user_vecs[u, :].dot(self.item_vecs[i, :].T) 250 | return prediction 251 | 252 | def predict_all(self): 253 | """ Predict ratings for every user and item.""" 254 | predictions = np.zeros((self.user_vecs.shape[0], 255 | self.item_vecs.shape[0])) 256 | for u in range(self.user_vecs.shape[0]): 257 | for i in range(self.item_vecs.shape[0]): 258 | predictions[u, i] = self.predict(u, i) 259 | 260 | return predictions 261 | 262 | def calculate_learning_curve(self, iter_array, test, learning_rate=0.1): 263 | iter_array.sort() 264 | self.train_mse =[] 265 | self.test_mse = [] 266 | for (i, n_iter) in enumerate(iter_array): 267 | self.train(n_iter, learning_rate) 268 | 269 | predictions = self.predict_all() 270 | 271 | self.train_mse += [get_mse(predictions, self.ratings)] 272 | self.test_mse += [get_mse(predictions, test)] 273 | print('Train mse: ' + str(self.train_mse[-1])) 274 | print('Test mse: ' + str(self.test_mse[-1])) 275 | 276 | 277 | 278 | def plot_learning_curve(iter_array, model): 279 | plt.plot(iter_array, model.train_mse, \ 280 | label='Training', linewidth=5) 281 | plt.plot(iter_array, model.test_mse, \ 282 | label='Test', linewidth=5) 283 | 284 | 285 | plt.xticks(fontsize=16); 286 | plt.yticks(fontsize=16); 287 | plt.xlabel('iterations', fontsize=30); 288 | plt.ylabel('MSE', fontsize=30); 289 | plt.legend(loc='best', fontsize=20); 290 | 291 | 292 | 293 | 294 | -------------------------------------------------------------------------------- /Python/fireworks.py: -------------------------------------------------------------------------------- 1 | ''' 2 | FIREWORKS SIMULATION WITH TKINTER 3 | 4 | *self-containing code 5 | *to run: simply type python simple.py in your console 6 | *compatible with both Python 2 and Python 3 7 | *Dependencies: tkinter, Pillow (only for background image) 8 | *The design is based on high school physics, with some small twists only for aesthetics purpose 9 | 10 | ''' 11 | import tkinter as tk 12 | #from tkinter import messagebox 13 | #from tkinter import PhotoImage 14 | from PIL import Image, ImageTk 15 | from time import time, sleep 16 | from random import choice, uniform, randint 17 | from math import sin, cos, radians 18 | 19 | # gravity, act as our constant g, you can experiment by changing it 20 | GRAVITY = 0.05 21 | # list of color, can choose randomly or use as a queue (FIFO) 22 | colors = ['red', 'blue', 'yellow', 'white', 'green', 'orange', 'purple', 'seagreen','indigo', 'cornflowerblue'] 23 | 24 | ''' 25 | Generic class for particles 26 | 27 | particles are emitted almost randomly on the sky, forming a round of circle (a star) before falling and getting removed 28 | from canvas 29 | 30 | Attributes: 31 | - id: identifier of a particular particle in a star 32 | - x, y: x,y-coordinate of a star (point of explosion) 33 | - vx, vy: speed of particle in x, y coordinate 34 | - total: total number of particle in a star 35 | - age: how long has the particle last on canvas 36 | - color: self-explantory 37 | - cv: canvas 38 | - lifespan: how long a particle will last on canvas 39 | 40 | ''' 41 | class part: 42 | def __init__(self, cv, idx, total, explosion_speed, x=0., y=0., vx = 0., vy = 0., size=2., color = 'red', lifespan = 2, **kwargs): 43 | self.id = idx 44 | self.x = x 45 | self.y = y 46 | self.initial_speed = explosion_speed 47 | self.vx = vx 48 | self.vy = vy 49 | self.total = total 50 | self.age = 0 51 | self.color = color 52 | self.cv = cv 53 | self.cid = self.cv.create_oval( 54 | x - size, y - size, x + size, 55 | y + size, fill=self.color) 56 | self.lifespan = lifespan 57 | 58 | def update(self, dt): 59 | self.age += dt 60 | 61 | # particle expansions 62 | if self.alive() and self.expand(): 63 | move_x = cos(radians(self.id*360/self.total))*self.initial_speed 64 | move_y = sin(radians(self.id*360/self.total))*self.initial_speed 65 | self.cv.move(self.cid, move_x, move_y) 66 | self.vx = move_x/(float(dt)*1000) 67 | 68 | # falling down in projectile motion 69 | elif self.alive(): 70 | move_x = cos(radians(self.id*360/self.total)) 71 | # we technically don't need to update x, y because move will do the job 72 | self.cv.move(self.cid, self.vx + move_x, self.vy+GRAVITY*dt) 73 | self.vy += GRAVITY*dt 74 | 75 | # remove article if it is over the lifespan 76 | elif self.cid is not None: 77 | cv.delete(self.cid) 78 | self.cid = None 79 | 80 | # define time frame for expansion 81 | def expand (self): 82 | return self.age <= 1.2 83 | 84 | # check if particle is still alive in lifespan 85 | def alive(self): 86 | return self.age <= self.lifespan 87 | 88 | ''' 89 | Firework simulation loop: 90 | Recursively call to repeatedly emit new fireworks on canvas 91 | 92 | a list of list (list of stars, each of which is a list of particles) 93 | is created and drawn on canvas at every call, 94 | via update protocol inside each 'part' object 95 | ''' 96 | def simulate(cv): 97 | t = time() 98 | explode_points = [] 99 | wait_time = randint(10,100) 100 | numb_explode = randint(6,10) 101 | # create list of list of all particles in all simultaneous explosion 102 | for point in range(numb_explode): 103 | objects = [] 104 | x_cordi = randint(50,550) 105 | y_cordi = randint(50, 150) 106 | speed = uniform (0.5, 1.5) 107 | size = uniform (0.5,3) 108 | color = choice(colors) 109 | explosion_speed = uniform(0.2, 1) 110 | total_particles = randint(10,50) 111 | for i in range(1,total_particles): 112 | r = part(cv, idx = i, total = total_particles, explosion_speed = explosion_speed, x = x_cordi, y = y_cordi, 113 | vx = speed, vy = speed, color=color, size = size, lifespan = uniform(0.6,1.75)) 114 | objects.append(r) 115 | explode_points.append(objects) 116 | 117 | total_time = .0 118 | # keeps undate within a timeframe of 1.8 second 119 | while total_time < 1.8: 120 | sleep(0.01) 121 | tnew = time() 122 | t, dt = tnew, tnew - t 123 | for point in explode_points: 124 | for item in point: 125 | item.update(dt) 126 | cv.update() 127 | total_time += dt 128 | # recursive call to continue adding new explosion on canvas 129 | root.after(wait_time, simulate, cv) 130 | 131 | def close(*ignore): 132 | """Stops simulation loop and closes the window.""" 133 | global root 134 | root.quit() 135 | 136 | if __name__ == '__main__': 137 | root = tk.Tk() 138 | cv = tk.Canvas(root, height=600, width=600) 139 | # use a nice background image 140 | image = Image.open("image.jpg") 141 | photo = ImageTk.PhotoImage(image) 142 | cv.create_image(0, 0, image=photo, anchor='nw') 143 | 144 | cv.pack() 145 | root.protocol("WM_DELETE_WINDOW", close) 146 | 147 | root.after(100, simulate, cv) 148 | 149 | root.mainloop() 150 | -------------------------------------------------------------------------------- /Python/flights_networkx.py: -------------------------------------------------------------------------------- 1 | # import libaries 2 | import pandas as pd 3 | import numpy as np 4 | import networkx as nx 5 | import matplotlib.pyplot as plt 6 | from mpl_toolkits.basemap import Basemap as Basemap 7 | import matplotlib.lines as mlines 8 | 9 | def main(): 10 | # download airport info data 11 | airport_col = ['ID', 'Name', 'City', 'Country','IATA', 'ICAO', 'Lat', 'Long', 'Alt', 12 | 'Timezone', 'DST', 'Tz database time zone', 'type', 'source'] 13 | airport_df = pd.read_csv("https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat", 14 | names = airport_col, index_col = 0) 15 | 16 | # download flight routes data 17 | route_cols = ['Airline', 'Airline ID', 'Source Airport', 'Source Airport ID', 18 | 'Dest Airport', 'Dest Airport ID', 'Codeshare', 'Stops', 'equipment'] 19 | routes_df = pd.read_csv("https://raw.githubusercontent.com/jpatokal/openflights/master/data/routes.dat", 20 | names = route_cols) 21 | #clean up data, change 'object' type to numeric and drops NaNs 22 | routes_df['Source Airport ID'] = pd.to_numeric(routes_df['Source Airport ID'].astype(str), 'coerce') 23 | routes_df['Dest Airport ID'] = pd.to_numeric(routes_df['Dest Airport ID'].astype(str), 'coerce') 24 | routes_df = routes_df.dropna(subset=["Source Airport ID", "Dest Airport ID"]) 25 | 26 | 27 | simple_visualization(airport_df, routes_df) 28 | advanced_visualization(airport_df, routes_df) 29 | 30 | ##### Part 1: simple network visualization, Alaska and other non-mainlain territories included #### 31 | ################################################################################################### 32 | # extract country and then extra columns 33 | def simple_visualization (airport_df, routes_df): 34 | if (airport_df is None) or (routes_df is None): 35 | print "Data cannot be retrieved and read" 36 | else: 37 | airport_us = airport_df[(airport_df.Country == "United States")][['Name','Lat', 'Long', 'IATA', 'ICAO']] 38 | us_airport_ix = airport_us.index.values 39 | routes_us = routes_df[(routes_df['Source Airport ID'].isin(us_airport_ix)) & 40 | (routes_df['Dest Airport ID'].isin(us_airport_ix))] #extract routes that flyies from AND to USA 41 | routes_us = pd.DataFrame(routes_us.groupby(['Source Airport', 'Dest Airport']).size().reset_index(name='counts')) 42 | # to find number of flights in and out of an airport 43 | # it is similar to find number of rows in which each airport occur in either one of the 2 columns 44 | counts = routes_us['Source Airport'].append(routes_us.loc[routes_us['Source Airport'] != routes_us['Dest Airport'], 'Dest Airport']).value_counts() 45 | # create a data frame of position based on names in count 46 | counts = pd.DataFrame({'IATA': counts.index, 'total_flight': counts}) 47 | pos_data = counts.merge(airport_us, on = 'IATA') 48 | 49 | # Create graph 50 | graph = nx.from_pandas_edgelist(routes_us, source = 'Source Airport', target = 'Dest Airport', 51 | edge_attr = 'counts',create_using = nx.DiGraph()) 52 | 53 | # default graph using Networkx inbuilt graph tools 54 | plt.figure(figsize = (10,9)) 55 | nx.draw_networkx(graph) 56 | plt.savefig("./images/networkx_basemap/map_0.png", format = "png", dpi = 300) 57 | plt.show() 58 | 59 | # Set up base map 60 | plt.figure(figsize=(15,20)) 61 | m = Basemap( 62 | projection='merc', 63 | llcrnrlon=-180, 64 | llcrnrlat=10, 65 | urcrnrlon=-50, 66 | urcrnrlat=70, 67 | lat_ts=0, 68 | resolution='l', 69 | suppress_ticks=True) 70 | 71 | # import long lat as m attribute 72 | mx, my = m(pos_data['Long'].values, pos_data['Lat'].values) 73 | pos = {} 74 | for count, elem in enumerate (pos_data['IATA']): 75 | pos[elem] = (mx[count], my[count]) 76 | 77 | # draw nodes and edges and over aly on basemap 78 | nx.draw_networkx_nodes(G = graph, pos = pos, node_list = graph.nodes(), node_color = 'r', alpha = 0.8, 79 | node_size = [counts['total_flight'][s]*3 for s in graph.nodes()]) 80 | nx.draw_networkx_edges(G = graph, pos = pos, edge_color='g', width = routes_us['counts']*0.75, 81 | alpha=0.2, arrows = False) 82 | 83 | m.drawcountries(linewidth = 3) 84 | m.drawstates(linewidth = 0.2) 85 | m.drawcoastlines(linewidth=3) 86 | plt.tight_layout() 87 | plt.savefig("./images/networkx_basemap/map_2.png", format = "png", dpi = 300) 88 | plt.show() 89 | print ("successful visualization") 90 | return 0 91 | 92 | ##### Part 2: more on visualization, only mainlain territories with more features #### 93 | ###################################################################################### 94 | # extract country and then extra columns 95 | def advanced_visualization (airport_df, routes_df): 96 | if (airport_df is None) or (routes_df is None): 97 | print ("Data cannot be retrieved and read") 98 | else: 99 | airport_us = airport_df[(airport_df.Country == "United States") & (airport_df.Lat > 25) 100 | & (airport_df.Lat < 50) & (airport_df.Long > -130) & (airport_df.Long < -60)] 101 | us_airport_ix = airport_us.index.values 102 | routes_us = routes_df[(routes_df['Source Airport ID'].isin(us_airport_ix)) & 103 | (routes_df['Dest Airport ID'].isin(us_airport_ix))] #extract routes that flyies from AND to USA 104 | routes_us = pd.DataFrame(routes_us.groupby(['Source Airport', 'Dest Airport']).size().reset_index(name='counts')) 105 | # to find number of flights in and out of an airport 106 | # it is similar to find number of rows in which each airport occur in either one of the 2 columns 107 | counts = routes_us['Source Airport'].append(routes_us.loc[routes_us['Source Airport'] != routes_us['Dest Airport'], 'Dest Airport']).value_counts() 108 | # create a data frame of position based on names in count 109 | counts = pd.DataFrame({'IATA': counts.index, 'total_flight': counts}) 110 | pos_data = counts.merge(airport_us, on = 'IATA') 111 | 112 | # Create graph 113 | graph = nx.from_pandas_edgelist(routes_us, source = 'Source Airport', target = 'Dest Airport', 114 | edge_attr = 'counts',create_using = nx.DiGraph()) 115 | 116 | # Set up base map 117 | plt.figure(figsize=(15,20)) 118 | m = Basemap( 119 | projection='merc', 120 | llcrnrlon=-180, 121 | llcrnrlat=10, 122 | urcrnrlon=-50, 123 | urcrnrlat=70, 124 | lat_ts=0, 125 | resolution='l', 126 | suppress_ticks=True) 127 | 128 | # import long lat as m attribute 129 | mx, my = m(pos_data['Long'].values, pos_data['Lat'].values) 130 | pos = {} 131 | for count, elem in enumerate (pos_data['IATA']): 132 | pos[elem] = (mx[count], my[count]) 133 | 134 | # draw nodes and edges and overly on basemap 135 | nx.draw_networkx_nodes(G = graph, pos = pos, nodelist = [x for x in graph.nodes() if counts['total_flight'][x] >= 100], 136 | node_color = 'r', alpha = 0.8, 137 | node_size = [counts['total_flight'][x]*4 for x in graph.nodes() if counts['total_flight'][x] >= 100]) 138 | 139 | nx.draw_networkx_labels(G = graph, pos = pos, font_size=10, 140 | labels = {x:x for x in graph.nodes() if counts['total_flight'][x] >= 100}) 141 | 142 | nx.draw_networkx_nodes(G = graph, pos = pos, nodelist = [x for x in graph.nodes() if counts['total_flight'][x] < 100], 143 | node_color = 'b', alpha = 0.6, 144 | node_size = [counts['total_flight'][x]*4 for x in graph.nodes() if counts['total_flight'][x] < 100]) 145 | 146 | nx.draw_networkx_edges(G = graph, pos = pos, edge_color = 'g', width = routes_us['counts']*0.75, 147 | alpha=0.06, arrows = False) 148 | 149 | m.drawcountries(linewidth = 3) 150 | m.drawstates(linewidth = 0.2) 151 | m.drawcoastlines(linewidth=1) 152 | m.fillcontinents(alpha = 0.3) 153 | line1 = mlines.Line2D(range(1), range(1), color="white", marker='o', markerfacecolor="red") 154 | line2 = mlines.Line2D(range(1), range(1), color="white", marker='o',markerfacecolor="blue") 155 | line3 = mlines.Line2D(range(1), range(1), color="green", marker='',markerfacecolor="green") 156 | plt.legend((line1, line2, line3), ('Large Airport > 100 routes', 'Smaller airports', 'routes'), 157 | loc=4, fontsize = 'xx-large') 158 | plt.title("Network graph of flight routes in the USA", fontsize = 30) 159 | #m.bluemarble() 160 | plt.tight_layout() 161 | plt.savefig("./images/networkx_basemap/map_3.png", format = "png", dpi = 300) 162 | plt.show() 163 | print ("successful visualization") 164 | return 0 165 | 166 | if __name__ == "__main__": 167 | main() 168 | -------------------------------------------------------------------------------- /Python/image.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/image.jpg -------------------------------------------------------------------------------- /Python/images/networkx_basemap/map_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/images/networkx_basemap/map_0.png -------------------------------------------------------------------------------- /Python/images/networkx_basemap/map_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/images/networkx_basemap/map_1.png -------------------------------------------------------------------------------- /Python/images/networkx_basemap/map_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/images/networkx_basemap/map_2.png -------------------------------------------------------------------------------- /Python/images/networkx_basemap/map_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/images/networkx_basemap/map_3.png -------------------------------------------------------------------------------- /Python/images/networkx_basemap/table_1.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/images/networkx_basemap/table_1.PNG -------------------------------------------------------------------------------- /Python/images/networkx_basemap/table_2.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/images/networkx_basemap/table_2.PNG -------------------------------------------------------------------------------- /Python/images/networkx_basemap/table_3.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/images/networkx_basemap/table_3.PNG -------------------------------------------------------------------------------- /Python/images/networkx_basemap/table_5.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/images/networkx_basemap/table_5.PNG -------------------------------------------------------------------------------- /Python/lincoln_estimate.py: -------------------------------------------------------------------------------- 1 | from matplotlib import pyplot as plt 2 | 3 | def like_insta_post(p): 4 | "Find an error with probability p" 5 | return 1 if random.random() < p else 0 6 | 7 | def simulate(true_audience, p1, p2, reps=10000): 8 | """Simulate Lincoln's method for estimating errors 9 | given the true number of errors, each person's probability 10 | of finding an error, and the number of simulations to run.""" 11 | naive_estimates = [] 12 | lincoln_estimates = [] 13 | 14 | for rep in range(reps): 15 | like_post_1 = np.array([like_insta_post(p1) for _ in range(true_audience)]) 16 | like_post_2 = np.array([like_insta_post(p2) for _ in range(true_audience)]) 17 | like_post1_count = sum(like_post_1) 18 | like_post2_count = sum(like_post_2) 19 | overlap = np.sum(like_post_1 & like_post_2) 20 | 21 | naive_estimates.append(like_post1_count + like_post2_count - overlap) 22 | if overlap > 0: 23 | lincoln_estimates.append(like_post1_count*like_post2_count / float(overlap)) 24 | 25 | return naive_estimates, lincoln_estimates 26 | 27 | def calc_stats(arr): 28 | return ( 29 | np.mean(arr), 30 | np.std(arr, ddof=1), 31 | np.mean(arr) - 1.96*np.std(arr, ddof=1), 32 | np.mean(arr) + 1.96*np.std(arr, ddof=1) 33 | ) 34 | 35 | sims = [[0.3, 0.5], [0.6, 0.4], [0.7, 0.8], [0.9, 0.9]] 36 | # create 2 lists, 1 of data frame of values, 1 of titles 37 | res_arr = [] 38 | title_arr = [] 39 | 40 | for p in sims: 41 | naive_estimates, lincoln_estimates = simulate(100, p[0], p[1], reps=100000) 42 | naive_stats = calc_stats(naive_estimates) 43 | lincoln_stats = calc_stats(lincoln_estimates) 44 | naive_mean, naive_std = naive_stats[0], naive_stats[1] 45 | lincoln_mean, lincoln_std = lincoln_stats[0], lincoln_stats[1] 46 | 47 | pd_res = pd.DataFrame( 48 | { 49 | "method":["naive", "Lincoln"], 50 | "estimate":[naive_mean, lincoln_mean], 51 | "std": [naive_std, lincoln_std]} 52 | ) 53 | res_arr.append(pd_res) 54 | title_arr.append(f" p1={str(p[0])}\n p2={str(p[1])}") 55 | 56 | colors = ['blue', 'orange'] 57 | fig, axes = plt.subplots(1, 4, figsize=(18, 6), sharey=True) 58 | ax1, ax2, ax3, ax4 = axes 59 | for dat_df, ax, title in zip( 60 | res_arr, 61 | [ax1, ax2, ax3, ax4], 62 | title_arr 63 | ): 64 | dat_df.plot(x='method', y='estimate', yerr = 'std', kind='bar', color = colors, 65 | ax=ax, legend=False, xlabel='', ylabel = 'mean of estimates').set_title(title) 66 | 67 | for ax in axes: 68 | ax.set_xticklabels(ax.get_xticklabels(), rotation = 90) 69 | for side in ('right', 'top', 'left'): 70 | if (ax == ax1) and (side == 'left'): 71 | continue 72 | else: 73 | sp = ax.spines[side] 74 | sp.set_visible(False) 75 | -------------------------------------------------------------------------------- /Python/mbappe.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/mbappe.jpg -------------------------------------------------------------------------------- /Python/n_dimensionalNormal.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib as plt 3 | 4 | def get_3_std_estimates(x): 5 | return (x.mean() - 3 * x.std(), x.mean() + 3 * x.std()) 6 | 7 | def get_2_std_estimates(x): 8 | return (x.mean() - 2 * x.std(), x.mean() + 2 * x.std()) 9 | 10 | def get_1_std_estimates(x): 11 | return (x.mean() - x.std(), x.mean() + x.std()) 12 | 13 | 14 | N = 10000 15 | 16 | def get_graph(n, title): 17 | """ 18 | Draw a distribution histogram for a sample of N data from 19 | n-dimensional Normal distribution 20 | """ 21 | 22 | sample = np.random.normal(size=(N, n)) 23 | dist = np.square(np.linalg.norm(sample, axis = 1)) 24 | lower_bound, upper_bound = get_2_std_estimates(dist) 25 | n, bins, patches = plt.hist(dist, bins = 'auto', density = "true") 26 | plt.axvline(x = lower_bound, color = 'red') 27 | plt.axvline(x = upper_bound, color = 'red') 28 | plt.title(title, fontdict = {'fontsize': 20}) 29 | plt.show() 30 | 31 | get_graph(100, "Distribution of distance from origin for n = 100") 32 | 33 | 34 | def get_boundary(n): 35 | """ 36 | For a dimension value n, sample N data points from a n-dimensional 37 | Normal distribution and find the 2 standard deviation boundary 38 | for the squared Euclidan norms. 39 | """ 40 | 41 | sample = np.random.normal(size=(N, n)) 42 | dist = np.square(np.linalg.norm(sample, axis = 1)) 43 | lower_bound, upper_bound = get_2_std_estimates(dist) 44 | return (lower_bound, upper_bound) 45 | 46 | ### simulation 47 | n_range = range(1, 5001) 48 | lower_bounds = [] 49 | upper_bounds = [] 50 | 51 | for n in n_range: 52 | lower_bound, upper_bound = get_boundary(n) 53 | lower_bounds.append(lower_bound/n) 54 | upper_bounds.append(upper_bound/n) 55 | 56 | plt.style.use('seaborn-notebook') 57 | plt.plot(n_range, lower_bounds, label = 'lower_bounds\ndivided by n') 58 | plt.plot(n_range, upper_bounds, label = 'upper_bounds\ndivided by n') 59 | #plt.axvline(x=1000, color = 'red', linestyle = '--') 60 | plt.legend(prop={'size': 13}) 61 | plt.xlim(1, 5000) 62 | plt.xlabel("dimensions") 63 | plt.title("Ratio between 2-standard devation boundaries and n as n increases", fontdict = {'fontsize': 16}) 64 | plt.show() 65 | 66 | 67 | 68 | #### how many points lie in the 10% period or outside 69 | def get_pct_for_interval(n): 70 | sample = np.random.normal(size=(N, n)) 71 | dist = np.square(np.linalg.norm(sample, axis = 1)) 72 | 73 | lower_interval = np.count_nonzero(dist < n*0.95) 74 | middle_interval = np.count_nonzero((dist >= n*0.95) & (dist <= n*1.05)) 75 | large_interval = np.count_nonzero(dist > n*1.05) 76 | 77 | return lower_interval/N, middle_interval/N, large_interval/N 78 | 79 | lower_intervals = [] 80 | middle_intervals = [] 81 | large_intervals = [] 82 | 83 | for n in n_range: 84 | lower_interval, middle_interval, large_interval = get_pct_for_interval(n) 85 | lower_intervals.append(lower_interval) 86 | middle_intervals.append(middle_interval) 87 | large_intervals.append(large_interval) 88 | 89 | plt.stackplot(n_range, 90 | lower_intervals, 91 | middle_intervals, 92 | large_intervals, 93 | labels=['d^2 < 0.95n', 94 | '0.95n <= d^2 <= 1.05n', 95 | 'd^2 > 1.05n']) 96 | plt.legend() 97 | plt.xlabel("dimensions") 98 | plt.title("Probability that a sample point will be at some distance from the origin", fontdict = {'fontsize': 16}) 99 | plt.ylim(0, 1) 100 | plt.xlim(1, 5000) 101 | -------------------------------------------------------------------------------- /Python/optimal_dating.py: -------------------------------------------------------------------------------- 1 | import random 2 | import matplotlib as plt 3 | import seaborn as sns 4 | 5 | ################# 6 | # Top k algorithm 7 | ################# 8 | 9 | 10 | def perm_rank(n): 11 | """create a ranked order list of n items""" 12 | return random.sample(range(1, n+1), n) 13 | 14 | 15 | def top_k_selection_algo(array, m, k): 16 | """for any list of order, apply top-k algorithm 17 | 18 | Return whether we succeed (1) or failure (0) to 19 | identify top-k value 20 | """ 21 | top_first_m = min(array[:(m-1)]) 22 | # then for array[n:] 23 | # we pick first k values that is greater than max_first_m 24 | inspect_array = np.array(array[m-1:]) 25 | qualified_cand = inspect_array[inspect_array < top_first_m][:k] 26 | 27 | if len(qualified_cand) == k and max(qualified_cand) == k: 28 | return 1 29 | return 0 30 | 31 | 32 | def simulation_top_k(n, k, iters): 33 | """ 34 | for any value of k and n 35 | simulate all exploration cutoff from 2-> n 36 | and return a list of success probability at different cutoff 37 | """ 38 | result = [] 39 | for m in range(2, n+1): 40 | result_m = [] 41 | for i in range(iters): 42 | order = perm_rank(n) 43 | success = selection_algo(order, m, k) 44 | result_m.append(success) 45 | result.append(np.mean(result_m)) 46 | return result 47 | 48 | 49 | result = simulation_top_k(100, 1, iters) 50 | result_3 = simulation_top_k(100, 3, iters) 51 | result_5 = simulation_top_k(100, 5, iters) 52 | result_10 = simulation_top_k(100, 10, iters) 53 | 54 | 55 | plt.style.use('fivethirtyeight') 56 | plt.figure(figsize=(13,6)) 57 | sns.scatterplot(np.arange(2, 101),y=result, label = "k = 1") 58 | sns.scatterplot(np.arange(2, 101),y=result_3, label = "k = 3") 59 | sns.scatterplot(np.arange(2, 101),y=result_5, label = "k = 5") 60 | sns.scatterplot(np.arange(2, 101),y=result_10, label = "k = 10") 61 | plt.grid(False) 62 | plt.title("Probability of finding top k partners\n by exploring first r values") 63 | plt.xlabel("r values") 64 | plt.ylabel("Probability") 65 | 66 | 67 | ############################## 68 | # Top candidate with p success 69 | ############################## 70 | 71 | def selection_algo_with_success_rate(array, m, p): 72 | top_first_m = min(array[:(m-1)]) 73 | available_array = np.random.binomial(1, p, len(array)) 74 | #print(available_array) 75 | # then for array[n:] 76 | # we pick first k values that is greater than max_first_m 77 | #print("top first m", top_first_m) 78 | #print(array[:(m-1)], array[m-1:]) 79 | inspect_array = array[m-1:] 80 | inspect_available = available_array[m-1:] 81 | 82 | if top_first_m == 1: 83 | return 0 84 | available_idx = np.where(inspect_available == 1)[0] 85 | available_person = np.array(inspect_array)[available_idx] 86 | pass_cand = available_person[available_person < top_first_m] 87 | #print(pass_cand) 88 | if len(pass_cand) == 0: 89 | return 0 90 | accept = pass_cand[0] 91 | if accept == 1: 92 | return 1 93 | return 0 94 | 95 | def simulate_with_success_rate(n, p, iters): 96 | result = [] 97 | for m in range(2, n+1): 98 | result_m = [] 99 | for i in range(iters): 100 | order = perm_rank(n) 101 | success = selection_algo_with_success_rate(order, m, p) 102 | result_m.append(success) 103 | result.append(np.mean(result_m)) 104 | return result 105 | 106 | result_avail_1 = simulate_with_success_rate(100, 1, iters) 107 | result_avail_2 = ssimulate_with_success_rate(100, 0.25, iters) 108 | result_avail_5 = simulate_with_success_rate(100, 0.5, iters) 109 | result_avail_7 = simulate_with_success_rate(100, 0.75, iters) 110 | 111 | plt.style.use('fivethirtyeight') 112 | plt.figure(figsize=(13,6)) 113 | sns.scatterplot(np.arange(2, 101),y=result_avail_1, label = "p = 1") 114 | sns.scatterplot(np.arange(2, 101),y=result_avail_2, label = "p = 0.25") 115 | sns.scatterplot(np.arange(2, 101),y=result_avail_5, label = "p = 0.5") 116 | sns.scatterplot(np.arange(2, 101),y=result_avail_7, label = "p = 0.75") 117 | plt.title("Probability of finding top partner at different success rate\n by exploring first r values") 118 | plt.grid(False) 119 | plt.xlabel("r values") 120 | plt.ylabel("Probability") 121 | -------------------------------------------------------------------------------- /R/EPL/Agg.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/R/EPL/Agg.png -------------------------------------------------------------------------------- /R/EPL/Last.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/R/EPL/Last.png -------------------------------------------------------------------------------- /R/EPL/Misc/TeamEvaluate2015.R: -------------------------------------------------------------------------------- 1 | # Load libraries and read files 2 | packages <- c("dplyr", "fpc", "cluster", 3 | "factoextra", "dendextend", 4 | "psych", "qgraph") 5 | lapply(packages, library, character.only = TRUE) 6 | 7 | raw_df <- read.csv("./Team2015season.csv", header=T) 8 | # scale data 9 | 10 | scaled_data <- raw_df %>% 11 | remove_rownames() %>% 12 | column_to_rownames("Team") %>% 13 | scale() 14 | 15 | 16 | ####################################### 17 | # Hierarchical Cluster Analysis 18 | # Useful tutorial: 19 | # https://uc-r.github.io/hc_clustering 20 | ####################################### 21 | 22 | #Eucledian, Ward's method 23 | d_1 <- dist(scaled_data, method="euclidean") 24 | clust_1 <- hclust(d_1, method="ward.D") 25 | #draw the dendrogram 26 | plot(clust_1, 27 | cex=0.7, 28 | xlab="", 29 | ylab="Distance", 30 | main="Clusterings of 60 European teams") 31 | rect.hclust(clust_1, k = 4, border = 2:5) 32 | 33 | #get membership vector 34 | cuts <- cutree(clust_1,k=4) 35 | scaled_data %>% 36 | as.data.frame() %>% 37 | mutate(cluster = cuts) %>% 38 | head 39 | 40 | # Compute distance matrix 41 | res.dist <- dist(scaled_data, method = "euclidean") 42 | 43 | # Compute 2 hierarchical clusterings 44 | hc1 <- hclust(res.dist, method = "complete") 45 | hc2 <- hclust(res.dist, method = "ward.D2") 46 | 47 | # Create two dendrograms and compare group partition 48 | dend1 <- as.dendrogram (hc1) 49 | dend2 <- as.dendrogram (hc2) 50 | 51 | dend_list <- dendlist(dend1, dend2) 52 | 53 | tanglegram(dend1, dend2, 54 | lwd = 1, 55 | edge.lwd = 1, 56 | lab.cex = 0.5, 57 | columns_width = c(8, 3, 8), 58 | highlight_distinct_edges = FALSE, # Turn-off dashed lines 59 | common_subtrees_color_lines = FALSE, # Turn-off line colors 60 | common_subtrees_color_branches = TRUE, # Color common branches 61 | main = paste("entanglement =", round(entanglement(dend_list), 2)) 62 | ) 63 | 64 | ########################################### 65 | # K-means clustering 66 | # Useful tutorial: 67 | # https://uc-r.github.io/kmeans_clustering 68 | ########################################### 69 | 70 | # use 4 centers that Hc clustering suggests 71 | # nstart: attempts multiple initial configurations 72 | # and reports on the best one. 73 | km_results <- kmeans(scaled_data, centers = 4, nstart = 100) 74 | km_results 75 | 76 | # fviz_cluster does PCA and plot the data points 77 | # according to the first two PCs that explain the majority of the variance 78 | fviz_cluster(km_results, data = scaled_data) 79 | 80 | # Evaluating clustering 81 | # Best number of cluster using scree-plot (elbow method) 82 | # optimal total-wihtin cluster sum of square 83 | set.seed(123) 84 | fviz_nbclust(scaled_data, kmeans, method = "wss") 85 | 86 | # Average Silhouette method 87 | # measuring the quality of the clusters 88 | # by how well object lies within a cluster 89 | # try to maximize average silhouette 90 | fviz_nbclust(scaled_data, kmeans, method = "silhouette") 91 | 92 | # GAP statistics method 93 | # can apply to both kmeans and HC 94 | # compares the total intracluster variation 95 | # with their expected values 96 | # under null reference distribution of the data 97 | # at various value of k 98 | set.seed(123) 99 | gap_stat <- clusGap(scaled_data, 100 | FUN = kmeans, 101 | nstart = 100, 102 | K.max = 10, 103 | B = 50) 104 | # Print the result 105 | print(gap_stat, method = "firstmax") 106 | fviz_gap_stat(gap_stat) 107 | 108 | ################################################################### 109 | # Factor analysis 110 | # Useful tutorial: 111 | # http://www.di.fc.ul.pt/~jpn/r/factoranalysis/factoranalysis.html 112 | # https://rpubs.com/aaronsc32/factor-analysis-introduction 113 | ################################################################### 114 | # determined the number of factors to use with scree plot 115 | parallel <- fa.parallel(scaled_data, 116 | fm = 'minres', 117 | fa = 'fa') 118 | 119 | # factor analysis -- no rotation 120 | # Varimax: assume factors completely uncorrelated 121 | # Oblique: correlations in factors 122 | 123 | # Method: factanal only support MaxLikelihood 124 | # In fa (psych), we can use "PAF (pa)" or "mingres", 125 | # the later provide results similar to `MaxLikelihood` 126 | # without assuming multivariate normal distribution 127 | # and derives solutions through iterative eigen decomposition like principal axis. 128 | 129 | fa1 <- factanal(scaled_data, 130 | factors=2, 131 | rotation="none", 132 | scores="regression") 133 | 134 | fa2 <- fa(scaled_data, 135 | nfactors = 3, 136 | rotate = "oblimin", 137 | fm="minres") 138 | fa1 139 | 140 | # biplot 141 | biplot(fa1$scores[,1:2], 142 | loadings(fa1), 143 | cex=c(0.7,0.8)) 144 | # qgraph 145 | # a different visualization of biplot 146 | qg.fa1 <- qgraph(fa1) 147 | 148 | # NOTE: 149 | # - after Exploratory Factor Analysis (EFA), 150 | # - the next step could be Confirmatory Factor Analysis 151 | # - which is part of a larger subset: Structual Equation Modelling 152 | # - https://socialsciences.mcmaster.ca/jfox/Misc/sem/SEM-paper.pdf 153 | 154 | 155 | # we can get some flexibility from the "psych" package 156 | fa_analysis <- function(data_set, factor, 157 | rotate = "varimax", fm = "pa"){ 158 | res <- fa(data_set, nfactors = factor, 159 | rotate = rotate, fm = fm) 160 | print("Factor Analysis results:") 161 | print(res) 162 | 163 | # get loading plot for the first two factors 164 | plot(res$loadings, pch=18, col='red') 165 | abline(h=0) 166 | abline(v=0) 167 | text(res$loadings, labels=names(data_set),cex=0.8) 168 | 169 | #get reproduced correlation matrix 170 | repro <- res$loadings%*%t(res$loadings) 171 | #residual correlation matrix 172 | residual <- cor(data_set)-repro 173 | print("Residual correlation matrx") 174 | round(resid2,2) 175 | 176 | #get root-mean squared residuals 177 | len <- length(residual[upper.tri(residual)]) 178 | RMSR <- sqrt(sum(residual[upper.tri(residual)]^2)/len) 179 | print("Root-mean squared residuals:", RMSR) 180 | 181 | #get proportion of residuals greater than 0.05 in absolute value 182 | prop <- sum(rep(1,len)[abs(residual[upper.tri(residual)])>0.05])/len 183 | print("Proportion of residuals greater than 0.05 in absolute value:", prop) 184 | } 185 | 186 | # varimax - paf 187 | fa_analysis(soccer, 3) 188 | 189 | # quartimax - pag 190 | fa_analysis(soccer, 3, "quartimax", "pa") 191 | -------------------------------------------------------------------------------- /R/EPL/betting/Portfolio-xkcd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/R/EPL/betting/Portfolio-xkcd.png -------------------------------------------------------------------------------- /R/EPL/betting/bet_strategy.R: -------------------------------------------------------------------------------- 1 | library(tidyverse) 2 | source("prediction.R") 3 | # in MAC, may have to go to font book to activate xkcd.ttf 4 | #library(extrafont) 5 | #font_import(path = ".", pattern="xkcd") 6 | #fonts() 7 | #loadfonts() 8 | 9 | betting_house <- c("B365", "BW", "IW", "PS", "WH", "VC") 10 | 11 | # easy computation of max odd or mean probability 12 | # find max_odd if find_max is TRUE, else return Consensus Probability of event 13 | row_max_prob <- function(df, row_idx, find_max){ 14 | predict_outcome = df[row_idx, "predict_outcome"] 15 | if (is.na(predict_outcome)) return (NA) 16 | col_names <- paste0(betting_house, predict_outcome) 17 | val = ifelse(find_max, max(df[row_idx,col_names]), 1/mean(as.numeric(df[row_idx,col_names]))) 18 | return (val) 19 | } 20 | 21 | 22 | ##### find total return at every round 23 | # based on prediction, max_odd, Consensus Probability and amount of capital to bet 24 | # input in Round (Matchweek), method ("poisson", "merson", "random") and Amount of available capital 25 | betting_round <- function (round, method, capital){ 26 | total_return = 0 27 | 28 | round_data <- df_prediction %>% 29 | filter(Round == round) %>% 30 | mutate(method = method, 31 | predict_outcome = ifelse(method == "random", sample(c("H", "D", "A"),n(), replace = TRUE), 32 | ifelse(method == 'poisson', poisson_predict, Merson_predict))) 33 | no_matches = dim(round_data)[1] 34 | round_data$max_odd <- sapply(1:no_matches, function(x) row_max_prob(round_data, x, TRUE)) 35 | round_data$prob <- sapply(1:no_matches, function(x) row_max_prob(round_data, x, FALSE)) 36 | 37 | round_data <- round_data %>% 38 | mutate (fraction = ((prob*max_odd - (1-prob))/max_odd), 39 | f_normalize = fraction/sum(fraction, na.rm = TRUE), 40 | bet_amount = f_normalize * capital, 41 | payoff = ifelse(FTR == predict_outcome, bet_amount*max_odd, 0), 42 | profit = payoff-bet_amount) 43 | 44 | return (sum(round_data$profit, na.rm = TRUE)) 45 | } 46 | 47 | # inititate a table to store return result 48 | # remove Paul's Merson bet 49 | return_table <- data.frame(round = 0:30, 50 | Poisson = rep(0,31), 51 | random_bet = rep(0,31)) 52 | 53 | return_table[1,c("Poisson", "random_bet")] <- rep(1000,2) 54 | 55 | for (i in 1:30){ 56 | Poisson_return <- betting_round(i, "poisson",1000/30) 57 | random_return <- betting_round(i, "random", 1000/30) 58 | #Merson_return <- betting_round(i, "Merson",1000/30) 59 | 60 | return_table[i+1,"Poisson"] <- Poisson_return 61 | return_table[i+1,"random_bet"] <- random_return 62 | #return_table[i+1,"Merson_bet"] <- Merson_return 63 | } 64 | 65 | # we are interested the change in the portfolio overtime 66 | return_table$Poisson <- cumsum(return_table$Poisson) 67 | return_table$random_bet <- cumsum(return_table$random_bet) 68 | #return_table$Merson_bet <- cumsum(return_table$Merson_bet) 69 | 70 | return_table %>% 71 | gather("method", "value", -round) %>% 72 | mutate(method = factor(method, levels = c('Poisson', 'random_bet'), 73 | labels = c('Poisson prediction', 'random prediction'))) %>% 74 | ggplot(aes(x=round, y=value, group=method)) + 75 | geom_line(aes(color=method)) + 76 | scale_x_continuous(breaks = seq(0, 30, by = 5)) + 77 | ggtitle("Portfolio value at the end of every matchweek") + 78 | theme(axis.line = element_line(size=1, colour = "black"), 79 | panel.grid.major = element_blank(), 80 | panel.grid.minor = element_blank(), 81 | panel.border = element_blank(), 82 | panel.background = element_blank(), 83 | plot.title=element_text(size = 18, family="xkcd"), 84 | text=element_text(size = 13, family="xkcd"), 85 | axis.text.x=element_text(colour="black", size = 12), 86 | axis.text.y=element_text(colour="black", size = 12)) + 87 | ylab('Portfolio total value in dollars') + 88 | xlab ('Matchweek') 89 | 90 | 91 | ############# 92 | # Extra note 93 | # If you can invest as much as you want with $1000 buffer 94 | # meaning that you set out to invest 1000/30 per round 95 | # but if Kelly criterion asks for more, you can till afford it 96 | ############ 97 | 98 | ##### Method to combine Kelly criterion and odds 99 | betting_round <- function (round, predict_method, capital){ 100 | total_return = 0 101 | 102 | round_data <- df_prediction %>% 103 | filter(Round == round) 104 | 105 | for (i in 1:dim(round_data)[1]){ 106 | predict = ifelse(predict_method == "random", sample(c("H", "D", "A"),1), 107 | round_data[i,predict_method]) 108 | if (is.na(predict)){ 109 | total_return = total_return 110 | } 111 | else{ 112 | # once I have the prediction, I find the one with the highest odd 113 | odds = as.vector (round_data[i, paste0(betting_house, predict)]) 114 | odd = max(odds) 115 | 116 | predict_prob = 1/rowMeans(round_data[i,paste0(betting_house, predict)]) 117 | bet_amount = ((predict_prob*odd - (1-predict_prob))/odd)*capital 118 | 119 | total_return = ifelse(round_data[i, "FTR"] == predict, 120 | total_return + bet_amount*(odd-1), 121 | total_return - bet_amount) 122 | } 123 | } 124 | return (total_return) 125 | } 126 | -------------------------------------------------------------------------------- /R/EPL/betting/clean_data.R: -------------------------------------------------------------------------------- 1 | ######################################## 2 | # scripts to clean data to usable format 3 | # source: 4 | # - fixtures.csv: dedicatedexcel.com 5 | # - Historical results: https://www.kaggle.com/thefc17/epl-results-19932018 6 | ######################################### 7 | library (dplyr) 8 | 9 | link_fixture = "https://raw.githubusercontent.com/tuangauss/DataScienceProjects/master/data/all_games.csv" 10 | link_history = "https://raw.githubusercontent.com/tuangauss/DataScienceProjects/master/data/history.csv" 11 | 12 | fixtures <- read.csv(link_fixture, stringsAsFactors = FALSE) 13 | 14 | # get the team 15 | teams <- unique(fixtures$HOME.TEAM) 16 | 17 | # extract historic results 18 | history <- read.csv(link_history, stringsAsFactors = FALSE) 19 | 20 | # get info from the 2010 up to 2018 21 | seasons <- sapply(10:17, function(x) paste0(2000+x,'-',x+1)) 22 | 23 | recent.pl <- history %>% 24 | filter(Season %in% seasons, div == 'E0') 25 | 26 | # because the two data comes from different source, so the teams name don't match 27 | teams[!teams %in% recent.pl$HomeTeam] 28 | unique(recent.pl$HomeTeam) 29 | 30 | # now we need to fix it 31 | pair_fix <- list(c('Manchester United', 'Man United'), 32 | c('Newcastle United', 'Newcastle'), 33 | c('Huddersfield Town', 'Huddersfield'), 34 | c('Wolverhampton Wanderers', 'Wolves'), 35 | c('Cardiff City', 'Cardiff'), 36 | c('Leicester City', 'Leicester'), 37 | c('Tottenham Hotspur', 'Tottenham'), 38 | c('West Ham United', 'West Ham'), 39 | c('Manchester City', "Man City"), 40 | c('Brighton and Hove Albion', 'Brighton')) 41 | 42 | # fix the recent.pl dataset 43 | # for name-conformity 44 | for (i in 1:length(pair_fix)){ 45 | recent.pl <- recent.pl %>% 46 | mutate(HomeTeam = replace(HomeTeam, 47 | HomeTeam == pair_fix[[i]][2], 48 | pair_fix[[i]][1]), 49 | AwayTeam = replace(AwayTeam, 50 | AwayTeam == pair_fix[[i]][2], 51 | pair_fix[[i]][1])) 52 | } 53 | 54 | 55 | # a bland average dataframe 56 | ave_home <- recent.pl %>% 57 | group_by(HomeTeam) %>% 58 | summarize (ave_scored_h = mean(FTHG), ave_conceded_h = mean(FTAG)) %>% 59 | filter (HomeTeam %in% teams) %>% rename(Team = HomeTeam) 60 | 61 | ave_away <- recent.pl %>% 62 | group_by(AwayTeam) %>% 63 | summarize (ave_scored_a = mean(FTAG), ave_conceded_a = mean(FTHG)) %>% 64 | filter (AwayTeam %in% teams) %>% rename(Team = AwayTeam) 65 | 66 | ave <- merge(ave_home, ave_away, by = 'Team') 67 | 68 | 69 | # more precise result with pairwise 70 | hist_pair.pl <- recent.pl %>% 71 | group_by(HomeTeam, AwayTeam) %>% 72 | filter (HomeTeam %in% teams, AwayTeam %in% teams) %>% 73 | summarize (match = n(), ave_home_scored = mean(FTHG), ave_away_scored = mean(FTAG)) 74 | 75 | # data set for new season 76 | # just clean the data name for readability 77 | new_season <- fixtures %>% 78 | rename(HomeTeam = HOME.TEAM, 79 | AwayTeam = AWAY.TEAM) 80 | 81 | # clean data form memory 82 | rm(history, seasons, recent.pl, pair_fix, ave_home, ave_away, fixtures) 83 | -------------------------------------------------------------------------------- /R/EPL/betting/prediction.R: -------------------------------------------------------------------------------- 1 | library (dplyr) 2 | source ('clean_data.R') 3 | 4 | # function to simplify result 5 | # from scoreline to who wins the match, H (Home), A(Away) or D(Draw) 6 | result_calc <- function (h_goal, a_goal){ 7 | result = ifelse(h_goal == a_goal, 'D', ifelse(h_goal > a_goal, 'H', 'A')) 8 | return (result) 9 | } 10 | 11 | # function to calibrate results 12 | # The idea is to make sure that if Probability of wining of Home and Away is tight 13 | # e.g: 0.451(H) vs 0.447 (A) 14 | # then it should be thought as a draw 15 | result_calibrate <- function(prob_h, prob_d, prob_a){ 16 | result = ifelse(abs(prob_h - prob_a) < 0.01, "D", 17 | ifelse (prob_h == pmax(prob_d,prob_h,prob_a), "H", 18 | ifelse(prob_d == pmax(prob_h,prob_d,prob_a), "D", "A" ))) 19 | return (result) 20 | } 21 | 22 | 23 | # get most frequent score line of a match after n, sim time 24 | get_score <- function (home, away, nsim){ 25 | # try to get from history, pair 26 | subset <- hist_pair.pl[ which( hist_pair.pl$HomeTeam ==home | hist_pair.pl$AwayTeam ==away), ] 27 | # more efficient code, no need to retract back to dataframe many times 28 | ave_h_s = subset$ave_home_scored[1] 29 | ave_a_s = subset$ave_away_scored[1] 30 | 31 | t_ave_h_s = ave[ave$Team == home,]$ave_scored_h 32 | t_ave_a_c = ave[ave$Team == away,]$ave_conceded_a 33 | t_ave_h_c = ave[ave$Team == home,]$ave_conceded_h 34 | t_ave_a_s = ave[ave$Team == away,]$ave_scored_a 35 | result = character(length(nsim)) 36 | for (i in 1:nsim){ 37 | if ((dim(subset)[1] == 1) & (subset$match[1] > 3)){ 38 | h_scored = rpois(1, ave_h_s) 39 | a_scored = rpois(1, ave_a_s) 40 | } 41 | # if we have no historical result of the match 42 | else{ 43 | # take into account both attacking stat of home and defense stats of away 44 | h_scored = rpois(1, 1/2 * (t_ave_h_s + t_ave_a_c)) 45 | a_scored = rpois(1, 1/2 * (t_ave_a_s + t_ave_h_c)) 46 | } 47 | result[i] = result_calc(h_scored, a_scored) 48 | } 49 | result_tab = table(result)/nsim 50 | return (c(result_tab['H'], result_tab['D'], result_tab['A'])) 51 | } 52 | 53 | nsim = 10000 54 | matches <- mapply(get_score, new_season$HomeTeam, new_season$AwayTeam, nsim, SIMPLIFY = FALSE) 55 | new_season$H <- sapply(matches, function(x) x[1]) 56 | new_season$D <- sapply(matches, function(x) x[2]) 57 | new_season$A <- sapply(matches, function(x) x[3]) 58 | 59 | df_prediction <- new_season %>% 60 | mutate(poisson_predict = result_calibrate(H,D,A)) 61 | 62 | # The data about Paul Merson's prediction seems to get lost somehow 63 | #df_prediction <- new_season %>% 64 | # mutate(poisson_predict = result_calibrate(H,D,A), 65 | # Merson_predict = result_calc(Merson.H, Merson.A)) 66 | -------------------------------------------------------------------------------- /R/EPL/penalty/Scraping.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 34, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import requests \n", 10 | "from bs4 import BeautifulSoup\n", 11 | "import time\n", 12 | "import random\n", 13 | "import pandas as pd\n", 14 | "import numpy as np" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 14, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "root = \"https://www.statbunker.com/competitions/Penalties?comp_id=\"" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 5, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "PL = [200,243,279,323,373,415,449,481,515,556,586,614,639]\n", 33 | "year_list = [str(i)+\"/\" + str(i+1) for i in range (7,20)]\n", 34 | "year_europa = [str(i)+\"/\" + str(i+1) for i in range (9,20)]" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 2, 40 | "metadata": {}, 41 | "outputs": [], 42 | "source": [ 43 | "Laliga = [212,259,293,336,378,413,461,485,518,564,600,622,648] #start 07/08\n", 44 | "Seria = [211,258,292,337,377,414,462,486,517,562,593,623,649]\n", 45 | "bundes = [204,250,285,330,374,416,447,483,516,561,591,620,646]\n", 46 | "france = [202,251,284,331,375,412,454,484,514,563,594,621,647]\n", 47 | "championship = [207,246,280,325,370,420,451,488,524,557,587,615,640]\n", 48 | "scottish = [205,249,283,329,369,419,455,491,521,566,590,618,643]\n", 49 | "CL = [203,261,295,332,366,429,468,500,540,571,601,628,655]#07/08\n", 50 | "europa = [296,335,362,430,470,501,541,572,602,629,656] #09/10" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 28, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "cup_dict = {\"Premier League\": PL,\n", 60 | " \"La Liga\": Laliga,\n", 61 | " \"Bundesliga\": bundes,\n", 62 | " \"Ligue One\": france,\n", 63 | " \"English Championship\": championship,\n", 64 | " \"Scottish Premiership\": scottish,\n", 65 | " \"Champion League\": CL,\n", 66 | " \"Europa Cup\": europa}" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 29, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "# scrape for PL first then append the others later\n", 76 | "full_name, full_club, full_year, full_league, full_penalties, full_home, full_away, full_scored, full_missed, full_saved = ([] for i in range(10))" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 31, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "name": "stdout", 86 | "output_type": "stream", 87 | "text": [ 88 | "Done Premier League, season 7/8\n", 89 | "Done Premier League, season 8/9\n", 90 | "Done Premier League, season 9/10\n", 91 | "Done Premier League, season 10/11\n", 92 | "Done Premier League, season 11/12\n", 93 | "Done Premier League, season 12/13\n", 94 | "Done Premier League, season 13/14\n", 95 | "Done Premier League, season 14/15\n", 96 | "Done Premier League, season 15/16\n", 97 | "Done Premier League, season 16/17\n", 98 | "Done Premier League, season 17/18\n", 99 | "Done Premier League, season 18/19\n", 100 | "Done Premier League, season 19/20\n", 101 | "Done La Liga, season 7/8\n", 102 | "Done La Liga, season 8/9\n", 103 | "Done La Liga, season 9/10\n", 104 | "Done La Liga, season 10/11\n", 105 | "Done La Liga, season 11/12\n", 106 | "Done La Liga, season 12/13\n", 107 | "Done La Liga, season 13/14\n", 108 | "Done La Liga, season 14/15\n", 109 | "Done La Liga, season 15/16\n", 110 | "Done La Liga, season 16/17\n", 111 | "Done La Liga, season 17/18\n", 112 | "Done La Liga, season 18/19\n", 113 | "Done La Liga, season 19/20\n", 114 | "Done Bundesliga, season 7/8\n", 115 | "Done Bundesliga, season 8/9\n", 116 | "Done Bundesliga, season 9/10\n", 117 | "Done Bundesliga, season 10/11\n", 118 | "Done Bundesliga, season 11/12\n", 119 | "Done Bundesliga, season 12/13\n", 120 | "Done Bundesliga, season 13/14\n", 121 | "Done Bundesliga, season 14/15\n", 122 | "Done Bundesliga, season 15/16\n", 123 | "Done Bundesliga, season 16/17\n", 124 | "Done Bundesliga, season 17/18\n", 125 | "Done Bundesliga, season 18/19\n", 126 | "Done Bundesliga, season 19/20\n", 127 | "Done Ligue One, season 7/8\n", 128 | "Done Ligue One, season 8/9\n", 129 | "Done Ligue One, season 9/10\n", 130 | "Done Ligue One, season 10/11\n", 131 | "Done Ligue One, season 11/12\n", 132 | "Done Ligue One, season 12/13\n", 133 | "Done Ligue One, season 13/14\n", 134 | "Done Ligue One, season 14/15\n", 135 | "Done Ligue One, season 15/16\n", 136 | "Done Ligue One, season 16/17\n", 137 | "Done Ligue One, season 17/18\n", 138 | "Done Ligue One, season 18/19\n", 139 | "Done Ligue One, season 19/20\n", 140 | "Done English Championship, season 7/8\n", 141 | "Done English Championship, season 8/9\n", 142 | "Done English Championship, season 9/10\n", 143 | "Done English Championship, season 10/11\n", 144 | "Done English Championship, season 11/12\n", 145 | "Done English Championship, season 12/13\n", 146 | "Done English Championship, season 13/14\n", 147 | "Done English Championship, season 14/15\n", 148 | "Done English Championship, season 15/16\n", 149 | "Done English Championship, season 16/17\n", 150 | "Done English Championship, season 17/18\n", 151 | "Done English Championship, season 18/19\n", 152 | "Done English Championship, season 19/20\n", 153 | "Done Scottish Premiership, season 7/8\n", 154 | "Done Scottish Premiership, season 8/9\n", 155 | "Done Scottish Premiership, season 9/10\n", 156 | "Done Scottish Premiership, season 10/11\n", 157 | "Done Scottish Premiership, season 11/12\n", 158 | "Done Scottish Premiership, season 12/13\n", 159 | "Done Scottish Premiership, season 13/14\n", 160 | "Done Scottish Premiership, season 14/15\n", 161 | "Done Scottish Premiership, season 15/16\n", 162 | "Done Scottish Premiership, season 16/17\n", 163 | "Done Scottish Premiership, season 17/18\n", 164 | "Done Scottish Premiership, season 18/19\n", 165 | "Done Scottish Premiership, season 19/20\n", 166 | "Done Champion League, season 7/8\n", 167 | "Done Champion League, season 8/9\n", 168 | "Done Champion League, season 9/10\n", 169 | "Done Champion League, season 10/11\n", 170 | "Done Champion League, season 11/12\n", 171 | "Done Champion League, season 12/13\n", 172 | "Done Champion League, season 13/14\n", 173 | "Done Champion League, season 14/15\n", 174 | "Done Champion League, season 15/16\n", 175 | "Done Champion League, season 16/17\n", 176 | "Done Champion League, season 17/18\n", 177 | "Done Champion League, season 18/19\n", 178 | "Done Champion League, season 19/20\n", 179 | "Done Europa Cup, season 9/10\n", 180 | "Done Europa Cup, season 10/11\n", 181 | "Done Europa Cup, season 11/12\n", 182 | "Done Europa Cup, season 12/13\n", 183 | "Done Europa Cup, season 13/14\n", 184 | "Done Europa Cup, season 14/15\n", 185 | "Done Europa Cup, season 15/16\n", 186 | "Done Europa Cup, season 16/17\n", 187 | "Done Europa Cup, season 17/18\n", 188 | "Done Europa Cup, season 18/19\n", 189 | "Done Europa Cup, season 19/20\n" 190 | ] 191 | } 192 | ], 193 | "source": [ 194 | "for cup in cup_dict:\n", 195 | " if cup == \"Europa Cup\":\n", 196 | " season = year_europa\n", 197 | " else:\n", 198 | " season = year_list\n", 199 | " \n", 200 | " name, club, year, league, penalties, home, away, scored, missed, saved = ([] for i in range(10))\n", 201 | " code_enum = cup_dict[cup]\n", 202 | " for count, el in enumerate(code_enum):\n", 203 | " URL = root + str(el)\n", 204 | " r = requests.get(URL)\n", 205 | " soup = BeautifulSoup(r.content, 'html5lib')\n", 206 | " details = soup.findAll(True, {'class':['odd', 'even']})\n", 207 | " for row in details:\n", 208 | " el_list = list(row.strings)\n", 209 | " name.append(el_list[0])\n", 210 | " club.append(el_list[1])\n", 211 | " year.append(season[count])\n", 212 | " league.append(cup)\n", 213 | " penalties.append(el_list[2])\n", 214 | " home.append(el_list[3])\n", 215 | " away.append(el_list[4])\n", 216 | " scored.append(el_list[5])\n", 217 | " missed.append(el_list[6])\n", 218 | " saved.append(el_list[7])\n", 219 | " print (\"Done \" + cup + \", season \" + season[count])\n", 220 | " time.sleep(random.randint(1,5))\n", 221 | " full_name += name\n", 222 | " full_club += club\n", 223 | " full_year += year\n", 224 | " full_league += league\n", 225 | " full_penalties += penalties\n", 226 | " full_home += home\n", 227 | " full_away += away\n", 228 | " full_scored += scored\n", 229 | " full_missed += missed\n", 230 | " full_saved += saved " 231 | ] 232 | }, 233 | { 234 | "cell_type": "code", 235 | "execution_count": 32, 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "table = pd.DataFrame({'name': full_name, 'club': full_club,\n", 240 | " 'year': full_year, 'league': full_league,\n", 241 | " 'penalties': full_penalties, 'home': full_home, 'away': full_away,\n", 242 | " 'scored': full_scored, 'missed': full_missed, 'saved': full_saved})" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 35, 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "data": { 252 | "text/plain": [ 253 | "(3937, 10)" 254 | ] 255 | }, 256 | "execution_count": 35, 257 | "metadata": {}, 258 | "output_type": "execute_result" 259 | } 260 | ], 261 | "source": [ 262 | "np.shape(table)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 38, 268 | "metadata": {}, 269 | "outputs": [ 270 | { 271 | "data": { 272 | "text/html": [ 273 | "
\n", 274 | "\n", 287 | "\n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | "
nameclubyearleaguepenaltieshomeawayscoredmissedsaved
3927Claudiu KeseruLudogorets Razgrad19/20Europa Cup11-1--
3928Adem LjajicBesiktas19/20Europa Cup1-11--
3929Andraz SporarSlovan Bratislava19/20Europa Cup1-1--1
3930James TavernierRangers19/20Europa Cup11--1-
3931Ryan ChristieCeltic19/20Europa Cup1-11--
3932Tomas de VincentiAPOEL Nicosia19/20Europa Cup11-1--
3933Bruno FernandesSporting Lisbon19/20Europa Cup1-11--
3934Ciprian DeacCFR Cluj19/20Europa Cup11-1--
3935M'Baye NiangStade Rennes19/20Europa Cup11-1--
3936Bibras NatchoPartizan Belgrade19/20Europa Cup11-1--
\n", 436 | "
" 437 | ], 438 | "text/plain": [ 439 | " name club year league penalties \\\n", 440 | "3927 Claudiu Keseru Ludogorets Razgrad 19/20 Europa Cup 1 \n", 441 | "3928 Adem Ljajic Besiktas 19/20 Europa Cup 1 \n", 442 | "3929 Andraz Sporar Slovan Bratislava 19/20 Europa Cup 1 \n", 443 | "3930 James Tavernier Rangers 19/20 Europa Cup 1 \n", 444 | "3931 Ryan Christie Celtic 19/20 Europa Cup 1 \n", 445 | "3932 Tomas de Vincenti APOEL Nicosia 19/20 Europa Cup 1 \n", 446 | "3933 Bruno Fernandes Sporting Lisbon 19/20 Europa Cup 1 \n", 447 | "3934 Ciprian Deac CFR Cluj 19/20 Europa Cup 1 \n", 448 | "3935 M'Baye Niang Stade Rennes 19/20 Europa Cup 1 \n", 449 | "3936 Bibras Natcho Partizan Belgrade 19/20 Europa Cup 1 \n", 450 | "\n", 451 | " home away scored missed saved \n", 452 | "3927 1 - 1 - - \n", 453 | "3928 - 1 1 - - \n", 454 | "3929 - 1 - - 1 \n", 455 | "3930 1 - - 1 - \n", 456 | "3931 - 1 1 - - \n", 457 | "3932 1 - 1 - - \n", 458 | "3933 - 1 1 - - \n", 459 | "3934 1 - 1 - - \n", 460 | "3935 1 - 1 - - \n", 461 | "3936 1 - 1 - - " 462 | ] 463 | }, 464 | "execution_count": 38, 465 | "metadata": {}, 466 | "output_type": "execute_result" 467 | } 468 | ], 469 | "source": [ 470 | "table.tail(10)" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 37, 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "table.to_csv (r'./all_penalties.csv', index = None, header=True)" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": null, 485 | "metadata": {}, 486 | "outputs": [], 487 | "source": [] 488 | } 489 | ], 490 | "metadata": { 491 | "kernelspec": { 492 | "display_name": "Python 3", 493 | "language": "python", 494 | "name": "python3" 495 | }, 496 | "language_info": { 497 | "codemirror_mode": { 498 | "name": "ipython", 499 | "version": 3 500 | }, 501 | "file_extension": ".py", 502 | "mimetype": "text/x-python", 503 | "name": "python", 504 | "nbconvert_exporter": "python", 505 | "pygments_lexer": "ipython3", 506 | "version": "3.7.3" 507 | } 508 | }, 509 | "nbformat": 4, 510 | "nbformat_minor": 2 511 | } 512 | -------------------------------------------------------------------------------- /R/EPL/penalty/penalty.R: -------------------------------------------------------------------------------- 1 | ################################ 2 | #### Data Science Project ##### 3 | # Article: # 4 | # https://tinyurl.com/y2ynruqo # 5 | ################################ 6 | 7 | library(MASS) 8 | library(tidyverse) 9 | library(betareg) 10 | library(xkcd) 11 | 12 | # read raw_data 13 | raw_data <- read.csv("./all_penalties.csv", 14 | stringsAsFactors = FALSE) 15 | 16 | # basic cleaning and group by player 17 | player_data <- raw_data %>% 18 | mutate(name = str_squish(name), 19 | penalties = ifelse(penalties == '-',0,penalties), 20 | scored = as.numeric(ifelse(scored =='-', 0, scored))) %>% 21 | group_by(name) %>% 22 | summarise(total = sum(penalties), 23 | total_score = sum(scored))%>% 24 | mutate(ratio = total_score/(total)) %>% 25 | filter(total >= 4 & ratio > 0 & ratio < 1) %>% 26 | na.omit() 27 | 28 | #### draw xkcd with dataman 29 | xrange = c(0.2,1.0) 30 | yrange = c(0,4) 31 | ratioxy <- diff(xrange) / diff(yrange) 32 | mapping <- aes(x=x, 33 | y=y, 34 | scale=scale, 35 | ratioxy=ratioxy, 36 | angleofspine = angleofspine, 37 | anglerighthumerus = anglerighthumerus, 38 | anglelefthumerus = anglelefthumerus, 39 | anglerightradius = anglerightradius, 40 | angleleftradius = angleleftradius, 41 | anglerightleg = anglerightleg, 42 | angleleftleg = angleleftleg, 43 | angleofneck = angleofneck) 44 | 45 | dataman <- data.frame( x= 0.3, y=3, 46 | scale = 0.5, 47 | ratioxy = ratioxy, 48 | angleofspine = -pi/2, 49 | anglerighthumerus = -pi/6, 50 | anglelefthumerus = -pi/2 -pi/6, 51 | anglerightradius = pi/5, 52 | angleleftradius = pi/5, 53 | angleleftleg = 3*pi/2 + pi / 12 , 54 | anglerightleg = 3*pi/2 - pi / 12, 55 | angleofneck = runif(1, min = 3 * pi / 2 - pi/10 , max = 3 * pi / 2 + pi/10)) 56 | 57 | # draw histogram of conversion rates 58 | player_data %>% 59 | ggplot(aes(ratio)) + 60 | geom_histogram(breaks = 5:25/25, 61 | fill = hcl(0, 50, 80)) + 62 | xkcdaxis(c(0.1,1), c(0,80)) + 63 | labs (x = "\nHistogram of penalties conversion rate", y = "Count") + 64 | theme_xkcd() 65 | 66 | # fit a beta distribution on the histogram 67 | m <- MASS::fitdistr(player_data$ratio, dbeta, 68 | start = list(shape1 = 10, shape2 = 1), 69 | lower=c(0.1,0.1)) 70 | alpha0 <- m$estimate[1] 71 | beta0 <- m$estimate[2] 72 | 73 | # plot the fit with some fun xkcd 74 | ggplot(player_data) + 75 | geom_histogram(aes(ratio, y = ..density..), 76 | breaks = 5:25/25, 77 | fill = hcl(0, 50, 80)) + 78 | stat_function(fun = function(x) dbeta(x, alpha0, beta0), color = "red", 79 | size = 1) + 80 | xlab("\n Penalty Coversion Rate") + 81 | theme_xkcd() + 82 | xkcdaxis(xrange, yrange) + 83 | xkcdman(mapping, dataman) + 84 | annotate("text", x=0.4, y = 4, 85 | label = "Does not look an amazing good fit\nBut it's okay", 86 | family="xkcd") + 87 | xkcdline(aes(x=xbegin,y=ybegin,xend=xend,yend=yend), 88 | data.frame(xbegin=0.36,ybegin=3,xend=0.42,yend=3.5), 89 | xjitteramount = 0.01) 90 | 91 | # adjusted ratio: 92 | adjusted_ratio <- player_data %>% 93 | mutate(eb_estimate = (total_score + alpha0) / (total + alpha0 + beta0)) %>% 94 | arrange(desc(eb_estimate)) 95 | 96 | # posterior plots for specific players: 97 | specific_players <- adjusted_ratio %>% 98 | filter(name %in% c("Cristiano Ronaldo", 99 | "Nicolas Pepe", 100 | "Alexis Sanchez", 101 | "Antoine Griezmann")) %>% 102 | mutate(alpha = total_score + alpha0, 103 | beta = total - total_score + beta0) 104 | 105 | # draw posterior beta distribution for these players 106 | specific_players %>% 107 | crossing(x=seq(0.4,0.99,.002)) %>% 108 | ungroup() %>% 109 | mutate(density=dbeta(x,alpha,beta)) %>% 110 | ggplot(aes(x, density, color = name)) + 111 | geom_line() + 112 | stat_function(fun=function(x) dbeta(x, alpha0, beta0), lty = 2, color = 'black') + 113 | xlab("Conversion rate") + 114 | theme_xkcd() 115 | 116 | # draw actual vs adjusted ratio plot 117 | ggplot(adjusted_ratio, aes(ratio, eb_estimate, color = total)) + 118 | geom_hline(yintercept = alpha0 / (alpha0 + beta0), color = "red", lty = 2) + 119 | geom_point() + 120 | geom_abline(color = "red") + 121 | scale_colour_gradient(breaks = c(0,20,30,50,70)) + 122 | xlim(0.5,1) + 123 | ylim(0.5,1) + 124 | xlab("Actual goal scoring average") + 125 | ylab("Posterior goal scoring average") 126 | 127 | 128 | #### When it seems that a unimodal beta distribution is not a good fit 129 | #### we can use E-M algorithm (implemented in the betareg package 130 | #### to fit 2 beta distributions 131 | m<- betamix(ratio ~ 1| 1, data = player_data, k = 1:3) 132 | 133 | mu <- plogis(coef(m)[,1]) 134 | phi <- exp(coef(m)[,2]) 135 | a <- mu*phi 136 | b <- (1-mu)*phi 137 | # get the cluser 138 | cl <- clusters(m) 139 | 140 | # plotting 141 | ## separate histograms for both clusters 142 | ## TODO: convert back to ggplot code 143 | hist(subset(player_data, cl == 1)$ratio, breaks = 5:25/25, freq = FALSE, 144 | col = hcl(0, 50, 80), main = "", xlab = "Penalty Conversion Rate", ylim = c(0, 9)) 145 | 146 | hist(subset(player_data, cl == 2)$ratio, breaks = 5:25/25, freq = FALSE, 147 | col = hcl(240, 50, 80), main = "", xlab = "Penalty Conversion Rate", ylim = c(0, 9), add = TRUE) 148 | 149 | ## lines for fitted densities 150 | ys <- seq(0, 1, by = 0.01) 151 | lines(ys, dbeta(ys, shape1 = a[1], shape2 = b[1]), 152 | col = hcl(0, 80, 50), lwd = 2) 153 | lines(ys, dbeta(ys, shape1 = a[2], shape2 = b[2]), 154 | col = hcl(240, 80, 50), lwd = 2) 155 | 156 | ## lines for corresponding means 157 | abline(v = mu[1], col = hcl(0, 80, 50), lty = 2, lwd = 2) 158 | abline(v = mu[2], col = hcl(240, 80, 50), lty = 2, lwd = 2) 159 | 160 | ## repeat Bayesian updating 161 | ## only group specific this time 162 | post <- posterior(m) 163 | post[,1] 164 | # posterior probabilies of being assigned to each group 165 | player_data$post_1 <- post[,1] 166 | player_data$post_2 <- post[,2] 167 | 168 | player_data <- player_data %>% 169 | mutate(shrunkage_1 = (total_score + a[1])/(total + a[1] + b[1]), 170 | shrunkage_2 = (total_score + a[2])/(total + a[2] + b[2]), 171 | shrunkage_ave = (post_1*shrunkage_1 + post_2*shrunkage_2)) %>% 172 | arrange(desc(shrunkage_ave)) 173 | 174 | # plot 175 | player_data %>% 176 | gather(type, value, ratio, shrunkage_ave)%>% 177 | mutate(type = ifelse(type == 'ratio', 178 | 'Raw scoring ratio', 179 | 'Average posterior'), 180 | type = relevel(factor(type), 'Raw scoring ratio')) %>% 181 | ggplot(aes(total_score, value)) + 182 | geom_point() + 183 | facet_wrap(~ type) + 184 | ylab("Estimate") + 185 | theme_bw() 186 | -------------------------------------------------------------------------------- /R/EPL/prediction/clean_data.R: -------------------------------------------------------------------------------- 1 | ######################################## 2 | # scripts to clean data to usable format 3 | # pipe directly to sim.R 4 | # source: 5 | # - fixtures.csv: dedicatedexcel.com 6 | # - Historical results: https://www.kaggle.com/thefc17/epl-results-19932018 7 | ######################################### 8 | library (dplyr) 9 | 10 | fixtures <- read.csv("fixtures.csv", stringsAsFactors = FALSE) 11 | 12 | # get the team 13 | teams <- unique(fixtures$HOME.TEAM) 14 | 15 | # extract historic results 16 | history <- read.csv("history.csv", stringsAsFactors = FALSE) 17 | 18 | # get info from the 2010 up to 2018 19 | seasons <- sapply(10:17, function(x) paste0(2000+x,'-',x+1)) 20 | 21 | recent.pl <- history %>% 22 | filter(Season %in% seasons, div == 'E0') 23 | 24 | # because the two data comes from different source, so the teams name don't match 25 | teams[!teams %in% recent.pl$HomeTeam] 26 | unique(recent.pl$HomeTeam) 27 | 28 | # now we need to fix it 29 | pair_fix <- list(c('Manchester United', 'Man United'), c('Newcastle United', 'Newcastle'), 30 | c('Huddersfield Town', 'Huddersfield'), c('Wolverhampton Wanderers', 'Wolves'), 31 | c('Cardiff City', 'Cardiff'), c('Leicester City', 'Leicester'), 32 | c('Tottenham Hotspur', 'Tottenham'), c('West Ham United', 'West Ham'), 33 | c('Manchester City', "Man City"), c('Brighton and Hove Albion', 'Brighton')) 34 | 35 | # fix the recent.pl dataset 36 | for (i in 1:length(pair_fix)){ 37 | recent.pl <- recent.pl %>% 38 | mutate(HomeTeam = replace(HomeTeam, HomeTeam == pair_fix[[i]][2], pair_fix[[i]][1]), 39 | AwayTeam = replace(AwayTeam, AwayTeam == pair_fix[[i]][2], pair_fix[[i]][1])) 40 | } 41 | 42 | 43 | # a bland average dataframe 44 | ave_home <- recent.pl %>% 45 | group_by(HomeTeam) %>% 46 | summarize (ave_scored_h = mean(FTHG), ave_conceded_h = mean(FTAG)) %>% 47 | filter (HomeTeam %in% teams) %>% rename(Team = HomeTeam) 48 | 49 | ave_away <- recent.pl %>% 50 | group_by(AwayTeam) %>% 51 | summarize (ave_scored_a = mean(FTAG), ave_conceded_a = mean(FTHG)) %>% 52 | filter (AwayTeam %in% teams) %>% rename(Team = AwayTeam) 53 | 54 | ave <- merge(ave_home, ave_away, by = 'Team') 55 | 56 | 57 | # more precise result with pairwise 58 | hist_pair.pl <- recent.pl %>% 59 | group_by(HomeTeam, AwayTeam) %>% 60 | filter (HomeTeam %in% teams, AwayTeam %in% teams) %>% 61 | summarize (match = n(), ave_home_scored = mean(FTHG), ave_away_scored = mean(FTAG)) 62 | 63 | rm(history, seasons, pair_fix, ave_home, ave_away) 64 | -------------------------------------------------------------------------------- /R/EPL/prediction/match_simulate.R: -------------------------------------------------------------------------------- 1 | library (dplyr) 2 | source ('clean_data.R') 3 | 4 | # get most frequent score line of a match after n, sim time 5 | nsim = 100 6 | get_score <- function (home, away, nsim){ 7 | # try to get from history, pair 8 | subset <- hist_pair.pl[ which( hist_pair.pl$HomeTeam ==home | hist_pair.pl$AwayTeam ==away), ] 9 | # more efficient code, no need to retract back to dataframe many times 10 | ave_h_s = subset$ave_home_scored[1] 11 | ave_a_s = subset$ave_away_scored[1] 12 | 13 | t_ave_h_s = ave[ave$Team == home,]$ave_scored_h 14 | t_ave_a_c = ave[ave$Team == away,]$ave_conceded_a 15 | t_ave_h_c = ave[ave$Team == home,]$ave_conceded_h 16 | t_ave_a_s = ave[ave$Team == away,]$ave_scored_a 17 | score_line = character(length(nsim)) 18 | # simulation idea similar to that of sim.R 19 | for (i in 1:nsim){ 20 | if ((dim(subset)[1] == 1) & (subset$match[1] > 3)){ 21 | h_scored = rpois(1, ave_h_s) 22 | a_scored = rpois(1, ave_a_s) 23 | } 24 | # if we have no historical result of the match 25 | else{ 26 | # take into account both attacking stat of home and defense stats of away 27 | h_scored = rpois(1, 1/2 * (t_ave_h_s + t_ave_a_c)) 28 | a_scored = rpois(1, 1/2 * (t_ave_a_s + t_ave_h_c)) 29 | } 30 | score_line[i] = paste0(h_scored, '-', a_scored) 31 | } 32 | return (list(names(which.max(table(score_line))), max(table(score_line)))) 33 | } 34 | 35 | round_1 <- head(fixtures,10) 36 | matches <- mapply(get_score, round_1$HOME.TEAM, round_1$AWAY.TEAM, nsim, SIMPLIFY = FALSE) 37 | round_1$score_line <- sapply(matches, function(x) x[1]) 38 | round_1$prob <- sapply(matches, function(x) x[2]) 39 | -------------------------------------------------------------------------------- /R/EPL/prediction/sim.R: -------------------------------------------------------------------------------- 1 | library (dplyr) 2 | source ('clean_data.R') 3 | 4 | # get score of a match 5 | get_score <- function (home, away){ 6 | # try to get from history, pair 7 | subset <- hist_pair.pl[ which( hist_pair.pl$HomeTeam ==home & hist_pair.pl$AwayTeam ==away), ] 8 | # only use this method if we have at least 4 matches 9 | if ((dim(subset)[1] == 1) & (subset$match[1] > 3)){ 10 | h_scored = rpois(1, subset$ave_home_scored[1]) 11 | a_scored = rpois(1, subset$ave_away_scored[1]) 12 | } 13 | # if we have no historical result of the match 14 | else{ 15 | # take into account both attacking stat of home and defense stats of away 16 | h_scored = rpois(1, 1/2 * (ave[ave$Team == home,]$ave_scored_h + 17 | ave[ave$Team == away,]$ave_conceded_a)) 18 | a_scored = rpois(1, 1/2 * (ave[ave$Team == away,]$ave_scored_a + 19 | ave[ave$Team == home,]$ave_conceded_h)) 20 | } 21 | return (list(h_scored, a_scored)) 22 | } 23 | 24 | rank <- function (m_result){ 25 | table <- data.frame(name = teams, 26 | goal_score = rep(0,20), 27 | goal_conceded = rep(0,20), 28 | point = rep(0,20)) 29 | # loop through all the results and then update 30 | for (i in 1:nrow(m_result)){ 31 | home = m_result$HOME.TEAM[i] 32 | away = m_result$AWAY.TEAM[i] 33 | h_goal = m_result$h_scored[i] 34 | a_goal = m_result$a_scored[i] 35 | 36 | # add goal 37 | table[table$name == home,]$goal_score = table[table$name == home,]$goal_score + h_goal 38 | table[table$name == home,]$goal_conceded = table[table$name == home,]$goal_conceded + a_goal 39 | table[table$name == away,]$goal_score = table[table$name == away,]$goal_score + a_goal 40 | table[table$name == away,]$goal_conceded = table[table$name == away,]$goal_conceded + h_goal 41 | 42 | 43 | # calculate point 44 | if (h_goal > a_goal){ 45 | table[table$name == home,]$point = table[table$name == home,]$point + 3 46 | } 47 | else if (h_goal < a_goal){ 48 | table[table$name == away,]$point = table[table$name == away,]$point + 3 49 | } 50 | else{ 51 | table[table$name == home,]$point = table[table$name == home,]$point + 1 52 | table[table$name == away,]$point = table[table$name == away,]$point + 1 53 | } 54 | } 55 | 56 | table$goal_dif <- table$goal_score - table$goal_conceded 57 | table <- table[order(-table$point, -table$goal_dif, -table$goal_score), ] 58 | 59 | return (table) 60 | } 61 | 62 | simulate <- function(fixtures){ 63 | matches <- mapply(get_score, fixtures$HOME.TEAM, fixtures$AWAY.TEAM, SIMPLIFY = FALSE) 64 | fixtures$h_scored <- unlist(sapply(matches, function(x) x[1])) 65 | fixtures$a_scored <- unlist(sapply(matches, function(x) x[2])) 66 | table <- rank(fixtures) 67 | return (table) 68 | } 69 | 70 | 71 | nsim = 10000 72 | tabulate_data <- data.frame(name = teams, 73 | champion = rep(0,20), 74 | runner_up = rep(0,20), 75 | top_4 = rep(0,20), 76 | top_6 = rep(0,20), 77 | relegate = rep(0,20)) 78 | pb <- txtProgressBar(min = 0, max = nsim, style = 3) 79 | 80 | for (sim in 1:nsim){ 81 | table = simulate(fixtures) 82 | 83 | first = table$name[1] 84 | second = table$name[2] 85 | first_4 = table$name[1:4] 86 | first_6 = table$name[1:6] 87 | last_3 = table$name[18:20] 88 | 89 | tabulate_data <- tabulate_data %>% 90 | mutate(champion = ifelse(name == first, champion+1, champion), 91 | runner_up = ifelse(name == second, runner_up+1, runner_up), 92 | top_4 = ifelse(name %in% first_4, top_4+1, top_4), 93 | top_6 = ifelse(name %in% first_6, top_6+1, top_6), 94 | relegate = ifelse(name %in% last_3, relegate+1, relegate)) 95 | setTxtProgressBar(pb, sim) 96 | } 97 | 98 | # convert to percentage 99 | tabulate_data <- tabulate_data %>% 100 | mutate (champion = champion/nsim, 101 | runner_up = runner_up/nsim, 102 | top_4 = top_4/nsim, 103 | top_6 = top_6/nsim, 104 | relegate = relegate/nsim) 105 | 106 | # write result into csv 107 | write.csv(tabulate_data, "tabulate_data.csv", row.names = FALSE) 108 | 109 | 110 | -------------------------------------------------------------------------------- /R/EPL/prediction/visualize.R: -------------------------------------------------------------------------------- 1 | library (dplyr) 2 | library (ggplot2) 3 | library (xkcd) 4 | library (extrafont) 5 | 6 | download.file("http://simonsoftware.se/other/xkcd.ttf", 7 | dest="xkcd.ttf", mode="wb") 8 | system("cp xkcd.ttf ~/Library/Fonts") 9 | font_import(path="~/Library/Fonts", pattern = "xkcd", prompt=FALSE) 10 | fonts() 11 | fonttable() 12 | if(.Platform$OS.type != "unix") { 13 | ## Register fonts for Windows bitmap output 14 | loadfonts(device="win") 15 | } else { 16 | loadfonts() 17 | } 18 | 19 | # extract historic results 20 | history <- read.csv("https://raw.githubusercontent.com/tuangauss/Various-projects/master/data/history.csv", stringsAsFactors = FALSE) 21 | 22 | # get info from the 2010 up to 2018 23 | seasons <- sapply(10:17, function(x) paste0(2000+x,'-',x+1)) 24 | 25 | 26 | graph_func <- function(season){ 27 | if (season[1] == "2017-18"){ 28 | title = "Last season: 2017-2018" 29 | } 30 | else{ 31 | title = "From 2010-11 to 2017-18" 32 | } 33 | data <- history %>% 34 | filter (Season %in% season, div == 'E0') %>% 35 | mutate (total = FTAG + FTHG) 36 | 37 | ave_score <- mean(data$total) 38 | 39 | prob_data <- data %>% 40 | group_by(total) %>% 41 | summarize (prob = n()/nrow(data)) 42 | 43 | ggplot(data=prob_data, aes(x=total, y=prob)) + 44 | geom_bar(stat="identity", color="blue", fill="grey") + 45 | scale_x_continuous(breaks=seq(0,10,1)) + 46 | geom_line(aes(x = total, y = dpois(x=total, lambda = ave_score)), 47 | col = "red", size = 0.5) + 48 | geom_point(aes(x = total, y = dpois(x=total, lambda = ave_score)), 49 | col = "black", size = 3) + 50 | ggtitle(title) + labs (x = "Total Goal", y = "Probability") + 51 | theme_xkcd() 52 | } 53 | 54 | graph_func(seasons) 55 | graph_func(c('2017-18')) 56 | -------------------------------------------------------------------------------- /R/EPL/xkcd.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/R/EPL/xkcd.ttf -------------------------------------------------------------------------------- /R/Paul_hypothesis_test.R: -------------------------------------------------------------------------------- 1 | ####################################### 2 | ## Hypothesis testing procedure 3 | ## The curious case of Paul the Octopus 4 | ## Fisher vs N-p approach 5 | ####################################### 6 | 7 | # The script is self-containing, no extra module or libary needed 8 | 9 | 10 | # graph binomial distribution, color extreme value and beyond 11 | graph <- function(n,p, value){ 12 | x <- seq(0,n) 13 | prob <- dbinom(x,size=n,prob=p) 14 | cols <- c("grey","red")[(h >= value) + 1] 15 | barplot(prob,names.arg=0:n, col = cols, 16 | main=sprintf(paste('binomial distribution, size:',n, "prob:",p))) 17 | } 18 | 19 | graph(14,0.5,12) 20 | 21 | 22 | # calculate p value for binomial distribtion, at x = 12 23 | p_value = 1-pbinom(11,14,0.5) 24 | #p_value = dbinom(12,14,0.5) + dbinom(13,14,0.5) + dbinom(14,14,0.5) 25 | 26 | # Neyman- Pearson approach 27 | # calculate current power 28 | # a. Assuming type 1 error = 0.01 29 | p_value = 1-pbinom(0:14, 14,0.5) 30 | critical_value = which(p_value == p_value[p_value < 0.01][1])-1 31 | type2 = pbinom(critical_value-1,14,0.75) 32 | 33 | # b. More interesting problem 34 | # You should try first before looking up the code 35 | # if we want to achieve type 1 error < 1% and power > 90%, how many observation do we need to make? 36 | 37 | stop = FALSE 38 | for (n in 1:50){ 39 | for (k in 0:n){ 40 | type1 <- 1- pbinom(k,n,0.5) 41 | type2 <- pbinom(k-1,n,0.75) 42 | if (type1 < 0.01 & type2 <0.1){ 43 | print (paste("n is ",toString(n),", k is", toString(k))) 44 | stop = TRUE 45 | break 46 | } 47 | } 48 | if (stop) break 49 | } 50 | # need 42 observations 51 | 52 | 53 | # if we cut it some slack 54 | # type 1 of 5% and type 2 of 20% 55 | stop = FALSE 56 | for (n in 1:50){ 57 | for (k in 0:n){ 58 | type1 <- 1- pbinom(k,n,0.5) 59 | type2 <- pbinom(k-1,n,0.75) 60 | if (type1 < 0.05 & type2 <0.2){ 61 | print (paste("n is ",toString(n),", k is", toString(k))) 62 | stop = TRUE 63 | break 64 | } 65 | } 66 | if (stop) break 67 | } 68 | # still need 16 observations 69 | -------------------------------------------------------------------------------- /R/RuleOfThree.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | 3 | true_p <- 0.001 4 | iter <- function(size){ 5 | samp <- sample(x= c(1,0), 6 | size = size, 7 | prob = c(true_p, 1 - true_p), 8 | replace = TRUE) 9 | cut <- which.max(samp) - 1 10 | upper_bound <- min(3/cut, 1) 11 | return(upper_bound) 12 | } 13 | 14 | res <- replicate(n = 100000, iter(size = 10000)) 15 | sum(res > true_p) 16 | 17 | ggplot() + 18 | aes(res) + 19 | geom_histogram(colour="black", fill="grey", bins = 100) + 20 | geom_vline(aes(xintercept = true_p), color = "red") + 21 | xlim(c(0, .05)) + 22 | theme_bw() + 23 | theme(legend.position = "none") + 24 | labs(x = "Upper Bound") 25 | -------------------------------------------------------------------------------- /R/bayes_god.R: -------------------------------------------------------------------------------- 1 | ######################## 2 | # Illustrative calculation 3 | # Article: A Bayesian quest to find God 4 | # Published: July 19, 2019 5 | ######################## 6 | 7 | library (tidyverse) 8 | 9 | bayes <- function(x, y_x, y_nx){ 10 | num <- y_x * x 11 | denom <- y_x * x + y_nx * (1-x) 12 | return (num/denom) 13 | } 14 | 15 | bayes(0.01, 1, 1/7) 16 | 17 | days <- seq(0,10,1) 18 | posterior <- rep(0.01,11) 19 | for (i in 2:11){ 20 | post <- bayes(posterior[i-1], 1, 1/7) 21 | posterior[i] <- post 22 | } 23 | 24 | 25 | posterior_2<- rep(0.0001,11) 26 | for (i in 2:11){ 27 | post <- bayes(posterior_2[i-1], 1, 1/7) 28 | posterior_2[i] <- post 29 | } 30 | 31 | #https://www.datanovia.com/en/blog/ggplot-legend-title-position-and-labels/ 32 | df <- data.frame(days, posterior, posterior_2) 33 | 34 | vis1 <- df %>% 35 | ggplot(aes(x=days, y = posterior)) + 36 | geom_point() + 37 | scale_x_continuous(breaks = days) + 38 | labs (title = " Posterior estimate") + 39 | theme_classic() 40 | 41 | vis2 <- df %>% 42 | gather(prior, value, -days) %>% 43 | ggplot(aes(x=days, y = value, color = prior)) + 44 | geom_point() + 45 | scale_color_discrete(name = "Value of prior \n on Day-1", 46 | labels = c(0.01, 0.0001)) + 47 | scale_x_continuous(breaks = days) + 48 | labs (title = " Posterior estimate") + 49 | theme_classic() 50 | -------------------------------------------------------------------------------- /R/bayesian_gym.R: -------------------------------------------------------------------------------- 1 | # load libraries 2 | library(rjags) 3 | library(dplyr) 4 | library(MASS) 5 | library(ggplot2) 6 | 7 | # load data 8 | raw_data <- read.csv("~/data/Vietnamese_2016.csv", 9 | head = TRUE, sep = ";") 10 | head(raw_data) 11 | summary(raw_data$Age_gr) 12 | 13 | # clean data 14 | data <- raw_data %>% 15 | filter(Age_gr == "18-29") %>% 16 | filter(Sex == "male") %>% 17 | dplyr::select(height, weight, BMI) %>% 18 | mutate(height = as.numeric(gsub(",", ".", height))) %>% 19 | mutate(weight = as.numeric(gsub(",", ".", weight))) 20 | 21 | # my info 22 | m_height = 168 23 | m_weight = 58 24 | m_BMI = m_weight / (m_height/100)^2 25 | 26 | # visualization: 27 | truehist(data$weight,nbins = 50, 28 | main = paste("Histogram of Vietnamese male weight"), xlab = "Weight in kg") 29 | abline(v=m_weight,col="black", lwd = 4) 30 | abline(v=median(data$weight), col = "red", lty = 4, lwd = 4) 31 | abline(v=mean(data$weight), col ="orange", lty = 4, lwd = 4) 32 | text(m_weight-2, 0.12, "Me!!!") 33 | 34 | my_data <- data.frame(height = m_height, weight = m_weight) 35 | ggplot(data, aes(height, weight)) + 36 | geom_point(shape = 16, size = 5, show.legend = FALSE, colour = "blue", alpha = 0.4) + theme_minimal() + 37 | geom_point(data = my_data, color ="red", size = 5) + 38 | labs (title = "Weight versus Height plot of 383 Vietnamese male and Tuan", subtitle = "***Red point is author's own measurement") + 39 | theme(plot.title = element_text(color="#666666", face="bold", size=20, hjust=0)) 40 | 41 | 42 | # add standard least square line 43 | model <- lm(data$weight ~ data$height) #fit linear model 44 | label_text <- paste('Fitted model: ', round(coef(model)[1], 3), ' + ', round(coef(model)[2], 3), ' x', sep = '') 45 | ggplot(data, aes(height, weight)) + 46 | geom_point(shape = 16, size = 5, show.legend = FALSE, colour = "blue", alpha = 0.4) + theme_minimal() + 47 | geom_smooth(method = "lm", fullrange=TRUE, color = "red") + 48 | geom_text(aes(x = 143, y = 55, label = label_text),hjust = 0, size = 6) + 49 | geom_point(data = my_data, color ="red", size = 5) + 50 | labs (title = "Weight versus Height plot of 383 Vietnamese male and Tuan") + 51 | theme(plot.title = element_text(color="#666666", face="bold", size=20, hjust=0)) 52 | 53 | 54 | 55 | #### Running JAGS model #### 56 | ############################ 57 | 58 | n <- nrow(data) #383 data points 59 | 60 | mymodel <- " 61 | model{ 62 | for(i in 1:n){ 63 | y[i] ~ dnorm(a + b*x[i], tau) 64 | } 65 | a ~ dnorm(0, 1e-6) 66 | b ~ dnorm(0, 1e-6) 67 | tau ~ dgamma(.01,.01) 68 | sig <- 1/sqrt(tau) 69 | } 70 | " 71 | 72 | jm <- jags.model(file = textConnection(mymodel), data=list(n=n, x=data$height, y=data$weight)) 73 | cs <- coda.samples(jm, c("a","b","sig"), 11000) 74 | sample_data <- as.data.frame(cs[[1]][-(1:1000),]) 75 | 76 | cmean <- sample_data$a + sample_data$b*m_height # "conditional mean" 77 | 78 | m_perc <- pnorm(q = m_weight, mean = cmean, sd = sample_data$sig) 79 | truehist(m_perc, main = "Posterior distribution for my weight percentile", 80 | xlab = "percentile", ylab = "Frequency") 81 | mean(m_perc<=0.4) 82 | mean(m_perc) 83 | 84 | 85 | ### What happen if I compare myself to American men 86 | nls_data <-read.csv("~/data/national_longitudinal_survey.csv", head = TRUE) 87 | nls_data <- nls_data %>% 88 | filter(Gender == "Male") %>% 89 | mutate (height = Height..inches.*2.54) %>% 90 | mutate (weight = Weight..lbs./2.2046) %>% 91 | dplyr::select(height,weight) 92 | 93 | #4150 data points 94 | n <- nrow(nls_data) 95 | 96 | mymodel <- " 97 | model{ 98 | for(i in 1:n){ 99 | y[i] ~ dnorm(a + b*x[i], tau) 100 | } 101 | a ~ dnorm(0, 1e-6) 102 | b ~ dnorm(0, 1e-6) 103 | tau ~ dgamma(.01,.01) 104 | sig <- 1/sqrt(tau) 105 | } 106 | " 107 | 108 | jm <- jags.model(file = textConnection(mymodel), data=list(n=n, x=nls_data$height, y=nls_data$weight)) 109 | cs <- coda.samples(jm, c("a","b","sig"), 11000) 110 | sample_data <- as.data.frame(cs[[1]][-(1:1000),]) 111 | 112 | cmean <- sample_data$a + sample_data$b*m_height 113 | m_perc <- pnorm(q = m_weight, mean = cmean, sd = sample_data$sig) 114 | truehist(m_perc) 115 | -------------------------------------------------------------------------------- /R/dating_sim.R: -------------------------------------------------------------------------------- 1 | ############################################ 2 | ## The Optimal dating strategy 3 | ## Why we should always reject the first 37% 4 | ## An MC simulation 5 | ############################################ 6 | 7 | # calculate the theoretical probability of P(S_n,k) 8 | theo_prob <- function(x){ 9 | if (x == 1) return (1/100) 10 | else return ((x-1)/100 * (sum(1/((x:100)-1)))) 11 | } 12 | 13 | # a util function to simulate the 'best-partner rank' 14 | perm_rank <- function(n){ 15 | return (sample(1:n, n)) 16 | } 17 | 18 | # simulation(n) will run a MC simulation for the case of N=n 19 | # returning the optimal M and the corresponding optimal probability 20 | simulation <- function(n){ 21 | M_range <- 2:n 22 | niter <- 1000 #for each value of M, we simulate 1000 times 23 | 24 | # declare a vector to store results, 25 | # notice that if M = 1, the probability is 1/100 26 | prob_result <- rep(1/100, 100) 27 | 28 | # do a simulation for each value of M 29 | for (M in M_range){ 30 | result <- rep(0, niter) 31 | for (i in 1:niter){ 32 | order <- perm_rank(n) #simulate the order 33 | # find the best among the first M-1 that gets rejected 34 | highest_reject <- min(head(order, M-1)) 35 | if (highest_reject != 1){ 36 | accept <- order[order < highest_reject][1] 37 | # we consider ourselves successful if: 38 | # - rank 1 is not included in the first M-1 candidates 39 | # - rank 1 is the first person who is better than all we have seen 40 | if (accept == 1){ 41 | result[i] <- 1 42 | } 43 | } 44 | } 45 | prob_result[M] <- mean(result) 46 | } 47 | return (c(max(prob_result), which.max(prob_result)/n)) 48 | } 49 | 50 | # applying simulation(n) to different values of n 51 | opt_p <- sapply(2:30, function(x) simulation(x)[1]) 52 | plot(2:30, opt_p, ylim = c(0.2,1), main = 'Optimal probability \n P(S_n,k)', 53 | xlab = 'N', ylab = 'Probability') 54 | 55 | opt_ratio <- sapply(2:30, function(x) simulation(x)[2]) 56 | plot(2:30, opt_ratio, ylim = c(0.2,1.1), main = 'Optimal ratio \n M/N', 57 | xlab = 'N', ylab = 'Ratio') 58 | -------------------------------------------------------------------------------- /R/end_to_end_projects.R: -------------------------------------------------------------------------------- 1 | # import necessary libary 2 | library(MASS) 3 | library(dplyr) 4 | library(caret) 5 | library (ggplot2) 6 | library(rpart) 7 | library(e1071) 8 | library (leaps) 9 | 10 | # download and extract dataset from source 11 | link <- "http://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz" 12 | download.file(link, destfile = "~/data/cal_housing.tgz") 13 | untar("cal_housing.gz") 14 | cal_housing <- read.csv("~data/CaliforniaHousing/cal_housing.data") 15 | 16 | # explore and visualize data 17 | head(cal_housing) 18 | dim(cal_housing) 19 | str(cal_housing) 20 | summary(cal_housing) #very useful 21 | 22 | levels(cal_housing$ocean_proximity) #categorical var 23 | 24 | # plot numerical vars 25 | cal_housing_num <- subset(cal_housing, 26 | select = -c(ocean_proximity)) 27 | par(mfrow= c(3,3)) 28 | invisible(lapply(names(cal_housing_num), 29 | function(col_name) truehist(cal_housing_num[,col_name], 30 | main = paste("Histogram of ", col_name), 31 | xlab = NA))) 32 | 33 | # scatter plot with ggplots 34 | g <- ggplot(cal_housing, aes(x = longitude, y = latitude, colour = median_income)) 35 | g + geom_point() + scale_color_distiller(palette = "Spectral") + 36 | labs(title = "Plot of data points by location and median_income") + 37 | theme(plot.title = element_text(color="black", size=14, face="bold.italic")) 38 | 39 | # correlation 40 | cor(subset(cal_housing, select = -c(ocean_proximity)), 41 | use = "pairwise.complete.obs")[,"median_house_value"] 42 | 43 | #### Data Wrangling ##### 44 | ######################### 45 | 46 | # dealing wiht missing data: 3 options 47 | cal_housing <- subset(cal_housing, 48 | select = -c(total_bedroom)) #delete column 49 | 50 | cal_housing <- cal_housing[complete.cases(cal_housing),] #remove missing entries 51 | 52 | cal_housing$total_bedrooms[is.na(cal_housing$total_bedrooms)] <- median(cal_housing$total_bedrooms, na.rm=TRUE)#impute NAs with a good statistics (eg: median) 53 | 54 | # other cleaning tasks 55 | cal_housing <- cal_housing %>% 56 | filter(median_house_value < 500000) %>% 57 | mutate(rooms_per_house = total_rooms / households) %>% 58 | mutate(population_per_house = population / households) %>% 59 | mutate(ocean_proximity = as.factor(ocean_proximity)) %>% 60 | mutate_at(vars(-ocean_proximity, -median_house_value, -median_income), funs(scale)) %>% 61 | data.matrix %>% data.frame 62 | 63 | 64 | #### Split to training set and test set #### 65 | ############################################ 66 | 67 | # random sampling 68 | set.seed(365) 69 | train_id <- sample(nrow(cal_housing), size = 0.8*nrow(cal_housing)) 70 | train_set <- cal_housing[train_id,] 71 | test_set <- cal_housing[-train_id,] 72 | print (paste(nrow(train_set), "train +", nrow(test_set), "test")) 73 | 74 | # stratified sampling 75 | par(mfrow = c(1,2)) 76 | truehist(cal_housing[,"median_income"], main = paste("Histogram of median income"), xlab = NA) 77 | cal_housing <- cal_housing %>% #categorize median income 78 | mutate(income_level = ceiling(median_income/2)) %>% 79 | mutate(income_level = factor(ifelse(income_level >= 5, 5, income_level))) %>% 80 | select(-median_income) 81 | plot(cal_housing$income_level, main = paste("Bar plot of income level"), xlab = NA) 82 | 83 | train_str_id <- createDataPartition(cal_housing$income_level, p =.8, 84 | list = FALSE, times = 1) 85 | train_str <- cal_housing[train_str_id,] 86 | test_str <- cal_housing[-train_str_id,] 87 | # test to see if we achieve stratified sampling 88 | table(cal_housing$income_level) / nrow(cal_housing) 89 | table(train_str$income_level) / nrow(train_str) 90 | 91 | 92 | #compare performance of 2 sampling method 93 | overall<- as.vector(table(cal_housing$income_level) / nrow(cal_housing)) 94 | normal_sampling <- factor(sapply(ceiling(test_set$median_income/2), 95 | function(value) ifelse(value >=5, 5, value))) #sapply automatically returns a list 96 | normal_sampling <- as.vector(table(normal_sampling) / length(normal_sampling)) 97 | str_sampling <- as.vector(table(test_str$income_level) / nrow(test_str)) 98 | compare <- data.frame(overall, str_sampling, normal_sampling) %>% 99 | mutate(rand_error = 100*normal_sampling/overall - 100) %>% 100 | mutate(strat_error = 100*str_sampling/overall-100) 101 | 102 | compare 103 | 104 | #### Fit models #### 105 | #################### 106 | 107 | # linear model 108 | model_lm <- lm(median_house_value~., train_str) 109 | summary(model_lm) 110 | predict_lm_train <- predict(model_lm, train_str) 111 | sqrt(mean((train_str$median_house_value - predict_lm_train)^2)) #RMSE 112 | 113 | 114 | # Decision tree 115 | model_decision_tree <- rpart(median_house_value~., 116 | data = train_str, method = "anova", 117 | control = rpart.control(minsplit = 2, cp=0.001)) 118 | predict_decision_tree <- predict(model_decision_tree, train_str) 119 | sqrt(mean((train_str$median_house_value - predict_decision_tree)^2)) #RMSE 120 | 121 | #SVM 122 | model_svm <- svm(median_house_value~., 123 | data = train_str, cost = 10) 124 | predict_svm <- predict(model_svm, train_str) 125 | sqrt(mean((train_str$median_house_value - predict_svm)^2)) #RMSE 126 | 127 | #### 10-fold cross validation: 128 | cal_housing_copy <- cal_housing[sample(nrow(cal_housing)),] # randomly shuffle your data 129 | 130 | 131 | folds <- cut(seq(1,nrow(cal_housing_copy)), 132 | breaks=10,labels=FALSE) #Create 10 equally size folds 133 | 134 | #Perform 10 fold cross validation 135 | MSE_lm <- 0 136 | MSE_tree <- 0 137 | MSE_svm <- 0 138 | 139 | for(i in 1:10){ 140 | #Segement your data by fold using the which() function 141 | testIndexes <- which(folds==i,arr.ind=TRUE) 142 | testData <- cal_housing_copy[testIndexes, ] 143 | trainData <- cal_housing_copy[-testIndexes, ] 144 | 145 | # fit in the models 146 | lm_model <- lm(median_house_value~., trainData) 147 | tree_model <- rpart(median_house_value~.,data = trainData, method = "anova", 148 | control = rpart.control(minsplit = 2, cp = 0.001)) 149 | svm_model <- svm(median_house_value~.,data = trainData, cost = 10) 150 | 151 | # make predictions 152 | predict1 <- predict(lm_model, testData) 153 | predict2 <- predict (tree_model, testData) 154 | predict3 <- predict(svm_model, testData) 155 | 156 | #update MSE 157 | MSE_lm <- MSE_lm + sum(folds == i)/nrow(cal_housing_copy) * mean((predict1 - testData$median_house_value)^2) 158 | MSE_tree <- MSE_tree + sum(folds == i)/nrow(cal_housing_copy) * mean((predict2 - testData$median_house_value)^2) 159 | MSE_svm <- MSE_svm + sum(folds == i)/nrow(cal_housing_copy) * mean((predict3 - testData$median_house_value)^2) 160 | 161 | } 162 | 163 | sqrt(MSE_lm) 164 | sqrt(MSE_tree) 165 | sqrt(MSE_svm) 166 | 167 | #### Tuning parameters #### 168 | ########################### 169 | 170 | # Decision tree: 171 | tune_tree <- tune.rpart(median_house_value~., 172 | data = train_str, minsplit = c(5,10,15, 20), 173 | cp = c(0.1,0.01,0.001,0.0001)) 174 | summary(tune_tree) 175 | plot(tune_tree) 176 | 177 | best_tree <- tune_tree$best.model 178 | predict_tree <- predict(best_tree, train_str) 179 | sqrt(mean((train_str$median_house_value - predict_tree)^2)) #RMSE of best tree model 180 | 181 | # SVM: 182 | tune_svm <- tune.svm(median_house_value ~., 183 | data = train_str, 184 | cost=10^(-1:2), gamma=c(0.1,0,1)) 185 | summary(tune_svm) 186 | plot(tune_svm) 187 | best_svm <- tune_svm$best.model 188 | predict_svm <- predict (best_svm, train_str) 189 | sqrt(mean((train_str$median_house_value - predict_svm)^2)) 190 | 191 | 192 | #### Applying on test set #### 193 | ############################## 194 | 195 | predict_tree_final <- predict(best_tree, test_str) 196 | sqrt(mean((test_str$median_house_value - predict_tree_final)^2)) 197 | 198 | predict_svm_final <- predict(best_svm, test_str) 199 | sqrt(mean((test_str$median_house_value - predict_svm_final)^2)) -------------------------------------------------------------------------------- /R/lindy/Inverse_Random_Sampling.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/R/lindy/Inverse_Random_Sampling.pdf -------------------------------------------------------------------------------- /R/lindy/lindy_simulation.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | 3 | # conditional weibull 4 | # subtract t_0 to get exepcted years left 5 | sample_w <- function(u, lambda, kappa, t_0){ 6 | (t_0^kappa - lambda^kappa*log(1-u))^(1/kappa) - t_0 7 | } 8 | # conditional pareto 9 | sample_p <- function(u, t_0, alpha = 2){ 10 | t_0*(1-u)^(-1/alpha) - t_0 11 | } 12 | 13 | result_w = c() 14 | result_p = c() 15 | year_range <- seq(0, 80, 10) 16 | for(t in year_range){ 17 | u <- runif(10000) 18 | samps_w <- sample_w(u, 77.1, 5.05, t) 19 | samps_p <- sample_p(u, t) 20 | result_w <- c(result_w, mean(samps_w)) 21 | result_p <- c(result_p, mean(samps_p)) 22 | } 23 | 24 | plot(year_range, result_w, 25 | type = "l", ylim = c(0, 100), 26 | main="Expected remaining year", 27 | xlab = "Year passed", 28 | ylab = "Years remaining", 29 | bty = "n") 30 | lines(year_range, result_p, col = "green", lty = 2) 31 | par(xpd=TRUE) 32 | legend(x=4.5, y = 100, 33 | legend=c("Human life time", "Lindy's good"), 34 | lty=1:2, 35 | col = c("black", "green"), 36 | ncol=2) 37 | 38 | # changing shape parameter 39 | u <- runif(100000) 40 | samps_p1 <- sample_p(u, 20, alpha = 2) 41 | samps_p2 <- sample_p(u, 20, alpha = 1.5) 42 | samps_p3 <- sample_p(u, 20, alpha = 3) 43 | df <- data.frame("type"= c(rep("alpha = 1.5", 100000), 44 | rep("alpha = 2", 100000), 45 | rep("alpha = 3", 100000)), 46 | "value" = c(samps_p2, samps_p1, samps_p3)) 47 | ggplot(df, aes(x=value, fill = type)) + 48 | geom_density(alpha = .3) + 49 | xlim(0, 50) + 50 | ggtitle("pdf of years remaining after the first 20 years") + 51 | xlab("Years remaining") + 52 | ylab("Probability") 53 | 54 | 55 | 56 | #after 20 years 57 | u <- runif(100000) 58 | samps_w <- sample_w(u, 77.1, 5.05, 20) 59 | mean_w <- mean(samps_w) 60 | samps_p <- sample_p(u, 20) 61 | mean_p <- mean(samps_p) 62 | df <- data.frame("type"= c(rep("Human life time", 100000), 63 | rep("Lindy's good", 100000)), 64 | "value" = c(samps_w, samps_p)) 65 | ggplot(df, aes(x=value, fill = type)) + 66 | geom_density(alpha = .3) + 67 | xlim(0, 100) + 68 | ggtitle("pdf of years remaining after the first 20 years") + 69 | xlab("Years remaining") + 70 | ylab("Probability") + 71 | geom_vline(xintercept = mean_p, linetype = "dashed", color = "blue") + 72 | geom_vline(xintercept = mean_w, linetype = "dashed", color = "red") 73 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Overview 2 | 3 | In this repository, you will find the source code to various projects I have been working on or still work-in-progress. The majority of the projects are accompanied by a Medium blog posts at [tuannguyen-doan.medium.com](https://tuannguyen-doan.medium.com/). I published almost exclusively on Towards Data Science publication through Medium's Partnership program so please check out these articles as a way to support me and my future projects. Alternatively, you can also find my blog posts at my personal website [here](https://tuangauss.github.io/). 4 | 5 | My interests lie in the intersection of statistical techniques, data visualization and sports (especially football). All the codes are written entirely in Python or R. I don't have a strong preference or attempt to make a concerted effort to code in a specific language/platform. The decision is mostly based on how specific functionalities needed for a project are supported (scraping in Python and data processing with dplyr piping in R). 6 | 7 | ### I. Statistical application: 8 | 9 | #### The statistics of modern football: 10 | A collection of projects that explore the intricate statistical aspect of the Beautiful Game 11 | 12 | - [Empirical Bayes and penalty taking ability](https://towardsdatascience.com/men-of-steel-finding-the-best-penalty-takers-with-empirical-bayes-estimation-aa0e126fb08b) - Using Bayesian statistics to make meaningful comparison between players across Europe. 13 | - [Poisson process and match prediction](https://towardsdatascience.com/o-jogo-bonito-predicting-the-premier-league-with-a-random-model-1b02fa3a7e5a) - Here we learn about the Poisson process and how a random model outperforms football experts with its prediction. 14 | - [The mathematics of football betting strategies](https://towardsdatascience.com/making-big-bucks-with-a-data-driven-sports-betting-strategy-6c21a6869171) - With the Poisson model and some additional help from mathematical research, can we beat the bookies? 15 | - [Fisher vs Neyman-Person debate and Paul the Octopus](https://towardsdatascience.com/what-can-an-octopus-tell-us-about-the-biggest-debate-in-statistical-theory-f017295d781f) - We went over the theory (or many theories) of hypothesis testings and see how they apply to the psychic ability of Paul the Octopus. 16 | 17 | #### Statistical theory and its application: 18 | 19 | - [Bayes theorem and a probabilistic argument for God](https://towardsdatascience.com/a-bayesian-quest-to-find-god-b30934972473) - Bayes theory and how people have been using it to justify the necessary existence of God. 20 | - [Dating with probability theory](https://towardsdatascience.com/probability-theory-and-the-optimal-dating-strategy-for-2018-2b75b26fb0b) - Here we explore what probability theory has to say about the most optimal strategy to find the love of your life. 21 | - [Bayes theorem and why it matters to my workout routine](https://towardsdatascience.com/how-bayesian-statistics-convinced-me-to-hit-the-gym-fa737b0a7ac) - A lightweight introduction to Bayes' theorem and how it helps convince me to hit the gym. 22 | - [The Rule of Three and its application](https://towardsdatascience.com/the-rule-of-three-calculating-the-probability-of-events-that-have-not-yet-occurred-106144dc2c39) - A short introduction of the Rule of Three and how we can apply it to calculate the probability of events that have yet to happen. Application in voting, vaccine development, product quality monitoring, etc. 23 | - [Lindy's effect](https://towardsdatascience.com/a-statistical-rule-to-optimize-your-life-the-lindys-effect-96d2c75b080d) - A (slightly) mathematical description of the Lindy's effect and how one can use it as a guide for life. 24 | - [Normal Distribution with High Dimensionality](https://towardsdatascience.com/disney-movies-were-right-we-are-all-special-and-statistically-so-3bb56e79ab71) - A statistical investigation into the myth of the "average Joe." 25 | - [Mark-Recapture method](https://medium.com/towards-data-science/the-statistical-theory-behind-why-your-instagram-posts-have-so-few-likes-31f46d03448b) - An intro to the statistics behind sampling theory and how you can use it to count *almost* everything 26 | 27 | ### II. External Collaborations: 28 | 29 | #### Published papers: 30 | - [A robust and scalable method to compare Percentile metrics in online experiments (Quora Data Blog, 2022)](https://quoradata.quora.com/A-Robust-and-Scalable-method-to-compare-Percentile-Metrics-in-online-experiments) Conducting statistical tests for Percentile metrics can be tricky, as they have less neat mathematical properties than other more common metrics, such as the average or the ratios. I discuss Quora's method to A/B test these metrics in a statistically valid and scalable manner. 31 | - [How social learning amplifies moral outrage expression in online social networks (Science Advances, 2021)](https://www.science.org/doi/pdf/10.1126/sciadv.abe5641) - Moral outrage shapes fundamental aspects of social life and is now widespread in online social networks. Here, we show how social learning processes amplify online moral outrage expressions over time. 32 | - [Application of machine learning models in predicting length of stay among healthcare workers in underserved communities in South Africa (Human Resources for Health, 2018)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6293620/) - We aim to use machine learning methods to predict health professional’s length of practice in the rural public healthcare sector based on their demographic information. 33 | 34 | ### III. General tutorials with Python and R: 35 | 36 | #### Data visualization: 37 | - [NetworkX and Basemap](https://towardsdatascience.com/catching-that-flight-visualizing-social-network-with-networkx-and-basemap-ce4a0d2eaea6) - Here is a comprehensive tutorial of how we can visualize geographical data with powerful tools that support Python. 38 | - [Tkinter and Python](https://towardsdatascience.com/having-your-own-fun-how-to-build-your-own-macys-firework-show-with-python-and-tkinter-79cc31631b44) - Building your own firework shows with Tkinter (and some math chops). 39 | - [Data visualization with Matplotlib and Seaborn](https://towardsdatascience.com/advanced-sports-visualization-with-pandas-matplotlib-and-seaborn-9c16df80a81b) - Learn how to construct publish-worthy visualizations with Matplotlib and Seaborn packages. 40 | 41 | #### Machine Learning practicals: 42 | - [End-to-end Machine Learning project with R](https://github.com/tuangauss/DataScienceProjects/blob/master/R/end_to_end_projects.R) - Here is a full data science project that covers data collection, cleaning, visualization, machine learning and validation. 43 | - [Unsupervised Learning - Clustering method with R](https://github.com/tuangauss/DataScienceProjects/blob/master/R/EPL/Misc/TeamEvaluate2015.R) - An introduction to an array of unsupervised learning algorithms: Hierachical clustering, k-means, and Factor Analysis. 44 | - [Collaborative Filtering with Python](https://towardsdatascience.com/building-my-own-2021-book-recommendation-engine-903ea10d5021) - A comprehensive guide to the mathematical details and implementation of popular Matrix Factorization methods. 45 | 46 | 47 | -------------------------------------------------------------------------------- /data/Team2015season.csv: -------------------------------------------------------------------------------- 1 | Team,DribbledPast,Interception,Fouled,Fouls,Yellow,Red,Clearance,Total block,Total goals,Successful Dribble,Total Aerial challenges,AccLB,InAccLB,AccSP,InAccSP,Possession per game,Average salary per week_1,Summer transfer_1,Net spend on transfer,Average attendance per game 2 | Arsenal,376,767,418,376,68,2,1004,610,69,599,1390,877,871,16342,2446,55.7,166000,78000000,46400000,59992 3 | Manchester City,482,571,297,445,77,2,872,670,82,434,1188,902,1031,17790,2381,57,205000,53000000,32000000,45365 4 | Chelsea,387,370,432,382,73,4,1026,564,72,489,1299,1153,1118,15709,2289,54.1,192000,88500000,-800000,41546 5 | Manchester United,302,680,395,453,64,5,1117,513,60,374,1188,1736,1233,16407,1933,58.8,215000,145500000,122000000,75335 6 | Southampton,336,628,396,469,57,3,879,510,49,265,1308,1325,1397,13170,2289,51.9,63000,57500000,-30700000,30652 7 | Liverpool,324,441,466,385,66,3,1177,590,48,438,1161,1052,1097,14988,2240,54.4,144000,117000000,35700000,44659 8 | Tottenham,315,678,383,442,79,4,1045,527,56,413,1232,1113,1248,14575,2392,55.3,100000,27000000,-6500000,35728 9 | Stoke,366,501,409,487,82,1,1170,502,47,357,1655,1166,1257,10747,2206,50.2,61000,3000000,400000,27081 10 | Swansea,372,692,463,398,48,5,1202,426,43,351,1205,1092,1201,13586,1998,50.7,63000,5500000,1300000,20555 11 | West Ham,330,507,419,420,64,2,1474,489,43,260,1608,1098,1448,9022,1998,46.7,64000,23800000,31000000,34871 12 | Leicester,362,743,348,456,50,4,1179,457,45,349,1960,1022,1683,8460,2177,44.8,36300,8000000,8000000,31693 13 | Crystal Palace,340,601,434,527,63,4,1269,442,46,357,1759,945,1665,7435,2024,42.7,46000,12900000,11000000,24421 14 | Everton,341,444,450,390,66,2,1285,483,46,392,1228,1270,1289,14236,2008,53.6,69000,32800000,32800000,38406 15 | Burnley,403,565,398,407,64,2,1473,430,27,243,1888,1296,1724,8292,2315,44,21000,1500000,10000000,19131 16 | West Bromwich Albion,352,588,406,423,64,3,1090,411,37,241,1303,1054,1674,9771,1932,44.9,65000,14000000,13600000,25064 17 | Queens Park Rangers,387,561,392,447,75,3,1445,535,39,307,1732,1307,1698,8362,2294,46.3,78000,28000000,21000000,17809 18 | Hull,378,606,401,451,73,6,1317,429,32,255,1619,1171,1554,10192,1942,45.6,43000,32900000,24400000,23557 19 | Sunderland,436,579,447,441,94,2,993,408,30,341,1176,1081,1404,9922,2043,46.2,70000,14000000,9500000,43157 20 | Newcastle United,321,643,441,434,65,7,1025,468,40,444,1549,952,1424,10593,2052,48.1,78000,29500000,24900000,50359 21 | Aston Villa,286,484,422,401,70,7,1129,417,31,287,1400,1323,1419,11724,2051,49.1,69000,4700000,5900000,34133 22 | Juventus,300,555,501,525,73,5,762,602,71,405,940,1288,843,16023,2249,55.7,122000,46000000,11000000,38553 23 | Lazio,355,751,456,678,104,9,934,551,70,375,1215,1229,1259,12717,2185,52.7,55100,12250000,12750000,35500 24 | Inter,336,632,485,560,95,5,833,615,58,433,1142,952,874,16494,2242,57.8,70200,15700000,-3150000,37372 25 | Roma,295,670,511,521,103,5,910,525,53,420,894,1547,1080,15893,1990,57.5,94500,98600000,9075000,40118 26 | Fiorentina,360,626,555,538,82,3,853,616,61,383,977,1261,1031,14957,1996,55.9,56000,9100000,-24000000,30266 27 | Napoli,380,641,484,524,78,6,842,624,65,303,1062,1187,1153,15406,2213,54.1,67000,34050000,29490000,32266 28 | Genoa,301,592,622,636,103,6,1034,489,58,394,1042,1029,1201,11326,2191,51.2,35500,4340000,10160000,20882 29 | Sampdoria,297,559,600,615,97,3,1043,503,46,346,949,1363,1125,10715,1901,49,38500,24050000,-1700000,22276 30 | Palermo,354,522,549,582,84,4,1048,479,52,403,1144,1101,1201,11345,2182,49,24300,19700000,2030000,17466 31 | Torino,291,617,529,545,97,5,806,546,48,339,975,1340,1092,11818,1845,48.8,22500,34200000,3500000,17234 32 | AC Milan,261,682,615,541,81,13,905,450,54,294,898,1392,1225,12564,1906,51.2,97000,20600000,-15600000,36730 33 | Sassuolo,382,732,668,586,101,9,1231,448,46,264,1121,1069,1369,9976,1927,47.2,28000,16000000,2200000,13086 34 | Chievo,338,665,579,625,82,3,926,458,27,258,1162,1234,1449,8069,2071,43.6,18300,8000000,2510000,10652 35 | Empoli,414,371,610,492,58,4,953,516,43,424,1015,799,1207,13920,2437,51.4,11000,0,3750000,9229 36 | Cagliari,353,585,621,595,109,6,934,495,45,445,948,826,1054,12505,2343,49.1,16000,20200000,1700000,10551 37 | Verona,357,565,511,523,97,7,1159,461,48,239,1005,1142,1159,9497,1774,43.9,20500,16700000,-6200000,19312 38 | Udinese,321,503,504,532,91,6,1151,426,41,277,1290,1093,1484,9529,2007,45.2,25000,29200000,14850000,9132 39 | Atalanta,336,603,604,630,103,9,1041,461,37,254,1298,1158,1312,9702,2183,46.2,19700,0,-5400000,15673 40 | Cesena,370,563,557,632,102,4,1167,360,34,232,1303,1325,1294,9231,2243,45.2,11500,825000,950000,16204 41 | Parma,357,607,484,662,100,8,853,418,31,270,904,1218,1316,10543,2060,45.2,26200,4320000,-6360000,11758 42 | Barcelona,260,496,557,371,67,3,494,625,108,527,739,1315,682,21439,2371,65.3,347900,166000000,85000000,77632 43 | Real Madrid,343,482,544,406,85,3,658,686,116,477,925,1285,803,16083,1952,56.2,328000,135500000,21500000,73081 44 | Atletico Madrid,372,606,539,544,110,5,871,437,65,270,1596,1034,1242,10810,2443,49.4,105000,117600000,29600000,46603 45 | Valencia,294,683,613,568,103,9,1026,441,65,313,1351,1242,1223,10432,2102,50.7,73000,53300000,850000,43205 46 | Sevilla,350,614,517,609,116,2,811,463,68,320,1283,1092,1347,10597,2306,49.9,78600,20950000,-29600000,30671 47 | Villarreal,248,721,443,526,94,2,993,507,48,278,1302,1047,1287,10524,2535,48.9,44200,18700000,-3800000,16040 48 | Celta Vigo,334,673,495,586,117,5,771,499,47,375,1452,1233,1392,13577,2409,55.2,18700,0,0,19156 49 | Athletic Club,320,751,511,532,96,4,848,443,41,270,1699,1100,1645,10929,2753,50.8,38900,1000000,-35000000,43454 50 | Real Sociedad,261,606,559,514,88,1,1040,429,41,265,1533,1246,1366,10110,2415,50.3,49900,12000000,-33200000,22103 51 | Espanyol,328,637,535,594,111,6,964,368,47,266,1405,1013,1496,8546,2208,46.4,32200,2200000,-8800000,18693 52 | Rayo Vallecano,320,658,525,554,106,6,760,492,46,301,1421,1396,1368,12218,2309,55.4,15800,500000,150000,10628 53 | Malaga,400,631,509,571,107,8,830,461,40,297,1248,1157,1324,9233,2285,48.9,27500,1800000,-17600000,24530 54 | Deportivo La Coruna,351,748,498,554,103,4,892,399,33,284,1348,1044,1432,9884,2216,48.3,15300,200000,200000,21271 55 | Getafe,332,757,474,542,96,4,901,421,32,364,1301,1069,1439,9340,2169,47.3,18700,0,-8000000,7753 56 | Almeria,270,660,518,560,107,11,1021,394,33,300,1296,1117,1490,7317,2019,46.1,11700,0,-3000000,10405 57 | Eibar,343,653,410,568,103,5,963,354,33,216,1515,1257,1682,7366,2335,44.6,12800,162000,-5162000,4780 58 | Cordoba,338,692,496,487,95,9,1029,389,21,337,1433,1014,1383,9839,2126,47.4,16300,500000,500000,16126 59 | Granada,252,668,488,572,112,5,976,401,29,337,1307,1091,1331,8511,1992,46.9,22800,5750000,-10000000,17248 60 | Levante,296,688,480,558,107,3,835,404,34,286,1495,1092,1555,8046,2153,44.1,17000,0,0,15267 61 | Elche,318,562,485,588,103,6,834,407,35,248,1287,1279,1564,9889,2138,47.9,12500,1350000,-4650000,21684 62 | -------------------------------------------------------------------------------- /data/fixtures.csv: -------------------------------------------------------------------------------- 1 | DIVISION,DATE,TIME,FIXTURE,HOME TEAM,AWAY TEAM 2 | EPL,8/10/2018,20:00,Manchester United V Leicester City,Manchester United,Leicester City 3 | EPL,8/11/2018,12:30,Newcastle United V Tottenham Hotspur,Newcastle United,Tottenham Hotspur 4 | EPL,8/11/2018,15:00,Bournemouth V Cardiff City,Bournemouth,Cardiff City 5 | EPL,8/11/2018,15:00,Fulham V Crystal Palace,Fulham,Crystal Palace 6 | EPL,8/11/2018,15:00,Huddersfield Town V Chelsea,Huddersfield Town,Chelsea 7 | EPL,8/11/2018,15:00,Watford V Brighton and Hove Albion,Watford,Brighton and Hove Albion 8 | EPL,8/11/2018,17:30,Wolverhampton Wanderers V Everton,Wolverhampton Wanderers,Everton 9 | EPL,8/12/2018, 13:30,Liverpool V West Ham United,Liverpool,West Ham United 10 | EPL,8/12/2018, 13:30,Southampton V Burnley,Southampton,Burnley 11 | EPL,8/12/2018, 16:00,Arsenal V Manchester City,Arsenal,Manchester City 12 | EPL,8/18/2018, 12:30,Cardiff City V Newcastle United,Cardiff City,Newcastle United 13 | EPL,8/18/2018, 15:00,Everton V Southampton,Everton,Southampton 14 | EPL,8/18/2018, 15:00,Leicester City V Wolverhampton Wanderers,Leicester City,Wolverhampton Wanderers 15 | EPL,8/18/2018, 15:00,Tottenham Hotspur V Fulham,Tottenham Hotspur,Fulham 16 | EPL,8/18/2018, 15:00,West Ham United V Bournemouth,West Ham United,Bournemouth 17 | EPL,8/18/2018, 17:30,Chelsea V Arsenal,Chelsea,Arsenal 18 | EPL,8/19/2018, 13:30,Burnley V Watford,Burnley,Watford 19 | EPL,8/19/2018, 13:30,Manchester City V Huddersfield Town,Manchester City,Huddersfield Town 20 | EPL,8/19/2018, 16:00,Brighton and Hove Albion V Manchester United,Brighton and Hove Albion,Manchester United 21 | EPL,8/20/2018, 20:00,Crystal Palace V Liverpool,Crystal Palace,Liverpool 22 | EPL,8/25/2018, 12:30,Wolverhampton Wanderers V Manchester City,Wolverhampton Wanderers,Manchester City 23 | EPL,8/25/2018, 15:00,Arsenal V West Ham United,Arsenal,West Ham United 24 | EPL,8/25/2018, 15:00,Bournemouth V Everton,Bournemouth,Everton 25 | EPL,8/25/2018, 15:00,Fulham V Burnley,Fulham,Burnley 26 | EPL,8/25/2018, 15:00,Huddersfield Town V Cardiff City,Huddersfield Town,Cardiff City 27 | EPL,8/25/2018, 15:00,Southampton V Leicester City,Southampton,Leicester City 28 | EPL,8/25/2018, 17:30,Liverpool V Brighton and Hove Albion,Liverpool,Brighton and Hove Albion 29 | EPL,8/26/2018, 13:30,Watford V Crystal Palace,Watford,Crystal Palace 30 | EPL,8/26/2018, 16:00,Newcastle United V Chelsea,Newcastle United,Chelsea 31 | EPL,8/27/2018, 20:00,Manchester United V Tottenham Hotspur,Manchester United,Tottenham Hotspur 32 | EPL,9/1/2018, 12:30,Leicester City V Liverpool,Leicester City,Liverpool 33 | EPL,9/1/2018, 15:00,Brighton and Hove Albion V Fulham,Brighton and Hove Albion,Fulham 34 | EPL,9/1/2018, 15:00,Burnley V Manchester United,Burnley,Manchester United 35 | EPL,9/1/2018, 15:00,Chelsea V Bournemouth,Chelsea,Bournemouth 36 | EPL,9/1/2018, 15:00,Crystal Palace V Southampton,Crystal Palace,Southampton 37 | EPL,9/1/2018, 15:00,Everton V Huddersfield Town,Everton,Huddersfield Town 38 | EPL,9/1/2018, 15:00,West Ham United V Wolverhampton Wanderers,West Ham United,Wolverhampton Wanderers 39 | EPL,9/1/2018, 17:30,Manchester City V Newcastle United,Manchester City,Newcastle United 40 | EPL,9/2/2018, 13:30,Cardiff City V Arsenal,Cardiff City,Arsenal 41 | EPL,9/2/2018, 16:00,Watford V Tottenham Hotspur,Watford,Tottenham Hotspur 42 | EPL,9/15/2018, 12:30,Tottenham Hotspur V Liverpool,Tottenham Hotspur,Liverpool 43 | EPL,9/15/2018, 15:00,Bournemouth V Leicester City,Bournemouth,Leicester City 44 | EPL,9/15/2018, 15:00,Chelsea V Cardiff City,Chelsea,Cardiff City 45 | EPL,9/15/2018, 15:00,Huddersfield Town V Crystal Palace,Huddersfield Town,Crystal Palace 46 | EPL,9/15/2018, 15:00,Manchester City V Fulham,Manchester City,Fulham 47 | EPL,9/15/2018, 15:00,Newcastle United V Arsenal,Newcastle United,Arsenal 48 | EPL,9/15/2018, 17:30,Watford V Manchester United,Watford,Manchester United 49 | EPL,9/16/2018, 13:30,Wolverhampton Wanderers V Burnley,Wolverhampton Wanderers,Burnley 50 | EPL,9/16/2018, 16:00,Everton V West Ham United,Everton,West Ham United 51 | EPL,9/17/2018, 20:00,Southampton V Brighton and Hove Albion,Southampton,Brighton and Hove Albion 52 | EPL,9/22/2018, 12:30,Fulham V Watford,Fulham,Watford 53 | EPL,9/22/2018, 15:00,Burnley V Bournemouth,Burnley,Bournemouth 54 | EPL,9/22/2018, 15:00,Cardiff City V Manchester City,Cardiff City,Manchester City 55 | EPL,9/22/2018, 15:00,Crystal Palace V Newcastle United,Crystal Palace,Newcastle United 56 | EPL,9/22/2018, 15:00,Leicester City V Huddersfield Town,Leicester City,Huddersfield Town 57 | EPL,9/22/2018, 15:00,Liverpool V Southampton,Liverpool,Southampton 58 | EPL,9/22/2018, 15:00,Manchester United V Wolverhampton Wanderers,Manchester United,Wolverhampton Wanderers 59 | EPL,9/22/2018, 17:30,Brighton and Hove Albion V Tottenham Hotspur,Brighton and Hove Albion,Tottenham Hotspur 60 | EPL,9/23/2018, 13:30,West Ham United V Chelsea,West Ham United,Chelsea 61 | EPL,9/23/2018, 16:00,Arsenal V Everton,Arsenal,Everton 62 | EPL,9/29/2018, 12:30,West Ham United V Manchester United,West Ham United,Manchester United 63 | EPL,9/29/2018, 15:00,Arsenal V Watford,Arsenal,Watford 64 | EPL,9/29/2018, 15:00,Everton V Fulham,Everton,Fulham 65 | EPL,9/29/2018, 15:00,Huddersfield Town V Tottenham Hotspur,Huddersfield Town,Tottenham Hotspur 66 | EPL,9/29/2018, 15:00,Manchester City V Brighton and Hove Albion,Manchester City,Brighton and Hove Albion 67 | EPL,9/29/2018, 15:00,Newcastle United V Leicester City,Newcastle United,Leicester City 68 | EPL,9/29/2018, 15:00,Wolverhampton Wanderers V Southampton,Wolverhampton Wanderers,Southampton 69 | EPL,9/29/2018, 17:30,Chelsea V Liverpool,Chelsea,Liverpool 70 | EPL,9/30/2018, 16:00,Cardiff City V Burnley,Cardiff City,Burnley 71 | EPL,10/1/2018, 20:00,Bournemouth V Crystal Palace,Bournemouth,Crystal Palace 72 | EPL,10/5/2018, 20:00,Brighton and Hove Albion V West Ham United,Brighton and Hove Albion,West Ham United 73 | EPL,10/6/2018, 15:00,Burnley V Huddersfield Town,Burnley,Huddersfield Town 74 | EPL,10/6/2018, 15:00,Crystal Palace V Wolverhampton Wanderers,Crystal Palace,Wolverhampton Wanderers 75 | EPL,10/6/2018, 15:00,Leicester City V Everton,Leicester City,Everton 76 | EPL,10/6/2018, 15:00,Tottenham Hotspur V Cardiff City,Tottenham Hotspur,Cardiff City 77 | EPL,10/6/2018, 15:00,Watford V Bournemouth,Watford,Bournemouth 78 | EPL,10/6/2018, 17:30,Manchester United V Newcastle United,Manchester United,Newcastle United 79 | EPL,10/7/2018, 12:00,Fulham V Arsenal,Fulham,Arsenal 80 | EPL,10/7/2018, 14:15,Southampton V Chelsea,Southampton,Chelsea 81 | EPL,10/7/2018, 16:30,Liverpool V Manchester City,Liverpool,Manchester City 82 | EPL,10/20/2018, 12:30,Chelsea V Manchester United,Chelsea,Manchester United 83 | EPL,10/20/2018, 15:00,Bournemouth V Southampton,Bournemouth,Southampton 84 | EPL,10/20/2018, 15:00,Cardiff City V Fulham,Cardiff City,Fulham 85 | EPL,10/20/2018, 15:00,Manchester City V Burnley,Manchester City,Burnley 86 | EPL,10/20/2018, 15:00,Newcastle United V Brighton and Hove Albion,Newcastle United,Brighton and Hove Albion 87 | EPL,10/20/2018, 15:00,West Ham United V Tottenham Hotspur,West Ham United,Tottenham Hotspur 88 | EPL,10/20/2018, 15:00,Wolverhampton Wanderers V Watford,Wolverhampton Wanderers,Watford 89 | EPL,10/20/2018, 17:30,Huddersfield Town V Liverpool,Huddersfield Town,Liverpool 90 | EPL,10/21/2018, 16:00,Everton V Crystal Palace,Everton,Crystal Palace 91 | EPL,10/22/2018, 20:00,Arsenal V Leicester City,Arsenal,Leicester City 92 | EPL,10/27/2018, 12:30,Manchester United V Everton,Manchester United,Everton 93 | EPL,10/27/2018, 15:00,Brighton and Hove Albion V Wolverhampton Wanderers,Brighton and Hove Albion,Wolverhampton Wanderers 94 | EPL,10/27/2018, 15:00,Fulham V Bournemouth,Fulham,Bournemouth 95 | EPL,10/27/2018, 15:00,Liverpool V Cardiff City,Liverpool,Cardiff City 96 | EPL,10/27/2018, 15:00,Southampton V Newcastle United,Southampton,Newcastle United 97 | EPL,10/27/2018, 15:00,Watford V Huddersfield Town,Watford,Huddersfield Town 98 | EPL,10/27/2018, 17:30,Leicester City V West Ham United,Leicester City,West Ham United 99 | EPL,10/28/2018, 13:30,Burnley V Chelsea,Burnley,Chelsea 100 | EPL,10/28/2018, 13:30,Crystal Palace V Arsenal,Crystal Palace,Arsenal 101 | EPL,10/28/2018, 16:00,Tottenham Hotspur V Manchester City,Tottenham Hotspur,Manchester City 102 | EPL,11/3/2018, 12:30,Bournemouth V Manchester United,Bournemouth,Manchester United 103 | EPL,11/3/2018, 15:00,Cardiff City V Leicester City,Cardiff City,Leicester City 104 | EPL,11/3/2018, 15:00,Everton V Brighton and Hove Albion,Everton,Brighton and Hove Albion 105 | EPL,11/3/2018, 15:00,Manchester City V Southampton,Manchester City,Southampton 106 | EPL,11/3/2018, 15:00,Newcastle United V Watford,Newcastle United,Watford 107 | EPL,11/3/2018, 15:00,West Ham United V Burnley,West Ham United,Burnley 108 | EPL,11/3/2018, 17:30,Arsenal V Liverpool,Arsenal,Liverpool 109 | EPL,11/4/2018, 13:30,Wolverhampton Wanderers V Tottenham Hotspur,Wolverhampton Wanderers,Tottenham Hotspur 110 | EPL,11/4/2018, 16:00,Chelsea V Crystal Palace,Chelsea,Crystal Palace 111 | EPL,11/5/2018, 20:00,Huddersfield Town V Fulham,Huddersfield Town,Fulham 112 | EPL,11/10/2018, 12:30,Cardiff City V Brighton and Hove Albion,Cardiff City,Brighton and Hove Albion 113 | EPL,11/10/2018, 15:00,Huddersfield Town V West Ham United,Huddersfield Town,West Ham United 114 | EPL,11/10/2018, 15:00,Leicester City V Burnley,Leicester City,Burnley 115 | EPL,11/10/2018, 15:00,Newcastle United V Bournemouth,Newcastle United,Bournemouth 116 | EPL,11/10/2018, 15:00,Southampton V Watford,Southampton,Watford 117 | EPL,11/10/2018, 17:30,Crystal Palace V Tottenham Hotspur,Crystal Palace,Tottenham Hotspur 118 | EPL,11/11/2018, 12:00,Liverpool V Fulham,Liverpool,Fulham 119 | EPL,11/11/2018, 14:15,Chelsea V Everton,Chelsea,Everton 120 | EPL,11/11/2018, 16:30,Arsenal V Wolverhampton Wanderers,Arsenal,Wolverhampton Wanderers 121 | EPL,11/11/2018, 16:30,Manchester City V Manchester United,Manchester City,Manchester United 122 | EPL,11/24/2018, 15:00,Brighton and Hove Albion V Leicester City,Brighton and Hove Albion,Leicester City 123 | EPL,11/24/2018, 15:00,Everton V Cardiff City,Everton,Cardiff City 124 | EPL,11/24/2018, 15:00,Fulham V Southampton,Fulham,Southampton 125 | EPL,11/24/2018, 15:00,Manchester United V Crystal Palace,Manchester United,Crystal Palace 126 | EPL,11/24/2018, 15:00,Watford V Liverpool,Watford,Liverpool 127 | EPL,11/24/2018, 15:00,West Ham United V Manchester City,West Ham United,Manchester City 128 | EPL,11/24/2018, 17:30,Tottenham Hotspur V Chelsea,Tottenham Hotspur,Chelsea 129 | EPL,11/25/2018, 13:30,Bournemouth V Arsenal,Bournemouth,Arsenal 130 | EPL,11/25/2018, 16:00,Wolverhampton Wanderers V Huddersfield Town,Wolverhampton Wanderers,Huddersfield Town 131 | EPL,11/26/2018, 20:00,Burnley V Newcastle United,Burnley,Newcastle United 132 | EPL,12/1/2018, 15:00,Arsenal V Tottenham Hotspur,Arsenal,Tottenham Hotspur 133 | EPL,12/1/2018, 15:00,Cardiff City V Wolverhampton Wanderers,Cardiff City,Wolverhampton Wanderers 134 | EPL,12/1/2018, 15:00,Chelsea V Fulham,Chelsea,Fulham 135 | EPL,12/1/2018, 15:00,Crystal Palace V Burnley,Crystal Palace,Burnley 136 | EPL,12/1/2018, 15:00,Huddersfield Town V Brighton and Hove Albion,Huddersfield Town,Brighton and Hove Albion 137 | EPL,12/1/2018, 15:00,Leicester City V Watford,Leicester City,Watford 138 | EPL,12/1/2018, 15:00,Liverpool V Everton,Liverpool,Everton 139 | EPL,12/1/2018, 15:00,Manchester City V Bournemouth,Manchester City,Bournemouth 140 | EPL,12/1/2018, 15:00,Newcastle United V West Ham United,Newcastle United,West Ham United 141 | EPL,12/1/2018, 15:00,Southampton V Manchester United,Southampton,Manchester United 142 | EPL,12/4/2018, 19:45,Bournemouth V Huddersfield Town,Bournemouth,Huddersfield Town 143 | EPL,12/4/2018, 19:45,Brighton and Hove Albion V Crystal Palace,Brighton and Hove Albion,Crystal Palace 144 | EPL,12/4/2018, 19:45,Burnley V Liverpool,Burnley,Liverpool 145 | EPL,12/4/2018, 19:45,Fulham V Leicester City,Fulham,Leicester City 146 | EPL,12/4/2018, 19:45,Watford V Manchester City,Watford,Manchester City 147 | EPL,12/4/2018, 19:45,West Ham United V Cardiff City,West Ham United,Cardiff City 148 | EPL,12/4/2018, 19:45,Wolverhampton Wanderers V Chelsea,Wolverhampton Wanderers,Chelsea 149 | EPL,12/4/2018, 20:00,Manchester United V Arsenal,Manchester United,Arsenal 150 | EPL,12/5/2018, 19:45,Everton V Newcastle United,Everton,Newcastle United 151 | EPL,12/5/2018, 20:00,Tottenham Hotspur V Southampton,Tottenham Hotspur,Southampton 152 | EPL,12/8/2018, 15:00,Arsenal V Huddersfield Town,Arsenal,Huddersfield Town 153 | EPL,12/8/2018, 15:00,Bournemouth V Liverpool,Bournemouth,Liverpool 154 | EPL,12/8/2018, 15:00,Burnley V Brighton and Hove Albion,Burnley,Brighton and Hove Albion 155 | EPL,12/8/2018, 15:00,Cardiff City V Southampton,Cardiff City,Southampton 156 | EPL,12/8/2018, 15:00,Chelsea V Manchester City,Chelsea,Manchester City 157 | EPL,12/8/2018, 15:00,Everton V Watford,Everton,Watford 158 | EPL,12/8/2018, 15:00,Leicester City V Tottenham Hotspur,Leicester City,Tottenham Hotspur 159 | EPL,12/8/2018, 15:00,Manchester United V Fulham,Manchester United,Fulham 160 | EPL,12/8/2018, 15:00,Newcastle United V Wolverhampton Wanderers,Newcastle United,Wolverhampton Wanderers 161 | EPL,12/8/2018, 15:00,West Ham United V Crystal Palace,West Ham United,Crystal Palace 162 | EPL,12/15/2018, 15:00,Brighton and Hove Albion V Chelsea,Brighton and Hove Albion,Chelsea 163 | EPL,12/15/2018, 15:00,Crystal Palace V Leicester City,Crystal Palace,Leicester City 164 | EPL,12/15/2018, 15:00,Fulham V West Ham United,Fulham,West Ham United 165 | EPL,12/15/2018, 15:00,Huddersfield Town V Newcastle United,Huddersfield Town,Newcastle United 166 | EPL,12/15/2018, 15:00,Liverpool V Manchester United,Liverpool,Manchester United 167 | EPL,12/15/2018, 15:00,Manchester City V Everton,Manchester City,Everton 168 | EPL,12/15/2018, 15:00,Southampton V Arsenal,Southampton,Arsenal 169 | EPL,12/15/2018, 15:00,Tottenham Hotspur V Burnley,Tottenham Hotspur,Burnley 170 | EPL,12/15/2018, 15:00,Watford V Cardiff City,Watford,Cardiff City 171 | EPL,12/15/2018, 15:00,Wolverhampton Wanderers V Bournemouth,Wolverhampton Wanderers,Bournemouth 172 | EPL,12/22/2018, 15:00,Arsenal V Burnley,Arsenal,Burnley 173 | EPL,12/22/2018, 15:00,Bournemouth V Brighton and Hove Albion,Bournemouth,Brighton and Hove Albion 174 | EPL,12/22/2018, 15:00,Cardiff City V Manchester United,Cardiff City,Manchester United 175 | EPL,12/22/2018, 15:00,Chelsea V Leicester City,Chelsea,Leicester City 176 | EPL,12/22/2018, 15:00,Everton V Tottenham Hotspur,Everton,Tottenham Hotspur 177 | EPL,12/22/2018, 15:00,Huddersfield Town V Southampton,Huddersfield Town,Southampton 178 | EPL,12/22/2018, 15:00,Manchester City V Crystal Palace,Manchester City,Crystal Palace 179 | EPL,12/22/2018, 15:00,Newcastle United V Fulham,Newcastle United,Fulham 180 | EPL,12/22/2018, 15:00,West Ham United V Watford,West Ham United,Watford 181 | EPL,12/22/2018, 15:00,Wolverhampton Wanderers V Liverpool,Wolverhampton Wanderers,Liverpool 182 | EPL,12/26/2018, 15:00,Brighton and Hove Albion V Arsenal,Brighton and Hove Albion,Arsenal 183 | EPL,12/26/2018, 15:00,Burnley V Everton,Burnley,Everton 184 | EPL,12/26/2018, 15:00,Crystal Palace V Cardiff City,Crystal Palace,Cardiff City 185 | EPL,12/26/2018, 15:00,Fulham V Wolverhampton Wanderers,Fulham,Wolverhampton Wanderers 186 | EPL,12/26/2018, 15:00,Leicester City V Manchester City,Leicester City,Manchester City 187 | EPL,12/26/2018, 15:00,Liverpool V Newcastle United,Liverpool,Newcastle United 188 | EPL,12/26/2018, 15:00,Manchester United V Huddersfield Town,Manchester United,Huddersfield Town 189 | EPL,12/26/2018, 15:00,Southampton V West Ham United,Southampton,West Ham United 190 | EPL,12/26/2018, 15:00,Tottenham Hotspur V Bournemouth,Tottenham Hotspur,Bournemouth 191 | EPL,12/26/2018, 15:00,Watford V Chelsea,Watford,Chelsea 192 | EPL,12/29/2018, 15:00,Brighton and Hove Albion V Everton,Brighton and Hove Albion,Everton 193 | EPL,12/29/2018, 15:00,Burnley V West Ham United,Burnley,West Ham United 194 | EPL,12/29/2018, 15:00,Crystal Palace V Chelsea,Crystal Palace,Chelsea 195 | EPL,12/29/2018, 15:00,Fulham V Huddersfield Town,Fulham,Huddersfield Town 196 | EPL,12/29/2018, 15:00,Leicester City V Cardiff City,Leicester City,Cardiff City 197 | EPL,12/29/2018, 15:00,Liverpool V Arsenal,Liverpool,Arsenal 198 | EPL,12/29/2018, 15:00,Manchester United V Bournemouth,Manchester United,Bournemouth 199 | EPL,12/29/2018, 15:00,Southampton V Manchester City,Southampton,Manchester City 200 | EPL,12/29/2018, 15:00,Tottenham Hotspur V Wolverhampton Wanderers,Tottenham Hotspur,Wolverhampton Wanderers 201 | EPL,12/29/2018, 15:00,Watford V Newcastle United,Watford,Newcastle United 202 | EPL,1/1/2019, 15:00,Arsenal V Fulham,Arsenal,Fulham 203 | EPL,1/1/2019, 15:00,Bournemouth V Watford,Bournemouth,Watford 204 | EPL,1/1/2019, 15:00,Cardiff City V Tottenham Hotspur,Cardiff City,Tottenham Hotspur 205 | EPL,1/1/2019, 15:00,Chelsea V Southampton,Chelsea,Southampton 206 | EPL,1/1/2019, 15:00,Everton V Leicester City,Everton,Leicester City 207 | EPL,1/1/2019, 15:00,Huddersfield Town V Burnley,Huddersfield Town,Burnley 208 | EPL,1/1/2019, 15:00,Manchester City V Liverpool,Manchester City,Liverpool 209 | EPL,1/1/2019, 15:00,Newcastle United V Manchester United,Newcastle United,Manchester United 210 | EPL,1/1/2019, 15:00,West Ham United V Brighton and Hove Albion,West Ham United,Brighton and Hove Albion 211 | EPL,1/1/2019, 15:00,Wolverhampton Wanderers V Crystal Palace,Wolverhampton Wanderers,Crystal Palace 212 | EPL,1/12/2019, 15:00,Brighton and Hove Albion V Liverpool,Brighton and Hove Albion,Liverpool 213 | EPL,1/12/2019, 15:00,Burnley V Fulham,Burnley,Fulham 214 | EPL,1/12/2019, 15:00,Cardiff City V Huddersfield Town,Cardiff City,Huddersfield Town 215 | EPL,1/12/2019, 15:00,Chelsea V Newcastle United,Chelsea,Newcastle United 216 | EPL,1/12/2019, 15:00,Crystal Palace V Watford,Crystal Palace,Watford 217 | EPL,1/12/2019, 15:00,Everton V Bournemouth,Everton,Bournemouth 218 | EPL,1/12/2019, 15:00,Leicester City V Southampton,Leicester City,Southampton 219 | EPL,1/12/2019, 15:00,Manchester City V Wolverhampton Wanderers,Manchester City,Wolverhampton Wanderers 220 | EPL,1/12/2019, 15:00,Tottenham Hotspur V Manchester United,Tottenham Hotspur,Manchester United 221 | EPL,1/12/2019, 15:00,West Ham United V Arsenal,West Ham United,Arsenal 222 | EPL,1/19/2019, 15:00,Arsenal V Chelsea,Arsenal,Chelsea 223 | EPL,1/19/2019, 15:00,Bournemouth V West Ham United,Bournemouth,West Ham United 224 | EPL,1/19/2019, 15:00,Fulham V Tottenham Hotspur,Fulham,Tottenham Hotspur 225 | EPL,1/19/2019, 15:00,Huddersfield Town V Manchester City,Huddersfield Town,Manchester City 226 | EPL,1/19/2019, 15:00,Liverpool V Crystal Palace,Liverpool,Crystal Palace 227 | EPL,1/19/2019, 15:00,Manchester United V Brighton and Hove Albion,Manchester United,Brighton and Hove Albion 228 | EPL,1/19/2019, 15:00,Newcastle United V Cardiff City,Newcastle United,Cardiff City 229 | EPL,1/19/2019, 15:00,Southampton V Everton,Southampton,Everton 230 | EPL,1/19/2019, 15:00,Watford V Burnley,Watford,Burnley 231 | EPL,1/19/2019, 15:00,Wolverhampton Wanderers V Leicester City,Wolverhampton Wanderers,Leicester City 232 | EPL,1/29/2019, 19:45,Arsenal V Cardiff City,Arsenal,Cardiff City 233 | EPL,1/29/2019, 19:45,Bournemouth V Chelsea,Bournemouth,Chelsea 234 | EPL,1/29/2019, 19:45,Fulham V Brighton and Hove Albion,Fulham,Brighton and Hove Albion 235 | EPL,1/29/2019, 19:45,Huddersfield Town V Everton,Huddersfield Town,Everton 236 | EPL,1/29/2019, 19:45,Wolverhampton Wanderers V West Ham United,Wolverhampton Wanderers,West Ham United 237 | EPL,1/29/2019, 20:00,Manchester United V Burnley,Manchester United,Burnley 238 | EPL,1/30/2019, 19:45,Newcastle United V Manchester City,Newcastle United,Manchester City 239 | EPL,1/30/2019, 19:45,Southampton V Crystal Palace,Southampton,Crystal Palace 240 | EPL,1/30/2019, 20:00,Liverpool V Leicester City,Liverpool,Leicester City 241 | EPL,1/30/2019, 20:00,Tottenham Hotspur V Watford,Tottenham Hotspur,Watford 242 | EPL,2/2/2019, 15:00,Brighton and Hove Albion V Watford,Brighton and Hove Albion,Watford 243 | EPL,2/2/2019, 15:00,Burnley V Southampton,Burnley,Southampton 244 | EPL,2/2/2019, 15:00,Cardiff City V Bournemouth,Cardiff City,Bournemouth 245 | EPL,2/2/2019, 15:00,Chelsea V Huddersfield Town,Chelsea,Huddersfield Town 246 | EPL,2/2/2019, 15:00,Crystal Palace V Fulham,Crystal Palace,Fulham 247 | EPL,2/2/2019, 15:00,Everton V Wolverhampton Wanderers,Everton,Wolverhampton Wanderers 248 | EPL,2/2/2019, 15:00,Leicester City V Manchester United,Leicester City,Manchester United 249 | EPL,2/2/2019, 15:00,Manchester City V Arsenal,Manchester City,Arsenal 250 | EPL,2/2/2019, 15:00,Tottenham Hotspur V Newcastle United,Tottenham Hotspur,Newcastle United 251 | EPL,2/2/2019, 15:00,West Ham United V Liverpool,West Ham United,Liverpool 252 | EPL,2/9/2019, 15:00,Brighton and Hove Albion V Burnley,Brighton and Hove Albion,Burnley 253 | EPL,2/9/2019, 15:00,Crystal Palace V West Ham United,Crystal Palace,West Ham United 254 | EPL,2/9/2019, 15:00,Fulham V Manchester United,Fulham,Manchester United 255 | EPL,2/9/2019, 15:00,Huddersfield Town V Arsenal,Huddersfield Town,Arsenal 256 | EPL,2/9/2019, 15:00,Liverpool V Bournemouth,Liverpool,Bournemouth 257 | EPL,2/9/2019, 15:00,Manchester City V Chelsea,Manchester City,Chelsea 258 | EPL,2/9/2019, 15:00,Southampton V Cardiff City,Southampton,Cardiff City 259 | EPL,2/9/2019, 15:00,Tottenham Hotspur V Leicester City,Tottenham Hotspur,Leicester City 260 | EPL,2/9/2019, 15:00,Watford V Everton,Watford,Everton 261 | EPL,2/9/2019, 15:00,Wolverhampton Wanderers V Newcastle United,Wolverhampton Wanderers,Newcastle United 262 | EPL,2/23/2019, 15:00,Arsenal V Southampton,Arsenal,Southampton 263 | EPL,2/23/2019, 15:00,Bournemouth V Wolverhampton Wanderers,Bournemouth,Wolverhampton Wanderers 264 | EPL,2/23/2019, 15:00,Burnley V Tottenham Hotspur,Burnley,Tottenham Hotspur 265 | EPL,2/23/2019, 15:00,Cardiff City V Watford,Cardiff City,Watford 266 | EPL,2/23/2019, 15:00,Chelsea V Brighton and Hove Albion,Chelsea,Brighton and Hove Albion 267 | EPL,2/23/2019, 15:00,Everton V Manchester City,Everton,Manchester City 268 | EPL,2/23/2019, 15:00,Leicester City V Crystal Palace,Leicester City,Crystal Palace 269 | EPL,2/23/2019, 15:00,Manchester United V Liverpool,Manchester United,Liverpool 270 | EPL,2/23/2019, 15:00,Newcastle United V Huddersfield Town,Newcastle United,Huddersfield Town 271 | EPL,2/23/2019, 15:00,West Ham United V Fulham,West Ham United,Fulham 272 | EPL,2/26/2019, 19:45,Arsenal V Bournemouth,Arsenal,Bournemouth 273 | EPL,2/26/2019, 19:45,Cardiff City V Everton,Cardiff City,Everton 274 | EPL,2/26/2019, 19:45,Huddersfield Town V Wolverhampton Wanderers,Huddersfield Town,Wolverhampton Wanderers 275 | EPL,2/26/2019, 19:45,Leicester City V Brighton and Hove Albion,Leicester City,Brighton and Hove Albion 276 | EPL,2/26/2019, 20:00,Crystal Palace V Manchester United,Crystal Palace,Manchester United 277 | EPL,2/27/2019, 19:45,Chelsea V Tottenham Hotspur,Chelsea,Tottenham Hotspur 278 | EPL,2/27/2019, 19:45,Newcastle United V Burnley,Newcastle United,Burnley 279 | EPL,2/27/2019, 19:45,Southampton V Fulham,Southampton,Fulham 280 | EPL,2/27/2019, 20:00,Liverpool V Watford,Liverpool,Watford 281 | EPL,2/27/2019, 20:00,Manchester City V West Ham United,Manchester City,West Ham United 282 | EPL,3/2/2019, 15:00,Bournemouth V Manchester City,Bournemouth,Manchester City 283 | EPL,3/2/2019, 15:00,Brighton and Hove Albion V Huddersfield Town,Brighton and Hove Albion,Huddersfield Town 284 | EPL,3/2/2019, 15:00,Burnley V Crystal Palace,Burnley,Crystal Palace 285 | EPL,3/2/2019, 15:00,Everton V Liverpool,Everton,Liverpool 286 | EPL,3/2/2019, 15:00,Fulham V Chelsea,Fulham,Chelsea 287 | EPL,3/2/2019, 15:00,Manchester United V Southampton,Manchester United,Southampton 288 | EPL,3/2/2019, 15:00,Tottenham Hotspur V Arsenal,Tottenham Hotspur,Arsenal 289 | EPL,3/2/2019, 15:00,Watford V Leicester City,Watford,Leicester City 290 | EPL,3/2/2019, 15:00,West Ham United V Newcastle United,West Ham United,Newcastle United 291 | EPL,3/2/2019, 15:00,Wolverhampton Wanderers V Cardiff City,Wolverhampton Wanderers,Cardiff City 292 | EPL,3/9/2019, 15:00,Arsenal V Manchester United,Arsenal,Manchester United 293 | EPL,3/9/2019, 15:00,Cardiff City V West Ham United,Cardiff City,West Ham United 294 | EPL,3/9/2019, 15:00,Chelsea V Wolverhampton Wanderers,Chelsea,Wolverhampton Wanderers 295 | EPL,3/9/2019, 15:00,Crystal Palace V Brighton and Hove Albion,Crystal Palace,Brighton and Hove Albion 296 | EPL,3/9/2019, 15:00,Huddersfield Town V Bournemouth,Huddersfield Town,Bournemouth 297 | EPL,3/9/2019, 15:00,Leicester City V Fulham,Leicester City,Fulham 298 | EPL,3/9/2019, 15:00,Liverpool V Burnley,Liverpool,Burnley 299 | EPL,3/9/2019, 15:00,Manchester City V Watford,Manchester City,Watford 300 | EPL,3/9/2019, 15:00,Newcastle United V Everton,Newcastle United,Everton 301 | EPL,3/9/2019, 15:00,Southampton V Tottenham Hotspur,Southampton,Tottenham Hotspur 302 | EPL,3/16/2019, 15:00,Bournemouth V Newcastle United,Bournemouth,Newcastle United 303 | EPL,3/16/2019, 15:00,Brighton and Hove Albion V Cardiff City,Brighton and Hove Albion,Cardiff City 304 | EPL,3/16/2019, 15:00,Burnley V Leicester City,Burnley,Leicester City 305 | EPL,3/16/2019, 15:00,Everton V Chelsea,Everton,Chelsea 306 | EPL,3/16/2019, 15:00,Fulham V Liverpool,Fulham,Liverpool 307 | EPL,3/16/2019, 15:00,Manchester United V Manchester City,Manchester United,Manchester City 308 | EPL,3/16/2019, 15:00,Tottenham Hotspur V Crystal Palace,Tottenham Hotspur,Crystal Palace 309 | EPL,3/16/2019, 15:00,Watford V Southampton,Watford,Southampton 310 | EPL,3/16/2019, 15:00,West Ham United V Huddersfield Town,West Ham United,Huddersfield Town 311 | EPL,3/16/2019, 15:00,Wolverhampton Wanderers V Arsenal,Wolverhampton Wanderers,Arsenal 312 | EPL,3/30/2019, 15:00,Arsenal V Newcastle United,Arsenal,Newcastle United 313 | EPL,3/30/2019, 15:00,Brighton and Hove Albion V Southampton,Brighton and Hove Albion,Southampton 314 | EPL,3/30/2019, 15:00,Burnley V Wolverhampton Wanderers,Burnley,Wolverhampton Wanderers 315 | EPL,3/30/2019, 15:00,Cardiff City V Chelsea,Cardiff City,Chelsea 316 | EPL,3/30/2019, 15:00,Crystal Palace V Huddersfield Town,Crystal Palace,Huddersfield Town 317 | EPL,3/30/2019, 15:00,Fulham V Manchester City,Fulham,Manchester City 318 | EPL,3/30/2019, 15:00,Leicester City V Bournemouth,Leicester City,Bournemouth 319 | EPL,3/30/2019, 15:00,Liverpool V Tottenham Hotspur,Liverpool,Tottenham Hotspur 320 | EPL,3/30/2019, 15:00,Manchester United V Watford,Manchester United,Watford 321 | EPL,3/30/2019, 15:00,West Ham United V Everton,West Ham United,Everton 322 | EPL,4/6/2019, 15:00,Bournemouth V Burnley,Bournemouth,Burnley 323 | EPL,4/6/2019, 15:00,Chelsea V West Ham United,Chelsea,West Ham United 324 | EPL,4/6/2019, 15:00,Everton V Arsenal,Everton,Arsenal 325 | EPL,4/6/2019, 15:00,Huddersfield Town V Leicester City,Huddersfield Town,Leicester City 326 | EPL,4/6/2019, 15:00,Manchester City V Cardiff City,Manchester City,Cardiff City 327 | EPL,4/6/2019, 15:00,Newcastle United V Crystal Palace,Newcastle United,Crystal Palace 328 | EPL,4/6/2019, 15:00,Southampton V Liverpool,Southampton,Liverpool 329 | EPL,4/6/2019, 15:00,Tottenham Hotspur V Brighton and Hove Albion,Tottenham Hotspur,Brighton and Hove Albion 330 | EPL,4/6/2019, 15:00,Watford V Fulham,Watford,Fulham 331 | EPL,4/6/2019, 15:00,Wolverhampton Wanderers V Manchester United,Wolverhampton Wanderers,Manchester United 332 | EPL,4/13/2019, 15:00,Brighton and Hove Albion V Bournemouth,Brighton and Hove Albion,Bournemouth 333 | EPL,4/13/2019, 15:00,Burnley V Cardiff City,Burnley,Cardiff City 334 | EPL,4/13/2019, 15:00,Crystal Palace V Manchester City,Crystal Palace,Manchester City 335 | EPL,4/13/2019, 15:00,Fulham V Everton,Fulham,Everton 336 | EPL,4/13/2019, 15:00,Leicester City V Newcastle United,Leicester City,Newcastle United 337 | EPL,4/13/2019, 15:00,Liverpool V Chelsea,Liverpool,Chelsea 338 | EPL,4/13/2019, 15:00,Manchester United V West Ham United,Manchester United,West Ham United 339 | EPL,4/13/2019, 15:00,Southampton V Wolverhampton Wanderers,Southampton,Wolverhampton Wanderers 340 | EPL,4/13/2019, 15:00,Tottenham Hotspur V Huddersfield Town,Tottenham Hotspur,Huddersfield Town 341 | EPL,4/13/2019, 15:00,Watford V Arsenal,Watford,Arsenal 342 | EPL,4/20/2019, 15:00,Arsenal V Crystal Palace,Arsenal,Crystal Palace 343 | EPL,4/20/2019, 15:00,Bournemouth V Fulham,Bournemouth,Fulham 344 | EPL,4/20/2019, 15:00,Cardiff City V Liverpool,Cardiff City,Liverpool 345 | EPL,4/20/2019, 15:00,Chelsea V Burnley,Chelsea,Burnley 346 | EPL,4/20/2019, 15:00,Everton V Manchester United,Everton,Manchester United 347 | EPL,4/20/2019, 15:00,Huddersfield Town V Watford,Huddersfield Town,Watford 348 | EPL,4/20/2019, 15:00,Manchester City V Tottenham Hotspur,Manchester City,Tottenham Hotspur 349 | EPL,4/20/2019, 15:00,Newcastle United V Southampton,Newcastle United,Southampton 350 | EPL,4/20/2019, 15:00,West Ham United V Leicester City,West Ham United,Leicester City 351 | EPL,4/20/2019, 15:00,Wolverhampton Wanderers V Brighton and Hove Albion,Wolverhampton Wanderers,Brighton and Hove Albion 352 | EPL,4/27/2019, 15:00,Brighton and Hove Albion V Newcastle United,Brighton and Hove Albion,Newcastle United 353 | EPL,4/27/2019, 15:00,Burnley V Manchester City,Burnley,Manchester City 354 | EPL,4/27/2019, 15:00,Crystal Palace V Everton,Crystal Palace,Everton 355 | EPL,4/27/2019, 15:00,Fulham V Cardiff City,Fulham,Cardiff City 356 | EPL,4/27/2019, 15:00,Leicester City V Arsenal,Leicester City,Arsenal 357 | EPL,4/27/2019, 15:00,Liverpool V Huddersfield Town,Liverpool,Huddersfield Town 358 | EPL,4/27/2019, 15:00,Manchester United V Chelsea,Manchester United,Chelsea 359 | EPL,4/27/2019, 15:00,Southampton V Bournemouth,Southampton,Bournemouth 360 | EPL,4/27/2019, 15:00,Tottenham Hotspur V West Ham United,Tottenham Hotspur,West Ham United 361 | EPL,4/27/2019, 15:00,Watford V Wolverhampton Wanderers,Watford,Wolverhampton Wanderers 362 | EPL,5/4/2019, 15:00,Arsenal V Brighton and Hove Albion,Arsenal,Brighton and Hove Albion 363 | EPL,5/4/2019, 15:00,Bournemouth V Tottenham Hotspur,Bournemouth,Tottenham Hotspur 364 | EPL,5/4/2019, 15:00,Cardiff City V Crystal Palace,Cardiff City,Crystal Palace 365 | EPL,5/4/2019, 15:00,Chelsea V Watford,Chelsea,Watford 366 | EPL,5/4/2019, 15:00,Everton V Burnley,Everton,Burnley 367 | EPL,5/4/2019, 15:00,Huddersfield Town V Manchester United,Huddersfield Town,Manchester United 368 | EPL,5/4/2019, 15:00,Manchester City V Leicester City,Manchester City,Leicester City 369 | EPL,5/4/2019, 15:00,Newcastle United V Liverpool,Newcastle United,Liverpool 370 | EPL,5/4/2019, 15:00,West Ham United V Southampton,West Ham United,Southampton 371 | EPL,5/4/2019, 15:00,Wolverhampton Wanderers V Fulham,Wolverhampton Wanderers,Fulham 372 | EPL,5/12/2019, 15:00,Brighton and Hove Albion V Manchester City,Brighton and Hove Albion,Manchester City 373 | EPL,5/12/2019, 15:00,Burnley V Arsenal,Burnley,Arsenal 374 | EPL,5/12/2019, 15:00,Crystal Palace V Bournemouth,Crystal Palace,Bournemouth 375 | EPL,5/12/2019, 15:00,Fulham V Newcastle United,Fulham,Newcastle United 376 | EPL,5/12/2019, 15:00,Leicester City V Chelsea,Leicester City,Chelsea 377 | EPL,5/12/2019, 15:00,Liverpool V Wolverhampton Wanderers,Liverpool,Wolverhampton Wanderers 378 | EPL,5/12/2019, 15:00,Manchester United V Cardiff City,Manchester United,Cardiff City 379 | EPL,5/12/2019, 15:00,Southampton V Huddersfield Town,Southampton,Huddersfield Town 380 | EPL,5/12/2019, 15:00,Tottenham Hotspur V Everton,Tottenham Hotspur,Everton 381 | EPL,5/12/2019, 15:00,Watford V West Ham United,Watford,West Ham United 382 | -------------------------------------------------------------------------------- /dog_home.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/dog_home.png -------------------------------------------------------------------------------- /images/bayesian.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/bayesian.png -------------------------------------------------------------------------------- /images/dog_home.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/dog_home.png -------------------------------------------------------------------------------- /images/epl.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/epl.png -------------------------------------------------------------------------------- /images/fireworks.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/fireworks.gif -------------------------------------------------------------------------------- /images/messi-scribble.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/messi-scribble.png -------------------------------------------------------------------------------- /images/network.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/network.png -------------------------------------------------------------------------------- /images/paul.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/paul.png -------------------------------------------------------------------------------- /images/scrible-test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/scrible-test.png -------------------------------------------------------------------------------- /images/selfie.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/selfie.png -------------------------------------------------------------------------------- /images/selfie2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/selfie2.png -------------------------------------------------------------------------------- /images/test.R: -------------------------------------------------------------------------------- 1 | # test file 2 | -------------------------------------------------------------------------------- /llm_bots/animeyourself/anime_yourself.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This bot uses all options provided by the Poe protocol. 4 | 5 | """ 6 | from __future__ import annotations 7 | 8 | import asyncio 9 | import json 10 | from typing import AsyncIterable 11 | 12 | from fastapi_poe import PoeBot, run 13 | from fastapi_poe.types import ( 14 | ContentType, 15 | QueryRequest, 16 | ReportFeedbackRequest, 17 | SettingsRequest, 18 | SettingsResponse, 19 | ) 20 | import os 21 | import replicate 22 | import re 23 | import random 24 | import textwrap 25 | import asyncio 26 | from sse_starlette.sse import ServerSentEvent 27 | 28 | SETTINGS = SettingsResponse( 29 | context_clear_window_secs=60 * 60, allow_user_context_clear=True 30 | ) 31 | 32 | MODEL_URL = "mcai/dreamshaper-v6-img2img:c7959eb3a86c09b449dacc11ce8bba295fda466fc6935ab8709e35f4f48c980c" 33 | 34 | _WAIT_TIMEOUT_S = 1 35 | 36 | 37 | def parse_text(txt): 38 | # Define a regular expression to match the fields and their values 39 | pattern = re.compile(r'(image|prompt):\s*"([^"]+)"') 40 | matches = pattern.findall(txt) 41 | result = {} 42 | 43 | for field, value in matches: 44 | result[field] = value 45 | return result 46 | 47 | core_positive_prompt = """ 48 | portrait closeup, best quality, intricately detailed, 49 | moe manga style, finely detailed features perfect art, 50 | professional majestic impressionism oil painting by Waterhouse, 51 | John Constable, Ed Blinkey, Atey Ghailan, Studio Ghibli, 52 | by Jeremy Mann, Greg Manchess, Antonio Moro, trending on ArtStation, 53 | trending on CGSociety, cinematic lighting, hand drawn, hand colored. 54 | """ 55 | 56 | alternative_prompt = """ 57 | portrait closeup, best quality, 58 | moe manga style, finely detailed features perfect art, 59 | anime style, 8k, artwork in the style of guweiz, 60 | cinematic lighting, hand drawn, hand colored. 61 | """ 62 | 63 | negative_prompt = """ 64 | disfigured, kitsch, ugly, oversaturated, greain, low-res, deformed, blurry, bad anatomy, 65 | poorly drawn face, mutation, mutated, extra limb, missing limb, 66 | floating limbs, disconnected limbs, malformed hands, extra fingers, poorly drawn hands, 67 | """ 68 | 69 | def error_message(): 70 | msg = textwrap.dedent(f""" 71 | Sorry, I cannot parse your input. Please try again and make sure your input has the format: 72 | 73 | ```python 74 | image: "" 75 | prompt: (Optional) "" #no worry, we will generate an anime of you to start 76 | ``` 77 | 78 | """ 79 | ) 80 | return msg 81 | 82 | def _get_complete_message(second, input_url, output_url): 83 | _COMPLETE_MESSAGE = f""" 84 | Completed! (took {second}s) 85 | 86 | This is you: 87 | 88 | ![]({input_url}) 89 | 90 | This is the anime version of yourself. 91 | 92 | ![]({output_url}) 93 | 94 | """ 95 | return textwrap.dedent(_COMPLETE_MESSAGE) 96 | 97 | class AnimeYourself(PoeBot): 98 | async def get_response(self, query: QueryRequest) -> AsyncIterable[ServerSentEvent]: 99 | """Return an async iterator of events to send to the user.""" 100 | last_message = query.query[-1].content.lower() 101 | response_content_type: ContentType = ("text/markdown") 102 | yield self.meta_event( 103 | content_type=response_content_type, 104 | linkify=False, 105 | refetch_settings=False, 106 | suggested_replies=False, 107 | ) 108 | 109 | input_dict = parse_text(last_message) 110 | if "image" not in input_dict: 111 | yield self.text_event(error_message()) 112 | else: 113 | ### call the model to get results: 114 | input_prompt = "" if 'prompt' not in input_dict else input_dict['prompt'] 115 | 116 | generated_image_task = asyncio.create_task( 117 | self._generate_image( 118 | image_url = input_dict['image'], 119 | prompt = "mksks style," + input_prompt + "," + alternative_prompt 120 | ) 121 | ) 122 | 123 | i = 0 124 | while True: 125 | done, _ = await asyncio.wait( 126 | [generated_image_task], timeout=_WAIT_TIMEOUT_S 127 | ) 128 | if done: 129 | output = done.pop().result() 130 | break 131 | yield self.replace_response_event(f"Generating your image: {i}s elapsed...") 132 | i += 1 133 | 134 | if len(output) != 1: 135 | yield self.replace_response_event( 136 | textwrap.dedent( 137 | f""" 138 | 139 | Sorry, something seems to go wrong. 140 | 141 | Please don't blame the developer. He's trying ᕕ( ᐛ )ᕗ. 142 | 143 | But he does want you to know that you look amazing who you are. 144 | 145 | ![]({input_dict['image']}) 146 | """ 147 | ) 148 | ) 149 | else: 150 | yield self.replace_response_event( 151 | textwrap.dedent( 152 | _get_complete_message( 153 | second = i, 154 | input_url = input_dict['image'], 155 | output_url = output[0]) 156 | ) 157 | ) 158 | 159 | async def _generate_image(self, image_url: str, prompt: str): 160 | loop = asyncio.get_running_loop() 161 | output = await loop.run_in_executor( 162 | None, 163 | lambda: replicate.run( 164 | MODEL_URL, 165 | input={ 166 | "image": image_url, 167 | "prompt": prompt, 168 | "negative_prompt": negative_prompt, 169 | "num_inference_steps": 50, 170 | } 171 | ) 172 | ) 173 | return output 174 | 175 | async def on_feedback(self, feedback: ReportFeedbackRequest) -> None: 176 | """Called when we receive user feedback such as likes.""" 177 | print( 178 | f"User {feedback.user_id} gave feedback on {feedback.conversation_id}" 179 | f"message {feedback.message_id}: {feedback.feedback_type}" 180 | ) 181 | 182 | async def get_settings(self, settings: SettingsRequest) -> SettingsResponse: 183 | """Return the settings for this bot.""" 184 | return SETTINGS 185 | 186 | 187 | if __name__ == "__main__": 188 | run(AnimeYourself()) -------------------------------------------------------------------------------- /llm_bots/animeyourself/main.py: -------------------------------------------------------------------------------- 1 | from fastapi_poe import make_app 2 | import modal 3 | from modal import Image, Stub, asgi_app 4 | from anime_yourself import AnimeYourself 5 | import os 6 | 7 | # specific to hosting with modal.com 8 | image = Image.debian_slim().pip_install_from_requirements( 9 | "requirements.txt" 10 | ) 11 | stub = Stub("animeyourself") 12 | 13 | 14 | @stub.function(image=image, secret=modal.Secret.from_name("my-replicate-key")) 15 | @asgi_app() 16 | def fastapi_app(): 17 | bot = AnimeYourself() 18 | app = make_app(bot, allow_without_key=True) 19 | return app 20 | 21 | -------------------------------------------------------------------------------- /llm_bots/animeyourself/readme.md: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /llm_bots/animeyourself/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi-poe==0.0.14 2 | replicate 3 | -------------------------------------------------------------------------------- /llm_bots/scribble2img/README.md: -------------------------------------------------------------------------------- 1 | # Poe API Bot tutorial 2 | 3 | This is the companion repository to the Poe API bot 4 | [quick start](https://developer.poe.com/api-bots/quick-start). Please follow that guide 5 | for instructions on how to use this repository. 6 | -------------------------------------------------------------------------------- /llm_bots/scribble2img/main.py: -------------------------------------------------------------------------------- 1 | from fastapi_poe import make_app 2 | import modal 3 | from modal import Image, Stub, asgi_app 4 | from scribble2image import Scribble2ImageBot 5 | import os 6 | 7 | # specific to hosting with modal.com 8 | image = Image.debian_slim().pip_install_from_requirements( 9 | "requirements.txt" 10 | ) 11 | stub = Stub("scribble2image") 12 | 13 | 14 | @stub.function(image=image, secret=modal.Secret.from_name("my-replicate-key")) 15 | @asgi_app() 16 | def fastapi_app(): 17 | bot = Scribble2ImageBot() 18 | app = make_app(bot, allow_without_key=True) 19 | return app 20 | -------------------------------------------------------------------------------- /llm_bots/scribble2img/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi-poe==0.0.14 2 | replicate==0.8.3 3 | -------------------------------------------------------------------------------- /llm_bots/scribble2img/scribble2image.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | This bot uses all options provided by the Poe protocol. You can use it to get examples 4 | of all the protocol has to offer. 5 | 6 | """ 7 | from __future__ import annotations 8 | 9 | import asyncio 10 | import json 11 | from typing import AsyncIterable 12 | 13 | from fastapi_poe import PoeBot, run 14 | from fastapi_poe.types import ( 15 | ContentType, 16 | QueryRequest, 17 | ReportFeedbackRequest, 18 | SettingsRequest, 19 | SettingsResponse, 20 | ) 21 | import os 22 | import replicate 23 | import re 24 | import random 25 | import textwrap 26 | import asyncio 27 | from sse_starlette.sse import ServerSentEvent 28 | 29 | SETTINGS = SettingsResponse( 30 | context_clear_window_secs=60 * 60, allow_user_context_clear=True 31 | ) 32 | 33 | MODEL_URL = "jagilley/controlnet-scribble:435061a1b5a4c1e26740464bf786efdfa9cb3a3ac488595a2de23e143fdb0117" 34 | 35 | _WAIT_TIMEOUT_S = 1 36 | 37 | encouraging_msgs = [ 38 | "The developer loves your scribble 😍", 39 | "With the drawing skill like this, do you even need this bot?", 40 | "Wow! Your scribbling skill is hella strong", 41 | "Interesting interesting! I gotcha", 42 | "Your doodle is simply world class. Let me see what else I can add." 43 | ] 44 | 45 | def _get_complete_message(second, encouraging_msg, input_url, output_url): 46 | _COMPLETE_MESSAGE = f""" 47 | Completed! (took {second}s) 48 | 49 | This is the original. {encouraging_msg}: 50 | 51 | ![]({input_url}) 52 | 53 | This is your scribble brought to life: 54 | 55 | ![]({output_url}) 56 | 57 | """ 58 | return _COMPLETE_MESSAGE 59 | 60 | 61 | def parse_text(txt): 62 | # Define a regular expression to match the fields and their values 63 | pattern = re.compile(r'(image|prompt):\s*"([^"]+)"') 64 | matches = pattern.findall(txt) 65 | result = {} 66 | 67 | for field, value in matches: 68 | result[field] = value 69 | return result 70 | 71 | def error_message(missing_image=True, image_url=None): 72 | missing_piece = "image" if missing_image else "prompt" 73 | if image_url: 74 | additional_txt = f""" 75 | 76 | But I just wanna say that I love your scribble. 77 | 78 | ![]({image_url}) 79 | """ 80 | else: 81 | additional_txt = "" 82 | msg = textwrap.dedent(f""" 83 | Sorry, I cannot parse your {missing_piece}. Please try again and make sure your input has the format: 84 | 85 | ```python 86 | image: "" 87 | prompt: "" 88 | ``` 89 | 90 | {additional_txt} 91 | """ 92 | ) 93 | return msg 94 | 95 | class Scribble2ImageBot(PoeBot): 96 | 97 | async def get_response(self, query: QueryRequest) -> AsyncIterable[ServerSentEvent]: 98 | """Return an async iterator of events to send to the user.""" 99 | last_message = query.query[-1].content.lower() 100 | response_content_type: ContentType = ("text/markdown") 101 | yield self.meta_event( 102 | content_type=response_content_type, 103 | linkify=False, 104 | refetch_settings=False, 105 | suggested_replies=False, 106 | ) 107 | 108 | input_dict = parse_text(last_message) 109 | if "image" not in input_dict: 110 | yield self.text_event(error_message()) 111 | elif "prompt" not in input_dict: 112 | yield self.text_event(error_message( 113 | missing_image=False, 114 | image_url = input_dict['image'] 115 | ) 116 | ) 117 | else: 118 | ### call the model to get results: 119 | generated_image_task = asyncio.create_task( 120 | self._generate_image(input_dict['image'], input_dict['prompt']) 121 | ) 122 | 123 | i = 0 124 | while True: 125 | done, _ = await asyncio.wait( 126 | [generated_image_task], timeout=_WAIT_TIMEOUT_S 127 | ) 128 | if done: 129 | output = done.pop().result() 130 | break 131 | yield self.replace_response_event(f"Generating your image: {i}s elapsed...") 132 | i += 1 133 | 134 | if len(output) != 2: 135 | yield self.replace_response_event( 136 | textwrap.dedent( 137 | f""" 138 | 139 | Sorry, something seems to go wrong. 140 | 141 | Please don't blame the developer. He's trying ᕕ( ᐛ )ᕗ. 142 | 143 | But he does want you to know that he loves your scribble. 144 | 145 | ![]({input_dict['image']}) 146 | """ 147 | ) 148 | ) 149 | else: 150 | yield self.replace_response_event( 151 | textwrap.dedent( 152 | _get_complete_message( 153 | second = i, 154 | encouraging_msg = random.choice(encouraging_msgs), 155 | input_url = input_dict['image'], 156 | output_url = output[1]) 157 | ) 158 | ) 159 | 160 | async def _generate_image(self, image_url: str, prompt: str): 161 | loop = asyncio.get_running_loop() 162 | output = await loop.run_in_executor( 163 | None, 164 | lambda: replicate.run( 165 | MODEL_URL, 166 | input={ 167 | "image": image_url, 168 | "prompt": prompt, 169 | } 170 | ) 171 | ) 172 | return output 173 | 174 | 175 | async def on_feedback(self, feedback: ReportFeedbackRequest) -> None: 176 | """Called when we receive user feedback such as likes.""" 177 | print( 178 | f"User {feedback.user_id} gave feedback on {feedback.conversation_id}" 179 | f"message {feedback.message_id}: {feedback.feedback_type}" 180 | ) 181 | 182 | async def get_settings(self, settings: SettingsRequest) -> SettingsResponse: 183 | """Return the settings for this bot.""" 184 | return SETTINGS 185 | 186 | 187 | if __name__ == "__main__": 188 | run(Scribble2ImageBot()) --------------------------------------------------------------------------------