├── Python
├── CollaborativeFiltering
│ ├── Goodreads_surprise.py
│ └── TrainingModule.py
├── fireworks.py
├── flights_networkx.py
├── football_visual.ipynb
├── image.jpg
├── images
│ └── networkx_basemap
│ │ ├── map_0.png
│ │ ├── map_1.png
│ │ ├── map_2.png
│ │ ├── map_3.png
│ │ ├── table_1.PNG
│ │ ├── table_2.PNG
│ │ ├── table_3.PNG
│ │ └── table_5.PNG
├── lincoln_estimate.py
├── mbappe.jpg
├── n_dimensionalNormal.py
└── optimal_dating.py
├── R
├── EPL
│ ├── Agg.png
│ ├── Last.png
│ ├── Misc
│ │ └── TeamEvaluate2015.R
│ ├── betting
│ │ ├── Portfolio-xkcd.png
│ │ ├── bet_strategy.R
│ │ ├── clean_data.R
│ │ └── prediction.R
│ ├── penalty
│ │ ├── Scraping.ipynb
│ │ └── penalty.R
│ ├── prediction
│ │ ├── clean_data.R
│ │ ├── match_simulate.R
│ │ ├── sim.R
│ │ └── visualize.R
│ └── xkcd.ttf
├── Paul_hypothesis_test.R
├── RuleOfThree.R
├── bayes_god.R
├── bayesian_gym.R
├── dating_sim.R
├── end_to_end_projects.R
└── lindy
│ ├── Inverse_Random_Sampling.pdf
│ └── lindy_simulation.R
├── README.md
├── data
├── Team2015season.csv
├── Vietnamese_2016.csv
├── all_games.csv
├── all_penalties.csv
├── fixtures.csv
├── history.csv
├── housing.csv
└── national_longitudinal_survey.csv
├── dog_home.png
├── images
├── bayesian.png
├── dog_home.png
├── epl.png
├── fireworks.gif
├── messi-scribble.png
├── network.png
├── paul.png
├── scrible-test.png
├── selfie.png
├── selfie2.png
└── test.R
└── llm_bots
├── animeyourself
├── anime_yourself.py
├── main.py
├── readme.md
└── requirements.txt
└── scribble2img
├── README.md
├── main.py
├── requirements.txt
└── scribble2image.py
/Python/CollaborativeFiltering/Goodreads_surprise.py:
--------------------------------------------------------------------------------
1 | """
2 | Code for Collaborative Filtering project
3 |
4 | Instead of using module from Training module.py,
5 | using surprise package which is a lot more efficient
6 | in terms of sparse matrix handling.
7 | """
8 |
9 | import numpy as np
10 | import pandas as pd
11 | from surprise import SVD
12 | from surprise import Dataset
13 | from surprise import Reader
14 | from surprise import accuracy
15 | from surprise import BaselineOnly
16 | from surprise.model_selection import train_test_split
17 |
18 | # 6 millions rows
19 | df = pd.read_csv('/ratings.csv')
20 | df['user_id'] = df['user_id'].astype(str)
21 | df.head()
22 |
23 | # 10k books
24 | books = pd.read_csv('/books.csv')
25 |
26 | # create a mapping between book_id and name
27 | id_to_name = {}
28 | for row in new_books.itertuples():
29 | id_to_name[row[1]] = row[11]
30 |
31 | # add my onw book ratings
32 | my_rating = {'user_id': [str(53425) for i in range(11)],
33 | 'book_id': [13, 119, 240, 283, 479, 1100, 2205, 2246, 3227, 7210, 5],
34 | 'rating': [5, 3, 4, 4, 4, 5, 2, 4, 4, 3, 3]}
35 | print([id_to_name[id] for id in [13, 119, 240, 283, 479, 1100, 2205, 2246, 3227, 7210, 5]])
36 | full_rating = pd.DataFrame(my_rating).append(new_rating)
37 |
38 |
39 | # Load our data into DataSet class of surprise package
40 | reader = Reader(rating_scale=(1, 5))
41 | data = Dataset.load_from_df(full_rating[['user_id', 'book_id', 'rating']], reader)
42 |
43 | # split into trainset and testset
44 | trainset, testset = train_test_split(data, test_size=.10)
45 | train_eval = trainset.build_testset()
46 |
47 | # train a Funk SGD-SVD algorithms:
48 | epochs = [1, 5, 10, 20, 40, 80, 100, 120, 150]
49 | train_mse = []
50 | test_mse = []
51 | for n_epoch in epochs:
52 | print("Number of epochs trained", n_epoch)
53 | algo = SVD(n_factors = 40, lr_all = 0.001, n_epochs = n_epoch)
54 | algo.fit(trainset)
55 | train_predictions = algo.test(train_eval)
56 | test_predictions = algo.test(testset)
57 | train_mse.append(accuracy.mse(train_predictions))
58 | test_mse.append(accuracy.mse(test_predictions))
59 | print(accuracy.mse(train_predictions), accuracy.mse(test_predictions))
60 |
61 | # function to plot the learning curve through epochs
62 | def plot_learning_curve(iter_array, train_accuracy, test_accuracy, xlabel = 'iterations'):
63 | plt.plot(iter_array, train_accuracy,
64 | label='Train mse', linewidth=5)
65 | plt.plot(iter_array, test_accuracy,
66 | label='Test mse', linewidth=5)
67 |
68 |
69 | plt.xticks(fontsize=16);
70 | plt.yticks(fontsize=16);
71 | plt.xlabel(xlabel, fontsize=30);
72 | plt.ylabel('MSE', fontsize=30);
73 | plt.legend(loc='best', fontsize=20);
74 |
75 | plot_learning_curve(epochs, train_mse, test_mse)
76 |
77 | #train on a full dataset and make prediction
78 | full_trainset = data.build_full_trainset()
79 | algo = SVD(n_factors = 40, lr_all = 0.001, verbose=True, n_epochs = 100)
80 | algo.fit(full_trainset)
81 |
82 | # make prediction:
83 | all_book_id = full_rating.book_id.unique()
84 | top_n = []
85 | for book_id in all_book_id:
86 | top_n.append(algo.predict(uid = str(53425), iid = book_id))
87 | top_n.sort(key=lambda x: x.est, reverse=True)
88 | print([id_to_name[pred.iid] for pred in top_n[:10]]
89 |
--------------------------------------------------------------------------------
/Python/CollaborativeFiltering/TrainingModule.py:
--------------------------------------------------------------------------------
1 | %matplotlib inline
2 | import matplotlib.pyplot as plt
3 | import seaborn as sns
4 | sns.set()
5 | import numpy as np
6 | import pandas as pd
7 | import io
8 | from collections import defaultdict
9 | from sklearn.metrics import mean_squared_error
10 | from numpy.linalg import solve
11 | np.random.seed(0)
12 |
13 | # input is a dataframe with 3 columns
14 | # user_id, item_id, rating
15 | def create_rating_matrix(df):
16 | n_users = df.user_id.unique().shape[0]
17 | n_items = df.item_id.unique().shape[0]
18 | ratings = np.zeros((n_users, n_items))
19 | for row in df.itertuples():
20 | # row[1] - 1 is the user id readjusted to start by index 0
21 | # row[2] - 1 is the item id readjusted to start by index 0
22 | ratings[row[1]-1, row[2]-1] = row[3]
23 | ratings
24 |
25 |
26 | # calculate sparsity of rating matrix
27 | def calculate_sparsity(rating_matrix)
28 | sparsity = float(len(rating_matrixnonzero()[0])) * 100 / (rating_matrix.shape[0] * rating_matrix.shape[1])
29 | return sparsity
30 |
31 |
32 | # function to split train, test data
33 | def train_test_split(ratings, pct):
34 | test = np.zeros(ratings.shape)
35 | train = ratings.copy()
36 | for user in range(ratings.shape[0]):
37 | user_rating_idx = ratings[user, :].nonzero()[0]
38 | test_ratings = np.random.choice(user_rating_idx,
39 | size=int(len(user_rating_idx)*pct),
40 | replace=False)
41 | train[user, test_ratings] = 0.
42 | test[user, test_ratings] = ratings[user, test_ratings]
43 |
44 | # Test and training are truly disjoint
45 | assert(np.all((train * test) == 0))
46 | return train, test
47 |
48 |
49 | # function to calculate MSE error
50 | def get_mse(pred, actual):
51 | pred = pred[actual.nonzero()].flatten()
52 | actual = actual[actual.nonzero()].flatten()
53 | return mean_squared_error(pred, actual)
54 |
55 |
56 | class AlternatingLeastSquareMF():
57 | def __init__(self,
58 | ratings,
59 | n_factors=40,
60 | item_reg=0.0,
61 | user_reg=0.0
62 | ):
63 | """
64 | Train a matrix factorization model to predict empty
65 | entries in a matrix.
66 |
67 | Params
68 | ======
69 | ratings : (ndarray)
70 | User x Item matrix with corresponding ratings
71 |
72 | n_factors : (int)
73 | Number of latent factors (k) to use in model
74 |
75 | item_reg : (float)
76 | Regularization term for item latent factors
77 |
78 | user_reg : (float)
79 | Regularization term for user latent factors
80 | """
81 |
82 | self.ratings = ratings
83 | self.n_users, self.n_items = ratings.shape
84 | self.n_factors = n_factors
85 | self.item_reg = item_reg
86 | self.user_reg = user_reg
87 |
88 | def alternating_step(self,
89 | latent_vectors,
90 | fixed_vecs,
91 | ratings,
92 | _lambda,
93 | type='user'):
94 | """
95 | One of the two ALS steps. Solve for the latent vectors
96 | specified by type.
97 | """
98 | if type == 'user':
99 | # Precompute
100 | YTY = fixed_vecs.T.dot(fixed_vecs)
101 | lambdaI = np.eye(YTY.shape[0]) * _lambda
102 |
103 | for u in range(latent_vectors.shape[0]):
104 | latent_vectors[u, :] = solve((YTY + lambdaI),
105 | ratings[u, :].dot(fixed_vecs))
106 | elif type == 'item':
107 | # Precompute
108 | XTX = fixed_vecs.T.dot(fixed_vecs)
109 | lambdaI = np.eye(XTX.shape[0]) * _lambda
110 |
111 | for i in range(latent_vectors.shape[0]):
112 | latent_vectors[i, :] = solve((XTX + lambdaI),
113 | ratings[:, i].T.dot(fixed_vecs))
114 | return latent_vectors
115 |
116 |
117 |
118 | def train(self, n_iter=10):
119 | """ Train model for n_iter iterations from scratch."""
120 | # initialize latent vectors
121 | self.user_vecs = np.random.random((self.n_users, self.n_factors))
122 | self.item_vecs = np.random.random((self.n_items, self.n_factors))
123 |
124 | ctr = 1
125 | while ctr <= n_iter:
126 | self.user_vecs = self.alternating_step(self.user_vecs,
127 | self.item_vecs,
128 | self.ratings,
129 | self.user_reg,
130 | type='user')
131 | self.item_vecs = self.alternating_step(self.item_vecs,
132 | self.user_vecs,
133 | self.ratings,
134 | self.item_reg,
135 | type='item')
136 | ctr += 1
137 |
138 | def predict_all(self):
139 | """ Predict ratings for every user and item. """
140 | predictions = np.zeros((self.user_vecs.shape[0],
141 | self.item_vecs.shape[0]))
142 | for u in range(self.user_vecs.shape[0]):
143 | for i in range(self.item_vecs.shape[0]):
144 | predictions[u, i] = self.predict(u, i)
145 |
146 | return predictions
147 |
148 | def predict(self, u, i):
149 | """ Single user and item prediction. """
150 | return self.user_vecs[u, :].dot(self.item_vecs[i, :].T)
151 |
152 | def calculate_learning_curve(self, iter_array, test):
153 | """
154 | Keep track of MSE as a function of training iterations.
155 |
156 | Params
157 | ======
158 | iter_array : (list)
159 | List of numbers of iterations to train for each step of
160 | the learning curve. e.g. [1, 5, 10, 20]
161 | test : (2D ndarray)
162 | Testing dataset (assumed to be user x item).
163 |
164 | The function creates two new class attributes:
165 |
166 | train_mse : (list)
167 | Training data MSE values for each value of iter_array
168 | test_mse : (list)
169 | Test data MSE values for each value of iter_array
170 | """
171 | iter_array.sort()
172 | self.train_mse =[]
173 | self.test_mse = []
174 | for (i, n_iter) in enumerate(iter_array):
175 | self.train(n_iter)
176 | predictions = self.predict_all()
177 |
178 | self.train_mse += [get_mse(predictions, self.ratings)]
179 | self.test_mse += [get_mse(predictions, test)]
180 | print('Train mse: ' + str(self.train_mse[-1]))
181 | print('Test mse: ' + str(self.test_mse[-1]))
182 |
183 | class SGDMF():
184 | def __init__(self,
185 | ratings,
186 | n_factors=40,
187 | item_fact_reg=0.0,
188 | user_fact_reg=0.0,
189 | item_bias_reg=0.0,
190 | user_bias_reg=0.0,
191 | verbose=False
192 | ):
193 | """
194 | Train an SGD matrix factorization model to predict empty
195 | entries in a matrix.
196 | """
197 |
198 | self.ratings = ratings
199 | self.n_users, self.n_items = ratings.shape
200 | self.n_factors = n_factors
201 | self.item_fact_reg = item_fact_reg
202 | self.user_fact_reg = user_fact_reg
203 | self.item_bias_reg = item_bias_reg
204 | self.user_bias_reg = user_bias_reg
205 | self.sample_row, self.sample_col = self.ratings.nonzero()
206 | self.n_samples = len(self.sample_row)
207 |
208 | def sgd(self):
209 | for idx in self.training_indices:
210 | u = self.sample_row[idx]
211 | i = self.sample_col[idx]
212 | prediction = self.predict(u, i)
213 | e = (self.ratings[u,i] - prediction) # error
214 |
215 | # Update biases
216 | self.user_bias[u] += self.learning_rate * \
217 | (e - self.user_bias_reg * self.user_bias[u])
218 | self.item_bias[i] += self.learning_rate * \
219 | (e - self.item_bias_reg * self.item_bias[i])
220 |
221 | #Update latent factors
222 | self.user_vecs[u, :] += self.learning_rate * \
223 | (e * self.item_vecs[i, :] - \
224 | self.user_fact_reg * self.user_vecs[u,:])
225 | self.item_vecs[i, :] += self.learning_rate * \
226 | (e * self.user_vecs[u, :] - \
227 | self.item_fact_reg * self.item_vecs[i,:])
228 |
229 | def train(self, n_iter=10, learning_rate=0.1):
230 | """ Train model for n_iter iterations from scratch."""
231 | # initialize latent vectors
232 | self.user_vecs = np.random.random(size=(self.n_users, self.n_factors))
233 | self.item_vecs = np.random.random(size=(self.n_items, self.n_factors))
234 |
235 | self.learning_rate = learning_rate
236 | self.user_bias = np.zeros(self.n_users)
237 | self.item_bias = np.zeros(self.n_items)
238 | self.global_bias = np.mean(self.ratings[np.where(self.ratings != 0)])
239 |
240 | ctr = 1
241 | while ctr <= n_iter:
242 | self.training_indices = np.arange(self.n_samples)
243 | np.random.shuffle(self.training_indices)
244 | self.sgd()
245 | ctr += 1
246 |
247 | def predict(self, u, i):
248 | prediction = self.global_bias + self.user_bias[u] + self.item_bias[i]
249 | prediction += self.user_vecs[u, :].dot(self.item_vecs[i, :].T)
250 | return prediction
251 |
252 | def predict_all(self):
253 | """ Predict ratings for every user and item."""
254 | predictions = np.zeros((self.user_vecs.shape[0],
255 | self.item_vecs.shape[0]))
256 | for u in range(self.user_vecs.shape[0]):
257 | for i in range(self.item_vecs.shape[0]):
258 | predictions[u, i] = self.predict(u, i)
259 |
260 | return predictions
261 |
262 | def calculate_learning_curve(self, iter_array, test, learning_rate=0.1):
263 | iter_array.sort()
264 | self.train_mse =[]
265 | self.test_mse = []
266 | for (i, n_iter) in enumerate(iter_array):
267 | self.train(n_iter, learning_rate)
268 |
269 | predictions = self.predict_all()
270 |
271 | self.train_mse += [get_mse(predictions, self.ratings)]
272 | self.test_mse += [get_mse(predictions, test)]
273 | print('Train mse: ' + str(self.train_mse[-1]))
274 | print('Test mse: ' + str(self.test_mse[-1]))
275 |
276 |
277 |
278 | def plot_learning_curve(iter_array, model):
279 | plt.plot(iter_array, model.train_mse, \
280 | label='Training', linewidth=5)
281 | plt.plot(iter_array, model.test_mse, \
282 | label='Test', linewidth=5)
283 |
284 |
285 | plt.xticks(fontsize=16);
286 | plt.yticks(fontsize=16);
287 | plt.xlabel('iterations', fontsize=30);
288 | plt.ylabel('MSE', fontsize=30);
289 | plt.legend(loc='best', fontsize=20);
290 |
291 |
292 |
293 |
294 |
--------------------------------------------------------------------------------
/Python/fireworks.py:
--------------------------------------------------------------------------------
1 | '''
2 | FIREWORKS SIMULATION WITH TKINTER
3 |
4 | *self-containing code
5 | *to run: simply type python simple.py in your console
6 | *compatible with both Python 2 and Python 3
7 | *Dependencies: tkinter, Pillow (only for background image)
8 | *The design is based on high school physics, with some small twists only for aesthetics purpose
9 |
10 | '''
11 | import tkinter as tk
12 | #from tkinter import messagebox
13 | #from tkinter import PhotoImage
14 | from PIL import Image, ImageTk
15 | from time import time, sleep
16 | from random import choice, uniform, randint
17 | from math import sin, cos, radians
18 |
19 | # gravity, act as our constant g, you can experiment by changing it
20 | GRAVITY = 0.05
21 | # list of color, can choose randomly or use as a queue (FIFO)
22 | colors = ['red', 'blue', 'yellow', 'white', 'green', 'orange', 'purple', 'seagreen','indigo', 'cornflowerblue']
23 |
24 | '''
25 | Generic class for particles
26 |
27 | particles are emitted almost randomly on the sky, forming a round of circle (a star) before falling and getting removed
28 | from canvas
29 |
30 | Attributes:
31 | - id: identifier of a particular particle in a star
32 | - x, y: x,y-coordinate of a star (point of explosion)
33 | - vx, vy: speed of particle in x, y coordinate
34 | - total: total number of particle in a star
35 | - age: how long has the particle last on canvas
36 | - color: self-explantory
37 | - cv: canvas
38 | - lifespan: how long a particle will last on canvas
39 |
40 | '''
41 | class part:
42 | def __init__(self, cv, idx, total, explosion_speed, x=0., y=0., vx = 0., vy = 0., size=2., color = 'red', lifespan = 2, **kwargs):
43 | self.id = idx
44 | self.x = x
45 | self.y = y
46 | self.initial_speed = explosion_speed
47 | self.vx = vx
48 | self.vy = vy
49 | self.total = total
50 | self.age = 0
51 | self.color = color
52 | self.cv = cv
53 | self.cid = self.cv.create_oval(
54 | x - size, y - size, x + size,
55 | y + size, fill=self.color)
56 | self.lifespan = lifespan
57 |
58 | def update(self, dt):
59 | self.age += dt
60 |
61 | # particle expansions
62 | if self.alive() and self.expand():
63 | move_x = cos(radians(self.id*360/self.total))*self.initial_speed
64 | move_y = sin(radians(self.id*360/self.total))*self.initial_speed
65 | self.cv.move(self.cid, move_x, move_y)
66 | self.vx = move_x/(float(dt)*1000)
67 |
68 | # falling down in projectile motion
69 | elif self.alive():
70 | move_x = cos(radians(self.id*360/self.total))
71 | # we technically don't need to update x, y because move will do the job
72 | self.cv.move(self.cid, self.vx + move_x, self.vy+GRAVITY*dt)
73 | self.vy += GRAVITY*dt
74 |
75 | # remove article if it is over the lifespan
76 | elif self.cid is not None:
77 | cv.delete(self.cid)
78 | self.cid = None
79 |
80 | # define time frame for expansion
81 | def expand (self):
82 | return self.age <= 1.2
83 |
84 | # check if particle is still alive in lifespan
85 | def alive(self):
86 | return self.age <= self.lifespan
87 |
88 | '''
89 | Firework simulation loop:
90 | Recursively call to repeatedly emit new fireworks on canvas
91 |
92 | a list of list (list of stars, each of which is a list of particles)
93 | is created and drawn on canvas at every call,
94 | via update protocol inside each 'part' object
95 | '''
96 | def simulate(cv):
97 | t = time()
98 | explode_points = []
99 | wait_time = randint(10,100)
100 | numb_explode = randint(6,10)
101 | # create list of list of all particles in all simultaneous explosion
102 | for point in range(numb_explode):
103 | objects = []
104 | x_cordi = randint(50,550)
105 | y_cordi = randint(50, 150)
106 | speed = uniform (0.5, 1.5)
107 | size = uniform (0.5,3)
108 | color = choice(colors)
109 | explosion_speed = uniform(0.2, 1)
110 | total_particles = randint(10,50)
111 | for i in range(1,total_particles):
112 | r = part(cv, idx = i, total = total_particles, explosion_speed = explosion_speed, x = x_cordi, y = y_cordi,
113 | vx = speed, vy = speed, color=color, size = size, lifespan = uniform(0.6,1.75))
114 | objects.append(r)
115 | explode_points.append(objects)
116 |
117 | total_time = .0
118 | # keeps undate within a timeframe of 1.8 second
119 | while total_time < 1.8:
120 | sleep(0.01)
121 | tnew = time()
122 | t, dt = tnew, tnew - t
123 | for point in explode_points:
124 | for item in point:
125 | item.update(dt)
126 | cv.update()
127 | total_time += dt
128 | # recursive call to continue adding new explosion on canvas
129 | root.after(wait_time, simulate, cv)
130 |
131 | def close(*ignore):
132 | """Stops simulation loop and closes the window."""
133 | global root
134 | root.quit()
135 |
136 | if __name__ == '__main__':
137 | root = tk.Tk()
138 | cv = tk.Canvas(root, height=600, width=600)
139 | # use a nice background image
140 | image = Image.open("image.jpg")
141 | photo = ImageTk.PhotoImage(image)
142 | cv.create_image(0, 0, image=photo, anchor='nw')
143 |
144 | cv.pack()
145 | root.protocol("WM_DELETE_WINDOW", close)
146 |
147 | root.after(100, simulate, cv)
148 |
149 | root.mainloop()
150 |
--------------------------------------------------------------------------------
/Python/flights_networkx.py:
--------------------------------------------------------------------------------
1 | # import libaries
2 | import pandas as pd
3 | import numpy as np
4 | import networkx as nx
5 | import matplotlib.pyplot as plt
6 | from mpl_toolkits.basemap import Basemap as Basemap
7 | import matplotlib.lines as mlines
8 |
9 | def main():
10 | # download airport info data
11 | airport_col = ['ID', 'Name', 'City', 'Country','IATA', 'ICAO', 'Lat', 'Long', 'Alt',
12 | 'Timezone', 'DST', 'Tz database time zone', 'type', 'source']
13 | airport_df = pd.read_csv("https://raw.githubusercontent.com/jpatokal/openflights/master/data/airports.dat",
14 | names = airport_col, index_col = 0)
15 |
16 | # download flight routes data
17 | route_cols = ['Airline', 'Airline ID', 'Source Airport', 'Source Airport ID',
18 | 'Dest Airport', 'Dest Airport ID', 'Codeshare', 'Stops', 'equipment']
19 | routes_df = pd.read_csv("https://raw.githubusercontent.com/jpatokal/openflights/master/data/routes.dat",
20 | names = route_cols)
21 | #clean up data, change 'object' type to numeric and drops NaNs
22 | routes_df['Source Airport ID'] = pd.to_numeric(routes_df['Source Airport ID'].astype(str), 'coerce')
23 | routes_df['Dest Airport ID'] = pd.to_numeric(routes_df['Dest Airport ID'].astype(str), 'coerce')
24 | routes_df = routes_df.dropna(subset=["Source Airport ID", "Dest Airport ID"])
25 |
26 |
27 | simple_visualization(airport_df, routes_df)
28 | advanced_visualization(airport_df, routes_df)
29 |
30 | ##### Part 1: simple network visualization, Alaska and other non-mainlain territories included ####
31 | ###################################################################################################
32 | # extract country and then extra columns
33 | def simple_visualization (airport_df, routes_df):
34 | if (airport_df is None) or (routes_df is None):
35 | print "Data cannot be retrieved and read"
36 | else:
37 | airport_us = airport_df[(airport_df.Country == "United States")][['Name','Lat', 'Long', 'IATA', 'ICAO']]
38 | us_airport_ix = airport_us.index.values
39 | routes_us = routes_df[(routes_df['Source Airport ID'].isin(us_airport_ix)) &
40 | (routes_df['Dest Airport ID'].isin(us_airport_ix))] #extract routes that flyies from AND to USA
41 | routes_us = pd.DataFrame(routes_us.groupby(['Source Airport', 'Dest Airport']).size().reset_index(name='counts'))
42 | # to find number of flights in and out of an airport
43 | # it is similar to find number of rows in which each airport occur in either one of the 2 columns
44 | counts = routes_us['Source Airport'].append(routes_us.loc[routes_us['Source Airport'] != routes_us['Dest Airport'], 'Dest Airport']).value_counts()
45 | # create a data frame of position based on names in count
46 | counts = pd.DataFrame({'IATA': counts.index, 'total_flight': counts})
47 | pos_data = counts.merge(airport_us, on = 'IATA')
48 |
49 | # Create graph
50 | graph = nx.from_pandas_edgelist(routes_us, source = 'Source Airport', target = 'Dest Airport',
51 | edge_attr = 'counts',create_using = nx.DiGraph())
52 |
53 | # default graph using Networkx inbuilt graph tools
54 | plt.figure(figsize = (10,9))
55 | nx.draw_networkx(graph)
56 | plt.savefig("./images/networkx_basemap/map_0.png", format = "png", dpi = 300)
57 | plt.show()
58 |
59 | # Set up base map
60 | plt.figure(figsize=(15,20))
61 | m = Basemap(
62 | projection='merc',
63 | llcrnrlon=-180,
64 | llcrnrlat=10,
65 | urcrnrlon=-50,
66 | urcrnrlat=70,
67 | lat_ts=0,
68 | resolution='l',
69 | suppress_ticks=True)
70 |
71 | # import long lat as m attribute
72 | mx, my = m(pos_data['Long'].values, pos_data['Lat'].values)
73 | pos = {}
74 | for count, elem in enumerate (pos_data['IATA']):
75 | pos[elem] = (mx[count], my[count])
76 |
77 | # draw nodes and edges and over aly on basemap
78 | nx.draw_networkx_nodes(G = graph, pos = pos, node_list = graph.nodes(), node_color = 'r', alpha = 0.8,
79 | node_size = [counts['total_flight'][s]*3 for s in graph.nodes()])
80 | nx.draw_networkx_edges(G = graph, pos = pos, edge_color='g', width = routes_us['counts']*0.75,
81 | alpha=0.2, arrows = False)
82 |
83 | m.drawcountries(linewidth = 3)
84 | m.drawstates(linewidth = 0.2)
85 | m.drawcoastlines(linewidth=3)
86 | plt.tight_layout()
87 | plt.savefig("./images/networkx_basemap/map_2.png", format = "png", dpi = 300)
88 | plt.show()
89 | print ("successful visualization")
90 | return 0
91 |
92 | ##### Part 2: more on visualization, only mainlain territories with more features ####
93 | ######################################################################################
94 | # extract country and then extra columns
95 | def advanced_visualization (airport_df, routes_df):
96 | if (airport_df is None) or (routes_df is None):
97 | print ("Data cannot be retrieved and read")
98 | else:
99 | airport_us = airport_df[(airport_df.Country == "United States") & (airport_df.Lat > 25)
100 | & (airport_df.Lat < 50) & (airport_df.Long > -130) & (airport_df.Long < -60)]
101 | us_airport_ix = airport_us.index.values
102 | routes_us = routes_df[(routes_df['Source Airport ID'].isin(us_airport_ix)) &
103 | (routes_df['Dest Airport ID'].isin(us_airport_ix))] #extract routes that flyies from AND to USA
104 | routes_us = pd.DataFrame(routes_us.groupby(['Source Airport', 'Dest Airport']).size().reset_index(name='counts'))
105 | # to find number of flights in and out of an airport
106 | # it is similar to find number of rows in which each airport occur in either one of the 2 columns
107 | counts = routes_us['Source Airport'].append(routes_us.loc[routes_us['Source Airport'] != routes_us['Dest Airport'], 'Dest Airport']).value_counts()
108 | # create a data frame of position based on names in count
109 | counts = pd.DataFrame({'IATA': counts.index, 'total_flight': counts})
110 | pos_data = counts.merge(airport_us, on = 'IATA')
111 |
112 | # Create graph
113 | graph = nx.from_pandas_edgelist(routes_us, source = 'Source Airport', target = 'Dest Airport',
114 | edge_attr = 'counts',create_using = nx.DiGraph())
115 |
116 | # Set up base map
117 | plt.figure(figsize=(15,20))
118 | m = Basemap(
119 | projection='merc',
120 | llcrnrlon=-180,
121 | llcrnrlat=10,
122 | urcrnrlon=-50,
123 | urcrnrlat=70,
124 | lat_ts=0,
125 | resolution='l',
126 | suppress_ticks=True)
127 |
128 | # import long lat as m attribute
129 | mx, my = m(pos_data['Long'].values, pos_data['Lat'].values)
130 | pos = {}
131 | for count, elem in enumerate (pos_data['IATA']):
132 | pos[elem] = (mx[count], my[count])
133 |
134 | # draw nodes and edges and overly on basemap
135 | nx.draw_networkx_nodes(G = graph, pos = pos, nodelist = [x for x in graph.nodes() if counts['total_flight'][x] >= 100],
136 | node_color = 'r', alpha = 0.8,
137 | node_size = [counts['total_flight'][x]*4 for x in graph.nodes() if counts['total_flight'][x] >= 100])
138 |
139 | nx.draw_networkx_labels(G = graph, pos = pos, font_size=10,
140 | labels = {x:x for x in graph.nodes() if counts['total_flight'][x] >= 100})
141 |
142 | nx.draw_networkx_nodes(G = graph, pos = pos, nodelist = [x for x in graph.nodes() if counts['total_flight'][x] < 100],
143 | node_color = 'b', alpha = 0.6,
144 | node_size = [counts['total_flight'][x]*4 for x in graph.nodes() if counts['total_flight'][x] < 100])
145 |
146 | nx.draw_networkx_edges(G = graph, pos = pos, edge_color = 'g', width = routes_us['counts']*0.75,
147 | alpha=0.06, arrows = False)
148 |
149 | m.drawcountries(linewidth = 3)
150 | m.drawstates(linewidth = 0.2)
151 | m.drawcoastlines(linewidth=1)
152 | m.fillcontinents(alpha = 0.3)
153 | line1 = mlines.Line2D(range(1), range(1), color="white", marker='o', markerfacecolor="red")
154 | line2 = mlines.Line2D(range(1), range(1), color="white", marker='o',markerfacecolor="blue")
155 | line3 = mlines.Line2D(range(1), range(1), color="green", marker='',markerfacecolor="green")
156 | plt.legend((line1, line2, line3), ('Large Airport > 100 routes', 'Smaller airports', 'routes'),
157 | loc=4, fontsize = 'xx-large')
158 | plt.title("Network graph of flight routes in the USA", fontsize = 30)
159 | #m.bluemarble()
160 | plt.tight_layout()
161 | plt.savefig("./images/networkx_basemap/map_3.png", format = "png", dpi = 300)
162 | plt.show()
163 | print ("successful visualization")
164 | return 0
165 |
166 | if __name__ == "__main__":
167 | main()
168 |
--------------------------------------------------------------------------------
/Python/image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/image.jpg
--------------------------------------------------------------------------------
/Python/images/networkx_basemap/map_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/images/networkx_basemap/map_0.png
--------------------------------------------------------------------------------
/Python/images/networkx_basemap/map_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/images/networkx_basemap/map_1.png
--------------------------------------------------------------------------------
/Python/images/networkx_basemap/map_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/images/networkx_basemap/map_2.png
--------------------------------------------------------------------------------
/Python/images/networkx_basemap/map_3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/images/networkx_basemap/map_3.png
--------------------------------------------------------------------------------
/Python/images/networkx_basemap/table_1.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/images/networkx_basemap/table_1.PNG
--------------------------------------------------------------------------------
/Python/images/networkx_basemap/table_2.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/images/networkx_basemap/table_2.PNG
--------------------------------------------------------------------------------
/Python/images/networkx_basemap/table_3.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/images/networkx_basemap/table_3.PNG
--------------------------------------------------------------------------------
/Python/images/networkx_basemap/table_5.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/images/networkx_basemap/table_5.PNG
--------------------------------------------------------------------------------
/Python/lincoln_estimate.py:
--------------------------------------------------------------------------------
1 | from matplotlib import pyplot as plt
2 |
3 | def like_insta_post(p):
4 | "Find an error with probability p"
5 | return 1 if random.random() < p else 0
6 |
7 | def simulate(true_audience, p1, p2, reps=10000):
8 | """Simulate Lincoln's method for estimating errors
9 | given the true number of errors, each person's probability
10 | of finding an error, and the number of simulations to run."""
11 | naive_estimates = []
12 | lincoln_estimates = []
13 |
14 | for rep in range(reps):
15 | like_post_1 = np.array([like_insta_post(p1) for _ in range(true_audience)])
16 | like_post_2 = np.array([like_insta_post(p2) for _ in range(true_audience)])
17 | like_post1_count = sum(like_post_1)
18 | like_post2_count = sum(like_post_2)
19 | overlap = np.sum(like_post_1 & like_post_2)
20 |
21 | naive_estimates.append(like_post1_count + like_post2_count - overlap)
22 | if overlap > 0:
23 | lincoln_estimates.append(like_post1_count*like_post2_count / float(overlap))
24 |
25 | return naive_estimates, lincoln_estimates
26 |
27 | def calc_stats(arr):
28 | return (
29 | np.mean(arr),
30 | np.std(arr, ddof=1),
31 | np.mean(arr) - 1.96*np.std(arr, ddof=1),
32 | np.mean(arr) + 1.96*np.std(arr, ddof=1)
33 | )
34 |
35 | sims = [[0.3, 0.5], [0.6, 0.4], [0.7, 0.8], [0.9, 0.9]]
36 | # create 2 lists, 1 of data frame of values, 1 of titles
37 | res_arr = []
38 | title_arr = []
39 |
40 | for p in sims:
41 | naive_estimates, lincoln_estimates = simulate(100, p[0], p[1], reps=100000)
42 | naive_stats = calc_stats(naive_estimates)
43 | lincoln_stats = calc_stats(lincoln_estimates)
44 | naive_mean, naive_std = naive_stats[0], naive_stats[1]
45 | lincoln_mean, lincoln_std = lincoln_stats[0], lincoln_stats[1]
46 |
47 | pd_res = pd.DataFrame(
48 | {
49 | "method":["naive", "Lincoln"],
50 | "estimate":[naive_mean, lincoln_mean],
51 | "std": [naive_std, lincoln_std]}
52 | )
53 | res_arr.append(pd_res)
54 | title_arr.append(f" p1={str(p[0])}\n p2={str(p[1])}")
55 |
56 | colors = ['blue', 'orange']
57 | fig, axes = plt.subplots(1, 4, figsize=(18, 6), sharey=True)
58 | ax1, ax2, ax3, ax4 = axes
59 | for dat_df, ax, title in zip(
60 | res_arr,
61 | [ax1, ax2, ax3, ax4],
62 | title_arr
63 | ):
64 | dat_df.plot(x='method', y='estimate', yerr = 'std', kind='bar', color = colors,
65 | ax=ax, legend=False, xlabel='', ylabel = 'mean of estimates').set_title(title)
66 |
67 | for ax in axes:
68 | ax.set_xticklabels(ax.get_xticklabels(), rotation = 90)
69 | for side in ('right', 'top', 'left'):
70 | if (ax == ax1) and (side == 'left'):
71 | continue
72 | else:
73 | sp = ax.spines[side]
74 | sp.set_visible(False)
75 |
--------------------------------------------------------------------------------
/Python/mbappe.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/Python/mbappe.jpg
--------------------------------------------------------------------------------
/Python/n_dimensionalNormal.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import matplotlib as plt
3 |
4 | def get_3_std_estimates(x):
5 | return (x.mean() - 3 * x.std(), x.mean() + 3 * x.std())
6 |
7 | def get_2_std_estimates(x):
8 | return (x.mean() - 2 * x.std(), x.mean() + 2 * x.std())
9 |
10 | def get_1_std_estimates(x):
11 | return (x.mean() - x.std(), x.mean() + x.std())
12 |
13 |
14 | N = 10000
15 |
16 | def get_graph(n, title):
17 | """
18 | Draw a distribution histogram for a sample of N data from
19 | n-dimensional Normal distribution
20 | """
21 |
22 | sample = np.random.normal(size=(N, n))
23 | dist = np.square(np.linalg.norm(sample, axis = 1))
24 | lower_bound, upper_bound = get_2_std_estimates(dist)
25 | n, bins, patches = plt.hist(dist, bins = 'auto', density = "true")
26 | plt.axvline(x = lower_bound, color = 'red')
27 | plt.axvline(x = upper_bound, color = 'red')
28 | plt.title(title, fontdict = {'fontsize': 20})
29 | plt.show()
30 |
31 | get_graph(100, "Distribution of distance from origin for n = 100")
32 |
33 |
34 | def get_boundary(n):
35 | """
36 | For a dimension value n, sample N data points from a n-dimensional
37 | Normal distribution and find the 2 standard deviation boundary
38 | for the squared Euclidan norms.
39 | """
40 |
41 | sample = np.random.normal(size=(N, n))
42 | dist = np.square(np.linalg.norm(sample, axis = 1))
43 | lower_bound, upper_bound = get_2_std_estimates(dist)
44 | return (lower_bound, upper_bound)
45 |
46 | ### simulation
47 | n_range = range(1, 5001)
48 | lower_bounds = []
49 | upper_bounds = []
50 |
51 | for n in n_range:
52 | lower_bound, upper_bound = get_boundary(n)
53 | lower_bounds.append(lower_bound/n)
54 | upper_bounds.append(upper_bound/n)
55 |
56 | plt.style.use('seaborn-notebook')
57 | plt.plot(n_range, lower_bounds, label = 'lower_bounds\ndivided by n')
58 | plt.plot(n_range, upper_bounds, label = 'upper_bounds\ndivided by n')
59 | #plt.axvline(x=1000, color = 'red', linestyle = '--')
60 | plt.legend(prop={'size': 13})
61 | plt.xlim(1, 5000)
62 | plt.xlabel("dimensions")
63 | plt.title("Ratio between 2-standard devation boundaries and n as n increases", fontdict = {'fontsize': 16})
64 | plt.show()
65 |
66 |
67 |
68 | #### how many points lie in the 10% period or outside
69 | def get_pct_for_interval(n):
70 | sample = np.random.normal(size=(N, n))
71 | dist = np.square(np.linalg.norm(sample, axis = 1))
72 |
73 | lower_interval = np.count_nonzero(dist < n*0.95)
74 | middle_interval = np.count_nonzero((dist >= n*0.95) & (dist <= n*1.05))
75 | large_interval = np.count_nonzero(dist > n*1.05)
76 |
77 | return lower_interval/N, middle_interval/N, large_interval/N
78 |
79 | lower_intervals = []
80 | middle_intervals = []
81 | large_intervals = []
82 |
83 | for n in n_range:
84 | lower_interval, middle_interval, large_interval = get_pct_for_interval(n)
85 | lower_intervals.append(lower_interval)
86 | middle_intervals.append(middle_interval)
87 | large_intervals.append(large_interval)
88 |
89 | plt.stackplot(n_range,
90 | lower_intervals,
91 | middle_intervals,
92 | large_intervals,
93 | labels=['d^2 < 0.95n',
94 | '0.95n <= d^2 <= 1.05n',
95 | 'd^2 > 1.05n'])
96 | plt.legend()
97 | plt.xlabel("dimensions")
98 | plt.title("Probability that a sample point will be at some distance from the origin", fontdict = {'fontsize': 16})
99 | plt.ylim(0, 1)
100 | plt.xlim(1, 5000)
101 |
--------------------------------------------------------------------------------
/Python/optimal_dating.py:
--------------------------------------------------------------------------------
1 | import random
2 | import matplotlib as plt
3 | import seaborn as sns
4 |
5 | #################
6 | # Top k algorithm
7 | #################
8 |
9 |
10 | def perm_rank(n):
11 | """create a ranked order list of n items"""
12 | return random.sample(range(1, n+1), n)
13 |
14 |
15 | def top_k_selection_algo(array, m, k):
16 | """for any list of order, apply top-k algorithm
17 |
18 | Return whether we succeed (1) or failure (0) to
19 | identify top-k value
20 | """
21 | top_first_m = min(array[:(m-1)])
22 | # then for array[n:]
23 | # we pick first k values that is greater than max_first_m
24 | inspect_array = np.array(array[m-1:])
25 | qualified_cand = inspect_array[inspect_array < top_first_m][:k]
26 |
27 | if len(qualified_cand) == k and max(qualified_cand) == k:
28 | return 1
29 | return 0
30 |
31 |
32 | def simulation_top_k(n, k, iters):
33 | """
34 | for any value of k and n
35 | simulate all exploration cutoff from 2-> n
36 | and return a list of success probability at different cutoff
37 | """
38 | result = []
39 | for m in range(2, n+1):
40 | result_m = []
41 | for i in range(iters):
42 | order = perm_rank(n)
43 | success = selection_algo(order, m, k)
44 | result_m.append(success)
45 | result.append(np.mean(result_m))
46 | return result
47 |
48 |
49 | result = simulation_top_k(100, 1, iters)
50 | result_3 = simulation_top_k(100, 3, iters)
51 | result_5 = simulation_top_k(100, 5, iters)
52 | result_10 = simulation_top_k(100, 10, iters)
53 |
54 |
55 | plt.style.use('fivethirtyeight')
56 | plt.figure(figsize=(13,6))
57 | sns.scatterplot(np.arange(2, 101),y=result, label = "k = 1")
58 | sns.scatterplot(np.arange(2, 101),y=result_3, label = "k = 3")
59 | sns.scatterplot(np.arange(2, 101),y=result_5, label = "k = 5")
60 | sns.scatterplot(np.arange(2, 101),y=result_10, label = "k = 10")
61 | plt.grid(False)
62 | plt.title("Probability of finding top k partners\n by exploring first r values")
63 | plt.xlabel("r values")
64 | plt.ylabel("Probability")
65 |
66 |
67 | ##############################
68 | # Top candidate with p success
69 | ##############################
70 |
71 | def selection_algo_with_success_rate(array, m, p):
72 | top_first_m = min(array[:(m-1)])
73 | available_array = np.random.binomial(1, p, len(array))
74 | #print(available_array)
75 | # then for array[n:]
76 | # we pick first k values that is greater than max_first_m
77 | #print("top first m", top_first_m)
78 | #print(array[:(m-1)], array[m-1:])
79 | inspect_array = array[m-1:]
80 | inspect_available = available_array[m-1:]
81 |
82 | if top_first_m == 1:
83 | return 0
84 | available_idx = np.where(inspect_available == 1)[0]
85 | available_person = np.array(inspect_array)[available_idx]
86 | pass_cand = available_person[available_person < top_first_m]
87 | #print(pass_cand)
88 | if len(pass_cand) == 0:
89 | return 0
90 | accept = pass_cand[0]
91 | if accept == 1:
92 | return 1
93 | return 0
94 |
95 | def simulate_with_success_rate(n, p, iters):
96 | result = []
97 | for m in range(2, n+1):
98 | result_m = []
99 | for i in range(iters):
100 | order = perm_rank(n)
101 | success = selection_algo_with_success_rate(order, m, p)
102 | result_m.append(success)
103 | result.append(np.mean(result_m))
104 | return result
105 |
106 | result_avail_1 = simulate_with_success_rate(100, 1, iters)
107 | result_avail_2 = ssimulate_with_success_rate(100, 0.25, iters)
108 | result_avail_5 = simulate_with_success_rate(100, 0.5, iters)
109 | result_avail_7 = simulate_with_success_rate(100, 0.75, iters)
110 |
111 | plt.style.use('fivethirtyeight')
112 | plt.figure(figsize=(13,6))
113 | sns.scatterplot(np.arange(2, 101),y=result_avail_1, label = "p = 1")
114 | sns.scatterplot(np.arange(2, 101),y=result_avail_2, label = "p = 0.25")
115 | sns.scatterplot(np.arange(2, 101),y=result_avail_5, label = "p = 0.5")
116 | sns.scatterplot(np.arange(2, 101),y=result_avail_7, label = "p = 0.75")
117 | plt.title("Probability of finding top partner at different success rate\n by exploring first r values")
118 | plt.grid(False)
119 | plt.xlabel("r values")
120 | plt.ylabel("Probability")
121 |
--------------------------------------------------------------------------------
/R/EPL/Agg.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/R/EPL/Agg.png
--------------------------------------------------------------------------------
/R/EPL/Last.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/R/EPL/Last.png
--------------------------------------------------------------------------------
/R/EPL/Misc/TeamEvaluate2015.R:
--------------------------------------------------------------------------------
1 | # Load libraries and read files
2 | packages <- c("dplyr", "fpc", "cluster",
3 | "factoextra", "dendextend",
4 | "psych", "qgraph")
5 | lapply(packages, library, character.only = TRUE)
6 |
7 | raw_df <- read.csv("./Team2015season.csv", header=T)
8 | # scale data
9 |
10 | scaled_data <- raw_df %>%
11 | remove_rownames() %>%
12 | column_to_rownames("Team") %>%
13 | scale()
14 |
15 |
16 | #######################################
17 | # Hierarchical Cluster Analysis
18 | # Useful tutorial:
19 | # https://uc-r.github.io/hc_clustering
20 | #######################################
21 |
22 | #Eucledian, Ward's method
23 | d_1 <- dist(scaled_data, method="euclidean")
24 | clust_1 <- hclust(d_1, method="ward.D")
25 | #draw the dendrogram
26 | plot(clust_1,
27 | cex=0.7,
28 | xlab="",
29 | ylab="Distance",
30 | main="Clusterings of 60 European teams")
31 | rect.hclust(clust_1, k = 4, border = 2:5)
32 |
33 | #get membership vector
34 | cuts <- cutree(clust_1,k=4)
35 | scaled_data %>%
36 | as.data.frame() %>%
37 | mutate(cluster = cuts) %>%
38 | head
39 |
40 | # Compute distance matrix
41 | res.dist <- dist(scaled_data, method = "euclidean")
42 |
43 | # Compute 2 hierarchical clusterings
44 | hc1 <- hclust(res.dist, method = "complete")
45 | hc2 <- hclust(res.dist, method = "ward.D2")
46 |
47 | # Create two dendrograms and compare group partition
48 | dend1 <- as.dendrogram (hc1)
49 | dend2 <- as.dendrogram (hc2)
50 |
51 | dend_list <- dendlist(dend1, dend2)
52 |
53 | tanglegram(dend1, dend2,
54 | lwd = 1,
55 | edge.lwd = 1,
56 | lab.cex = 0.5,
57 | columns_width = c(8, 3, 8),
58 | highlight_distinct_edges = FALSE, # Turn-off dashed lines
59 | common_subtrees_color_lines = FALSE, # Turn-off line colors
60 | common_subtrees_color_branches = TRUE, # Color common branches
61 | main = paste("entanglement =", round(entanglement(dend_list), 2))
62 | )
63 |
64 | ###########################################
65 | # K-means clustering
66 | # Useful tutorial:
67 | # https://uc-r.github.io/kmeans_clustering
68 | ###########################################
69 |
70 | # use 4 centers that Hc clustering suggests
71 | # nstart: attempts multiple initial configurations
72 | # and reports on the best one.
73 | km_results <- kmeans(scaled_data, centers = 4, nstart = 100)
74 | km_results
75 |
76 | # fviz_cluster does PCA and plot the data points
77 | # according to the first two PCs that explain the majority of the variance
78 | fviz_cluster(km_results, data = scaled_data)
79 |
80 | # Evaluating clustering
81 | # Best number of cluster using scree-plot (elbow method)
82 | # optimal total-wihtin cluster sum of square
83 | set.seed(123)
84 | fviz_nbclust(scaled_data, kmeans, method = "wss")
85 |
86 | # Average Silhouette method
87 | # measuring the quality of the clusters
88 | # by how well object lies within a cluster
89 | # try to maximize average silhouette
90 | fviz_nbclust(scaled_data, kmeans, method = "silhouette")
91 |
92 | # GAP statistics method
93 | # can apply to both kmeans and HC
94 | # compares the total intracluster variation
95 | # with their expected values
96 | # under null reference distribution of the data
97 | # at various value of k
98 | set.seed(123)
99 | gap_stat <- clusGap(scaled_data,
100 | FUN = kmeans,
101 | nstart = 100,
102 | K.max = 10,
103 | B = 50)
104 | # Print the result
105 | print(gap_stat, method = "firstmax")
106 | fviz_gap_stat(gap_stat)
107 |
108 | ###################################################################
109 | # Factor analysis
110 | # Useful tutorial:
111 | # http://www.di.fc.ul.pt/~jpn/r/factoranalysis/factoranalysis.html
112 | # https://rpubs.com/aaronsc32/factor-analysis-introduction
113 | ###################################################################
114 | # determined the number of factors to use with scree plot
115 | parallel <- fa.parallel(scaled_data,
116 | fm = 'minres',
117 | fa = 'fa')
118 |
119 | # factor analysis -- no rotation
120 | # Varimax: assume factors completely uncorrelated
121 | # Oblique: correlations in factors
122 |
123 | # Method: factanal only support MaxLikelihood
124 | # In fa (psych), we can use "PAF (pa)" or "mingres",
125 | # the later provide results similar to `MaxLikelihood`
126 | # without assuming multivariate normal distribution
127 | # and derives solutions through iterative eigen decomposition like principal axis.
128 |
129 | fa1 <- factanal(scaled_data,
130 | factors=2,
131 | rotation="none",
132 | scores="regression")
133 |
134 | fa2 <- fa(scaled_data,
135 | nfactors = 3,
136 | rotate = "oblimin",
137 | fm="minres")
138 | fa1
139 |
140 | # biplot
141 | biplot(fa1$scores[,1:2],
142 | loadings(fa1),
143 | cex=c(0.7,0.8))
144 | # qgraph
145 | # a different visualization of biplot
146 | qg.fa1 <- qgraph(fa1)
147 |
148 | # NOTE:
149 | # - after Exploratory Factor Analysis (EFA),
150 | # - the next step could be Confirmatory Factor Analysis
151 | # - which is part of a larger subset: Structual Equation Modelling
152 | # - https://socialsciences.mcmaster.ca/jfox/Misc/sem/SEM-paper.pdf
153 |
154 |
155 | # we can get some flexibility from the "psych" package
156 | fa_analysis <- function(data_set, factor,
157 | rotate = "varimax", fm = "pa"){
158 | res <- fa(data_set, nfactors = factor,
159 | rotate = rotate, fm = fm)
160 | print("Factor Analysis results:")
161 | print(res)
162 |
163 | # get loading plot for the first two factors
164 | plot(res$loadings, pch=18, col='red')
165 | abline(h=0)
166 | abline(v=0)
167 | text(res$loadings, labels=names(data_set),cex=0.8)
168 |
169 | #get reproduced correlation matrix
170 | repro <- res$loadings%*%t(res$loadings)
171 | #residual correlation matrix
172 | residual <- cor(data_set)-repro
173 | print("Residual correlation matrx")
174 | round(resid2,2)
175 |
176 | #get root-mean squared residuals
177 | len <- length(residual[upper.tri(residual)])
178 | RMSR <- sqrt(sum(residual[upper.tri(residual)]^2)/len)
179 | print("Root-mean squared residuals:", RMSR)
180 |
181 | #get proportion of residuals greater than 0.05 in absolute value
182 | prop <- sum(rep(1,len)[abs(residual[upper.tri(residual)])>0.05])/len
183 | print("Proportion of residuals greater than 0.05 in absolute value:", prop)
184 | }
185 |
186 | # varimax - paf
187 | fa_analysis(soccer, 3)
188 |
189 | # quartimax - pag
190 | fa_analysis(soccer, 3, "quartimax", "pa")
191 |
--------------------------------------------------------------------------------
/R/EPL/betting/Portfolio-xkcd.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/R/EPL/betting/Portfolio-xkcd.png
--------------------------------------------------------------------------------
/R/EPL/betting/bet_strategy.R:
--------------------------------------------------------------------------------
1 | library(tidyverse)
2 | source("prediction.R")
3 | # in MAC, may have to go to font book to activate xkcd.ttf
4 | #library(extrafont)
5 | #font_import(path = ".", pattern="xkcd")
6 | #fonts()
7 | #loadfonts()
8 |
9 | betting_house <- c("B365", "BW", "IW", "PS", "WH", "VC")
10 |
11 | # easy computation of max odd or mean probability
12 | # find max_odd if find_max is TRUE, else return Consensus Probability of event
13 | row_max_prob <- function(df, row_idx, find_max){
14 | predict_outcome = df[row_idx, "predict_outcome"]
15 | if (is.na(predict_outcome)) return (NA)
16 | col_names <- paste0(betting_house, predict_outcome)
17 | val = ifelse(find_max, max(df[row_idx,col_names]), 1/mean(as.numeric(df[row_idx,col_names])))
18 | return (val)
19 | }
20 |
21 |
22 | ##### find total return at every round
23 | # based on prediction, max_odd, Consensus Probability and amount of capital to bet
24 | # input in Round (Matchweek), method ("poisson", "merson", "random") and Amount of available capital
25 | betting_round <- function (round, method, capital){
26 | total_return = 0
27 |
28 | round_data <- df_prediction %>%
29 | filter(Round == round) %>%
30 | mutate(method = method,
31 | predict_outcome = ifelse(method == "random", sample(c("H", "D", "A"),n(), replace = TRUE),
32 | ifelse(method == 'poisson', poisson_predict, Merson_predict)))
33 | no_matches = dim(round_data)[1]
34 | round_data$max_odd <- sapply(1:no_matches, function(x) row_max_prob(round_data, x, TRUE))
35 | round_data$prob <- sapply(1:no_matches, function(x) row_max_prob(round_data, x, FALSE))
36 |
37 | round_data <- round_data %>%
38 | mutate (fraction = ((prob*max_odd - (1-prob))/max_odd),
39 | f_normalize = fraction/sum(fraction, na.rm = TRUE),
40 | bet_amount = f_normalize * capital,
41 | payoff = ifelse(FTR == predict_outcome, bet_amount*max_odd, 0),
42 | profit = payoff-bet_amount)
43 |
44 | return (sum(round_data$profit, na.rm = TRUE))
45 | }
46 |
47 | # inititate a table to store return result
48 | # remove Paul's Merson bet
49 | return_table <- data.frame(round = 0:30,
50 | Poisson = rep(0,31),
51 | random_bet = rep(0,31))
52 |
53 | return_table[1,c("Poisson", "random_bet")] <- rep(1000,2)
54 |
55 | for (i in 1:30){
56 | Poisson_return <- betting_round(i, "poisson",1000/30)
57 | random_return <- betting_round(i, "random", 1000/30)
58 | #Merson_return <- betting_round(i, "Merson",1000/30)
59 |
60 | return_table[i+1,"Poisson"] <- Poisson_return
61 | return_table[i+1,"random_bet"] <- random_return
62 | #return_table[i+1,"Merson_bet"] <- Merson_return
63 | }
64 |
65 | # we are interested the change in the portfolio overtime
66 | return_table$Poisson <- cumsum(return_table$Poisson)
67 | return_table$random_bet <- cumsum(return_table$random_bet)
68 | #return_table$Merson_bet <- cumsum(return_table$Merson_bet)
69 |
70 | return_table %>%
71 | gather("method", "value", -round) %>%
72 | mutate(method = factor(method, levels = c('Poisson', 'random_bet'),
73 | labels = c('Poisson prediction', 'random prediction'))) %>%
74 | ggplot(aes(x=round, y=value, group=method)) +
75 | geom_line(aes(color=method)) +
76 | scale_x_continuous(breaks = seq(0, 30, by = 5)) +
77 | ggtitle("Portfolio value at the end of every matchweek") +
78 | theme(axis.line = element_line(size=1, colour = "black"),
79 | panel.grid.major = element_blank(),
80 | panel.grid.minor = element_blank(),
81 | panel.border = element_blank(),
82 | panel.background = element_blank(),
83 | plot.title=element_text(size = 18, family="xkcd"),
84 | text=element_text(size = 13, family="xkcd"),
85 | axis.text.x=element_text(colour="black", size = 12),
86 | axis.text.y=element_text(colour="black", size = 12)) +
87 | ylab('Portfolio total value in dollars') +
88 | xlab ('Matchweek')
89 |
90 |
91 | #############
92 | # Extra note
93 | # If you can invest as much as you want with $1000 buffer
94 | # meaning that you set out to invest 1000/30 per round
95 | # but if Kelly criterion asks for more, you can till afford it
96 | ############
97 |
98 | ##### Method to combine Kelly criterion and odds
99 | betting_round <- function (round, predict_method, capital){
100 | total_return = 0
101 |
102 | round_data <- df_prediction %>%
103 | filter(Round == round)
104 |
105 | for (i in 1:dim(round_data)[1]){
106 | predict = ifelse(predict_method == "random", sample(c("H", "D", "A"),1),
107 | round_data[i,predict_method])
108 | if (is.na(predict)){
109 | total_return = total_return
110 | }
111 | else{
112 | # once I have the prediction, I find the one with the highest odd
113 | odds = as.vector (round_data[i, paste0(betting_house, predict)])
114 | odd = max(odds)
115 |
116 | predict_prob = 1/rowMeans(round_data[i,paste0(betting_house, predict)])
117 | bet_amount = ((predict_prob*odd - (1-predict_prob))/odd)*capital
118 |
119 | total_return = ifelse(round_data[i, "FTR"] == predict,
120 | total_return + bet_amount*(odd-1),
121 | total_return - bet_amount)
122 | }
123 | }
124 | return (total_return)
125 | }
126 |
--------------------------------------------------------------------------------
/R/EPL/betting/clean_data.R:
--------------------------------------------------------------------------------
1 | ########################################
2 | # scripts to clean data to usable format
3 | # source:
4 | # - fixtures.csv: dedicatedexcel.com
5 | # - Historical results: https://www.kaggle.com/thefc17/epl-results-19932018
6 | #########################################
7 | library (dplyr)
8 |
9 | link_fixture = "https://raw.githubusercontent.com/tuangauss/DataScienceProjects/master/data/all_games.csv"
10 | link_history = "https://raw.githubusercontent.com/tuangauss/DataScienceProjects/master/data/history.csv"
11 |
12 | fixtures <- read.csv(link_fixture, stringsAsFactors = FALSE)
13 |
14 | # get the team
15 | teams <- unique(fixtures$HOME.TEAM)
16 |
17 | # extract historic results
18 | history <- read.csv(link_history, stringsAsFactors = FALSE)
19 |
20 | # get info from the 2010 up to 2018
21 | seasons <- sapply(10:17, function(x) paste0(2000+x,'-',x+1))
22 |
23 | recent.pl <- history %>%
24 | filter(Season %in% seasons, div == 'E0')
25 |
26 | # because the two data comes from different source, so the teams name don't match
27 | teams[!teams %in% recent.pl$HomeTeam]
28 | unique(recent.pl$HomeTeam)
29 |
30 | # now we need to fix it
31 | pair_fix <- list(c('Manchester United', 'Man United'),
32 | c('Newcastle United', 'Newcastle'),
33 | c('Huddersfield Town', 'Huddersfield'),
34 | c('Wolverhampton Wanderers', 'Wolves'),
35 | c('Cardiff City', 'Cardiff'),
36 | c('Leicester City', 'Leicester'),
37 | c('Tottenham Hotspur', 'Tottenham'),
38 | c('West Ham United', 'West Ham'),
39 | c('Manchester City', "Man City"),
40 | c('Brighton and Hove Albion', 'Brighton'))
41 |
42 | # fix the recent.pl dataset
43 | # for name-conformity
44 | for (i in 1:length(pair_fix)){
45 | recent.pl <- recent.pl %>%
46 | mutate(HomeTeam = replace(HomeTeam,
47 | HomeTeam == pair_fix[[i]][2],
48 | pair_fix[[i]][1]),
49 | AwayTeam = replace(AwayTeam,
50 | AwayTeam == pair_fix[[i]][2],
51 | pair_fix[[i]][1]))
52 | }
53 |
54 |
55 | # a bland average dataframe
56 | ave_home <- recent.pl %>%
57 | group_by(HomeTeam) %>%
58 | summarize (ave_scored_h = mean(FTHG), ave_conceded_h = mean(FTAG)) %>%
59 | filter (HomeTeam %in% teams) %>% rename(Team = HomeTeam)
60 |
61 | ave_away <- recent.pl %>%
62 | group_by(AwayTeam) %>%
63 | summarize (ave_scored_a = mean(FTAG), ave_conceded_a = mean(FTHG)) %>%
64 | filter (AwayTeam %in% teams) %>% rename(Team = AwayTeam)
65 |
66 | ave <- merge(ave_home, ave_away, by = 'Team')
67 |
68 |
69 | # more precise result with pairwise
70 | hist_pair.pl <- recent.pl %>%
71 | group_by(HomeTeam, AwayTeam) %>%
72 | filter (HomeTeam %in% teams, AwayTeam %in% teams) %>%
73 | summarize (match = n(), ave_home_scored = mean(FTHG), ave_away_scored = mean(FTAG))
74 |
75 | # data set for new season
76 | # just clean the data name for readability
77 | new_season <- fixtures %>%
78 | rename(HomeTeam = HOME.TEAM,
79 | AwayTeam = AWAY.TEAM)
80 |
81 | # clean data form memory
82 | rm(history, seasons, recent.pl, pair_fix, ave_home, ave_away, fixtures)
83 |
--------------------------------------------------------------------------------
/R/EPL/betting/prediction.R:
--------------------------------------------------------------------------------
1 | library (dplyr)
2 | source ('clean_data.R')
3 |
4 | # function to simplify result
5 | # from scoreline to who wins the match, H (Home), A(Away) or D(Draw)
6 | result_calc <- function (h_goal, a_goal){
7 | result = ifelse(h_goal == a_goal, 'D', ifelse(h_goal > a_goal, 'H', 'A'))
8 | return (result)
9 | }
10 |
11 | # function to calibrate results
12 | # The idea is to make sure that if Probability of wining of Home and Away is tight
13 | # e.g: 0.451(H) vs 0.447 (A)
14 | # then it should be thought as a draw
15 | result_calibrate <- function(prob_h, prob_d, prob_a){
16 | result = ifelse(abs(prob_h - prob_a) < 0.01, "D",
17 | ifelse (prob_h == pmax(prob_d,prob_h,prob_a), "H",
18 | ifelse(prob_d == pmax(prob_h,prob_d,prob_a), "D", "A" )))
19 | return (result)
20 | }
21 |
22 |
23 | # get most frequent score line of a match after n, sim time
24 | get_score <- function (home, away, nsim){
25 | # try to get from history, pair
26 | subset <- hist_pair.pl[ which( hist_pair.pl$HomeTeam ==home | hist_pair.pl$AwayTeam ==away), ]
27 | # more efficient code, no need to retract back to dataframe many times
28 | ave_h_s = subset$ave_home_scored[1]
29 | ave_a_s = subset$ave_away_scored[1]
30 |
31 | t_ave_h_s = ave[ave$Team == home,]$ave_scored_h
32 | t_ave_a_c = ave[ave$Team == away,]$ave_conceded_a
33 | t_ave_h_c = ave[ave$Team == home,]$ave_conceded_h
34 | t_ave_a_s = ave[ave$Team == away,]$ave_scored_a
35 | result = character(length(nsim))
36 | for (i in 1:nsim){
37 | if ((dim(subset)[1] == 1) & (subset$match[1] > 3)){
38 | h_scored = rpois(1, ave_h_s)
39 | a_scored = rpois(1, ave_a_s)
40 | }
41 | # if we have no historical result of the match
42 | else{
43 | # take into account both attacking stat of home and defense stats of away
44 | h_scored = rpois(1, 1/2 * (t_ave_h_s + t_ave_a_c))
45 | a_scored = rpois(1, 1/2 * (t_ave_a_s + t_ave_h_c))
46 | }
47 | result[i] = result_calc(h_scored, a_scored)
48 | }
49 | result_tab = table(result)/nsim
50 | return (c(result_tab['H'], result_tab['D'], result_tab['A']))
51 | }
52 |
53 | nsim = 10000
54 | matches <- mapply(get_score, new_season$HomeTeam, new_season$AwayTeam, nsim, SIMPLIFY = FALSE)
55 | new_season$H <- sapply(matches, function(x) x[1])
56 | new_season$D <- sapply(matches, function(x) x[2])
57 | new_season$A <- sapply(matches, function(x) x[3])
58 |
59 | df_prediction <- new_season %>%
60 | mutate(poisson_predict = result_calibrate(H,D,A))
61 |
62 | # The data about Paul Merson's prediction seems to get lost somehow
63 | #df_prediction <- new_season %>%
64 | # mutate(poisson_predict = result_calibrate(H,D,A),
65 | # Merson_predict = result_calc(Merson.H, Merson.A))
66 |
--------------------------------------------------------------------------------
/R/EPL/penalty/Scraping.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 34,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import requests \n",
10 | "from bs4 import BeautifulSoup\n",
11 | "import time\n",
12 | "import random\n",
13 | "import pandas as pd\n",
14 | "import numpy as np"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 14,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "root = \"https://www.statbunker.com/competitions/Penalties?comp_id=\""
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 5,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "PL = [200,243,279,323,373,415,449,481,515,556,586,614,639]\n",
33 | "year_list = [str(i)+\"/\" + str(i+1) for i in range (7,20)]\n",
34 | "year_europa = [str(i)+\"/\" + str(i+1) for i in range (9,20)]"
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 2,
40 | "metadata": {},
41 | "outputs": [],
42 | "source": [
43 | "Laliga = [212,259,293,336,378,413,461,485,518,564,600,622,648] #start 07/08\n",
44 | "Seria = [211,258,292,337,377,414,462,486,517,562,593,623,649]\n",
45 | "bundes = [204,250,285,330,374,416,447,483,516,561,591,620,646]\n",
46 | "france = [202,251,284,331,375,412,454,484,514,563,594,621,647]\n",
47 | "championship = [207,246,280,325,370,420,451,488,524,557,587,615,640]\n",
48 | "scottish = [205,249,283,329,369,419,455,491,521,566,590,618,643]\n",
49 | "CL = [203,261,295,332,366,429,468,500,540,571,601,628,655]#07/08\n",
50 | "europa = [296,335,362,430,470,501,541,572,602,629,656] #09/10"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 28,
56 | "metadata": {},
57 | "outputs": [],
58 | "source": [
59 | "cup_dict = {\"Premier League\": PL,\n",
60 | " \"La Liga\": Laliga,\n",
61 | " \"Bundesliga\": bundes,\n",
62 | " \"Ligue One\": france,\n",
63 | " \"English Championship\": championship,\n",
64 | " \"Scottish Premiership\": scottish,\n",
65 | " \"Champion League\": CL,\n",
66 | " \"Europa Cup\": europa}"
67 | ]
68 | },
69 | {
70 | "cell_type": "code",
71 | "execution_count": 29,
72 | "metadata": {},
73 | "outputs": [],
74 | "source": [
75 | "# scrape for PL first then append the others later\n",
76 | "full_name, full_club, full_year, full_league, full_penalties, full_home, full_away, full_scored, full_missed, full_saved = ([] for i in range(10))"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 31,
82 | "metadata": {},
83 | "outputs": [
84 | {
85 | "name": "stdout",
86 | "output_type": "stream",
87 | "text": [
88 | "Done Premier League, season 7/8\n",
89 | "Done Premier League, season 8/9\n",
90 | "Done Premier League, season 9/10\n",
91 | "Done Premier League, season 10/11\n",
92 | "Done Premier League, season 11/12\n",
93 | "Done Premier League, season 12/13\n",
94 | "Done Premier League, season 13/14\n",
95 | "Done Premier League, season 14/15\n",
96 | "Done Premier League, season 15/16\n",
97 | "Done Premier League, season 16/17\n",
98 | "Done Premier League, season 17/18\n",
99 | "Done Premier League, season 18/19\n",
100 | "Done Premier League, season 19/20\n",
101 | "Done La Liga, season 7/8\n",
102 | "Done La Liga, season 8/9\n",
103 | "Done La Liga, season 9/10\n",
104 | "Done La Liga, season 10/11\n",
105 | "Done La Liga, season 11/12\n",
106 | "Done La Liga, season 12/13\n",
107 | "Done La Liga, season 13/14\n",
108 | "Done La Liga, season 14/15\n",
109 | "Done La Liga, season 15/16\n",
110 | "Done La Liga, season 16/17\n",
111 | "Done La Liga, season 17/18\n",
112 | "Done La Liga, season 18/19\n",
113 | "Done La Liga, season 19/20\n",
114 | "Done Bundesliga, season 7/8\n",
115 | "Done Bundesliga, season 8/9\n",
116 | "Done Bundesliga, season 9/10\n",
117 | "Done Bundesliga, season 10/11\n",
118 | "Done Bundesliga, season 11/12\n",
119 | "Done Bundesliga, season 12/13\n",
120 | "Done Bundesliga, season 13/14\n",
121 | "Done Bundesliga, season 14/15\n",
122 | "Done Bundesliga, season 15/16\n",
123 | "Done Bundesliga, season 16/17\n",
124 | "Done Bundesliga, season 17/18\n",
125 | "Done Bundesliga, season 18/19\n",
126 | "Done Bundesliga, season 19/20\n",
127 | "Done Ligue One, season 7/8\n",
128 | "Done Ligue One, season 8/9\n",
129 | "Done Ligue One, season 9/10\n",
130 | "Done Ligue One, season 10/11\n",
131 | "Done Ligue One, season 11/12\n",
132 | "Done Ligue One, season 12/13\n",
133 | "Done Ligue One, season 13/14\n",
134 | "Done Ligue One, season 14/15\n",
135 | "Done Ligue One, season 15/16\n",
136 | "Done Ligue One, season 16/17\n",
137 | "Done Ligue One, season 17/18\n",
138 | "Done Ligue One, season 18/19\n",
139 | "Done Ligue One, season 19/20\n",
140 | "Done English Championship, season 7/8\n",
141 | "Done English Championship, season 8/9\n",
142 | "Done English Championship, season 9/10\n",
143 | "Done English Championship, season 10/11\n",
144 | "Done English Championship, season 11/12\n",
145 | "Done English Championship, season 12/13\n",
146 | "Done English Championship, season 13/14\n",
147 | "Done English Championship, season 14/15\n",
148 | "Done English Championship, season 15/16\n",
149 | "Done English Championship, season 16/17\n",
150 | "Done English Championship, season 17/18\n",
151 | "Done English Championship, season 18/19\n",
152 | "Done English Championship, season 19/20\n",
153 | "Done Scottish Premiership, season 7/8\n",
154 | "Done Scottish Premiership, season 8/9\n",
155 | "Done Scottish Premiership, season 9/10\n",
156 | "Done Scottish Premiership, season 10/11\n",
157 | "Done Scottish Premiership, season 11/12\n",
158 | "Done Scottish Premiership, season 12/13\n",
159 | "Done Scottish Premiership, season 13/14\n",
160 | "Done Scottish Premiership, season 14/15\n",
161 | "Done Scottish Premiership, season 15/16\n",
162 | "Done Scottish Premiership, season 16/17\n",
163 | "Done Scottish Premiership, season 17/18\n",
164 | "Done Scottish Premiership, season 18/19\n",
165 | "Done Scottish Premiership, season 19/20\n",
166 | "Done Champion League, season 7/8\n",
167 | "Done Champion League, season 8/9\n",
168 | "Done Champion League, season 9/10\n",
169 | "Done Champion League, season 10/11\n",
170 | "Done Champion League, season 11/12\n",
171 | "Done Champion League, season 12/13\n",
172 | "Done Champion League, season 13/14\n",
173 | "Done Champion League, season 14/15\n",
174 | "Done Champion League, season 15/16\n",
175 | "Done Champion League, season 16/17\n",
176 | "Done Champion League, season 17/18\n",
177 | "Done Champion League, season 18/19\n",
178 | "Done Champion League, season 19/20\n",
179 | "Done Europa Cup, season 9/10\n",
180 | "Done Europa Cup, season 10/11\n",
181 | "Done Europa Cup, season 11/12\n",
182 | "Done Europa Cup, season 12/13\n",
183 | "Done Europa Cup, season 13/14\n",
184 | "Done Europa Cup, season 14/15\n",
185 | "Done Europa Cup, season 15/16\n",
186 | "Done Europa Cup, season 16/17\n",
187 | "Done Europa Cup, season 17/18\n",
188 | "Done Europa Cup, season 18/19\n",
189 | "Done Europa Cup, season 19/20\n"
190 | ]
191 | }
192 | ],
193 | "source": [
194 | "for cup in cup_dict:\n",
195 | " if cup == \"Europa Cup\":\n",
196 | " season = year_europa\n",
197 | " else:\n",
198 | " season = year_list\n",
199 | " \n",
200 | " name, club, year, league, penalties, home, away, scored, missed, saved = ([] for i in range(10))\n",
201 | " code_enum = cup_dict[cup]\n",
202 | " for count, el in enumerate(code_enum):\n",
203 | " URL = root + str(el)\n",
204 | " r = requests.get(URL)\n",
205 | " soup = BeautifulSoup(r.content, 'html5lib')\n",
206 | " details = soup.findAll(True, {'class':['odd', 'even']})\n",
207 | " for row in details:\n",
208 | " el_list = list(row.strings)\n",
209 | " name.append(el_list[0])\n",
210 | " club.append(el_list[1])\n",
211 | " year.append(season[count])\n",
212 | " league.append(cup)\n",
213 | " penalties.append(el_list[2])\n",
214 | " home.append(el_list[3])\n",
215 | " away.append(el_list[4])\n",
216 | " scored.append(el_list[5])\n",
217 | " missed.append(el_list[6])\n",
218 | " saved.append(el_list[7])\n",
219 | " print (\"Done \" + cup + \", season \" + season[count])\n",
220 | " time.sleep(random.randint(1,5))\n",
221 | " full_name += name\n",
222 | " full_club += club\n",
223 | " full_year += year\n",
224 | " full_league += league\n",
225 | " full_penalties += penalties\n",
226 | " full_home += home\n",
227 | " full_away += away\n",
228 | " full_scored += scored\n",
229 | " full_missed += missed\n",
230 | " full_saved += saved "
231 | ]
232 | },
233 | {
234 | "cell_type": "code",
235 | "execution_count": 32,
236 | "metadata": {},
237 | "outputs": [],
238 | "source": [
239 | "table = pd.DataFrame({'name': full_name, 'club': full_club,\n",
240 | " 'year': full_year, 'league': full_league,\n",
241 | " 'penalties': full_penalties, 'home': full_home, 'away': full_away,\n",
242 | " 'scored': full_scored, 'missed': full_missed, 'saved': full_saved})"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": 35,
248 | "metadata": {},
249 | "outputs": [
250 | {
251 | "data": {
252 | "text/plain": [
253 | "(3937, 10)"
254 | ]
255 | },
256 | "execution_count": 35,
257 | "metadata": {},
258 | "output_type": "execute_result"
259 | }
260 | ],
261 | "source": [
262 | "np.shape(table)"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": 38,
268 | "metadata": {},
269 | "outputs": [
270 | {
271 | "data": {
272 | "text/html": [
273 | "
\n",
274 | "\n",
287 | "
\n",
288 | " \n",
289 | " \n",
290 | " | \n",
291 | " name | \n",
292 | " club | \n",
293 | " year | \n",
294 | " league | \n",
295 | " penalties | \n",
296 | " home | \n",
297 | " away | \n",
298 | " scored | \n",
299 | " missed | \n",
300 | " saved | \n",
301 | "
\n",
302 | " \n",
303 | " \n",
304 | " \n",
305 | " 3927 | \n",
306 | " Claudiu Keseru | \n",
307 | " Ludogorets Razgrad | \n",
308 | " 19/20 | \n",
309 | " Europa Cup | \n",
310 | " 1 | \n",
311 | " 1 | \n",
312 | " - | \n",
313 | " 1 | \n",
314 | " - | \n",
315 | " - | \n",
316 | "
\n",
317 | " \n",
318 | " 3928 | \n",
319 | " Adem Ljajic | \n",
320 | " Besiktas | \n",
321 | " 19/20 | \n",
322 | " Europa Cup | \n",
323 | " 1 | \n",
324 | " - | \n",
325 | " 1 | \n",
326 | " 1 | \n",
327 | " - | \n",
328 | " - | \n",
329 | "
\n",
330 | " \n",
331 | " 3929 | \n",
332 | " Andraz Sporar | \n",
333 | " Slovan Bratislava | \n",
334 | " 19/20 | \n",
335 | " Europa Cup | \n",
336 | " 1 | \n",
337 | " - | \n",
338 | " 1 | \n",
339 | " - | \n",
340 | " - | \n",
341 | " 1 | \n",
342 | "
\n",
343 | " \n",
344 | " 3930 | \n",
345 | " James Tavernier | \n",
346 | " Rangers | \n",
347 | " 19/20 | \n",
348 | " Europa Cup | \n",
349 | " 1 | \n",
350 | " 1 | \n",
351 | " - | \n",
352 | " - | \n",
353 | " 1 | \n",
354 | " - | \n",
355 | "
\n",
356 | " \n",
357 | " 3931 | \n",
358 | " Ryan Christie | \n",
359 | " Celtic | \n",
360 | " 19/20 | \n",
361 | " Europa Cup | \n",
362 | " 1 | \n",
363 | " - | \n",
364 | " 1 | \n",
365 | " 1 | \n",
366 | " - | \n",
367 | " - | \n",
368 | "
\n",
369 | " \n",
370 | " 3932 | \n",
371 | " Tomas de Vincenti | \n",
372 | " APOEL Nicosia | \n",
373 | " 19/20 | \n",
374 | " Europa Cup | \n",
375 | " 1 | \n",
376 | " 1 | \n",
377 | " - | \n",
378 | " 1 | \n",
379 | " - | \n",
380 | " - | \n",
381 | "
\n",
382 | " \n",
383 | " 3933 | \n",
384 | " Bruno Fernandes | \n",
385 | " Sporting Lisbon | \n",
386 | " 19/20 | \n",
387 | " Europa Cup | \n",
388 | " 1 | \n",
389 | " - | \n",
390 | " 1 | \n",
391 | " 1 | \n",
392 | " - | \n",
393 | " - | \n",
394 | "
\n",
395 | " \n",
396 | " 3934 | \n",
397 | " Ciprian Deac | \n",
398 | " CFR Cluj | \n",
399 | " 19/20 | \n",
400 | " Europa Cup | \n",
401 | " 1 | \n",
402 | " 1 | \n",
403 | " - | \n",
404 | " 1 | \n",
405 | " - | \n",
406 | " - | \n",
407 | "
\n",
408 | " \n",
409 | " 3935 | \n",
410 | " M'Baye Niang | \n",
411 | " Stade Rennes | \n",
412 | " 19/20 | \n",
413 | " Europa Cup | \n",
414 | " 1 | \n",
415 | " 1 | \n",
416 | " - | \n",
417 | " 1 | \n",
418 | " - | \n",
419 | " - | \n",
420 | "
\n",
421 | " \n",
422 | " 3936 | \n",
423 | " Bibras Natcho | \n",
424 | " Partizan Belgrade | \n",
425 | " 19/20 | \n",
426 | " Europa Cup | \n",
427 | " 1 | \n",
428 | " 1 | \n",
429 | " - | \n",
430 | " 1 | \n",
431 | " - | \n",
432 | " - | \n",
433 | "
\n",
434 | " \n",
435 | "
\n",
436 | "
"
437 | ],
438 | "text/plain": [
439 | " name club year league penalties \\\n",
440 | "3927 Claudiu Keseru Ludogorets Razgrad 19/20 Europa Cup 1 \n",
441 | "3928 Adem Ljajic Besiktas 19/20 Europa Cup 1 \n",
442 | "3929 Andraz Sporar Slovan Bratislava 19/20 Europa Cup 1 \n",
443 | "3930 James Tavernier Rangers 19/20 Europa Cup 1 \n",
444 | "3931 Ryan Christie Celtic 19/20 Europa Cup 1 \n",
445 | "3932 Tomas de Vincenti APOEL Nicosia 19/20 Europa Cup 1 \n",
446 | "3933 Bruno Fernandes Sporting Lisbon 19/20 Europa Cup 1 \n",
447 | "3934 Ciprian Deac CFR Cluj 19/20 Europa Cup 1 \n",
448 | "3935 M'Baye Niang Stade Rennes 19/20 Europa Cup 1 \n",
449 | "3936 Bibras Natcho Partizan Belgrade 19/20 Europa Cup 1 \n",
450 | "\n",
451 | " home away scored missed saved \n",
452 | "3927 1 - 1 - - \n",
453 | "3928 - 1 1 - - \n",
454 | "3929 - 1 - - 1 \n",
455 | "3930 1 - - 1 - \n",
456 | "3931 - 1 1 - - \n",
457 | "3932 1 - 1 - - \n",
458 | "3933 - 1 1 - - \n",
459 | "3934 1 - 1 - - \n",
460 | "3935 1 - 1 - - \n",
461 | "3936 1 - 1 - - "
462 | ]
463 | },
464 | "execution_count": 38,
465 | "metadata": {},
466 | "output_type": "execute_result"
467 | }
468 | ],
469 | "source": [
470 | "table.tail(10)"
471 | ]
472 | },
473 | {
474 | "cell_type": "code",
475 | "execution_count": 37,
476 | "metadata": {},
477 | "outputs": [],
478 | "source": [
479 | "table.to_csv (r'./all_penalties.csv', index = None, header=True)"
480 | ]
481 | },
482 | {
483 | "cell_type": "code",
484 | "execution_count": null,
485 | "metadata": {},
486 | "outputs": [],
487 | "source": []
488 | }
489 | ],
490 | "metadata": {
491 | "kernelspec": {
492 | "display_name": "Python 3",
493 | "language": "python",
494 | "name": "python3"
495 | },
496 | "language_info": {
497 | "codemirror_mode": {
498 | "name": "ipython",
499 | "version": 3
500 | },
501 | "file_extension": ".py",
502 | "mimetype": "text/x-python",
503 | "name": "python",
504 | "nbconvert_exporter": "python",
505 | "pygments_lexer": "ipython3",
506 | "version": "3.7.3"
507 | }
508 | },
509 | "nbformat": 4,
510 | "nbformat_minor": 2
511 | }
512 |
--------------------------------------------------------------------------------
/R/EPL/penalty/penalty.R:
--------------------------------------------------------------------------------
1 | ################################
2 | #### Data Science Project #####
3 | # Article: #
4 | # https://tinyurl.com/y2ynruqo #
5 | ################################
6 |
7 | library(MASS)
8 | library(tidyverse)
9 | library(betareg)
10 | library(xkcd)
11 |
12 | # read raw_data
13 | raw_data <- read.csv("./all_penalties.csv",
14 | stringsAsFactors = FALSE)
15 |
16 | # basic cleaning and group by player
17 | player_data <- raw_data %>%
18 | mutate(name = str_squish(name),
19 | penalties = ifelse(penalties == '-',0,penalties),
20 | scored = as.numeric(ifelse(scored =='-', 0, scored))) %>%
21 | group_by(name) %>%
22 | summarise(total = sum(penalties),
23 | total_score = sum(scored))%>%
24 | mutate(ratio = total_score/(total)) %>%
25 | filter(total >= 4 & ratio > 0 & ratio < 1) %>%
26 | na.omit()
27 |
28 | #### draw xkcd with dataman
29 | xrange = c(0.2,1.0)
30 | yrange = c(0,4)
31 | ratioxy <- diff(xrange) / diff(yrange)
32 | mapping <- aes(x=x,
33 | y=y,
34 | scale=scale,
35 | ratioxy=ratioxy,
36 | angleofspine = angleofspine,
37 | anglerighthumerus = anglerighthumerus,
38 | anglelefthumerus = anglelefthumerus,
39 | anglerightradius = anglerightradius,
40 | angleleftradius = angleleftradius,
41 | anglerightleg = anglerightleg,
42 | angleleftleg = angleleftleg,
43 | angleofneck = angleofneck)
44 |
45 | dataman <- data.frame( x= 0.3, y=3,
46 | scale = 0.5,
47 | ratioxy = ratioxy,
48 | angleofspine = -pi/2,
49 | anglerighthumerus = -pi/6,
50 | anglelefthumerus = -pi/2 -pi/6,
51 | anglerightradius = pi/5,
52 | angleleftradius = pi/5,
53 | angleleftleg = 3*pi/2 + pi / 12 ,
54 | anglerightleg = 3*pi/2 - pi / 12,
55 | angleofneck = runif(1, min = 3 * pi / 2 - pi/10 , max = 3 * pi / 2 + pi/10))
56 |
57 | # draw histogram of conversion rates
58 | player_data %>%
59 | ggplot(aes(ratio)) +
60 | geom_histogram(breaks = 5:25/25,
61 | fill = hcl(0, 50, 80)) +
62 | xkcdaxis(c(0.1,1), c(0,80)) +
63 | labs (x = "\nHistogram of penalties conversion rate", y = "Count") +
64 | theme_xkcd()
65 |
66 | # fit a beta distribution on the histogram
67 | m <- MASS::fitdistr(player_data$ratio, dbeta,
68 | start = list(shape1 = 10, shape2 = 1),
69 | lower=c(0.1,0.1))
70 | alpha0 <- m$estimate[1]
71 | beta0 <- m$estimate[2]
72 |
73 | # plot the fit with some fun xkcd
74 | ggplot(player_data) +
75 | geom_histogram(aes(ratio, y = ..density..),
76 | breaks = 5:25/25,
77 | fill = hcl(0, 50, 80)) +
78 | stat_function(fun = function(x) dbeta(x, alpha0, beta0), color = "red",
79 | size = 1) +
80 | xlab("\n Penalty Coversion Rate") +
81 | theme_xkcd() +
82 | xkcdaxis(xrange, yrange) +
83 | xkcdman(mapping, dataman) +
84 | annotate("text", x=0.4, y = 4,
85 | label = "Does not look an amazing good fit\nBut it's okay",
86 | family="xkcd") +
87 | xkcdline(aes(x=xbegin,y=ybegin,xend=xend,yend=yend),
88 | data.frame(xbegin=0.36,ybegin=3,xend=0.42,yend=3.5),
89 | xjitteramount = 0.01)
90 |
91 | # adjusted ratio:
92 | adjusted_ratio <- player_data %>%
93 | mutate(eb_estimate = (total_score + alpha0) / (total + alpha0 + beta0)) %>%
94 | arrange(desc(eb_estimate))
95 |
96 | # posterior plots for specific players:
97 | specific_players <- adjusted_ratio %>%
98 | filter(name %in% c("Cristiano Ronaldo",
99 | "Nicolas Pepe",
100 | "Alexis Sanchez",
101 | "Antoine Griezmann")) %>%
102 | mutate(alpha = total_score + alpha0,
103 | beta = total - total_score + beta0)
104 |
105 | # draw posterior beta distribution for these players
106 | specific_players %>%
107 | crossing(x=seq(0.4,0.99,.002)) %>%
108 | ungroup() %>%
109 | mutate(density=dbeta(x,alpha,beta)) %>%
110 | ggplot(aes(x, density, color = name)) +
111 | geom_line() +
112 | stat_function(fun=function(x) dbeta(x, alpha0, beta0), lty = 2, color = 'black') +
113 | xlab("Conversion rate") +
114 | theme_xkcd()
115 |
116 | # draw actual vs adjusted ratio plot
117 | ggplot(adjusted_ratio, aes(ratio, eb_estimate, color = total)) +
118 | geom_hline(yintercept = alpha0 / (alpha0 + beta0), color = "red", lty = 2) +
119 | geom_point() +
120 | geom_abline(color = "red") +
121 | scale_colour_gradient(breaks = c(0,20,30,50,70)) +
122 | xlim(0.5,1) +
123 | ylim(0.5,1) +
124 | xlab("Actual goal scoring average") +
125 | ylab("Posterior goal scoring average")
126 |
127 |
128 | #### When it seems that a unimodal beta distribution is not a good fit
129 | #### we can use E-M algorithm (implemented in the betareg package
130 | #### to fit 2 beta distributions
131 | m<- betamix(ratio ~ 1| 1, data = player_data, k = 1:3)
132 |
133 | mu <- plogis(coef(m)[,1])
134 | phi <- exp(coef(m)[,2])
135 | a <- mu*phi
136 | b <- (1-mu)*phi
137 | # get the cluser
138 | cl <- clusters(m)
139 |
140 | # plotting
141 | ## separate histograms for both clusters
142 | ## TODO: convert back to ggplot code
143 | hist(subset(player_data, cl == 1)$ratio, breaks = 5:25/25, freq = FALSE,
144 | col = hcl(0, 50, 80), main = "", xlab = "Penalty Conversion Rate", ylim = c(0, 9))
145 |
146 | hist(subset(player_data, cl == 2)$ratio, breaks = 5:25/25, freq = FALSE,
147 | col = hcl(240, 50, 80), main = "", xlab = "Penalty Conversion Rate", ylim = c(0, 9), add = TRUE)
148 |
149 | ## lines for fitted densities
150 | ys <- seq(0, 1, by = 0.01)
151 | lines(ys, dbeta(ys, shape1 = a[1], shape2 = b[1]),
152 | col = hcl(0, 80, 50), lwd = 2)
153 | lines(ys, dbeta(ys, shape1 = a[2], shape2 = b[2]),
154 | col = hcl(240, 80, 50), lwd = 2)
155 |
156 | ## lines for corresponding means
157 | abline(v = mu[1], col = hcl(0, 80, 50), lty = 2, lwd = 2)
158 | abline(v = mu[2], col = hcl(240, 80, 50), lty = 2, lwd = 2)
159 |
160 | ## repeat Bayesian updating
161 | ## only group specific this time
162 | post <- posterior(m)
163 | post[,1]
164 | # posterior probabilies of being assigned to each group
165 | player_data$post_1 <- post[,1]
166 | player_data$post_2 <- post[,2]
167 |
168 | player_data <- player_data %>%
169 | mutate(shrunkage_1 = (total_score + a[1])/(total + a[1] + b[1]),
170 | shrunkage_2 = (total_score + a[2])/(total + a[2] + b[2]),
171 | shrunkage_ave = (post_1*shrunkage_1 + post_2*shrunkage_2)) %>%
172 | arrange(desc(shrunkage_ave))
173 |
174 | # plot
175 | player_data %>%
176 | gather(type, value, ratio, shrunkage_ave)%>%
177 | mutate(type = ifelse(type == 'ratio',
178 | 'Raw scoring ratio',
179 | 'Average posterior'),
180 | type = relevel(factor(type), 'Raw scoring ratio')) %>%
181 | ggplot(aes(total_score, value)) +
182 | geom_point() +
183 | facet_wrap(~ type) +
184 | ylab("Estimate") +
185 | theme_bw()
186 |
--------------------------------------------------------------------------------
/R/EPL/prediction/clean_data.R:
--------------------------------------------------------------------------------
1 | ########################################
2 | # scripts to clean data to usable format
3 | # pipe directly to sim.R
4 | # source:
5 | # - fixtures.csv: dedicatedexcel.com
6 | # - Historical results: https://www.kaggle.com/thefc17/epl-results-19932018
7 | #########################################
8 | library (dplyr)
9 |
10 | fixtures <- read.csv("fixtures.csv", stringsAsFactors = FALSE)
11 |
12 | # get the team
13 | teams <- unique(fixtures$HOME.TEAM)
14 |
15 | # extract historic results
16 | history <- read.csv("history.csv", stringsAsFactors = FALSE)
17 |
18 | # get info from the 2010 up to 2018
19 | seasons <- sapply(10:17, function(x) paste0(2000+x,'-',x+1))
20 |
21 | recent.pl <- history %>%
22 | filter(Season %in% seasons, div == 'E0')
23 |
24 | # because the two data comes from different source, so the teams name don't match
25 | teams[!teams %in% recent.pl$HomeTeam]
26 | unique(recent.pl$HomeTeam)
27 |
28 | # now we need to fix it
29 | pair_fix <- list(c('Manchester United', 'Man United'), c('Newcastle United', 'Newcastle'),
30 | c('Huddersfield Town', 'Huddersfield'), c('Wolverhampton Wanderers', 'Wolves'),
31 | c('Cardiff City', 'Cardiff'), c('Leicester City', 'Leicester'),
32 | c('Tottenham Hotspur', 'Tottenham'), c('West Ham United', 'West Ham'),
33 | c('Manchester City', "Man City"), c('Brighton and Hove Albion', 'Brighton'))
34 |
35 | # fix the recent.pl dataset
36 | for (i in 1:length(pair_fix)){
37 | recent.pl <- recent.pl %>%
38 | mutate(HomeTeam = replace(HomeTeam, HomeTeam == pair_fix[[i]][2], pair_fix[[i]][1]),
39 | AwayTeam = replace(AwayTeam, AwayTeam == pair_fix[[i]][2], pair_fix[[i]][1]))
40 | }
41 |
42 |
43 | # a bland average dataframe
44 | ave_home <- recent.pl %>%
45 | group_by(HomeTeam) %>%
46 | summarize (ave_scored_h = mean(FTHG), ave_conceded_h = mean(FTAG)) %>%
47 | filter (HomeTeam %in% teams) %>% rename(Team = HomeTeam)
48 |
49 | ave_away <- recent.pl %>%
50 | group_by(AwayTeam) %>%
51 | summarize (ave_scored_a = mean(FTAG), ave_conceded_a = mean(FTHG)) %>%
52 | filter (AwayTeam %in% teams) %>% rename(Team = AwayTeam)
53 |
54 | ave <- merge(ave_home, ave_away, by = 'Team')
55 |
56 |
57 | # more precise result with pairwise
58 | hist_pair.pl <- recent.pl %>%
59 | group_by(HomeTeam, AwayTeam) %>%
60 | filter (HomeTeam %in% teams, AwayTeam %in% teams) %>%
61 | summarize (match = n(), ave_home_scored = mean(FTHG), ave_away_scored = mean(FTAG))
62 |
63 | rm(history, seasons, pair_fix, ave_home, ave_away)
64 |
--------------------------------------------------------------------------------
/R/EPL/prediction/match_simulate.R:
--------------------------------------------------------------------------------
1 | library (dplyr)
2 | source ('clean_data.R')
3 |
4 | # get most frequent score line of a match after n, sim time
5 | nsim = 100
6 | get_score <- function (home, away, nsim){
7 | # try to get from history, pair
8 | subset <- hist_pair.pl[ which( hist_pair.pl$HomeTeam ==home | hist_pair.pl$AwayTeam ==away), ]
9 | # more efficient code, no need to retract back to dataframe many times
10 | ave_h_s = subset$ave_home_scored[1]
11 | ave_a_s = subset$ave_away_scored[1]
12 |
13 | t_ave_h_s = ave[ave$Team == home,]$ave_scored_h
14 | t_ave_a_c = ave[ave$Team == away,]$ave_conceded_a
15 | t_ave_h_c = ave[ave$Team == home,]$ave_conceded_h
16 | t_ave_a_s = ave[ave$Team == away,]$ave_scored_a
17 | score_line = character(length(nsim))
18 | # simulation idea similar to that of sim.R
19 | for (i in 1:nsim){
20 | if ((dim(subset)[1] == 1) & (subset$match[1] > 3)){
21 | h_scored = rpois(1, ave_h_s)
22 | a_scored = rpois(1, ave_a_s)
23 | }
24 | # if we have no historical result of the match
25 | else{
26 | # take into account both attacking stat of home and defense stats of away
27 | h_scored = rpois(1, 1/2 * (t_ave_h_s + t_ave_a_c))
28 | a_scored = rpois(1, 1/2 * (t_ave_a_s + t_ave_h_c))
29 | }
30 | score_line[i] = paste0(h_scored, '-', a_scored)
31 | }
32 | return (list(names(which.max(table(score_line))), max(table(score_line))))
33 | }
34 |
35 | round_1 <- head(fixtures,10)
36 | matches <- mapply(get_score, round_1$HOME.TEAM, round_1$AWAY.TEAM, nsim, SIMPLIFY = FALSE)
37 | round_1$score_line <- sapply(matches, function(x) x[1])
38 | round_1$prob <- sapply(matches, function(x) x[2])
39 |
--------------------------------------------------------------------------------
/R/EPL/prediction/sim.R:
--------------------------------------------------------------------------------
1 | library (dplyr)
2 | source ('clean_data.R')
3 |
4 | # get score of a match
5 | get_score <- function (home, away){
6 | # try to get from history, pair
7 | subset <- hist_pair.pl[ which( hist_pair.pl$HomeTeam ==home & hist_pair.pl$AwayTeam ==away), ]
8 | # only use this method if we have at least 4 matches
9 | if ((dim(subset)[1] == 1) & (subset$match[1] > 3)){
10 | h_scored = rpois(1, subset$ave_home_scored[1])
11 | a_scored = rpois(1, subset$ave_away_scored[1])
12 | }
13 | # if we have no historical result of the match
14 | else{
15 | # take into account both attacking stat of home and defense stats of away
16 | h_scored = rpois(1, 1/2 * (ave[ave$Team == home,]$ave_scored_h +
17 | ave[ave$Team == away,]$ave_conceded_a))
18 | a_scored = rpois(1, 1/2 * (ave[ave$Team == away,]$ave_scored_a +
19 | ave[ave$Team == home,]$ave_conceded_h))
20 | }
21 | return (list(h_scored, a_scored))
22 | }
23 |
24 | rank <- function (m_result){
25 | table <- data.frame(name = teams,
26 | goal_score = rep(0,20),
27 | goal_conceded = rep(0,20),
28 | point = rep(0,20))
29 | # loop through all the results and then update
30 | for (i in 1:nrow(m_result)){
31 | home = m_result$HOME.TEAM[i]
32 | away = m_result$AWAY.TEAM[i]
33 | h_goal = m_result$h_scored[i]
34 | a_goal = m_result$a_scored[i]
35 |
36 | # add goal
37 | table[table$name == home,]$goal_score = table[table$name == home,]$goal_score + h_goal
38 | table[table$name == home,]$goal_conceded = table[table$name == home,]$goal_conceded + a_goal
39 | table[table$name == away,]$goal_score = table[table$name == away,]$goal_score + a_goal
40 | table[table$name == away,]$goal_conceded = table[table$name == away,]$goal_conceded + h_goal
41 |
42 |
43 | # calculate point
44 | if (h_goal > a_goal){
45 | table[table$name == home,]$point = table[table$name == home,]$point + 3
46 | }
47 | else if (h_goal < a_goal){
48 | table[table$name == away,]$point = table[table$name == away,]$point + 3
49 | }
50 | else{
51 | table[table$name == home,]$point = table[table$name == home,]$point + 1
52 | table[table$name == away,]$point = table[table$name == away,]$point + 1
53 | }
54 | }
55 |
56 | table$goal_dif <- table$goal_score - table$goal_conceded
57 | table <- table[order(-table$point, -table$goal_dif, -table$goal_score), ]
58 |
59 | return (table)
60 | }
61 |
62 | simulate <- function(fixtures){
63 | matches <- mapply(get_score, fixtures$HOME.TEAM, fixtures$AWAY.TEAM, SIMPLIFY = FALSE)
64 | fixtures$h_scored <- unlist(sapply(matches, function(x) x[1]))
65 | fixtures$a_scored <- unlist(sapply(matches, function(x) x[2]))
66 | table <- rank(fixtures)
67 | return (table)
68 | }
69 |
70 |
71 | nsim = 10000
72 | tabulate_data <- data.frame(name = teams,
73 | champion = rep(0,20),
74 | runner_up = rep(0,20),
75 | top_4 = rep(0,20),
76 | top_6 = rep(0,20),
77 | relegate = rep(0,20))
78 | pb <- txtProgressBar(min = 0, max = nsim, style = 3)
79 |
80 | for (sim in 1:nsim){
81 | table = simulate(fixtures)
82 |
83 | first = table$name[1]
84 | second = table$name[2]
85 | first_4 = table$name[1:4]
86 | first_6 = table$name[1:6]
87 | last_3 = table$name[18:20]
88 |
89 | tabulate_data <- tabulate_data %>%
90 | mutate(champion = ifelse(name == first, champion+1, champion),
91 | runner_up = ifelse(name == second, runner_up+1, runner_up),
92 | top_4 = ifelse(name %in% first_4, top_4+1, top_4),
93 | top_6 = ifelse(name %in% first_6, top_6+1, top_6),
94 | relegate = ifelse(name %in% last_3, relegate+1, relegate))
95 | setTxtProgressBar(pb, sim)
96 | }
97 |
98 | # convert to percentage
99 | tabulate_data <- tabulate_data %>%
100 | mutate (champion = champion/nsim,
101 | runner_up = runner_up/nsim,
102 | top_4 = top_4/nsim,
103 | top_6 = top_6/nsim,
104 | relegate = relegate/nsim)
105 |
106 | # write result into csv
107 | write.csv(tabulate_data, "tabulate_data.csv", row.names = FALSE)
108 |
109 |
110 |
--------------------------------------------------------------------------------
/R/EPL/prediction/visualize.R:
--------------------------------------------------------------------------------
1 | library (dplyr)
2 | library (ggplot2)
3 | library (xkcd)
4 | library (extrafont)
5 |
6 | download.file("http://simonsoftware.se/other/xkcd.ttf",
7 | dest="xkcd.ttf", mode="wb")
8 | system("cp xkcd.ttf ~/Library/Fonts")
9 | font_import(path="~/Library/Fonts", pattern = "xkcd", prompt=FALSE)
10 | fonts()
11 | fonttable()
12 | if(.Platform$OS.type != "unix") {
13 | ## Register fonts for Windows bitmap output
14 | loadfonts(device="win")
15 | } else {
16 | loadfonts()
17 | }
18 |
19 | # extract historic results
20 | history <- read.csv("https://raw.githubusercontent.com/tuangauss/Various-projects/master/data/history.csv", stringsAsFactors = FALSE)
21 |
22 | # get info from the 2010 up to 2018
23 | seasons <- sapply(10:17, function(x) paste0(2000+x,'-',x+1))
24 |
25 |
26 | graph_func <- function(season){
27 | if (season[1] == "2017-18"){
28 | title = "Last season: 2017-2018"
29 | }
30 | else{
31 | title = "From 2010-11 to 2017-18"
32 | }
33 | data <- history %>%
34 | filter (Season %in% season, div == 'E0') %>%
35 | mutate (total = FTAG + FTHG)
36 |
37 | ave_score <- mean(data$total)
38 |
39 | prob_data <- data %>%
40 | group_by(total) %>%
41 | summarize (prob = n()/nrow(data))
42 |
43 | ggplot(data=prob_data, aes(x=total, y=prob)) +
44 | geom_bar(stat="identity", color="blue", fill="grey") +
45 | scale_x_continuous(breaks=seq(0,10,1)) +
46 | geom_line(aes(x = total, y = dpois(x=total, lambda = ave_score)),
47 | col = "red", size = 0.5) +
48 | geom_point(aes(x = total, y = dpois(x=total, lambda = ave_score)),
49 | col = "black", size = 3) +
50 | ggtitle(title) + labs (x = "Total Goal", y = "Probability") +
51 | theme_xkcd()
52 | }
53 |
54 | graph_func(seasons)
55 | graph_func(c('2017-18'))
56 |
--------------------------------------------------------------------------------
/R/EPL/xkcd.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/R/EPL/xkcd.ttf
--------------------------------------------------------------------------------
/R/Paul_hypothesis_test.R:
--------------------------------------------------------------------------------
1 | #######################################
2 | ## Hypothesis testing procedure
3 | ## The curious case of Paul the Octopus
4 | ## Fisher vs N-p approach
5 | #######################################
6 |
7 | # The script is self-containing, no extra module or libary needed
8 |
9 |
10 | # graph binomial distribution, color extreme value and beyond
11 | graph <- function(n,p, value){
12 | x <- seq(0,n)
13 | prob <- dbinom(x,size=n,prob=p)
14 | cols <- c("grey","red")[(h >= value) + 1]
15 | barplot(prob,names.arg=0:n, col = cols,
16 | main=sprintf(paste('binomial distribution, size:',n, "prob:",p)))
17 | }
18 |
19 | graph(14,0.5,12)
20 |
21 |
22 | # calculate p value for binomial distribtion, at x = 12
23 | p_value = 1-pbinom(11,14,0.5)
24 | #p_value = dbinom(12,14,0.5) + dbinom(13,14,0.5) + dbinom(14,14,0.5)
25 |
26 | # Neyman- Pearson approach
27 | # calculate current power
28 | # a. Assuming type 1 error = 0.01
29 | p_value = 1-pbinom(0:14, 14,0.5)
30 | critical_value = which(p_value == p_value[p_value < 0.01][1])-1
31 | type2 = pbinom(critical_value-1,14,0.75)
32 |
33 | # b. More interesting problem
34 | # You should try first before looking up the code
35 | # if we want to achieve type 1 error < 1% and power > 90%, how many observation do we need to make?
36 |
37 | stop = FALSE
38 | for (n in 1:50){
39 | for (k in 0:n){
40 | type1 <- 1- pbinom(k,n,0.5)
41 | type2 <- pbinom(k-1,n,0.75)
42 | if (type1 < 0.01 & type2 <0.1){
43 | print (paste("n is ",toString(n),", k is", toString(k)))
44 | stop = TRUE
45 | break
46 | }
47 | }
48 | if (stop) break
49 | }
50 | # need 42 observations
51 |
52 |
53 | # if we cut it some slack
54 | # type 1 of 5% and type 2 of 20%
55 | stop = FALSE
56 | for (n in 1:50){
57 | for (k in 0:n){
58 | type1 <- 1- pbinom(k,n,0.5)
59 | type2 <- pbinom(k-1,n,0.75)
60 | if (type1 < 0.05 & type2 <0.2){
61 | print (paste("n is ",toString(n),", k is", toString(k)))
62 | stop = TRUE
63 | break
64 | }
65 | }
66 | if (stop) break
67 | }
68 | # still need 16 observations
69 |
--------------------------------------------------------------------------------
/R/RuleOfThree.R:
--------------------------------------------------------------------------------
1 | library(ggplot2)
2 |
3 | true_p <- 0.001
4 | iter <- function(size){
5 | samp <- sample(x= c(1,0),
6 | size = size,
7 | prob = c(true_p, 1 - true_p),
8 | replace = TRUE)
9 | cut <- which.max(samp) - 1
10 | upper_bound <- min(3/cut, 1)
11 | return(upper_bound)
12 | }
13 |
14 | res <- replicate(n = 100000, iter(size = 10000))
15 | sum(res > true_p)
16 |
17 | ggplot() +
18 | aes(res) +
19 | geom_histogram(colour="black", fill="grey", bins = 100) +
20 | geom_vline(aes(xintercept = true_p), color = "red") +
21 | xlim(c(0, .05)) +
22 | theme_bw() +
23 | theme(legend.position = "none") +
24 | labs(x = "Upper Bound")
25 |
--------------------------------------------------------------------------------
/R/bayes_god.R:
--------------------------------------------------------------------------------
1 | ########################
2 | # Illustrative calculation
3 | # Article: A Bayesian quest to find God
4 | # Published: July 19, 2019
5 | ########################
6 |
7 | library (tidyverse)
8 |
9 | bayes <- function(x, y_x, y_nx){
10 | num <- y_x * x
11 | denom <- y_x * x + y_nx * (1-x)
12 | return (num/denom)
13 | }
14 |
15 | bayes(0.01, 1, 1/7)
16 |
17 | days <- seq(0,10,1)
18 | posterior <- rep(0.01,11)
19 | for (i in 2:11){
20 | post <- bayes(posterior[i-1], 1, 1/7)
21 | posterior[i] <- post
22 | }
23 |
24 |
25 | posterior_2<- rep(0.0001,11)
26 | for (i in 2:11){
27 | post <- bayes(posterior_2[i-1], 1, 1/7)
28 | posterior_2[i] <- post
29 | }
30 |
31 | #https://www.datanovia.com/en/blog/ggplot-legend-title-position-and-labels/
32 | df <- data.frame(days, posterior, posterior_2)
33 |
34 | vis1 <- df %>%
35 | ggplot(aes(x=days, y = posterior)) +
36 | geom_point() +
37 | scale_x_continuous(breaks = days) +
38 | labs (title = " Posterior estimate") +
39 | theme_classic()
40 |
41 | vis2 <- df %>%
42 | gather(prior, value, -days) %>%
43 | ggplot(aes(x=days, y = value, color = prior)) +
44 | geom_point() +
45 | scale_color_discrete(name = "Value of prior \n on Day-1",
46 | labels = c(0.01, 0.0001)) +
47 | scale_x_continuous(breaks = days) +
48 | labs (title = " Posterior estimate") +
49 | theme_classic()
50 |
--------------------------------------------------------------------------------
/R/bayesian_gym.R:
--------------------------------------------------------------------------------
1 | # load libraries
2 | library(rjags)
3 | library(dplyr)
4 | library(MASS)
5 | library(ggplot2)
6 |
7 | # load data
8 | raw_data <- read.csv("~/data/Vietnamese_2016.csv",
9 | head = TRUE, sep = ";")
10 | head(raw_data)
11 | summary(raw_data$Age_gr)
12 |
13 | # clean data
14 | data <- raw_data %>%
15 | filter(Age_gr == "18-29") %>%
16 | filter(Sex == "male") %>%
17 | dplyr::select(height, weight, BMI) %>%
18 | mutate(height = as.numeric(gsub(",", ".", height))) %>%
19 | mutate(weight = as.numeric(gsub(",", ".", weight)))
20 |
21 | # my info
22 | m_height = 168
23 | m_weight = 58
24 | m_BMI = m_weight / (m_height/100)^2
25 |
26 | # visualization:
27 | truehist(data$weight,nbins = 50,
28 | main = paste("Histogram of Vietnamese male weight"), xlab = "Weight in kg")
29 | abline(v=m_weight,col="black", lwd = 4)
30 | abline(v=median(data$weight), col = "red", lty = 4, lwd = 4)
31 | abline(v=mean(data$weight), col ="orange", lty = 4, lwd = 4)
32 | text(m_weight-2, 0.12, "Me!!!")
33 |
34 | my_data <- data.frame(height = m_height, weight = m_weight)
35 | ggplot(data, aes(height, weight)) +
36 | geom_point(shape = 16, size = 5, show.legend = FALSE, colour = "blue", alpha = 0.4) + theme_minimal() +
37 | geom_point(data = my_data, color ="red", size = 5) +
38 | labs (title = "Weight versus Height plot of 383 Vietnamese male and Tuan", subtitle = "***Red point is author's own measurement") +
39 | theme(plot.title = element_text(color="#666666", face="bold", size=20, hjust=0))
40 |
41 |
42 | # add standard least square line
43 | model <- lm(data$weight ~ data$height) #fit linear model
44 | label_text <- paste('Fitted model: ', round(coef(model)[1], 3), ' + ', round(coef(model)[2], 3), ' x', sep = '')
45 | ggplot(data, aes(height, weight)) +
46 | geom_point(shape = 16, size = 5, show.legend = FALSE, colour = "blue", alpha = 0.4) + theme_minimal() +
47 | geom_smooth(method = "lm", fullrange=TRUE, color = "red") +
48 | geom_text(aes(x = 143, y = 55, label = label_text),hjust = 0, size = 6) +
49 | geom_point(data = my_data, color ="red", size = 5) +
50 | labs (title = "Weight versus Height plot of 383 Vietnamese male and Tuan") +
51 | theme(plot.title = element_text(color="#666666", face="bold", size=20, hjust=0))
52 |
53 |
54 |
55 | #### Running JAGS model ####
56 | ############################
57 |
58 | n <- nrow(data) #383 data points
59 |
60 | mymodel <- "
61 | model{
62 | for(i in 1:n){
63 | y[i] ~ dnorm(a + b*x[i], tau)
64 | }
65 | a ~ dnorm(0, 1e-6)
66 | b ~ dnorm(0, 1e-6)
67 | tau ~ dgamma(.01,.01)
68 | sig <- 1/sqrt(tau)
69 | }
70 | "
71 |
72 | jm <- jags.model(file = textConnection(mymodel), data=list(n=n, x=data$height, y=data$weight))
73 | cs <- coda.samples(jm, c("a","b","sig"), 11000)
74 | sample_data <- as.data.frame(cs[[1]][-(1:1000),])
75 |
76 | cmean <- sample_data$a + sample_data$b*m_height # "conditional mean"
77 |
78 | m_perc <- pnorm(q = m_weight, mean = cmean, sd = sample_data$sig)
79 | truehist(m_perc, main = "Posterior distribution for my weight percentile",
80 | xlab = "percentile", ylab = "Frequency")
81 | mean(m_perc<=0.4)
82 | mean(m_perc)
83 |
84 |
85 | ### What happen if I compare myself to American men
86 | nls_data <-read.csv("~/data/national_longitudinal_survey.csv", head = TRUE)
87 | nls_data <- nls_data %>%
88 | filter(Gender == "Male") %>%
89 | mutate (height = Height..inches.*2.54) %>%
90 | mutate (weight = Weight..lbs./2.2046) %>%
91 | dplyr::select(height,weight)
92 |
93 | #4150 data points
94 | n <- nrow(nls_data)
95 |
96 | mymodel <- "
97 | model{
98 | for(i in 1:n){
99 | y[i] ~ dnorm(a + b*x[i], tau)
100 | }
101 | a ~ dnorm(0, 1e-6)
102 | b ~ dnorm(0, 1e-6)
103 | tau ~ dgamma(.01,.01)
104 | sig <- 1/sqrt(tau)
105 | }
106 | "
107 |
108 | jm <- jags.model(file = textConnection(mymodel), data=list(n=n, x=nls_data$height, y=nls_data$weight))
109 | cs <- coda.samples(jm, c("a","b","sig"), 11000)
110 | sample_data <- as.data.frame(cs[[1]][-(1:1000),])
111 |
112 | cmean <- sample_data$a + sample_data$b*m_height
113 | m_perc <- pnorm(q = m_weight, mean = cmean, sd = sample_data$sig)
114 | truehist(m_perc)
115 |
--------------------------------------------------------------------------------
/R/dating_sim.R:
--------------------------------------------------------------------------------
1 | ############################################
2 | ## The Optimal dating strategy
3 | ## Why we should always reject the first 37%
4 | ## An MC simulation
5 | ############################################
6 |
7 | # calculate the theoretical probability of P(S_n,k)
8 | theo_prob <- function(x){
9 | if (x == 1) return (1/100)
10 | else return ((x-1)/100 * (sum(1/((x:100)-1))))
11 | }
12 |
13 | # a util function to simulate the 'best-partner rank'
14 | perm_rank <- function(n){
15 | return (sample(1:n, n))
16 | }
17 |
18 | # simulation(n) will run a MC simulation for the case of N=n
19 | # returning the optimal M and the corresponding optimal probability
20 | simulation <- function(n){
21 | M_range <- 2:n
22 | niter <- 1000 #for each value of M, we simulate 1000 times
23 |
24 | # declare a vector to store results,
25 | # notice that if M = 1, the probability is 1/100
26 | prob_result <- rep(1/100, 100)
27 |
28 | # do a simulation for each value of M
29 | for (M in M_range){
30 | result <- rep(0, niter)
31 | for (i in 1:niter){
32 | order <- perm_rank(n) #simulate the order
33 | # find the best among the first M-1 that gets rejected
34 | highest_reject <- min(head(order, M-1))
35 | if (highest_reject != 1){
36 | accept <- order[order < highest_reject][1]
37 | # we consider ourselves successful if:
38 | # - rank 1 is not included in the first M-1 candidates
39 | # - rank 1 is the first person who is better than all we have seen
40 | if (accept == 1){
41 | result[i] <- 1
42 | }
43 | }
44 | }
45 | prob_result[M] <- mean(result)
46 | }
47 | return (c(max(prob_result), which.max(prob_result)/n))
48 | }
49 |
50 | # applying simulation(n) to different values of n
51 | opt_p <- sapply(2:30, function(x) simulation(x)[1])
52 | plot(2:30, opt_p, ylim = c(0.2,1), main = 'Optimal probability \n P(S_n,k)',
53 | xlab = 'N', ylab = 'Probability')
54 |
55 | opt_ratio <- sapply(2:30, function(x) simulation(x)[2])
56 | plot(2:30, opt_ratio, ylim = c(0.2,1.1), main = 'Optimal ratio \n M/N',
57 | xlab = 'N', ylab = 'Ratio')
58 |
--------------------------------------------------------------------------------
/R/end_to_end_projects.R:
--------------------------------------------------------------------------------
1 | # import necessary libary
2 | library(MASS)
3 | library(dplyr)
4 | library(caret)
5 | library (ggplot2)
6 | library(rpart)
7 | library(e1071)
8 | library (leaps)
9 |
10 | # download and extract dataset from source
11 | link <- "http://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.tgz"
12 | download.file(link, destfile = "~/data/cal_housing.tgz")
13 | untar("cal_housing.gz")
14 | cal_housing <- read.csv("~data/CaliforniaHousing/cal_housing.data")
15 |
16 | # explore and visualize data
17 | head(cal_housing)
18 | dim(cal_housing)
19 | str(cal_housing)
20 | summary(cal_housing) #very useful
21 |
22 | levels(cal_housing$ocean_proximity) #categorical var
23 |
24 | # plot numerical vars
25 | cal_housing_num <- subset(cal_housing,
26 | select = -c(ocean_proximity))
27 | par(mfrow= c(3,3))
28 | invisible(lapply(names(cal_housing_num),
29 | function(col_name) truehist(cal_housing_num[,col_name],
30 | main = paste("Histogram of ", col_name),
31 | xlab = NA)))
32 |
33 | # scatter plot with ggplots
34 | g <- ggplot(cal_housing, aes(x = longitude, y = latitude, colour = median_income))
35 | g + geom_point() + scale_color_distiller(palette = "Spectral") +
36 | labs(title = "Plot of data points by location and median_income") +
37 | theme(plot.title = element_text(color="black", size=14, face="bold.italic"))
38 |
39 | # correlation
40 | cor(subset(cal_housing, select = -c(ocean_proximity)),
41 | use = "pairwise.complete.obs")[,"median_house_value"]
42 |
43 | #### Data Wrangling #####
44 | #########################
45 |
46 | # dealing wiht missing data: 3 options
47 | cal_housing <- subset(cal_housing,
48 | select = -c(total_bedroom)) #delete column
49 |
50 | cal_housing <- cal_housing[complete.cases(cal_housing),] #remove missing entries
51 |
52 | cal_housing$total_bedrooms[is.na(cal_housing$total_bedrooms)] <- median(cal_housing$total_bedrooms, na.rm=TRUE)#impute NAs with a good statistics (eg: median)
53 |
54 | # other cleaning tasks
55 | cal_housing <- cal_housing %>%
56 | filter(median_house_value < 500000) %>%
57 | mutate(rooms_per_house = total_rooms / households) %>%
58 | mutate(population_per_house = population / households) %>%
59 | mutate(ocean_proximity = as.factor(ocean_proximity)) %>%
60 | mutate_at(vars(-ocean_proximity, -median_house_value, -median_income), funs(scale)) %>%
61 | data.matrix %>% data.frame
62 |
63 |
64 | #### Split to training set and test set ####
65 | ############################################
66 |
67 | # random sampling
68 | set.seed(365)
69 | train_id <- sample(nrow(cal_housing), size = 0.8*nrow(cal_housing))
70 | train_set <- cal_housing[train_id,]
71 | test_set <- cal_housing[-train_id,]
72 | print (paste(nrow(train_set), "train +", nrow(test_set), "test"))
73 |
74 | # stratified sampling
75 | par(mfrow = c(1,2))
76 | truehist(cal_housing[,"median_income"], main = paste("Histogram of median income"), xlab = NA)
77 | cal_housing <- cal_housing %>% #categorize median income
78 | mutate(income_level = ceiling(median_income/2)) %>%
79 | mutate(income_level = factor(ifelse(income_level >= 5, 5, income_level))) %>%
80 | select(-median_income)
81 | plot(cal_housing$income_level, main = paste("Bar plot of income level"), xlab = NA)
82 |
83 | train_str_id <- createDataPartition(cal_housing$income_level, p =.8,
84 | list = FALSE, times = 1)
85 | train_str <- cal_housing[train_str_id,]
86 | test_str <- cal_housing[-train_str_id,]
87 | # test to see if we achieve stratified sampling
88 | table(cal_housing$income_level) / nrow(cal_housing)
89 | table(train_str$income_level) / nrow(train_str)
90 |
91 |
92 | #compare performance of 2 sampling method
93 | overall<- as.vector(table(cal_housing$income_level) / nrow(cal_housing))
94 | normal_sampling <- factor(sapply(ceiling(test_set$median_income/2),
95 | function(value) ifelse(value >=5, 5, value))) #sapply automatically returns a list
96 | normal_sampling <- as.vector(table(normal_sampling) / length(normal_sampling))
97 | str_sampling <- as.vector(table(test_str$income_level) / nrow(test_str))
98 | compare <- data.frame(overall, str_sampling, normal_sampling) %>%
99 | mutate(rand_error = 100*normal_sampling/overall - 100) %>%
100 | mutate(strat_error = 100*str_sampling/overall-100)
101 |
102 | compare
103 |
104 | #### Fit models ####
105 | ####################
106 |
107 | # linear model
108 | model_lm <- lm(median_house_value~., train_str)
109 | summary(model_lm)
110 | predict_lm_train <- predict(model_lm, train_str)
111 | sqrt(mean((train_str$median_house_value - predict_lm_train)^2)) #RMSE
112 |
113 |
114 | # Decision tree
115 | model_decision_tree <- rpart(median_house_value~.,
116 | data = train_str, method = "anova",
117 | control = rpart.control(minsplit = 2, cp=0.001))
118 | predict_decision_tree <- predict(model_decision_tree, train_str)
119 | sqrt(mean((train_str$median_house_value - predict_decision_tree)^2)) #RMSE
120 |
121 | #SVM
122 | model_svm <- svm(median_house_value~.,
123 | data = train_str, cost = 10)
124 | predict_svm <- predict(model_svm, train_str)
125 | sqrt(mean((train_str$median_house_value - predict_svm)^2)) #RMSE
126 |
127 | #### 10-fold cross validation:
128 | cal_housing_copy <- cal_housing[sample(nrow(cal_housing)),] # randomly shuffle your data
129 |
130 |
131 | folds <- cut(seq(1,nrow(cal_housing_copy)),
132 | breaks=10,labels=FALSE) #Create 10 equally size folds
133 |
134 | #Perform 10 fold cross validation
135 | MSE_lm <- 0
136 | MSE_tree <- 0
137 | MSE_svm <- 0
138 |
139 | for(i in 1:10){
140 | #Segement your data by fold using the which() function
141 | testIndexes <- which(folds==i,arr.ind=TRUE)
142 | testData <- cal_housing_copy[testIndexes, ]
143 | trainData <- cal_housing_copy[-testIndexes, ]
144 |
145 | # fit in the models
146 | lm_model <- lm(median_house_value~., trainData)
147 | tree_model <- rpart(median_house_value~.,data = trainData, method = "anova",
148 | control = rpart.control(minsplit = 2, cp = 0.001))
149 | svm_model <- svm(median_house_value~.,data = trainData, cost = 10)
150 |
151 | # make predictions
152 | predict1 <- predict(lm_model, testData)
153 | predict2 <- predict (tree_model, testData)
154 | predict3 <- predict(svm_model, testData)
155 |
156 | #update MSE
157 | MSE_lm <- MSE_lm + sum(folds == i)/nrow(cal_housing_copy) * mean((predict1 - testData$median_house_value)^2)
158 | MSE_tree <- MSE_tree + sum(folds == i)/nrow(cal_housing_copy) * mean((predict2 - testData$median_house_value)^2)
159 | MSE_svm <- MSE_svm + sum(folds == i)/nrow(cal_housing_copy) * mean((predict3 - testData$median_house_value)^2)
160 |
161 | }
162 |
163 | sqrt(MSE_lm)
164 | sqrt(MSE_tree)
165 | sqrt(MSE_svm)
166 |
167 | #### Tuning parameters ####
168 | ###########################
169 |
170 | # Decision tree:
171 | tune_tree <- tune.rpart(median_house_value~.,
172 | data = train_str, minsplit = c(5,10,15, 20),
173 | cp = c(0.1,0.01,0.001,0.0001))
174 | summary(tune_tree)
175 | plot(tune_tree)
176 |
177 | best_tree <- tune_tree$best.model
178 | predict_tree <- predict(best_tree, train_str)
179 | sqrt(mean((train_str$median_house_value - predict_tree)^2)) #RMSE of best tree model
180 |
181 | # SVM:
182 | tune_svm <- tune.svm(median_house_value ~.,
183 | data = train_str,
184 | cost=10^(-1:2), gamma=c(0.1,0,1))
185 | summary(tune_svm)
186 | plot(tune_svm)
187 | best_svm <- tune_svm$best.model
188 | predict_svm <- predict (best_svm, train_str)
189 | sqrt(mean((train_str$median_house_value - predict_svm)^2))
190 |
191 |
192 | #### Applying on test set ####
193 | ##############################
194 |
195 | predict_tree_final <- predict(best_tree, test_str)
196 | sqrt(mean((test_str$median_house_value - predict_tree_final)^2))
197 |
198 | predict_svm_final <- predict(best_svm, test_str)
199 | sqrt(mean((test_str$median_house_value - predict_svm_final)^2))
--------------------------------------------------------------------------------
/R/lindy/Inverse_Random_Sampling.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/R/lindy/Inverse_Random_Sampling.pdf
--------------------------------------------------------------------------------
/R/lindy/lindy_simulation.R:
--------------------------------------------------------------------------------
1 | library(ggplot2)
2 |
3 | # conditional weibull
4 | # subtract t_0 to get exepcted years left
5 | sample_w <- function(u, lambda, kappa, t_0){
6 | (t_0^kappa - lambda^kappa*log(1-u))^(1/kappa) - t_0
7 | }
8 | # conditional pareto
9 | sample_p <- function(u, t_0, alpha = 2){
10 | t_0*(1-u)^(-1/alpha) - t_0
11 | }
12 |
13 | result_w = c()
14 | result_p = c()
15 | year_range <- seq(0, 80, 10)
16 | for(t in year_range){
17 | u <- runif(10000)
18 | samps_w <- sample_w(u, 77.1, 5.05, t)
19 | samps_p <- sample_p(u, t)
20 | result_w <- c(result_w, mean(samps_w))
21 | result_p <- c(result_p, mean(samps_p))
22 | }
23 |
24 | plot(year_range, result_w,
25 | type = "l", ylim = c(0, 100),
26 | main="Expected remaining year",
27 | xlab = "Year passed",
28 | ylab = "Years remaining",
29 | bty = "n")
30 | lines(year_range, result_p, col = "green", lty = 2)
31 | par(xpd=TRUE)
32 | legend(x=4.5, y = 100,
33 | legend=c("Human life time", "Lindy's good"),
34 | lty=1:2,
35 | col = c("black", "green"),
36 | ncol=2)
37 |
38 | # changing shape parameter
39 | u <- runif(100000)
40 | samps_p1 <- sample_p(u, 20, alpha = 2)
41 | samps_p2 <- sample_p(u, 20, alpha = 1.5)
42 | samps_p3 <- sample_p(u, 20, alpha = 3)
43 | df <- data.frame("type"= c(rep("alpha = 1.5", 100000),
44 | rep("alpha = 2", 100000),
45 | rep("alpha = 3", 100000)),
46 | "value" = c(samps_p2, samps_p1, samps_p3))
47 | ggplot(df, aes(x=value, fill = type)) +
48 | geom_density(alpha = .3) +
49 | xlim(0, 50) +
50 | ggtitle("pdf of years remaining after the first 20 years") +
51 | xlab("Years remaining") +
52 | ylab("Probability")
53 |
54 |
55 |
56 | #after 20 years
57 | u <- runif(100000)
58 | samps_w <- sample_w(u, 77.1, 5.05, 20)
59 | mean_w <- mean(samps_w)
60 | samps_p <- sample_p(u, 20)
61 | mean_p <- mean(samps_p)
62 | df <- data.frame("type"= c(rep("Human life time", 100000),
63 | rep("Lindy's good", 100000)),
64 | "value" = c(samps_w, samps_p))
65 | ggplot(df, aes(x=value, fill = type)) +
66 | geom_density(alpha = .3) +
67 | xlim(0, 100) +
68 | ggtitle("pdf of years remaining after the first 20 years") +
69 | xlab("Years remaining") +
70 | ylab("Probability") +
71 | geom_vline(xintercept = mean_p, linetype = "dashed", color = "blue") +
72 | geom_vline(xintercept = mean_w, linetype = "dashed", color = "red")
73 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Overview
2 |
3 | In this repository, you will find the source code to various projects I have been working on or still work-in-progress. The majority of the projects are accompanied by a Medium blog posts at [tuannguyen-doan.medium.com](https://tuannguyen-doan.medium.com/). I published almost exclusively on Towards Data Science publication through Medium's Partnership program so please check out these articles as a way to support me and my future projects. Alternatively, you can also find my blog posts at my personal website [here](https://tuangauss.github.io/).
4 |
5 | My interests lie in the intersection of statistical techniques, data visualization and sports (especially football). All the codes are written entirely in Python or R. I don't have a strong preference or attempt to make a concerted effort to code in a specific language/platform. The decision is mostly based on how specific functionalities needed for a project are supported (scraping in Python and data processing with dplyr piping in R).
6 |
7 | ### I. Statistical application:
8 |
9 | #### The statistics of modern football:
10 | A collection of projects that explore the intricate statistical aspect of the Beautiful Game
11 |
12 | - [Empirical Bayes and penalty taking ability](https://towardsdatascience.com/men-of-steel-finding-the-best-penalty-takers-with-empirical-bayes-estimation-aa0e126fb08b) - Using Bayesian statistics to make meaningful comparison between players across Europe.
13 | - [Poisson process and match prediction](https://towardsdatascience.com/o-jogo-bonito-predicting-the-premier-league-with-a-random-model-1b02fa3a7e5a) - Here we learn about the Poisson process and how a random model outperforms football experts with its prediction.
14 | - [The mathematics of football betting strategies](https://towardsdatascience.com/making-big-bucks-with-a-data-driven-sports-betting-strategy-6c21a6869171) - With the Poisson model and some additional help from mathematical research, can we beat the bookies?
15 | - [Fisher vs Neyman-Person debate and Paul the Octopus](https://towardsdatascience.com/what-can-an-octopus-tell-us-about-the-biggest-debate-in-statistical-theory-f017295d781f) - We went over the theory (or many theories) of hypothesis testings and see how they apply to the psychic ability of Paul the Octopus.
16 |
17 | #### Statistical theory and its application:
18 |
19 | - [Bayes theorem and a probabilistic argument for God](https://towardsdatascience.com/a-bayesian-quest-to-find-god-b30934972473) - Bayes theory and how people have been using it to justify the necessary existence of God.
20 | - [Dating with probability theory](https://towardsdatascience.com/probability-theory-and-the-optimal-dating-strategy-for-2018-2b75b26fb0b) - Here we explore what probability theory has to say about the most optimal strategy to find the love of your life.
21 | - [Bayes theorem and why it matters to my workout routine](https://towardsdatascience.com/how-bayesian-statistics-convinced-me-to-hit-the-gym-fa737b0a7ac) - A lightweight introduction to Bayes' theorem and how it helps convince me to hit the gym.
22 | - [The Rule of Three and its application](https://towardsdatascience.com/the-rule-of-three-calculating-the-probability-of-events-that-have-not-yet-occurred-106144dc2c39) - A short introduction of the Rule of Three and how we can apply it to calculate the probability of events that have yet to happen. Application in voting, vaccine development, product quality monitoring, etc.
23 | - [Lindy's effect](https://towardsdatascience.com/a-statistical-rule-to-optimize-your-life-the-lindys-effect-96d2c75b080d) - A (slightly) mathematical description of the Lindy's effect and how one can use it as a guide for life.
24 | - [Normal Distribution with High Dimensionality](https://towardsdatascience.com/disney-movies-were-right-we-are-all-special-and-statistically-so-3bb56e79ab71) - A statistical investigation into the myth of the "average Joe."
25 | - [Mark-Recapture method](https://medium.com/towards-data-science/the-statistical-theory-behind-why-your-instagram-posts-have-so-few-likes-31f46d03448b) - An intro to the statistics behind sampling theory and how you can use it to count *almost* everything
26 |
27 | ### II. External Collaborations:
28 |
29 | #### Published papers:
30 | - [A robust and scalable method to compare Percentile metrics in online experiments (Quora Data Blog, 2022)](https://quoradata.quora.com/A-Robust-and-Scalable-method-to-compare-Percentile-Metrics-in-online-experiments) Conducting statistical tests for Percentile metrics can be tricky, as they have less neat mathematical properties than other more common metrics, such as the average or the ratios. I discuss Quora's method to A/B test these metrics in a statistically valid and scalable manner.
31 | - [How social learning amplifies moral outrage expression in online social networks (Science Advances, 2021)](https://www.science.org/doi/pdf/10.1126/sciadv.abe5641) - Moral outrage shapes fundamental aspects of social life and is now widespread in online social networks. Here, we show how social learning processes amplify online moral outrage expressions over time.
32 | - [Application of machine learning models in predicting length of stay among healthcare workers in underserved communities in South Africa (Human Resources for Health, 2018)](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC6293620/) - We aim to use machine learning methods to predict health professional’s length of practice in the rural public healthcare sector based on their demographic information.
33 |
34 | ### III. General tutorials with Python and R:
35 |
36 | #### Data visualization:
37 | - [NetworkX and Basemap](https://towardsdatascience.com/catching-that-flight-visualizing-social-network-with-networkx-and-basemap-ce4a0d2eaea6) - Here is a comprehensive tutorial of how we can visualize geographical data with powerful tools that support Python.
38 | - [Tkinter and Python](https://towardsdatascience.com/having-your-own-fun-how-to-build-your-own-macys-firework-show-with-python-and-tkinter-79cc31631b44) - Building your own firework shows with Tkinter (and some math chops).
39 | - [Data visualization with Matplotlib and Seaborn](https://towardsdatascience.com/advanced-sports-visualization-with-pandas-matplotlib-and-seaborn-9c16df80a81b) - Learn how to construct publish-worthy visualizations with Matplotlib and Seaborn packages.
40 |
41 | #### Machine Learning practicals:
42 | - [End-to-end Machine Learning project with R](https://github.com/tuangauss/DataScienceProjects/blob/master/R/end_to_end_projects.R) - Here is a full data science project that covers data collection, cleaning, visualization, machine learning and validation.
43 | - [Unsupervised Learning - Clustering method with R](https://github.com/tuangauss/DataScienceProjects/blob/master/R/EPL/Misc/TeamEvaluate2015.R) - An introduction to an array of unsupervised learning algorithms: Hierachical clustering, k-means, and Factor Analysis.
44 | - [Collaborative Filtering with Python](https://towardsdatascience.com/building-my-own-2021-book-recommendation-engine-903ea10d5021) - A comprehensive guide to the mathematical details and implementation of popular Matrix Factorization methods.
45 |
46 |
47 |
--------------------------------------------------------------------------------
/data/Team2015season.csv:
--------------------------------------------------------------------------------
1 | Team,DribbledPast,Interception,Fouled,Fouls,Yellow,Red,Clearance,Total block,Total goals,Successful Dribble,Total Aerial challenges,AccLB,InAccLB,AccSP,InAccSP,Possession per game,Average salary per week_1,Summer transfer_1,Net spend on transfer,Average attendance per game
2 | Arsenal,376,767,418,376,68,2,1004,610,69,599,1390,877,871,16342,2446,55.7,166000,78000000,46400000,59992
3 | Manchester City,482,571,297,445,77,2,872,670,82,434,1188,902,1031,17790,2381,57,205000,53000000,32000000,45365
4 | Chelsea,387,370,432,382,73,4,1026,564,72,489,1299,1153,1118,15709,2289,54.1,192000,88500000,-800000,41546
5 | Manchester United,302,680,395,453,64,5,1117,513,60,374,1188,1736,1233,16407,1933,58.8,215000,145500000,122000000,75335
6 | Southampton,336,628,396,469,57,3,879,510,49,265,1308,1325,1397,13170,2289,51.9,63000,57500000,-30700000,30652
7 | Liverpool,324,441,466,385,66,3,1177,590,48,438,1161,1052,1097,14988,2240,54.4,144000,117000000,35700000,44659
8 | Tottenham,315,678,383,442,79,4,1045,527,56,413,1232,1113,1248,14575,2392,55.3,100000,27000000,-6500000,35728
9 | Stoke,366,501,409,487,82,1,1170,502,47,357,1655,1166,1257,10747,2206,50.2,61000,3000000,400000,27081
10 | Swansea,372,692,463,398,48,5,1202,426,43,351,1205,1092,1201,13586,1998,50.7,63000,5500000,1300000,20555
11 | West Ham,330,507,419,420,64,2,1474,489,43,260,1608,1098,1448,9022,1998,46.7,64000,23800000,31000000,34871
12 | Leicester,362,743,348,456,50,4,1179,457,45,349,1960,1022,1683,8460,2177,44.8,36300,8000000,8000000,31693
13 | Crystal Palace,340,601,434,527,63,4,1269,442,46,357,1759,945,1665,7435,2024,42.7,46000,12900000,11000000,24421
14 | Everton,341,444,450,390,66,2,1285,483,46,392,1228,1270,1289,14236,2008,53.6,69000,32800000,32800000,38406
15 | Burnley,403,565,398,407,64,2,1473,430,27,243,1888,1296,1724,8292,2315,44,21000,1500000,10000000,19131
16 | West Bromwich Albion,352,588,406,423,64,3,1090,411,37,241,1303,1054,1674,9771,1932,44.9,65000,14000000,13600000,25064
17 | Queens Park Rangers,387,561,392,447,75,3,1445,535,39,307,1732,1307,1698,8362,2294,46.3,78000,28000000,21000000,17809
18 | Hull,378,606,401,451,73,6,1317,429,32,255,1619,1171,1554,10192,1942,45.6,43000,32900000,24400000,23557
19 | Sunderland,436,579,447,441,94,2,993,408,30,341,1176,1081,1404,9922,2043,46.2,70000,14000000,9500000,43157
20 | Newcastle United,321,643,441,434,65,7,1025,468,40,444,1549,952,1424,10593,2052,48.1,78000,29500000,24900000,50359
21 | Aston Villa,286,484,422,401,70,7,1129,417,31,287,1400,1323,1419,11724,2051,49.1,69000,4700000,5900000,34133
22 | Juventus,300,555,501,525,73,5,762,602,71,405,940,1288,843,16023,2249,55.7,122000,46000000,11000000,38553
23 | Lazio,355,751,456,678,104,9,934,551,70,375,1215,1229,1259,12717,2185,52.7,55100,12250000,12750000,35500
24 | Inter,336,632,485,560,95,5,833,615,58,433,1142,952,874,16494,2242,57.8,70200,15700000,-3150000,37372
25 | Roma,295,670,511,521,103,5,910,525,53,420,894,1547,1080,15893,1990,57.5,94500,98600000,9075000,40118
26 | Fiorentina,360,626,555,538,82,3,853,616,61,383,977,1261,1031,14957,1996,55.9,56000,9100000,-24000000,30266
27 | Napoli,380,641,484,524,78,6,842,624,65,303,1062,1187,1153,15406,2213,54.1,67000,34050000,29490000,32266
28 | Genoa,301,592,622,636,103,6,1034,489,58,394,1042,1029,1201,11326,2191,51.2,35500,4340000,10160000,20882
29 | Sampdoria,297,559,600,615,97,3,1043,503,46,346,949,1363,1125,10715,1901,49,38500,24050000,-1700000,22276
30 | Palermo,354,522,549,582,84,4,1048,479,52,403,1144,1101,1201,11345,2182,49,24300,19700000,2030000,17466
31 | Torino,291,617,529,545,97,5,806,546,48,339,975,1340,1092,11818,1845,48.8,22500,34200000,3500000,17234
32 | AC Milan,261,682,615,541,81,13,905,450,54,294,898,1392,1225,12564,1906,51.2,97000,20600000,-15600000,36730
33 | Sassuolo,382,732,668,586,101,9,1231,448,46,264,1121,1069,1369,9976,1927,47.2,28000,16000000,2200000,13086
34 | Chievo,338,665,579,625,82,3,926,458,27,258,1162,1234,1449,8069,2071,43.6,18300,8000000,2510000,10652
35 | Empoli,414,371,610,492,58,4,953,516,43,424,1015,799,1207,13920,2437,51.4,11000,0,3750000,9229
36 | Cagliari,353,585,621,595,109,6,934,495,45,445,948,826,1054,12505,2343,49.1,16000,20200000,1700000,10551
37 | Verona,357,565,511,523,97,7,1159,461,48,239,1005,1142,1159,9497,1774,43.9,20500,16700000,-6200000,19312
38 | Udinese,321,503,504,532,91,6,1151,426,41,277,1290,1093,1484,9529,2007,45.2,25000,29200000,14850000,9132
39 | Atalanta,336,603,604,630,103,9,1041,461,37,254,1298,1158,1312,9702,2183,46.2,19700,0,-5400000,15673
40 | Cesena,370,563,557,632,102,4,1167,360,34,232,1303,1325,1294,9231,2243,45.2,11500,825000,950000,16204
41 | Parma,357,607,484,662,100,8,853,418,31,270,904,1218,1316,10543,2060,45.2,26200,4320000,-6360000,11758
42 | Barcelona,260,496,557,371,67,3,494,625,108,527,739,1315,682,21439,2371,65.3,347900,166000000,85000000,77632
43 | Real Madrid,343,482,544,406,85,3,658,686,116,477,925,1285,803,16083,1952,56.2,328000,135500000,21500000,73081
44 | Atletico Madrid,372,606,539,544,110,5,871,437,65,270,1596,1034,1242,10810,2443,49.4,105000,117600000,29600000,46603
45 | Valencia,294,683,613,568,103,9,1026,441,65,313,1351,1242,1223,10432,2102,50.7,73000,53300000,850000,43205
46 | Sevilla,350,614,517,609,116,2,811,463,68,320,1283,1092,1347,10597,2306,49.9,78600,20950000,-29600000,30671
47 | Villarreal,248,721,443,526,94,2,993,507,48,278,1302,1047,1287,10524,2535,48.9,44200,18700000,-3800000,16040
48 | Celta Vigo,334,673,495,586,117,5,771,499,47,375,1452,1233,1392,13577,2409,55.2,18700,0,0,19156
49 | Athletic Club,320,751,511,532,96,4,848,443,41,270,1699,1100,1645,10929,2753,50.8,38900,1000000,-35000000,43454
50 | Real Sociedad,261,606,559,514,88,1,1040,429,41,265,1533,1246,1366,10110,2415,50.3,49900,12000000,-33200000,22103
51 | Espanyol,328,637,535,594,111,6,964,368,47,266,1405,1013,1496,8546,2208,46.4,32200,2200000,-8800000,18693
52 | Rayo Vallecano,320,658,525,554,106,6,760,492,46,301,1421,1396,1368,12218,2309,55.4,15800,500000,150000,10628
53 | Malaga,400,631,509,571,107,8,830,461,40,297,1248,1157,1324,9233,2285,48.9,27500,1800000,-17600000,24530
54 | Deportivo La Coruna,351,748,498,554,103,4,892,399,33,284,1348,1044,1432,9884,2216,48.3,15300,200000,200000,21271
55 | Getafe,332,757,474,542,96,4,901,421,32,364,1301,1069,1439,9340,2169,47.3,18700,0,-8000000,7753
56 | Almeria,270,660,518,560,107,11,1021,394,33,300,1296,1117,1490,7317,2019,46.1,11700,0,-3000000,10405
57 | Eibar,343,653,410,568,103,5,963,354,33,216,1515,1257,1682,7366,2335,44.6,12800,162000,-5162000,4780
58 | Cordoba,338,692,496,487,95,9,1029,389,21,337,1433,1014,1383,9839,2126,47.4,16300,500000,500000,16126
59 | Granada,252,668,488,572,112,5,976,401,29,337,1307,1091,1331,8511,1992,46.9,22800,5750000,-10000000,17248
60 | Levante,296,688,480,558,107,3,835,404,34,286,1495,1092,1555,8046,2153,44.1,17000,0,0,15267
61 | Elche,318,562,485,588,103,6,834,407,35,248,1287,1279,1564,9889,2138,47.9,12500,1350000,-4650000,21684
62 |
--------------------------------------------------------------------------------
/data/fixtures.csv:
--------------------------------------------------------------------------------
1 | DIVISION,DATE,TIME,FIXTURE,HOME TEAM,AWAY TEAM
2 | EPL,8/10/2018,20:00,Manchester United V Leicester City,Manchester United,Leicester City
3 | EPL,8/11/2018,12:30,Newcastle United V Tottenham Hotspur,Newcastle United,Tottenham Hotspur
4 | EPL,8/11/2018,15:00,Bournemouth V Cardiff City,Bournemouth,Cardiff City
5 | EPL,8/11/2018,15:00,Fulham V Crystal Palace,Fulham,Crystal Palace
6 | EPL,8/11/2018,15:00,Huddersfield Town V Chelsea,Huddersfield Town,Chelsea
7 | EPL,8/11/2018,15:00,Watford V Brighton and Hove Albion,Watford,Brighton and Hove Albion
8 | EPL,8/11/2018,17:30,Wolverhampton Wanderers V Everton,Wolverhampton Wanderers,Everton
9 | EPL,8/12/2018, 13:30,Liverpool V West Ham United,Liverpool,West Ham United
10 | EPL,8/12/2018, 13:30,Southampton V Burnley,Southampton,Burnley
11 | EPL,8/12/2018, 16:00,Arsenal V Manchester City,Arsenal,Manchester City
12 | EPL,8/18/2018, 12:30,Cardiff City V Newcastle United,Cardiff City,Newcastle United
13 | EPL,8/18/2018, 15:00,Everton V Southampton,Everton,Southampton
14 | EPL,8/18/2018, 15:00,Leicester City V Wolverhampton Wanderers,Leicester City,Wolverhampton Wanderers
15 | EPL,8/18/2018, 15:00,Tottenham Hotspur V Fulham,Tottenham Hotspur,Fulham
16 | EPL,8/18/2018, 15:00,West Ham United V Bournemouth,West Ham United,Bournemouth
17 | EPL,8/18/2018, 17:30,Chelsea V Arsenal,Chelsea,Arsenal
18 | EPL,8/19/2018, 13:30,Burnley V Watford,Burnley,Watford
19 | EPL,8/19/2018, 13:30,Manchester City V Huddersfield Town,Manchester City,Huddersfield Town
20 | EPL,8/19/2018, 16:00,Brighton and Hove Albion V Manchester United,Brighton and Hove Albion,Manchester United
21 | EPL,8/20/2018, 20:00,Crystal Palace V Liverpool,Crystal Palace,Liverpool
22 | EPL,8/25/2018, 12:30,Wolverhampton Wanderers V Manchester City,Wolverhampton Wanderers,Manchester City
23 | EPL,8/25/2018, 15:00,Arsenal V West Ham United,Arsenal,West Ham United
24 | EPL,8/25/2018, 15:00,Bournemouth V Everton,Bournemouth,Everton
25 | EPL,8/25/2018, 15:00,Fulham V Burnley,Fulham,Burnley
26 | EPL,8/25/2018, 15:00,Huddersfield Town V Cardiff City,Huddersfield Town,Cardiff City
27 | EPL,8/25/2018, 15:00,Southampton V Leicester City,Southampton,Leicester City
28 | EPL,8/25/2018, 17:30,Liverpool V Brighton and Hove Albion,Liverpool,Brighton and Hove Albion
29 | EPL,8/26/2018, 13:30,Watford V Crystal Palace,Watford,Crystal Palace
30 | EPL,8/26/2018, 16:00,Newcastle United V Chelsea,Newcastle United,Chelsea
31 | EPL,8/27/2018, 20:00,Manchester United V Tottenham Hotspur,Manchester United,Tottenham Hotspur
32 | EPL,9/1/2018, 12:30,Leicester City V Liverpool,Leicester City,Liverpool
33 | EPL,9/1/2018, 15:00,Brighton and Hove Albion V Fulham,Brighton and Hove Albion,Fulham
34 | EPL,9/1/2018, 15:00,Burnley V Manchester United,Burnley,Manchester United
35 | EPL,9/1/2018, 15:00,Chelsea V Bournemouth,Chelsea,Bournemouth
36 | EPL,9/1/2018, 15:00,Crystal Palace V Southampton,Crystal Palace,Southampton
37 | EPL,9/1/2018, 15:00,Everton V Huddersfield Town,Everton,Huddersfield Town
38 | EPL,9/1/2018, 15:00,West Ham United V Wolverhampton Wanderers,West Ham United,Wolverhampton Wanderers
39 | EPL,9/1/2018, 17:30,Manchester City V Newcastle United,Manchester City,Newcastle United
40 | EPL,9/2/2018, 13:30,Cardiff City V Arsenal,Cardiff City,Arsenal
41 | EPL,9/2/2018, 16:00,Watford V Tottenham Hotspur,Watford,Tottenham Hotspur
42 | EPL,9/15/2018, 12:30,Tottenham Hotspur V Liverpool,Tottenham Hotspur,Liverpool
43 | EPL,9/15/2018, 15:00,Bournemouth V Leicester City,Bournemouth,Leicester City
44 | EPL,9/15/2018, 15:00,Chelsea V Cardiff City,Chelsea,Cardiff City
45 | EPL,9/15/2018, 15:00,Huddersfield Town V Crystal Palace,Huddersfield Town,Crystal Palace
46 | EPL,9/15/2018, 15:00,Manchester City V Fulham,Manchester City,Fulham
47 | EPL,9/15/2018, 15:00,Newcastle United V Arsenal,Newcastle United,Arsenal
48 | EPL,9/15/2018, 17:30,Watford V Manchester United,Watford,Manchester United
49 | EPL,9/16/2018, 13:30,Wolverhampton Wanderers V Burnley,Wolverhampton Wanderers,Burnley
50 | EPL,9/16/2018, 16:00,Everton V West Ham United,Everton,West Ham United
51 | EPL,9/17/2018, 20:00,Southampton V Brighton and Hove Albion,Southampton,Brighton and Hove Albion
52 | EPL,9/22/2018, 12:30,Fulham V Watford,Fulham,Watford
53 | EPL,9/22/2018, 15:00,Burnley V Bournemouth,Burnley,Bournemouth
54 | EPL,9/22/2018, 15:00,Cardiff City V Manchester City,Cardiff City,Manchester City
55 | EPL,9/22/2018, 15:00,Crystal Palace V Newcastle United,Crystal Palace,Newcastle United
56 | EPL,9/22/2018, 15:00,Leicester City V Huddersfield Town,Leicester City,Huddersfield Town
57 | EPL,9/22/2018, 15:00,Liverpool V Southampton,Liverpool,Southampton
58 | EPL,9/22/2018, 15:00,Manchester United V Wolverhampton Wanderers,Manchester United,Wolverhampton Wanderers
59 | EPL,9/22/2018, 17:30,Brighton and Hove Albion V Tottenham Hotspur,Brighton and Hove Albion,Tottenham Hotspur
60 | EPL,9/23/2018, 13:30,West Ham United V Chelsea,West Ham United,Chelsea
61 | EPL,9/23/2018, 16:00,Arsenal V Everton,Arsenal,Everton
62 | EPL,9/29/2018, 12:30,West Ham United V Manchester United,West Ham United,Manchester United
63 | EPL,9/29/2018, 15:00,Arsenal V Watford,Arsenal,Watford
64 | EPL,9/29/2018, 15:00,Everton V Fulham,Everton,Fulham
65 | EPL,9/29/2018, 15:00,Huddersfield Town V Tottenham Hotspur,Huddersfield Town,Tottenham Hotspur
66 | EPL,9/29/2018, 15:00,Manchester City V Brighton and Hove Albion,Manchester City,Brighton and Hove Albion
67 | EPL,9/29/2018, 15:00,Newcastle United V Leicester City,Newcastle United,Leicester City
68 | EPL,9/29/2018, 15:00,Wolverhampton Wanderers V Southampton,Wolverhampton Wanderers,Southampton
69 | EPL,9/29/2018, 17:30,Chelsea V Liverpool,Chelsea,Liverpool
70 | EPL,9/30/2018, 16:00,Cardiff City V Burnley,Cardiff City,Burnley
71 | EPL,10/1/2018, 20:00,Bournemouth V Crystal Palace,Bournemouth,Crystal Palace
72 | EPL,10/5/2018, 20:00,Brighton and Hove Albion V West Ham United,Brighton and Hove Albion,West Ham United
73 | EPL,10/6/2018, 15:00,Burnley V Huddersfield Town,Burnley,Huddersfield Town
74 | EPL,10/6/2018, 15:00,Crystal Palace V Wolverhampton Wanderers,Crystal Palace,Wolverhampton Wanderers
75 | EPL,10/6/2018, 15:00,Leicester City V Everton,Leicester City,Everton
76 | EPL,10/6/2018, 15:00,Tottenham Hotspur V Cardiff City,Tottenham Hotspur,Cardiff City
77 | EPL,10/6/2018, 15:00,Watford V Bournemouth,Watford,Bournemouth
78 | EPL,10/6/2018, 17:30,Manchester United V Newcastle United,Manchester United,Newcastle United
79 | EPL,10/7/2018, 12:00,Fulham V Arsenal,Fulham,Arsenal
80 | EPL,10/7/2018, 14:15,Southampton V Chelsea,Southampton,Chelsea
81 | EPL,10/7/2018, 16:30,Liverpool V Manchester City,Liverpool,Manchester City
82 | EPL,10/20/2018, 12:30,Chelsea V Manchester United,Chelsea,Manchester United
83 | EPL,10/20/2018, 15:00,Bournemouth V Southampton,Bournemouth,Southampton
84 | EPL,10/20/2018, 15:00,Cardiff City V Fulham,Cardiff City,Fulham
85 | EPL,10/20/2018, 15:00,Manchester City V Burnley,Manchester City,Burnley
86 | EPL,10/20/2018, 15:00,Newcastle United V Brighton and Hove Albion,Newcastle United,Brighton and Hove Albion
87 | EPL,10/20/2018, 15:00,West Ham United V Tottenham Hotspur,West Ham United,Tottenham Hotspur
88 | EPL,10/20/2018, 15:00,Wolverhampton Wanderers V Watford,Wolverhampton Wanderers,Watford
89 | EPL,10/20/2018, 17:30,Huddersfield Town V Liverpool,Huddersfield Town,Liverpool
90 | EPL,10/21/2018, 16:00,Everton V Crystal Palace,Everton,Crystal Palace
91 | EPL,10/22/2018, 20:00,Arsenal V Leicester City,Arsenal,Leicester City
92 | EPL,10/27/2018, 12:30,Manchester United V Everton,Manchester United,Everton
93 | EPL,10/27/2018, 15:00,Brighton and Hove Albion V Wolverhampton Wanderers,Brighton and Hove Albion,Wolverhampton Wanderers
94 | EPL,10/27/2018, 15:00,Fulham V Bournemouth,Fulham,Bournemouth
95 | EPL,10/27/2018, 15:00,Liverpool V Cardiff City,Liverpool,Cardiff City
96 | EPL,10/27/2018, 15:00,Southampton V Newcastle United,Southampton,Newcastle United
97 | EPL,10/27/2018, 15:00,Watford V Huddersfield Town,Watford,Huddersfield Town
98 | EPL,10/27/2018, 17:30,Leicester City V West Ham United,Leicester City,West Ham United
99 | EPL,10/28/2018, 13:30,Burnley V Chelsea,Burnley,Chelsea
100 | EPL,10/28/2018, 13:30,Crystal Palace V Arsenal,Crystal Palace,Arsenal
101 | EPL,10/28/2018, 16:00,Tottenham Hotspur V Manchester City,Tottenham Hotspur,Manchester City
102 | EPL,11/3/2018, 12:30,Bournemouth V Manchester United,Bournemouth,Manchester United
103 | EPL,11/3/2018, 15:00,Cardiff City V Leicester City,Cardiff City,Leicester City
104 | EPL,11/3/2018, 15:00,Everton V Brighton and Hove Albion,Everton,Brighton and Hove Albion
105 | EPL,11/3/2018, 15:00,Manchester City V Southampton,Manchester City,Southampton
106 | EPL,11/3/2018, 15:00,Newcastle United V Watford,Newcastle United,Watford
107 | EPL,11/3/2018, 15:00,West Ham United V Burnley,West Ham United,Burnley
108 | EPL,11/3/2018, 17:30,Arsenal V Liverpool,Arsenal,Liverpool
109 | EPL,11/4/2018, 13:30,Wolverhampton Wanderers V Tottenham Hotspur,Wolverhampton Wanderers,Tottenham Hotspur
110 | EPL,11/4/2018, 16:00,Chelsea V Crystal Palace,Chelsea,Crystal Palace
111 | EPL,11/5/2018, 20:00,Huddersfield Town V Fulham,Huddersfield Town,Fulham
112 | EPL,11/10/2018, 12:30,Cardiff City V Brighton and Hove Albion,Cardiff City,Brighton and Hove Albion
113 | EPL,11/10/2018, 15:00,Huddersfield Town V West Ham United,Huddersfield Town,West Ham United
114 | EPL,11/10/2018, 15:00,Leicester City V Burnley,Leicester City,Burnley
115 | EPL,11/10/2018, 15:00,Newcastle United V Bournemouth,Newcastle United,Bournemouth
116 | EPL,11/10/2018, 15:00,Southampton V Watford,Southampton,Watford
117 | EPL,11/10/2018, 17:30,Crystal Palace V Tottenham Hotspur,Crystal Palace,Tottenham Hotspur
118 | EPL,11/11/2018, 12:00,Liverpool V Fulham,Liverpool,Fulham
119 | EPL,11/11/2018, 14:15,Chelsea V Everton,Chelsea,Everton
120 | EPL,11/11/2018, 16:30,Arsenal V Wolverhampton Wanderers,Arsenal,Wolverhampton Wanderers
121 | EPL,11/11/2018, 16:30,Manchester City V Manchester United,Manchester City,Manchester United
122 | EPL,11/24/2018, 15:00,Brighton and Hove Albion V Leicester City,Brighton and Hove Albion,Leicester City
123 | EPL,11/24/2018, 15:00,Everton V Cardiff City,Everton,Cardiff City
124 | EPL,11/24/2018, 15:00,Fulham V Southampton,Fulham,Southampton
125 | EPL,11/24/2018, 15:00,Manchester United V Crystal Palace,Manchester United,Crystal Palace
126 | EPL,11/24/2018, 15:00,Watford V Liverpool,Watford,Liverpool
127 | EPL,11/24/2018, 15:00,West Ham United V Manchester City,West Ham United,Manchester City
128 | EPL,11/24/2018, 17:30,Tottenham Hotspur V Chelsea,Tottenham Hotspur,Chelsea
129 | EPL,11/25/2018, 13:30,Bournemouth V Arsenal,Bournemouth,Arsenal
130 | EPL,11/25/2018, 16:00,Wolverhampton Wanderers V Huddersfield Town,Wolverhampton Wanderers,Huddersfield Town
131 | EPL,11/26/2018, 20:00,Burnley V Newcastle United,Burnley,Newcastle United
132 | EPL,12/1/2018, 15:00,Arsenal V Tottenham Hotspur,Arsenal,Tottenham Hotspur
133 | EPL,12/1/2018, 15:00,Cardiff City V Wolverhampton Wanderers,Cardiff City,Wolverhampton Wanderers
134 | EPL,12/1/2018, 15:00,Chelsea V Fulham,Chelsea,Fulham
135 | EPL,12/1/2018, 15:00,Crystal Palace V Burnley,Crystal Palace,Burnley
136 | EPL,12/1/2018, 15:00,Huddersfield Town V Brighton and Hove Albion,Huddersfield Town,Brighton and Hove Albion
137 | EPL,12/1/2018, 15:00,Leicester City V Watford,Leicester City,Watford
138 | EPL,12/1/2018, 15:00,Liverpool V Everton,Liverpool,Everton
139 | EPL,12/1/2018, 15:00,Manchester City V Bournemouth,Manchester City,Bournemouth
140 | EPL,12/1/2018, 15:00,Newcastle United V West Ham United,Newcastle United,West Ham United
141 | EPL,12/1/2018, 15:00,Southampton V Manchester United,Southampton,Manchester United
142 | EPL,12/4/2018, 19:45,Bournemouth V Huddersfield Town,Bournemouth,Huddersfield Town
143 | EPL,12/4/2018, 19:45,Brighton and Hove Albion V Crystal Palace,Brighton and Hove Albion,Crystal Palace
144 | EPL,12/4/2018, 19:45,Burnley V Liverpool,Burnley,Liverpool
145 | EPL,12/4/2018, 19:45,Fulham V Leicester City,Fulham,Leicester City
146 | EPL,12/4/2018, 19:45,Watford V Manchester City,Watford,Manchester City
147 | EPL,12/4/2018, 19:45,West Ham United V Cardiff City,West Ham United,Cardiff City
148 | EPL,12/4/2018, 19:45,Wolverhampton Wanderers V Chelsea,Wolverhampton Wanderers,Chelsea
149 | EPL,12/4/2018, 20:00,Manchester United V Arsenal,Manchester United,Arsenal
150 | EPL,12/5/2018, 19:45,Everton V Newcastle United,Everton,Newcastle United
151 | EPL,12/5/2018, 20:00,Tottenham Hotspur V Southampton,Tottenham Hotspur,Southampton
152 | EPL,12/8/2018, 15:00,Arsenal V Huddersfield Town,Arsenal,Huddersfield Town
153 | EPL,12/8/2018, 15:00,Bournemouth V Liverpool,Bournemouth,Liverpool
154 | EPL,12/8/2018, 15:00,Burnley V Brighton and Hove Albion,Burnley,Brighton and Hove Albion
155 | EPL,12/8/2018, 15:00,Cardiff City V Southampton,Cardiff City,Southampton
156 | EPL,12/8/2018, 15:00,Chelsea V Manchester City,Chelsea,Manchester City
157 | EPL,12/8/2018, 15:00,Everton V Watford,Everton,Watford
158 | EPL,12/8/2018, 15:00,Leicester City V Tottenham Hotspur,Leicester City,Tottenham Hotspur
159 | EPL,12/8/2018, 15:00,Manchester United V Fulham,Manchester United,Fulham
160 | EPL,12/8/2018, 15:00,Newcastle United V Wolverhampton Wanderers,Newcastle United,Wolverhampton Wanderers
161 | EPL,12/8/2018, 15:00,West Ham United V Crystal Palace,West Ham United,Crystal Palace
162 | EPL,12/15/2018, 15:00,Brighton and Hove Albion V Chelsea,Brighton and Hove Albion,Chelsea
163 | EPL,12/15/2018, 15:00,Crystal Palace V Leicester City,Crystal Palace,Leicester City
164 | EPL,12/15/2018, 15:00,Fulham V West Ham United,Fulham,West Ham United
165 | EPL,12/15/2018, 15:00,Huddersfield Town V Newcastle United,Huddersfield Town,Newcastle United
166 | EPL,12/15/2018, 15:00,Liverpool V Manchester United,Liverpool,Manchester United
167 | EPL,12/15/2018, 15:00,Manchester City V Everton,Manchester City,Everton
168 | EPL,12/15/2018, 15:00,Southampton V Arsenal,Southampton,Arsenal
169 | EPL,12/15/2018, 15:00,Tottenham Hotspur V Burnley,Tottenham Hotspur,Burnley
170 | EPL,12/15/2018, 15:00,Watford V Cardiff City,Watford,Cardiff City
171 | EPL,12/15/2018, 15:00,Wolverhampton Wanderers V Bournemouth,Wolverhampton Wanderers,Bournemouth
172 | EPL,12/22/2018, 15:00,Arsenal V Burnley,Arsenal,Burnley
173 | EPL,12/22/2018, 15:00,Bournemouth V Brighton and Hove Albion,Bournemouth,Brighton and Hove Albion
174 | EPL,12/22/2018, 15:00,Cardiff City V Manchester United,Cardiff City,Manchester United
175 | EPL,12/22/2018, 15:00,Chelsea V Leicester City,Chelsea,Leicester City
176 | EPL,12/22/2018, 15:00,Everton V Tottenham Hotspur,Everton,Tottenham Hotspur
177 | EPL,12/22/2018, 15:00,Huddersfield Town V Southampton,Huddersfield Town,Southampton
178 | EPL,12/22/2018, 15:00,Manchester City V Crystal Palace,Manchester City,Crystal Palace
179 | EPL,12/22/2018, 15:00,Newcastle United V Fulham,Newcastle United,Fulham
180 | EPL,12/22/2018, 15:00,West Ham United V Watford,West Ham United,Watford
181 | EPL,12/22/2018, 15:00,Wolverhampton Wanderers V Liverpool,Wolverhampton Wanderers,Liverpool
182 | EPL,12/26/2018, 15:00,Brighton and Hove Albion V Arsenal,Brighton and Hove Albion,Arsenal
183 | EPL,12/26/2018, 15:00,Burnley V Everton,Burnley,Everton
184 | EPL,12/26/2018, 15:00,Crystal Palace V Cardiff City,Crystal Palace,Cardiff City
185 | EPL,12/26/2018, 15:00,Fulham V Wolverhampton Wanderers,Fulham,Wolverhampton Wanderers
186 | EPL,12/26/2018, 15:00,Leicester City V Manchester City,Leicester City,Manchester City
187 | EPL,12/26/2018, 15:00,Liverpool V Newcastle United,Liverpool,Newcastle United
188 | EPL,12/26/2018, 15:00,Manchester United V Huddersfield Town,Manchester United,Huddersfield Town
189 | EPL,12/26/2018, 15:00,Southampton V West Ham United,Southampton,West Ham United
190 | EPL,12/26/2018, 15:00,Tottenham Hotspur V Bournemouth,Tottenham Hotspur,Bournemouth
191 | EPL,12/26/2018, 15:00,Watford V Chelsea,Watford,Chelsea
192 | EPL,12/29/2018, 15:00,Brighton and Hove Albion V Everton,Brighton and Hove Albion,Everton
193 | EPL,12/29/2018, 15:00,Burnley V West Ham United,Burnley,West Ham United
194 | EPL,12/29/2018, 15:00,Crystal Palace V Chelsea,Crystal Palace,Chelsea
195 | EPL,12/29/2018, 15:00,Fulham V Huddersfield Town,Fulham,Huddersfield Town
196 | EPL,12/29/2018, 15:00,Leicester City V Cardiff City,Leicester City,Cardiff City
197 | EPL,12/29/2018, 15:00,Liverpool V Arsenal,Liverpool,Arsenal
198 | EPL,12/29/2018, 15:00,Manchester United V Bournemouth,Manchester United,Bournemouth
199 | EPL,12/29/2018, 15:00,Southampton V Manchester City,Southampton,Manchester City
200 | EPL,12/29/2018, 15:00,Tottenham Hotspur V Wolverhampton Wanderers,Tottenham Hotspur,Wolverhampton Wanderers
201 | EPL,12/29/2018, 15:00,Watford V Newcastle United,Watford,Newcastle United
202 | EPL,1/1/2019, 15:00,Arsenal V Fulham,Arsenal,Fulham
203 | EPL,1/1/2019, 15:00,Bournemouth V Watford,Bournemouth,Watford
204 | EPL,1/1/2019, 15:00,Cardiff City V Tottenham Hotspur,Cardiff City,Tottenham Hotspur
205 | EPL,1/1/2019, 15:00,Chelsea V Southampton,Chelsea,Southampton
206 | EPL,1/1/2019, 15:00,Everton V Leicester City,Everton,Leicester City
207 | EPL,1/1/2019, 15:00,Huddersfield Town V Burnley,Huddersfield Town,Burnley
208 | EPL,1/1/2019, 15:00,Manchester City V Liverpool,Manchester City,Liverpool
209 | EPL,1/1/2019, 15:00,Newcastle United V Manchester United,Newcastle United,Manchester United
210 | EPL,1/1/2019, 15:00,West Ham United V Brighton and Hove Albion,West Ham United,Brighton and Hove Albion
211 | EPL,1/1/2019, 15:00,Wolverhampton Wanderers V Crystal Palace,Wolverhampton Wanderers,Crystal Palace
212 | EPL,1/12/2019, 15:00,Brighton and Hove Albion V Liverpool,Brighton and Hove Albion,Liverpool
213 | EPL,1/12/2019, 15:00,Burnley V Fulham,Burnley,Fulham
214 | EPL,1/12/2019, 15:00,Cardiff City V Huddersfield Town,Cardiff City,Huddersfield Town
215 | EPL,1/12/2019, 15:00,Chelsea V Newcastle United,Chelsea,Newcastle United
216 | EPL,1/12/2019, 15:00,Crystal Palace V Watford,Crystal Palace,Watford
217 | EPL,1/12/2019, 15:00,Everton V Bournemouth,Everton,Bournemouth
218 | EPL,1/12/2019, 15:00,Leicester City V Southampton,Leicester City,Southampton
219 | EPL,1/12/2019, 15:00,Manchester City V Wolverhampton Wanderers,Manchester City,Wolverhampton Wanderers
220 | EPL,1/12/2019, 15:00,Tottenham Hotspur V Manchester United,Tottenham Hotspur,Manchester United
221 | EPL,1/12/2019, 15:00,West Ham United V Arsenal,West Ham United,Arsenal
222 | EPL,1/19/2019, 15:00,Arsenal V Chelsea,Arsenal,Chelsea
223 | EPL,1/19/2019, 15:00,Bournemouth V West Ham United,Bournemouth,West Ham United
224 | EPL,1/19/2019, 15:00,Fulham V Tottenham Hotspur,Fulham,Tottenham Hotspur
225 | EPL,1/19/2019, 15:00,Huddersfield Town V Manchester City,Huddersfield Town,Manchester City
226 | EPL,1/19/2019, 15:00,Liverpool V Crystal Palace,Liverpool,Crystal Palace
227 | EPL,1/19/2019, 15:00,Manchester United V Brighton and Hove Albion,Manchester United,Brighton and Hove Albion
228 | EPL,1/19/2019, 15:00,Newcastle United V Cardiff City,Newcastle United,Cardiff City
229 | EPL,1/19/2019, 15:00,Southampton V Everton,Southampton,Everton
230 | EPL,1/19/2019, 15:00,Watford V Burnley,Watford,Burnley
231 | EPL,1/19/2019, 15:00,Wolverhampton Wanderers V Leicester City,Wolverhampton Wanderers,Leicester City
232 | EPL,1/29/2019, 19:45,Arsenal V Cardiff City,Arsenal,Cardiff City
233 | EPL,1/29/2019, 19:45,Bournemouth V Chelsea,Bournemouth,Chelsea
234 | EPL,1/29/2019, 19:45,Fulham V Brighton and Hove Albion,Fulham,Brighton and Hove Albion
235 | EPL,1/29/2019, 19:45,Huddersfield Town V Everton,Huddersfield Town,Everton
236 | EPL,1/29/2019, 19:45,Wolverhampton Wanderers V West Ham United,Wolverhampton Wanderers,West Ham United
237 | EPL,1/29/2019, 20:00,Manchester United V Burnley,Manchester United,Burnley
238 | EPL,1/30/2019, 19:45,Newcastle United V Manchester City,Newcastle United,Manchester City
239 | EPL,1/30/2019, 19:45,Southampton V Crystal Palace,Southampton,Crystal Palace
240 | EPL,1/30/2019, 20:00,Liverpool V Leicester City,Liverpool,Leicester City
241 | EPL,1/30/2019, 20:00,Tottenham Hotspur V Watford,Tottenham Hotspur,Watford
242 | EPL,2/2/2019, 15:00,Brighton and Hove Albion V Watford,Brighton and Hove Albion,Watford
243 | EPL,2/2/2019, 15:00,Burnley V Southampton,Burnley,Southampton
244 | EPL,2/2/2019, 15:00,Cardiff City V Bournemouth,Cardiff City,Bournemouth
245 | EPL,2/2/2019, 15:00,Chelsea V Huddersfield Town,Chelsea,Huddersfield Town
246 | EPL,2/2/2019, 15:00,Crystal Palace V Fulham,Crystal Palace,Fulham
247 | EPL,2/2/2019, 15:00,Everton V Wolverhampton Wanderers,Everton,Wolverhampton Wanderers
248 | EPL,2/2/2019, 15:00,Leicester City V Manchester United,Leicester City,Manchester United
249 | EPL,2/2/2019, 15:00,Manchester City V Arsenal,Manchester City,Arsenal
250 | EPL,2/2/2019, 15:00,Tottenham Hotspur V Newcastle United,Tottenham Hotspur,Newcastle United
251 | EPL,2/2/2019, 15:00,West Ham United V Liverpool,West Ham United,Liverpool
252 | EPL,2/9/2019, 15:00,Brighton and Hove Albion V Burnley,Brighton and Hove Albion,Burnley
253 | EPL,2/9/2019, 15:00,Crystal Palace V West Ham United,Crystal Palace,West Ham United
254 | EPL,2/9/2019, 15:00,Fulham V Manchester United,Fulham,Manchester United
255 | EPL,2/9/2019, 15:00,Huddersfield Town V Arsenal,Huddersfield Town,Arsenal
256 | EPL,2/9/2019, 15:00,Liverpool V Bournemouth,Liverpool,Bournemouth
257 | EPL,2/9/2019, 15:00,Manchester City V Chelsea,Manchester City,Chelsea
258 | EPL,2/9/2019, 15:00,Southampton V Cardiff City,Southampton,Cardiff City
259 | EPL,2/9/2019, 15:00,Tottenham Hotspur V Leicester City,Tottenham Hotspur,Leicester City
260 | EPL,2/9/2019, 15:00,Watford V Everton,Watford,Everton
261 | EPL,2/9/2019, 15:00,Wolverhampton Wanderers V Newcastle United,Wolverhampton Wanderers,Newcastle United
262 | EPL,2/23/2019, 15:00,Arsenal V Southampton,Arsenal,Southampton
263 | EPL,2/23/2019, 15:00,Bournemouth V Wolverhampton Wanderers,Bournemouth,Wolverhampton Wanderers
264 | EPL,2/23/2019, 15:00,Burnley V Tottenham Hotspur,Burnley,Tottenham Hotspur
265 | EPL,2/23/2019, 15:00,Cardiff City V Watford,Cardiff City,Watford
266 | EPL,2/23/2019, 15:00,Chelsea V Brighton and Hove Albion,Chelsea,Brighton and Hove Albion
267 | EPL,2/23/2019, 15:00,Everton V Manchester City,Everton,Manchester City
268 | EPL,2/23/2019, 15:00,Leicester City V Crystal Palace,Leicester City,Crystal Palace
269 | EPL,2/23/2019, 15:00,Manchester United V Liverpool,Manchester United,Liverpool
270 | EPL,2/23/2019, 15:00,Newcastle United V Huddersfield Town,Newcastle United,Huddersfield Town
271 | EPL,2/23/2019, 15:00,West Ham United V Fulham,West Ham United,Fulham
272 | EPL,2/26/2019, 19:45,Arsenal V Bournemouth,Arsenal,Bournemouth
273 | EPL,2/26/2019, 19:45,Cardiff City V Everton,Cardiff City,Everton
274 | EPL,2/26/2019, 19:45,Huddersfield Town V Wolverhampton Wanderers,Huddersfield Town,Wolverhampton Wanderers
275 | EPL,2/26/2019, 19:45,Leicester City V Brighton and Hove Albion,Leicester City,Brighton and Hove Albion
276 | EPL,2/26/2019, 20:00,Crystal Palace V Manchester United,Crystal Palace,Manchester United
277 | EPL,2/27/2019, 19:45,Chelsea V Tottenham Hotspur,Chelsea,Tottenham Hotspur
278 | EPL,2/27/2019, 19:45,Newcastle United V Burnley,Newcastle United,Burnley
279 | EPL,2/27/2019, 19:45,Southampton V Fulham,Southampton,Fulham
280 | EPL,2/27/2019, 20:00,Liverpool V Watford,Liverpool,Watford
281 | EPL,2/27/2019, 20:00,Manchester City V West Ham United,Manchester City,West Ham United
282 | EPL,3/2/2019, 15:00,Bournemouth V Manchester City,Bournemouth,Manchester City
283 | EPL,3/2/2019, 15:00,Brighton and Hove Albion V Huddersfield Town,Brighton and Hove Albion,Huddersfield Town
284 | EPL,3/2/2019, 15:00,Burnley V Crystal Palace,Burnley,Crystal Palace
285 | EPL,3/2/2019, 15:00,Everton V Liverpool,Everton,Liverpool
286 | EPL,3/2/2019, 15:00,Fulham V Chelsea,Fulham,Chelsea
287 | EPL,3/2/2019, 15:00,Manchester United V Southampton,Manchester United,Southampton
288 | EPL,3/2/2019, 15:00,Tottenham Hotspur V Arsenal,Tottenham Hotspur,Arsenal
289 | EPL,3/2/2019, 15:00,Watford V Leicester City,Watford,Leicester City
290 | EPL,3/2/2019, 15:00,West Ham United V Newcastle United,West Ham United,Newcastle United
291 | EPL,3/2/2019, 15:00,Wolverhampton Wanderers V Cardiff City,Wolverhampton Wanderers,Cardiff City
292 | EPL,3/9/2019, 15:00,Arsenal V Manchester United,Arsenal,Manchester United
293 | EPL,3/9/2019, 15:00,Cardiff City V West Ham United,Cardiff City,West Ham United
294 | EPL,3/9/2019, 15:00,Chelsea V Wolverhampton Wanderers,Chelsea,Wolverhampton Wanderers
295 | EPL,3/9/2019, 15:00,Crystal Palace V Brighton and Hove Albion,Crystal Palace,Brighton and Hove Albion
296 | EPL,3/9/2019, 15:00,Huddersfield Town V Bournemouth,Huddersfield Town,Bournemouth
297 | EPL,3/9/2019, 15:00,Leicester City V Fulham,Leicester City,Fulham
298 | EPL,3/9/2019, 15:00,Liverpool V Burnley,Liverpool,Burnley
299 | EPL,3/9/2019, 15:00,Manchester City V Watford,Manchester City,Watford
300 | EPL,3/9/2019, 15:00,Newcastle United V Everton,Newcastle United,Everton
301 | EPL,3/9/2019, 15:00,Southampton V Tottenham Hotspur,Southampton,Tottenham Hotspur
302 | EPL,3/16/2019, 15:00,Bournemouth V Newcastle United,Bournemouth,Newcastle United
303 | EPL,3/16/2019, 15:00,Brighton and Hove Albion V Cardiff City,Brighton and Hove Albion,Cardiff City
304 | EPL,3/16/2019, 15:00,Burnley V Leicester City,Burnley,Leicester City
305 | EPL,3/16/2019, 15:00,Everton V Chelsea,Everton,Chelsea
306 | EPL,3/16/2019, 15:00,Fulham V Liverpool,Fulham,Liverpool
307 | EPL,3/16/2019, 15:00,Manchester United V Manchester City,Manchester United,Manchester City
308 | EPL,3/16/2019, 15:00,Tottenham Hotspur V Crystal Palace,Tottenham Hotspur,Crystal Palace
309 | EPL,3/16/2019, 15:00,Watford V Southampton,Watford,Southampton
310 | EPL,3/16/2019, 15:00,West Ham United V Huddersfield Town,West Ham United,Huddersfield Town
311 | EPL,3/16/2019, 15:00,Wolverhampton Wanderers V Arsenal,Wolverhampton Wanderers,Arsenal
312 | EPL,3/30/2019, 15:00,Arsenal V Newcastle United,Arsenal,Newcastle United
313 | EPL,3/30/2019, 15:00,Brighton and Hove Albion V Southampton,Brighton and Hove Albion,Southampton
314 | EPL,3/30/2019, 15:00,Burnley V Wolverhampton Wanderers,Burnley,Wolverhampton Wanderers
315 | EPL,3/30/2019, 15:00,Cardiff City V Chelsea,Cardiff City,Chelsea
316 | EPL,3/30/2019, 15:00,Crystal Palace V Huddersfield Town,Crystal Palace,Huddersfield Town
317 | EPL,3/30/2019, 15:00,Fulham V Manchester City,Fulham,Manchester City
318 | EPL,3/30/2019, 15:00,Leicester City V Bournemouth,Leicester City,Bournemouth
319 | EPL,3/30/2019, 15:00,Liverpool V Tottenham Hotspur,Liverpool,Tottenham Hotspur
320 | EPL,3/30/2019, 15:00,Manchester United V Watford,Manchester United,Watford
321 | EPL,3/30/2019, 15:00,West Ham United V Everton,West Ham United,Everton
322 | EPL,4/6/2019, 15:00,Bournemouth V Burnley,Bournemouth,Burnley
323 | EPL,4/6/2019, 15:00,Chelsea V West Ham United,Chelsea,West Ham United
324 | EPL,4/6/2019, 15:00,Everton V Arsenal,Everton,Arsenal
325 | EPL,4/6/2019, 15:00,Huddersfield Town V Leicester City,Huddersfield Town,Leicester City
326 | EPL,4/6/2019, 15:00,Manchester City V Cardiff City,Manchester City,Cardiff City
327 | EPL,4/6/2019, 15:00,Newcastle United V Crystal Palace,Newcastle United,Crystal Palace
328 | EPL,4/6/2019, 15:00,Southampton V Liverpool,Southampton,Liverpool
329 | EPL,4/6/2019, 15:00,Tottenham Hotspur V Brighton and Hove Albion,Tottenham Hotspur,Brighton and Hove Albion
330 | EPL,4/6/2019, 15:00,Watford V Fulham,Watford,Fulham
331 | EPL,4/6/2019, 15:00,Wolverhampton Wanderers V Manchester United,Wolverhampton Wanderers,Manchester United
332 | EPL,4/13/2019, 15:00,Brighton and Hove Albion V Bournemouth,Brighton and Hove Albion,Bournemouth
333 | EPL,4/13/2019, 15:00,Burnley V Cardiff City,Burnley,Cardiff City
334 | EPL,4/13/2019, 15:00,Crystal Palace V Manchester City,Crystal Palace,Manchester City
335 | EPL,4/13/2019, 15:00,Fulham V Everton,Fulham,Everton
336 | EPL,4/13/2019, 15:00,Leicester City V Newcastle United,Leicester City,Newcastle United
337 | EPL,4/13/2019, 15:00,Liverpool V Chelsea,Liverpool,Chelsea
338 | EPL,4/13/2019, 15:00,Manchester United V West Ham United,Manchester United,West Ham United
339 | EPL,4/13/2019, 15:00,Southampton V Wolverhampton Wanderers,Southampton,Wolverhampton Wanderers
340 | EPL,4/13/2019, 15:00,Tottenham Hotspur V Huddersfield Town,Tottenham Hotspur,Huddersfield Town
341 | EPL,4/13/2019, 15:00,Watford V Arsenal,Watford,Arsenal
342 | EPL,4/20/2019, 15:00,Arsenal V Crystal Palace,Arsenal,Crystal Palace
343 | EPL,4/20/2019, 15:00,Bournemouth V Fulham,Bournemouth,Fulham
344 | EPL,4/20/2019, 15:00,Cardiff City V Liverpool,Cardiff City,Liverpool
345 | EPL,4/20/2019, 15:00,Chelsea V Burnley,Chelsea,Burnley
346 | EPL,4/20/2019, 15:00,Everton V Manchester United,Everton,Manchester United
347 | EPL,4/20/2019, 15:00,Huddersfield Town V Watford,Huddersfield Town,Watford
348 | EPL,4/20/2019, 15:00,Manchester City V Tottenham Hotspur,Manchester City,Tottenham Hotspur
349 | EPL,4/20/2019, 15:00,Newcastle United V Southampton,Newcastle United,Southampton
350 | EPL,4/20/2019, 15:00,West Ham United V Leicester City,West Ham United,Leicester City
351 | EPL,4/20/2019, 15:00,Wolverhampton Wanderers V Brighton and Hove Albion,Wolverhampton Wanderers,Brighton and Hove Albion
352 | EPL,4/27/2019, 15:00,Brighton and Hove Albion V Newcastle United,Brighton and Hove Albion,Newcastle United
353 | EPL,4/27/2019, 15:00,Burnley V Manchester City,Burnley,Manchester City
354 | EPL,4/27/2019, 15:00,Crystal Palace V Everton,Crystal Palace,Everton
355 | EPL,4/27/2019, 15:00,Fulham V Cardiff City,Fulham,Cardiff City
356 | EPL,4/27/2019, 15:00,Leicester City V Arsenal,Leicester City,Arsenal
357 | EPL,4/27/2019, 15:00,Liverpool V Huddersfield Town,Liverpool,Huddersfield Town
358 | EPL,4/27/2019, 15:00,Manchester United V Chelsea,Manchester United,Chelsea
359 | EPL,4/27/2019, 15:00,Southampton V Bournemouth,Southampton,Bournemouth
360 | EPL,4/27/2019, 15:00,Tottenham Hotspur V West Ham United,Tottenham Hotspur,West Ham United
361 | EPL,4/27/2019, 15:00,Watford V Wolverhampton Wanderers,Watford,Wolverhampton Wanderers
362 | EPL,5/4/2019, 15:00,Arsenal V Brighton and Hove Albion,Arsenal,Brighton and Hove Albion
363 | EPL,5/4/2019, 15:00,Bournemouth V Tottenham Hotspur,Bournemouth,Tottenham Hotspur
364 | EPL,5/4/2019, 15:00,Cardiff City V Crystal Palace,Cardiff City,Crystal Palace
365 | EPL,5/4/2019, 15:00,Chelsea V Watford,Chelsea,Watford
366 | EPL,5/4/2019, 15:00,Everton V Burnley,Everton,Burnley
367 | EPL,5/4/2019, 15:00,Huddersfield Town V Manchester United,Huddersfield Town,Manchester United
368 | EPL,5/4/2019, 15:00,Manchester City V Leicester City,Manchester City,Leicester City
369 | EPL,5/4/2019, 15:00,Newcastle United V Liverpool,Newcastle United,Liverpool
370 | EPL,5/4/2019, 15:00,West Ham United V Southampton,West Ham United,Southampton
371 | EPL,5/4/2019, 15:00,Wolverhampton Wanderers V Fulham,Wolverhampton Wanderers,Fulham
372 | EPL,5/12/2019, 15:00,Brighton and Hove Albion V Manchester City,Brighton and Hove Albion,Manchester City
373 | EPL,5/12/2019, 15:00,Burnley V Arsenal,Burnley,Arsenal
374 | EPL,5/12/2019, 15:00,Crystal Palace V Bournemouth,Crystal Palace,Bournemouth
375 | EPL,5/12/2019, 15:00,Fulham V Newcastle United,Fulham,Newcastle United
376 | EPL,5/12/2019, 15:00,Leicester City V Chelsea,Leicester City,Chelsea
377 | EPL,5/12/2019, 15:00,Liverpool V Wolverhampton Wanderers,Liverpool,Wolverhampton Wanderers
378 | EPL,5/12/2019, 15:00,Manchester United V Cardiff City,Manchester United,Cardiff City
379 | EPL,5/12/2019, 15:00,Southampton V Huddersfield Town,Southampton,Huddersfield Town
380 | EPL,5/12/2019, 15:00,Tottenham Hotspur V Everton,Tottenham Hotspur,Everton
381 | EPL,5/12/2019, 15:00,Watford V West Ham United,Watford,West Ham United
382 |
--------------------------------------------------------------------------------
/dog_home.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/dog_home.png
--------------------------------------------------------------------------------
/images/bayesian.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/bayesian.png
--------------------------------------------------------------------------------
/images/dog_home.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/dog_home.png
--------------------------------------------------------------------------------
/images/epl.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/epl.png
--------------------------------------------------------------------------------
/images/fireworks.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/fireworks.gif
--------------------------------------------------------------------------------
/images/messi-scribble.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/messi-scribble.png
--------------------------------------------------------------------------------
/images/network.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/network.png
--------------------------------------------------------------------------------
/images/paul.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/paul.png
--------------------------------------------------------------------------------
/images/scrible-test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/scrible-test.png
--------------------------------------------------------------------------------
/images/selfie.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/selfie.png
--------------------------------------------------------------------------------
/images/selfie2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tuangauss/DataScienceProjects/9b0dab074cbf05c270124b6857a748e6f1ee73b9/images/selfie2.png
--------------------------------------------------------------------------------
/images/test.R:
--------------------------------------------------------------------------------
1 | # test file
2 |
--------------------------------------------------------------------------------
/llm_bots/animeyourself/anime_yourself.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | This bot uses all options provided by the Poe protocol.
4 |
5 | """
6 | from __future__ import annotations
7 |
8 | import asyncio
9 | import json
10 | from typing import AsyncIterable
11 |
12 | from fastapi_poe import PoeBot, run
13 | from fastapi_poe.types import (
14 | ContentType,
15 | QueryRequest,
16 | ReportFeedbackRequest,
17 | SettingsRequest,
18 | SettingsResponse,
19 | )
20 | import os
21 | import replicate
22 | import re
23 | import random
24 | import textwrap
25 | import asyncio
26 | from sse_starlette.sse import ServerSentEvent
27 |
28 | SETTINGS = SettingsResponse(
29 | context_clear_window_secs=60 * 60, allow_user_context_clear=True
30 | )
31 |
32 | MODEL_URL = "mcai/dreamshaper-v6-img2img:c7959eb3a86c09b449dacc11ce8bba295fda466fc6935ab8709e35f4f48c980c"
33 |
34 | _WAIT_TIMEOUT_S = 1
35 |
36 |
37 | def parse_text(txt):
38 | # Define a regular expression to match the fields and their values
39 | pattern = re.compile(r'(image|prompt):\s*"([^"]+)"')
40 | matches = pattern.findall(txt)
41 | result = {}
42 |
43 | for field, value in matches:
44 | result[field] = value
45 | return result
46 |
47 | core_positive_prompt = """
48 | portrait closeup, best quality, intricately detailed,
49 | moe manga style, finely detailed features perfect art,
50 | professional majestic impressionism oil painting by Waterhouse,
51 | John Constable, Ed Blinkey, Atey Ghailan, Studio Ghibli,
52 | by Jeremy Mann, Greg Manchess, Antonio Moro, trending on ArtStation,
53 | trending on CGSociety, cinematic lighting, hand drawn, hand colored.
54 | """
55 |
56 | alternative_prompt = """
57 | portrait closeup, best quality,
58 | moe manga style, finely detailed features perfect art,
59 | anime style, 8k, artwork in the style of guweiz,
60 | cinematic lighting, hand drawn, hand colored.
61 | """
62 |
63 | negative_prompt = """
64 | disfigured, kitsch, ugly, oversaturated, greain, low-res, deformed, blurry, bad anatomy,
65 | poorly drawn face, mutation, mutated, extra limb, missing limb,
66 | floating limbs, disconnected limbs, malformed hands, extra fingers, poorly drawn hands,
67 | """
68 |
69 | def error_message():
70 | msg = textwrap.dedent(f"""
71 | Sorry, I cannot parse your input. Please try again and make sure your input has the format:
72 |
73 | ```python
74 | image: ""
75 | prompt: (Optional) "" #no worry, we will generate an anime of you to start
76 | ```
77 |
78 | """
79 | )
80 | return msg
81 |
82 | def _get_complete_message(second, input_url, output_url):
83 | _COMPLETE_MESSAGE = f"""
84 | Completed! (took {second}s)
85 |
86 | This is you:
87 |
88 | 
89 |
90 | This is the anime version of yourself.
91 |
92 | 
93 |
94 | """
95 | return textwrap.dedent(_COMPLETE_MESSAGE)
96 |
97 | class AnimeYourself(PoeBot):
98 | async def get_response(self, query: QueryRequest) -> AsyncIterable[ServerSentEvent]:
99 | """Return an async iterator of events to send to the user."""
100 | last_message = query.query[-1].content.lower()
101 | response_content_type: ContentType = ("text/markdown")
102 | yield self.meta_event(
103 | content_type=response_content_type,
104 | linkify=False,
105 | refetch_settings=False,
106 | suggested_replies=False,
107 | )
108 |
109 | input_dict = parse_text(last_message)
110 | if "image" not in input_dict:
111 | yield self.text_event(error_message())
112 | else:
113 | ### call the model to get results:
114 | input_prompt = "" if 'prompt' not in input_dict else input_dict['prompt']
115 |
116 | generated_image_task = asyncio.create_task(
117 | self._generate_image(
118 | image_url = input_dict['image'],
119 | prompt = "mksks style," + input_prompt + "," + alternative_prompt
120 | )
121 | )
122 |
123 | i = 0
124 | while True:
125 | done, _ = await asyncio.wait(
126 | [generated_image_task], timeout=_WAIT_TIMEOUT_S
127 | )
128 | if done:
129 | output = done.pop().result()
130 | break
131 | yield self.replace_response_event(f"Generating your image: {i}s elapsed...")
132 | i += 1
133 |
134 | if len(output) != 1:
135 | yield self.replace_response_event(
136 | textwrap.dedent(
137 | f"""
138 |
139 | Sorry, something seems to go wrong.
140 |
141 | Please don't blame the developer. He's trying ᕕ( ᐛ )ᕗ.
142 |
143 | But he does want you to know that you look amazing who you are.
144 |
145 | 
146 | """
147 | )
148 | )
149 | else:
150 | yield self.replace_response_event(
151 | textwrap.dedent(
152 | _get_complete_message(
153 | second = i,
154 | input_url = input_dict['image'],
155 | output_url = output[0])
156 | )
157 | )
158 |
159 | async def _generate_image(self, image_url: str, prompt: str):
160 | loop = asyncio.get_running_loop()
161 | output = await loop.run_in_executor(
162 | None,
163 | lambda: replicate.run(
164 | MODEL_URL,
165 | input={
166 | "image": image_url,
167 | "prompt": prompt,
168 | "negative_prompt": negative_prompt,
169 | "num_inference_steps": 50,
170 | }
171 | )
172 | )
173 | return output
174 |
175 | async def on_feedback(self, feedback: ReportFeedbackRequest) -> None:
176 | """Called when we receive user feedback such as likes."""
177 | print(
178 | f"User {feedback.user_id} gave feedback on {feedback.conversation_id}"
179 | f"message {feedback.message_id}: {feedback.feedback_type}"
180 | )
181 |
182 | async def get_settings(self, settings: SettingsRequest) -> SettingsResponse:
183 | """Return the settings for this bot."""
184 | return SETTINGS
185 |
186 |
187 | if __name__ == "__main__":
188 | run(AnimeYourself())
--------------------------------------------------------------------------------
/llm_bots/animeyourself/main.py:
--------------------------------------------------------------------------------
1 | from fastapi_poe import make_app
2 | import modal
3 | from modal import Image, Stub, asgi_app
4 | from anime_yourself import AnimeYourself
5 | import os
6 |
7 | # specific to hosting with modal.com
8 | image = Image.debian_slim().pip_install_from_requirements(
9 | "requirements.txt"
10 | )
11 | stub = Stub("animeyourself")
12 |
13 |
14 | @stub.function(image=image, secret=modal.Secret.from_name("my-replicate-key"))
15 | @asgi_app()
16 | def fastapi_app():
17 | bot = AnimeYourself()
18 | app = make_app(bot, allow_without_key=True)
19 | return app
20 |
21 |
--------------------------------------------------------------------------------
/llm_bots/animeyourself/readme.md:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/llm_bots/animeyourself/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi-poe==0.0.14
2 | replicate
3 |
--------------------------------------------------------------------------------
/llm_bots/scribble2img/README.md:
--------------------------------------------------------------------------------
1 | # Poe API Bot tutorial
2 |
3 | This is the companion repository to the Poe API bot
4 | [quick start](https://developer.poe.com/api-bots/quick-start). Please follow that guide
5 | for instructions on how to use this repository.
6 |
--------------------------------------------------------------------------------
/llm_bots/scribble2img/main.py:
--------------------------------------------------------------------------------
1 | from fastapi_poe import make_app
2 | import modal
3 | from modal import Image, Stub, asgi_app
4 | from scribble2image import Scribble2ImageBot
5 | import os
6 |
7 | # specific to hosting with modal.com
8 | image = Image.debian_slim().pip_install_from_requirements(
9 | "requirements.txt"
10 | )
11 | stub = Stub("scribble2image")
12 |
13 |
14 | @stub.function(image=image, secret=modal.Secret.from_name("my-replicate-key"))
15 | @asgi_app()
16 | def fastapi_app():
17 | bot = Scribble2ImageBot()
18 | app = make_app(bot, allow_without_key=True)
19 | return app
20 |
--------------------------------------------------------------------------------
/llm_bots/scribble2img/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi-poe==0.0.14
2 | replicate==0.8.3
3 |
--------------------------------------------------------------------------------
/llm_bots/scribble2img/scribble2image.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | This bot uses all options provided by the Poe protocol. You can use it to get examples
4 | of all the protocol has to offer.
5 |
6 | """
7 | from __future__ import annotations
8 |
9 | import asyncio
10 | import json
11 | from typing import AsyncIterable
12 |
13 | from fastapi_poe import PoeBot, run
14 | from fastapi_poe.types import (
15 | ContentType,
16 | QueryRequest,
17 | ReportFeedbackRequest,
18 | SettingsRequest,
19 | SettingsResponse,
20 | )
21 | import os
22 | import replicate
23 | import re
24 | import random
25 | import textwrap
26 | import asyncio
27 | from sse_starlette.sse import ServerSentEvent
28 |
29 | SETTINGS = SettingsResponse(
30 | context_clear_window_secs=60 * 60, allow_user_context_clear=True
31 | )
32 |
33 | MODEL_URL = "jagilley/controlnet-scribble:435061a1b5a4c1e26740464bf786efdfa9cb3a3ac488595a2de23e143fdb0117"
34 |
35 | _WAIT_TIMEOUT_S = 1
36 |
37 | encouraging_msgs = [
38 | "The developer loves your scribble 😍",
39 | "With the drawing skill like this, do you even need this bot?",
40 | "Wow! Your scribbling skill is hella strong",
41 | "Interesting interesting! I gotcha",
42 | "Your doodle is simply world class. Let me see what else I can add."
43 | ]
44 |
45 | def _get_complete_message(second, encouraging_msg, input_url, output_url):
46 | _COMPLETE_MESSAGE = f"""
47 | Completed! (took {second}s)
48 |
49 | This is the original. {encouraging_msg}:
50 |
51 | 
52 |
53 | This is your scribble brought to life:
54 |
55 | 
56 |
57 | """
58 | return _COMPLETE_MESSAGE
59 |
60 |
61 | def parse_text(txt):
62 | # Define a regular expression to match the fields and their values
63 | pattern = re.compile(r'(image|prompt):\s*"([^"]+)"')
64 | matches = pattern.findall(txt)
65 | result = {}
66 |
67 | for field, value in matches:
68 | result[field] = value
69 | return result
70 |
71 | def error_message(missing_image=True, image_url=None):
72 | missing_piece = "image" if missing_image else "prompt"
73 | if image_url:
74 | additional_txt = f"""
75 |
76 | But I just wanna say that I love your scribble.
77 |
78 | 
79 | """
80 | else:
81 | additional_txt = ""
82 | msg = textwrap.dedent(f"""
83 | Sorry, I cannot parse your {missing_piece}. Please try again and make sure your input has the format:
84 |
85 | ```python
86 | image: ""
87 | prompt: ""
88 | ```
89 |
90 | {additional_txt}
91 | """
92 | )
93 | return msg
94 |
95 | class Scribble2ImageBot(PoeBot):
96 |
97 | async def get_response(self, query: QueryRequest) -> AsyncIterable[ServerSentEvent]:
98 | """Return an async iterator of events to send to the user."""
99 | last_message = query.query[-1].content.lower()
100 | response_content_type: ContentType = ("text/markdown")
101 | yield self.meta_event(
102 | content_type=response_content_type,
103 | linkify=False,
104 | refetch_settings=False,
105 | suggested_replies=False,
106 | )
107 |
108 | input_dict = parse_text(last_message)
109 | if "image" not in input_dict:
110 | yield self.text_event(error_message())
111 | elif "prompt" not in input_dict:
112 | yield self.text_event(error_message(
113 | missing_image=False,
114 | image_url = input_dict['image']
115 | )
116 | )
117 | else:
118 | ### call the model to get results:
119 | generated_image_task = asyncio.create_task(
120 | self._generate_image(input_dict['image'], input_dict['prompt'])
121 | )
122 |
123 | i = 0
124 | while True:
125 | done, _ = await asyncio.wait(
126 | [generated_image_task], timeout=_WAIT_TIMEOUT_S
127 | )
128 | if done:
129 | output = done.pop().result()
130 | break
131 | yield self.replace_response_event(f"Generating your image: {i}s elapsed...")
132 | i += 1
133 |
134 | if len(output) != 2:
135 | yield self.replace_response_event(
136 | textwrap.dedent(
137 | f"""
138 |
139 | Sorry, something seems to go wrong.
140 |
141 | Please don't blame the developer. He's trying ᕕ( ᐛ )ᕗ.
142 |
143 | But he does want you to know that he loves your scribble.
144 |
145 | 
146 | """
147 | )
148 | )
149 | else:
150 | yield self.replace_response_event(
151 | textwrap.dedent(
152 | _get_complete_message(
153 | second = i,
154 | encouraging_msg = random.choice(encouraging_msgs),
155 | input_url = input_dict['image'],
156 | output_url = output[1])
157 | )
158 | )
159 |
160 | async def _generate_image(self, image_url: str, prompt: str):
161 | loop = asyncio.get_running_loop()
162 | output = await loop.run_in_executor(
163 | None,
164 | lambda: replicate.run(
165 | MODEL_URL,
166 | input={
167 | "image": image_url,
168 | "prompt": prompt,
169 | }
170 | )
171 | )
172 | return output
173 |
174 |
175 | async def on_feedback(self, feedback: ReportFeedbackRequest) -> None:
176 | """Called when we receive user feedback such as likes."""
177 | print(
178 | f"User {feedback.user_id} gave feedback on {feedback.conversation_id}"
179 | f"message {feedback.message_id}: {feedback.feedback_type}"
180 | )
181 |
182 | async def get_settings(self, settings: SettingsRequest) -> SettingsResponse:
183 | """Return the settings for this bot."""
184 | return SETTINGS
185 |
186 |
187 | if __name__ == "__main__":
188 | run(Scribble2ImageBot())
--------------------------------------------------------------------------------