├── .gitignore ├── PythonOOP ├── FarolModule │ ├── __init__.py │ ├── agent.py │ ├── simulation.py │ └── strategy.py ├── Farol_logfile.txt ├── Presentation_helper.ipynb ├── README.md ├── diagram.jpg └── elFarol.py ├── PythonWebScraping ├── PreProcessingScript.R ├── README.md ├── countries.csv ├── preprocessed.csv ├── rugby_stats_final.csv.csv └── scraper.py ├── README.md └── SQL ├── README.md └── sql_bigquery.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | credentials/ 2 | data/ 3 | .ipynb_checkpoints 4 | */.ipynb_checkpoints/* 5 | -------------------------------------------------------------------------------- /PythonOOP/FarolModule/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/boudrejp/DataSciencePortfolio/c71899edbf52c8e68ffca49e29afe73dcdfd7037/PythonOOP/FarolModule/__init__.py -------------------------------------------------------------------------------- /PythonOOP/FarolModule/agent.py: -------------------------------------------------------------------------------- 1 | ## __author__ = John Boudreaux 2 | import random 3 | import FarolModule.strategy as strat 4 | 5 | class agent: 6 | '''A class to store properties about agents for El Farol and minority 7 | problem games. Expects a memory length as an integer (default is 2), 8 | and a number of strategies as an integer''' 9 | 10 | def __init__(self, memory_length=3, num_strats=3): 11 | # need to select n at random from uniform distribution, 12 | # strategy at random from uniform distribution 13 | available_strats = strat.strategy.approved_strats 14 | self.current_strat = {"index": None, "expected_val": None, "go": None} 15 | 16 | self.strategies = {} 17 | for i in range(0, num_strats): 18 | picked_strat = random.choice(available_strats) 19 | picked_n = random.choice(list(range(1, memory_length+1))) 20 | self.strategies[i] = {'strat': strat.strategy(picked_strat, picked_n),\ 21 | 'n': picked_n, 'votes': 0, 22 | 'strat_type': picked_strat} 23 | 24 | def select_and_eval_strat(self, history, cutoff_val, num_of_agents): 25 | # although it may seem a bit much to have both selection and evaluation 26 | # in one method, it will reduce the number of temporary objects 27 | votes_list = [self.strategies[i]['votes'] for i in range(len(self.strategies))] 28 | max_votes = max(votes_list) 29 | max_indices = [index for index, val in enumerate(votes_list) if val == max_votes] 30 | # choose the strategy with the most votes. if tied, choose random 31 | if len(max_indices) > 1: 32 | which_index = random.choice(max_indices) 33 | else: 34 | which_index = max_indices[0] 35 | active_strat = self.strategies[which_index] 36 | self.current_strat['index'] = which_index 37 | # evaluate the selected strategy, bring back the expected val 38 | # provide convenient vars for readability 39 | local_n = self.strategies[which_index]['n'] 40 | local_strat_type = self.strategies[which_index]['strat'].strat_type 41 | expected_val = active_strat['strat'].strat_evaluate(num_of_agents, \ 42 | history, n = local_n, strat_type = local_strat_type) 43 | self.current_strat['expected_val'] = expected_val 44 | 45 | if expected_val < cutoff_val: 46 | self.current_strat['go'] = 1 47 | else: 48 | self.current_strat['go'] = 0 49 | 50 | def update_votes(self, winning_num): 51 | #function to update the votes for strategies 52 | which_index = self.current_strat['index'] 53 | 54 | if self.current_strat['go'] == winning_num: 55 | self.strategies[which_index]['votes'] += 1 56 | else: 57 | self.strategies[which_index]['votes'] -= 1 58 | -------------------------------------------------------------------------------- /PythonOOP/FarolModule/simulation.py: -------------------------------------------------------------------------------- 1 | # __author__ = John Boudreaux 2 | import math 3 | from FarolModule.agent import agent 4 | 5 | class simulation: 6 | '''A class to house all the parameters for a simulation of the El Farol 7 | problem, and to store its outputs''' 8 | # going to need to pass all parameters through here, since agents are More 9 | # or less a child class, and strategies are a child class of agents 10 | 11 | def __init__(self, num_of_agents, iterations, cutoff_val, mem_length, num_strats): 12 | '''Simulation object expects agents of class agent, 13 | and a number of iterations that is an integer''' 14 | self.num_of_agents = num_of_agents 15 | self.mem_length = mem_length 16 | self.target_value = math.ceil(cutoff_val * self.num_of_agents) 17 | self.iterations = iterations 18 | self.num_strats = num_strats 19 | self.history = [] 20 | # create an object to contain all the agents and create them 21 | self.agents = {} 22 | for i in range(self.num_of_agents): 23 | self.agents[i] = agent(memory_length = self.mem_length, num_strats = self.num_strats) 24 | 25 | 26 | def run_simulation(self): 27 | '''Code to loop through and run simulation. Takes variables from 28 | namespace of simulation''' 29 | print("Beginning simulation") 30 | for i in range(self.iterations): 31 | agents_going_to_bar = 0 32 | for j in range(len(self.agents)): 33 | self.agents[j].select_and_eval_strat(history = self.history, \ 34 | cutoff_val = self.target_value, num_of_agents = self.num_of_agents) 35 | agents_going_to_bar += self.agents[j].current_strat['go'] 36 | 37 | # if the total amount of agents at the bar is above the target value, 38 | # I would have had more fun staying home 39 | self.history.append(agents_going_to_bar) 40 | 41 | if agents_going_to_bar > self.target_value: 42 | winning_val = 0 43 | else: 44 | winning_val = 1 45 | 46 | for j in range(len(self.agents)): 47 | self.agents[j].update_votes(winning_val) 48 | 49 | if i == self.iterations / 2: 50 | print("50% Completed!") 51 | 52 | print("Simulation completed!") 53 | -------------------------------------------------------------------------------- /PythonOOP/FarolModule/strategy.py: -------------------------------------------------------------------------------- 1 | # __author__ = John Boudreaux 2 | import random 3 | 4 | class strategy: 5 | '''An object to contain different types of strategies for the El Farol 6 | problem that can be adopted by different agents. Expects a "strat_type" arg, 7 | which descibes the type of strategy adopted. Valid strategy options include: 8 | "same_as_past", which picks a strategy from n iterations ago, 9 | "flip", which takes the number of attendees at week n and subtracts it 10 | from the total number of agents, 11 | "minimum_of_last", which will return the minimum attendees from 12 | the last n iterations, 13 | "avg", which takes the average of the past n weeks, 14 | "maximum_of_last", which will return the maximum attendees from the last 15 | n iterations''' 16 | 17 | approved_strats = ["same_as_past", "flip", "minimum_of_last", "avg", \ 18 | "maximum_of_last", "rand_n"] 19 | 20 | def __init__(self, strat_type, n): 21 | if strat_type not in strategy.approved_strats: 22 | raise ValueError("Please pick an approved strategy from documentation") 23 | else: 24 | self.strat_type = strat_type 25 | self.n = n 26 | 27 | def strat_evaluate(self, num_of_agents, history, n, strat_type): 28 | '''A script to evaluate the strategy for a given agent and the history''' 29 | # note to self: I wish Python had a convenient switch/case protocol 30 | # all scripts here follow similar order: what to do if the length of the 31 | # history is zero, then if it is less than the required n, and then the 32 | # general case. 33 | 34 | if strat_type == "same_as_past": 35 | if len(history) < n: 36 | if len(history) == 0: 37 | expected_num = 0 38 | else: 39 | expected_num = history[0] 40 | else: 41 | expected_num = history[len(history)-n] 42 | 43 | elif strat_type == "flip": 44 | if len(history) < n: 45 | expected_num = num_of_agents 46 | else: 47 | expected_num = num_of_agents - history[len(history)-n] 48 | 49 | elif strat_type == "minimum_of_last": 50 | if len(history) < n: 51 | if len(history) == 0: 52 | expected_num = 0 53 | else: 54 | valid_search = history[0:len(history)] 55 | expected_num = min(valid_search) 56 | else: 57 | valid_search = history[(len(history)-n):len(history)] 58 | expected_num = min(valid_search) 59 | 60 | elif strat_type == "maximum_of_last": 61 | if len(history) < n: 62 | if len(history) == 0: 63 | expected_num = num_of_agents 64 | else: 65 | valid_search = history[0:len(history)] 66 | expected_num = max(valid_search) 67 | else: 68 | valid_search = history[(len(history)-n):len(history)] 69 | expected_num = max(valid_search) 70 | 71 | elif strat_type == "avg": 72 | if len(history) < n: 73 | if len(history) == 0: 74 | expected_num = round(num_of_agents / 2) 75 | else: 76 | valid_search = history[0:len(history)] 77 | expected_num = round(sum(valid_search)/len(valid_search)) 78 | else: 79 | valid_search = history[(len(history)-n):len(history)] 80 | expected_num = round(sum(valid_search)/len(valid_search)) 81 | 82 | elif strat_type == "rand_n": 83 | if len(history) < n: 84 | if len(history) == 0: 85 | expected_num = round(num_of_agents / 2) 86 | else: 87 | valid_search = history[0:len(history)] 88 | expected_num = random.choice(valid_search) 89 | else: 90 | valid_search = history[(len(history)-n):len(history)] 91 | expected_num = random.choice(valid_search) 92 | 93 | return(expected_num) 94 | -------------------------------------------------------------------------------- /PythonOOP/Farol_logfile.txt: -------------------------------------------------------------------------------- 1 | Iterations: 100 2 | Number of agents: 1000 3 | Agent memory length: 5 4 | Attendance percentage cutoff: 60.0 5 | Random seed: 42 6 | 7 | History: 8 | 686, 39, 496, 753, 458, 746, 530, 742, 506, 694, 570, 640, 585, 653, 588, 592, 851, 662, 479, 674, 532, 619, 692, 390, 586, 716, 515, 635, 603, 423, 617, 573, 646, 546, 680, 546, 642, 584, 587, 813, 642, 481, 658, 543, 603, 666, 410, 612, 555, 596, 753, 652, 411, 628, 542, 645, 575, 640, 538, 647, 581, 604, 646, 315, 565, 671, 522, 629, 576, 592, 780, 626, 429, 611, 537, 635, 631, 418, 570, 729, 590, 677, 610, 393, 551, 653, 578, 620, 583, 640, 536, 655, 587, 591, 767, 625, 490, 622, 529, 637 -------------------------------------------------------------------------------- /PythonOOP/Presentation_helper.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# The El Farol Bar Problem\n", 8 | "### Should I go to the bar tonight?\n", 9 | "\n", 10 | "![diagram](./diagram.jpg)" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "### Hyperparameter Inputs:\n", 18 | "* number of iterations\n", 19 | "* number of agents\n", 20 | "* maximum memory length of agents\n", 21 | "* number of strategies per agent\n", 22 | "* cutoff value (at what point is it no longer fun to go?)\n", 23 | "* seed (for reproducibility)\n", 24 | "\n", 25 | "In my application, there's a few extra prompts for input...\n", 26 | "* do you want a log file generated at the end of the simulation?\n", 27 | "* what kind of plot do you want to generate for bar attendance? Moving average or every iteration?" 28 | ] 29 | } 30 | ], 31 | "metadata": { 32 | "kernelspec": { 33 | "display_name": "Python 3", 34 | "language": "python", 35 | "name": "python3" 36 | }, 37 | "language_info": { 38 | "codemirror_mode": { 39 | "name": "ipython", 40 | "version": 3 41 | }, 42 | "file_extension": ".py", 43 | "mimetype": "text/x-python", 44 | "name": "python", 45 | "nbconvert_exporter": "python", 46 | "pygments_lexer": "ipython3", 47 | "version": "3.6.3" 48 | } 49 | }, 50 | "nbformat": 4, 51 | "nbformat_minor": 2 52 | } 53 | -------------------------------------------------------------------------------- /PythonOOP/README.md: -------------------------------------------------------------------------------- 1 | # Design Document 2 | Object Oriented Programming Example by John Boudreaux 3 | 4 | ## How to use 5 | * Have Python >= 3.5 installed on your machine 6 | * Clone or download this repository 7 | * In a shell, execute the `elFarol.py` script with Python 3 8 | 9 | ## Contents 10 | * __Farol Module__: A folder containing Python code for all the applicable classes included in the simulation; see *Object Based Considerations* below 11 | * __Farol_logfile.txt__: An example log file if the log option is shown 12 | * __Presentation_Helper.ipynb__: A brief Jupyter notebook introducing the problem with a visual 13 | * __README.md__: What you are reading right now :) 14 | * __diagram.jpg__: Picture used in `Presentation_Helper.ipynb` 15 | * __elFarol.py__: Main script for running El Farol simulation 16 | 17 | 18 | ## Background 19 | * Simulations of the El Farol problem 20 | * https://en.wikipedia.org/wiki/El_Farol_Bar_problem 21 | 22 | The overall goal of this project would be to run simulations around the El Farol problem which tries to answer the question "should I go to the bar tonight?" The underlying premises are that there is a threshold value of people at the bar that, above the threshold 23 | the bar is too crowded and you would have had more fun at home, and below the threshold 24 | you would have had more fun at the bar. All of the decision making "agents" must 25 | make a decision to go out or stay in at the same time, and will adjust their strategies 26 | to try to maximize their happiness. Each agent will have a slightly different willingness 27 | to change their strategy, but each will have the same memory length (unless it 28 | looks like I need more lines of code, then maybe we look at agents of different 29 | memory lengths). 30 | 31 | This is a fundamental problem in complexity theory, and shows how even very random 32 | situations can lead to emergent behavior. 33 | 34 | ## Dependencies 35 | * math 36 | * random 37 | * matplotlib 38 | 39 | 40 | ## Object based considerations 41 | * Class Simulation 42 | * attributes: 43 | * num_of_iterations 44 | * num_of_agents 45 | * set_seed 46 | * threshold 47 | * memory_length 48 | * dict_bar_attendance 49 | * methods: 50 | * run_simulation 51 | * go_to_bar 52 | * Class agent(dict) 53 | * attributes: 54 | * strategies 55 | * willingness_to_change 56 | * methods: 57 | * evaluate_best_strategy 58 | * adjust_strategy_weights 59 | 60 | -------------------------------------------------------------------------------- /PythonOOP/diagram.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/boudrejp/DataSciencePortfolio/c71899edbf52c8e68ffca49e29afe73dcdfd7037/PythonOOP/diagram.jpg -------------------------------------------------------------------------------- /PythonOOP/elFarol.py: -------------------------------------------------------------------------------- 1 | # A script to run the main routine of simulating the El Farol problem 2 | # background: https://en.wikipedia.org/wiki/El_Farol_Bar_problem 3 | 4 | import random 5 | import math 6 | import sys 7 | import FarolModule.agent as agent 8 | import FarolModule.simulation as simulation 9 | import FarolModule.strategy as strategy 10 | import matplotlib.pyplot as plt 11 | 12 | # fun text from: https://www.messletters.com/en/big-text/ 13 | print("") 14 | print(""":::::::::: ::: :::::::::: ::: ::::::::: :::::::: ::: 15 | :+: :+: :+: :+: :+: :+: :+: :+: :+: :+: 16 | +:+ +:+ +:+ +:+ +:+ +:+ +:+ +:+ +:+ +:+ 17 | +#++:++# +#+ :#::+::# +#++:++#++: +#++:++#: +#+ +:+ +#+ 18 | +#+ +#+ +#+ +#+ +#+ +#+ +#+ +#+ +#+ +#+ 19 | #+# #+# #+# #+# #+# #+# #+# #+# #+# #+# 20 | ########## ########## ### ### ### ### ### ######## ########## """) 21 | 22 | print("") 23 | print("Let's run a simulation of the El Farol problem. I'll need a few parameters...") 24 | print("") 25 | 26 | # user input for parameter selection 27 | iterations = int(input("Select a number of iterations for your simulation (integer value): ")) 28 | num_of_agents = int(input("Select a number of agents for your simulation (integer value): ")) 29 | mem_length = int(input("Select a number for the length of the agent memories (integer value): ")) 30 | num_strats = int(input("Select a number of strategies for each agent (integer value): ")) 31 | cutoff = int(input("Select a cutoff percentage (integer value): ")) / 100 32 | seed = int(input("Choose a seed value (integer value): ")) 33 | logging = input("Do you want to save a log file? [y/n]: ").lower() 34 | if logging != 'y' and logging != 'n': 35 | raise ValueError("Invalid input for logging- must choose y or n") 36 | 37 | plot_option = input("Moving average plot or plot each iteration? [ma] or [it]: ") 38 | if plot_option == "ma": 39 | moving_avg_num = int(input("Select a number of iterations for moving average plot: ")) 40 | elif plot_option != "it": 41 | raise ValueError("Invalid input for plot options- must choose ma or it") 42 | 43 | # for reproducibility 44 | random.seed(seed) 45 | 46 | print("") 47 | # create and run the simulation 48 | el_farol = simulation.simulation(num_of_agents, iterations, cutoff, mem_length, num_strats) 49 | el_farol.run_simulation() 50 | 51 | # plotting 52 | if plot_option == "it": 53 | # create chart of each iteration 54 | title_string = "El Farol Bar Attendance" 55 | 56 | plt.plot(el_farol.history) 57 | plt.axis([0, iterations, 0, num_of_agents]) 58 | plt.title(title_string) 59 | plt.show() 60 | 61 | elif plot_option == "ma": 62 | # create moving average from history 63 | moving_avg = [] 64 | history_length = len(el_farol.history) 65 | 66 | for i in range(history_length- moving_avg_num): 67 | if i < moving_avg_num: 68 | moving_avg_val = sum(el_farol.history[0:i+1])/\ 69 | len(el_farol.history[0:i+1]) 70 | else: 71 | moving_avg_val = sum(el_farol.history[i-moving_avg_num:i+1])/\ 72 | len(el_farol.history[i-moving_avg_num:i+1]) 73 | moving_avg.append(moving_avg_val) 74 | 75 | # create moving average chart 76 | title_string = "El Farol Bar Attendance: " + str(moving_avg_num) +\ 77 | "-point Moving Average" 78 | 79 | plt.plot(moving_avg) 80 | plt.axis([0, iterations, 0, num_of_agents]) 81 | plt.title(title_string) 82 | plt.show() 83 | 84 | # log the results 85 | def log_results(iterations, num_of_agents, mem_length, cutoff, seed, history): 86 | '''A function that will take the results of a simulation and write them into 87 | a log file in the repository to use for further analysis''' 88 | iter_string = "Iterations: " + str(iterations) 89 | agents_string = "Number of agents: " + str(num_of_agents) 90 | mem_string = "Agent memory length: " + str(mem_length) 91 | cutoff_string = "Attendance percentage cutoff: " + str(cutoff * 100) 92 | seed_string = "Random seed: " + str(seed) 93 | 94 | log_string = iter_string + "\n" + agents_string + "\n" +\ 95 | mem_string + "\n" + cutoff_string + "\n" +\ 96 | seed_string + "\n\n" + "History: " + "\n" +\ 97 | ', '.join(map(str, el_farol.history)) 98 | 99 | fh = open("Farol_logfile.txt", "w") 100 | fh.writelines(log_string) 101 | fh.close() 102 | print("Log file created in this directory at 'Farol_logfile.txt'") 103 | 104 | if logging == "y": 105 | log_results(iterations, num_of_agents, mem_length, cutoff, seed, el_farol.history) 106 | -------------------------------------------------------------------------------- /PythonWebScraping/PreProcessingScript.R: -------------------------------------------------------------------------------- 1 | ###data preprocessing 2 | 3 | 4 | df <- read.csv("C:/Users/John/Documents/GitHub/RugbyStats/testpage.csv") 5 | 6 | ###data tends to look good, other than some strange positions 7 | df$position <- gsub("\\(", "", df$position) 8 | df$position <- gsub("\\)", "", df$position) 9 | 10 | unique.positions <- unique(df$position) 11 | print(unique.positions) 12 | 13 | ### want to treat BP, R, UB somehow 14 | ### first guesses as to what these mean... 15 | ### BR = back row. we can likely set these to flanker (F) as a close approximation 16 | ### R = reserve. unless there is significant scoring here, likely get rid of 17 | ### UB = utility back. unless there is significant scoring here, likely get rid of 18 | 19 | length(which(df$position == "BR" & df$points != 0)) 20 | ### there's only 4 nonzero entries. going to assign these to flanker 21 | df$position[df$position == "BR"] <- "F" 22 | 23 | length(which(df$position == "R" & df$points != 0)) 24 | ### there's 31 nonzero point entries here, compared to 27,000 total entries 25 | ### i can likely get rid of these without significantly changing results 26 | df <- df[-which(df$position == "R"),] 27 | 28 | length(which(df$position == "UB" & df$points != 0)) 29 | ### there's 1 nonzero point entries here, compared to 27,000 total entries 30 | ### i can likely get rid of these without significantly changing results 31 | df <- df[-which(df$position == "UB"),] 32 | 33 | ### check on the points column. this one seems to be weird... 34 | print(unique(df$points)) 35 | 36 | ### take a look into the ones with G's... don't really have an explanation 37 | df[grep("G", df$points),] 38 | length(df[grep("G", df$points),]) 39 | 40 | ### only 10 entries... i can get rid of these without significantly affecting analysis 41 | 42 | df <- df[-grep("G", df$points),] 43 | print(unique(df$points)) 44 | 45 | df$points <- as.numeric(as.character(df$points)) 46 | 47 | ### get rid of all entries that are zero- this doesn't add anything and will just 48 | ### make tableau calculations slower 49 | 50 | df <- df[-which(df$points == 0),] 51 | # df$points <- as.numeric(df$points) 52 | summary(df) 53 | 54 | countries <- read.csv("C:/Users/John/Documents/GitHub/RugbyStats/countries.csv") 55 | 56 | ### add in the tiers for both country, opposition 57 | country.matching <- match(df$country, countries$Country_abbrev) 58 | opposition.matching <- match(df$opposition, countries$Country_name) 59 | df$country_tier <- countries$Country_tier[country.matching] 60 | df$opposition_tier <- countries$Country_tier[opposition.matching] 61 | 62 | ### check any NA's that are left over. They are likely tier 3, but could be typo... 63 | 64 | unique(df[is.na(df$country_tier), "country"]) 65 | ### Lions, Korea, HK, PI, Arab, Cooks 66 | ### give Lions their own category. could be fun to compare them to tier 1 nations 67 | df$country_tier[df$country == "Lions"] <- "Lions" 68 | 69 | ### the rest of these are tier 3 70 | 71 | df$country_tier[is.na(df$country_tier)] <- 3 72 | 73 | unique(df[is.na(df$opposition_tier), "opposition"]) 74 | 75 | ### there's a lot of national developmental teams (NZ Maori, Argentina XV, etc..) 76 | ### for consistency, we will delete these 77 | emerging.natl.teams <- unique(df[is.na(df$opposition_tier), "opposition"])[c(c(10:15),c(17:26))] 78 | 79 | 80 | for (i in 1:length(emerging.natl.teams)){ 81 | rows.to.del <- which(df$opposition == emerging.natl.teams[i]) 82 | df <- df[-rows.to.del,] 83 | } 84 | 85 | ### USA is tier 2 86 | df$opposition_tier[df$opposition == "USA"] <- 2 87 | 88 | ### make separate tier barbarians/lions 89 | df$opposition_tier[df$opposition == "Barbarians" | df$opposition == "Lions"] <- "Lions/Barb" 90 | 91 | ### remaining are all tier 3 92 | df$opposition_tier[is.na(df$opposition_tier)] <- 3 93 | 94 | ### let's make columns for the amount of points scored per each type 95 | 96 | df$points_tries <- df$tries * 5 97 | df$points_convs <- df$conv * 2 98 | df$points_pens <- df$penalties * 3 99 | df$points_drops <- df$drop_kicks * 3 100 | 101 | 102 | 103 | write.csv(df, file = "C:/Users/John/Documents/GitHub/RugbyStats/preprocessed.csv") -------------------------------------------------------------------------------- /PythonWebScraping/README.md: -------------------------------------------------------------------------------- 1 | # RugbyStats 2 | A look into ESPN's available international rugby union scoring stats via StatsGuru 3 | 4 | ## Contents 5 | * __countries.csv__: A list of countries, along with their international rugby tier, and abbreviations used if applicable. 6 | * __preprocessed.csv__: The scraped data from `rugby_stats_final.csv` after processed through `PreProcessingScript.R` with new features 7 | * __PreProcessingScript.R__: R script to do some general processing and addition of new features which may be useful for further analysis 8 | * __rugby_stats_final.csv__: The output of web scraping from the ESPN StatsGuru database via `scraper.py` 9 | * __scraper.py__: Simple web scraper using Python's `beautifulsoup` 10 | * __README.md__: The document you are currently reading 11 | 12 | ## Supplemental Information 13 | Rugby internationals Tier list: 14 | https://en.wikipedia.org/wiki/List_of_international_rugby_union_teams 15 | 16 | ESPN StatsGuru: 17 | http://stats.espnscrum.com/statsguru/rugby/stats/index.html 18 | -------------------------------------------------------------------------------- /PythonWebScraping/countries.csv: -------------------------------------------------------------------------------- 1 | Country_name,Country_tier,Country_abbrev 2 | New Zealand,1,NZ 3 | South Africa,1,SA 4 | England,1,Eng 5 | Australia,1,Aust 6 | France,1,Fra 7 | Ireland,1,Ire 8 | Wales,1,Wales 9 | Argentina,1,Arg 10 | Scotland,1,Scot 11 | Italy,1,Italy 12 | Georgia,2,Georg 13 | Portugal,2,Port 14 | Romania,2,Rom 15 | Russia,2,Russ 16 | Spain,2,Spain 17 | Germany,2,Germ 18 | Canada,2,Can 19 | United States,2,USA 20 | Uruguay,2,Urug 21 | Namibia,2,Namib 22 | Japan,2,JPN 23 | Fiji,2,Fiji 24 | Samoa,2,Samoa 25 | Tonga,2,Tonga 26 | Belgium,3,Belg 27 | Brazil,3,Braz 28 | Chile,3,Chile 29 | Ivory Coast,3, 30 | Kenya,3,Kenya 31 | Zimbabwe,3, 32 | Hong Kong,3, 33 | South Korea,3, 34 | Andorra,3, 35 | Armenia,3, 36 | Austria,3, 37 | Azerbaijan,3, 38 | Bahamas,3, 39 | Bahrain,3, 40 | Barbados,3, 41 | Bermuda,3, 42 | Bosnia and Herzegovina,3, 43 | Botswana,3, 44 | British Virgin Islands,3, 45 | Bulgaria,3, 46 | Burundi,3, 47 | Cambodia,3, 48 | Cameroon,3, 49 | Cayman Islands,3, 50 | China,3, 51 | Chinese Taipei,3, 52 | Colombia,3, 53 | Costa Rica,3, 54 | Croatia,3, 55 | Czech Republic,3,Czech 56 | Denmark,3, 57 | Ecuador,3, 58 | Egypt,3, 59 | Finland,3, 60 | Greece,3, 61 | Ghana,3, 62 | Guam,3, 63 | Guatemala,3, 64 | Guyana,3, 65 | Honduras,3, 66 | Hungary,3, 67 | India,3, 68 | Indonesia,3, 69 | Israel,3, 70 | Jamaica,3, 71 | Egypt,3, 72 | Kazakhstan,3,Kazak 73 | Kyrgyzstan,3, 74 | Laos,3, 75 | Latvia,3, 76 | Lebanon,3, 77 | Lithuania,3, 78 | Luxembourg,3, 79 | Madagascar,3, 80 | Malaysia,3, 81 | Mali,3, 82 | Malta,3, 83 | Montenegro,3, 84 | Mauritania,3, 85 | Mauritius,3, 86 | Mexico,3, 87 | Moldova,3, 88 | Monaco,3, 89 | Mongolia,3, 90 | Morocco,3, 91 | Netherlands,3, 92 | Nigeria,3, 93 | Norway,3, 94 | Pakistan,3, 95 | Paraguay,3, 96 | Peru,3, 97 | Philippines,3,Phil 98 | Poland,3, 99 | Qatar,3, 100 | Rwanda,3, 101 | Senegal,3, 102 | Serbia,3, 103 | Singapore,3,Sing 104 | Slovenia,3, 105 | Sri Lanka,3,SL 106 | Saint Lucia,3, 107 | Saint Vincent and the Grenadines,3, 108 | Swaziland,3, 109 | Sweden,3, 110 | Switzerland,3, 111 | Tanzania,3, 112 | Thailand,3, 113 | Togo,3, 114 | Tunisia,3, 115 | Trinidad and Tobago,3, 116 | Uganda,3, 117 | Ukraine,3,Ukr 118 | United Arab Emirates,3,UAE 119 | Uzbekistan,3, 120 | Venezuela,3, 121 | Zambia,3, 122 | -------------------------------------------------------------------------------- /PythonWebScraping/scraper.py: -------------------------------------------------------------------------------- 1 | ### web scraper to take in data from ESPN Rugby StatsGuru 2 | ### author: boudrejp 3 | ### url: http://stats.espnscrum.com/statsguru/rugby/stats/index.html?class=1;page=1;spanmax1=08+Nov+2017;spanmax2=08+Nov+2017;spanmin1=08+Nov+2007;spanmin2=08+Nov+2015;spanval1=span;spanval2=span;template=results;type=player;view=match 4 | ### can simply loop thru page numbers: this query gives 1061 pages of results 5 | 6 | 7 | import time 8 | from time import sleep 9 | from datetime import datetime 10 | from random import randint 11 | from bs4 import BeautifulSoup 12 | import pandas as pd 13 | import numpy as np 14 | from urllib.request import Request, urlopen 15 | from urllib.error import URLError 16 | 17 | ###all of the variables that we'll be importing 18 | player_list = [] 19 | country_list = [] 20 | position_list = [] 21 | points_list = [] 22 | tries_list = [] 23 | conv_list = [] 24 | pens_list = [] 25 | drops_list = [] 26 | result_list = [] 27 | opp_list = [] 28 | 29 | ###track the error iterateration -> we can go back and get these later 30 | 31 | print('libraries successfully imported') 32 | 33 | def table_url_page(page): 34 | return('http://stats.espnscrum.com/statsguru/rugby/stats/index.html?class=1;page=' + str(page) + 35 | ';spanmax1=08+Nov+2017;spanmax2=08+Nov+2017;spanmin1=08+Nov+2007;spanmin2=08+Nov+2015;' + 36 | 'spanval1=span;spanval2=span;template=results;type=player;view=match') 37 | 38 | ##will need to loop for all pages 39 | ##use a time delay to not use up too many server resources 40 | page_range = range(1,1601+1) 41 | 42 | time_start = datetime.now() 43 | 44 | for i in page_range: 45 | req = Request(table_url_page(i)) 46 | try: 47 | response = urlopen(req) 48 | except URLError as e: 49 | if hasattr(e, 'reason'): 50 | print('We failed to reach a server.') 51 | print('Reason: ', e.reason) 52 | elif hasattr(e, 'code'): 53 | print('The server couldn\'t fulfill the request.') 54 | print('Error code: ', e.code) 55 | else: 56 | sleep(randint(5,30)) 57 | 58 | with urlopen(table_url_page(i)) as url: 59 | s = url.read() 60 | 61 | soup = BeautifulSoup(s) 62 | #print(soup) 63 | table_entries = soup.find_all("tr", class_="data1") 64 | ###returned is a list of all the HTML that is under this class... 65 | ###each instance is saved as a separate entry 66 | #print(letters[1]) 67 | entries_on_page = len(table_entries) 68 | #print(entries_on_page) 69 | 70 | ###now, to trim the data into the categories that we want 71 | ###for loop for each entry in the table on the given page. store all the values 72 | for j in range(len(range(1,entries_on_page + 1))): 73 | entry = table_entries[j] 74 | player = entry.find("a", class_="data-link").get_text() 75 | player = str(player) 76 | 77 | country = entry.find("i").get_text() 78 | country = str(country) 79 | ##get rid of parentheses around country abbreviation 80 | country = country.replace("(", "") 81 | country = country.replace(")", "") 82 | 83 | position = entry.find_all("td", class_="left")[1].get_text() 84 | position = str(position) 85 | 86 | points_scored = entry.find("b").get_text() 87 | points_scored = str(points_scored) 88 | 89 | tries_scored = entry.find_all("td")[3].get_text() 90 | tries_scored = str(tries_scored) 91 | 92 | conversions_scored = entry.find_all("td")[4].get_text() 93 | conversions_scored = str(conversions_scored) 94 | 95 | penalties_scored = entry.find_all("td")[5].get_text() 96 | penalties_scored = str(penalties_scored) 97 | 98 | drops_scored = entry.find_all("td")[6].get_text() 99 | drops_scored = str(drops_scored) 100 | 101 | result = entry.find_all("td", class_="left")[2].get_text() 102 | result = str(result) 103 | 104 | opposition = entry.find_all("td", class_="left")[3].get_text() 105 | opposition = str(opposition) 106 | opposition = opposition.replace("v ", "") 107 | 108 | #print("Player is " + player) 109 | #print("Country is " + country) 110 | #print("Position is " + position) 111 | #print("Scored " + points_scored + " points") 112 | #print("Scored " + tries_scored + " tries") 113 | #print("Scored " + conversions_scored + " conversions") 114 | #print("Scored " + penalties_scored + " pens") 115 | #print("Scored " + drops_scored + " drop goals") 116 | #print("Result of game was " + result) 117 | #print("The opposition was " + opposition) 118 | 119 | player_list.append(player) 120 | country_list.append(country) 121 | position_list.append(position) 122 | points_list.append(points_scored) 123 | tries_list.append(tries_scored) 124 | conv_list.append(conversions_scored) 125 | pens_list.append(penalties_scored) 126 | drops_list.append(drops_scored) 127 | result_list.append(result) 128 | opp_list.append(opposition) 129 | 130 | time_loop = datetime.now() 131 | print("Loop " + str(i) + " completed. Total runtime so far: " + str(time_loop - time_start)) 132 | 133 | if i % 5 == 0: 134 | rugby_df_temp = pd.DataFrame( 135 | {"player": player_list, 136 | "country": country_list, 137 | "position": position_list, 138 | "points": points_list, 139 | "tries": tries_list, 140 | "conv": conv_list, 141 | "penalties": pens_list, 142 | "drop_kicks": drops_list, 143 | "result": result_list, 144 | "opposition": opp_list 145 | }) 146 | 147 | rugby_df_temp.to_csv('testpage.csv', index = False) 148 | print("CSV created with all data from " + str(i) + " iterations") 149 | 150 | 151 | time_end = datetime.now() 152 | time_taken = time_end - time_start 153 | 154 | print("Time taken: " + str(time_taken)) 155 | 156 | rugby_df_final = pd.DataFrame( 157 | {"player": player_list, 158 | "country": country_list, 159 | "position": position_list, 160 | "points": points_list, 161 | "tries": tries_list, 162 | "conv": conv_list, 163 | "penalties": pens_list, 164 | "drop_kicks": drops_list, 165 | "result": result_list, 166 | "opposition": opp_list 167 | }) 168 | 169 | rugby_df_final.to_csv('rugby_stats_final.csv', index = False) 170 | 171 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DataSciencePortfolio 2 | A collection of selected data science projects to demonstrate skill sets, knowledge, and what I find interesting 3 | 4 | ## Contents 5 | * __PythonOOP__: An object oriented programming example in Python 3 which will run simulations of the El Farol problem from complexity theory 6 | * __PythonWebScraping__: Simple web scraping example using Python's `beautifulsoup` and some initial processing, feature generation in R for international rugby scoring statistics. 7 | * __SQL__: Examples of executing and formulating SQL queries for selected simulated business cases. 8 | 9 | ## In Progress 10 | * __Movement of old projects into this repository__ 11 | * Machine learning projects (multiple) 12 | * Dockerized applications 13 | * __New and ongoing projects__ 14 | * More machine learning projects 15 | * Focus on deep learning architectures, reinforcement learning, time series applications 16 | * R Shiny applications, to be deployed to public R Shiny Server 17 | * Other application + visualization tools 18 | * Custom-written ML algorithms 19 | * Re-implementations to dive deeper into theory 20 | * Possible variants on existing algorithms 21 | 22 | -------------------------------------------------------------------------------- /SQL/README.md: -------------------------------------------------------------------------------- 1 | # SQL Projects 2 | In this folder you'll find side projects utilizing various SQL and SQL-like database technologies. In general, these aim to show how I might formulate a query for a business application. 3 | 4 | ### Contents 5 | * __sql_bigquery.ipynb__: Queryies utilizing Google BigQuery for the San Francisco bikeshares dataset, doing some exploratory data analysis and writing queries for a few business problems. 6 | -------------------------------------------------------------------------------- /SQL/sql_bigquery.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Using BigQuery to perform basic data analytics\n", 8 | "\n", 9 | "Here's a sample notebook of executing SQL commands in order to analyze some data, along with some basic visualization.\n", 10 | "\n", 11 | "We'll explore some data sets and reproduce how we might write queries for certain business problems." 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "### Setup" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "# relevant installs\n", 28 | "# !pip install google-cloud \n", 29 | "# !pip install --upgrade google-cloud-bigquery[pandas] \n", 30 | "# !pip install google-cloud-storage" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 2, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "%load_ext google.cloud.bigquery" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 3, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "SERVICE_ACCOUNT= 'bq_jupyter'\n", 49 | "JSON_FILE_NAME = '../credentials/ds-portfolio-a04fdb631b73.json'\n", 50 | "GCP_PROJECT_ID = 'ds-portfolio'" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 4, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "import subprocess\n", 60 | "import sys\n", 61 | "import logging\n", 62 | "import pandas as pd\n", 63 | "import numpy as np\n", 64 | "import matplotlib.pyplot as plt\n", 65 | "import seaborn as sns\n", 66 | "from scipy import stats\n", 67 | "\n", 68 | "logger = logging.Logger('catch_all')\n", 69 | "\n", 70 | "def run_command(parameters):\n", 71 | " try:\n", 72 | " # \"\"\"Prints and runs a command.\"\"\"\n", 73 | " return subprocess.check_output(parameters)\n", 74 | " except BaseException as e: \n", 75 | " logger.error(e) \n", 76 | " logger.error('ERROR: Looking in jupyter console for more information')" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 5, 82 | "metadata": {}, 83 | "outputs": [], 84 | "source": [ 85 | "%matplotlib inline" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "### Queries" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "We'll be using the San Francisco Bikeshares dataset, which contains information around trips for the bikeshare program in San Francisco." 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 6, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "from google.cloud import bigquery\n", 109 | "\n", 110 | "client = bigquery.Client.from_service_account_json(JSON_FILE_NAME)\n", 111 | "\n", 112 | "def query_to_df(query):\n", 113 | " # transfers query results to pandas dataframe for easy manipulating\n", 114 | " return(client.query(query).result().to_dataframe())\n", 115 | "\n", 116 | "def get_schema(table):\n", 117 | " # retreives the schema as a printed object\n", 118 | " return(client.get_table(table).schema)" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "There's 4 different tables in this database. As a first step, we should look at the schema of all of these tables and see where we might be able to join for insight in future queries.\n", 126 | "\n", 127 | "The tables are...\n", 128 | "* bikeshare_regions\n", 129 | "* bikeshare_station_info\n", 130 | "* bikeshare_station_status\n", 131 | "* bikeshare_trips" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 7, 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "data": { 141 | "text/plain": [ 142 | "[SchemaField('region_id', 'INTEGER', 'REQUIRED', 'Unique identifier for the region', ()),\n", 143 | " SchemaField('name', 'STRING', 'REQUIRED', 'Public name for this region', ())]" 144 | ] 145 | }, 146 | "execution_count": 7, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "#bikeshare_regions\n", 153 | "table = 'bigquery-public-data.san_francisco_bikeshare.bikeshare_regions'\n", 154 | "get_schema(table)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 8, 160 | "metadata": {}, 161 | "outputs": [ 162 | { 163 | "data": { 164 | "text/plain": [ 165 | "[SchemaField('station_id', 'INTEGER', 'REQUIRED', 'Unique identifier of a station.', ()),\n", 166 | " SchemaField('name', 'STRING', 'REQUIRED', 'Public name of the station', ()),\n", 167 | " SchemaField('short_name', 'STRING', 'NULLABLE', 'Short name or other type of identifier, as used by the data publisher', ()),\n", 168 | " SchemaField('lat', 'FLOAT', 'REQUIRED', 'The latitude of station. The field value must be a valid WGS 84 latitude in decimal degrees format. See: http://en.wikipedia.org/wiki/World_Geodetic_System, https://en.wikipedia.org/wiki/Decimal_degrees', ()),\n", 169 | " SchemaField('lon', 'FLOAT', 'REQUIRED', 'The longitude of station. The field value must be a valid WGS 84 longitude in decimal degrees format. See: http://en.wikipedia.org/wiki/World_Geodetic_System, https://en.wikipedia.org/wiki/Decimal_degrees', ()),\n", 170 | " SchemaField('region_id', 'INTEGER', 'NULLABLE', 'ID of the region where station is located', ()),\n", 171 | " SchemaField('rental_methods', 'STRING', 'NULLABLE', 'Array of enumerables containing the payment methods accepted at this station. Current valid values (in CAPS) are: KEY (i.e. operator issued bike key / fob / card) CREDITCARD PAYPASS APPLEPAY ANDROIDPAY TRANSITCARD ACCOUNTNUMBER PHONE This list is intended to be as comprehensive at the time of publication as possible but is subject to change, as defined in File Requirements above', ()),\n", 172 | " SchemaField('capacity', 'INTEGER', 'NULLABLE', 'Number of total docking points installed at this station, both available and unavailable', ()),\n", 173 | " SchemaField('external_id', 'STRING', 'NULLABLE', '', ()),\n", 174 | " SchemaField('rental_url', 'STRING', 'NULLABLE', '', ()),\n", 175 | " SchemaField('eightd_has_key_dispenser', 'BOOLEAN', 'NULLABLE', '', ()),\n", 176 | " SchemaField('has_kiosk', 'BOOLEAN', 'NULLABLE', '', ()),\n", 177 | " SchemaField('station_geom', 'GEOGRAPHY', 'NULLABLE', '', ())]" 178 | ] 179 | }, 180 | "execution_count": 8, 181 | "metadata": {}, 182 | "output_type": "execute_result" 183 | } 184 | ], 185 | "source": [ 186 | "#bikeshare_station_info\n", 187 | "table = 'bigquery-public-data.san_francisco_bikeshare.bikeshare_station_info'\n", 188 | "get_schema(table)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 9, 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "text/plain": [ 199 | "[SchemaField('station_id', 'INTEGER', 'REQUIRED', 'Unique identifier of a station', ()),\n", 200 | " SchemaField('num_bikes_available', 'INTEGER', 'REQUIRED', 'Number of bikes available for rental', ()),\n", 201 | " SchemaField('num_bikes_disabled', 'INTEGER', 'NULLABLE', 'Number of disabled bikes at the station. Vendors who do not want to publicize the number of disabled bikes or docks in their system can opt to omit station capacity (in station_information), num_bikes_disabled and num_docks_disabled. If station capacity is published then broken docks/bikes can be inferred (though not specifically whether the decreased capacity is a broken bike or dock)', ()),\n", 202 | " SchemaField('num_docks_available', 'INTEGER', 'REQUIRED', 'Number of docks accepting bike returns', ()),\n", 203 | " SchemaField('num_docks_disabled', 'INTEGER', 'NULLABLE', 'Number of empty but disabled dock points at the station. This value remains as part of the spec as it is possibly useful during development', ()),\n", 204 | " SchemaField('is_installed', 'BOOLEAN', 'REQUIRED', '1/0 boolean - is the station currently on the street', ()),\n", 205 | " SchemaField('is_renting', 'BOOLEAN', 'REQUIRED', '1/0 boolean - is the station currently renting bikes (even if the station is empty, if it is set to allow rentals this value should be 1)', ()),\n", 206 | " SchemaField('is_returning', 'BOOLEAN', 'REQUIRED', '1/0 boolean - is the station accepting bike returns (if a station is full but would allow a return if it was not full then this value should be 1)', ()),\n", 207 | " SchemaField('last_reported', 'INTEGER', 'REQUIRED', 'Integer POSIX timestamp indicating the last time this station reported its status to the backend', ()),\n", 208 | " SchemaField('num_ebikes_available', 'INTEGER', 'NULLABLE', '', ()),\n", 209 | " SchemaField('eightd_has_available_keys', 'BOOLEAN', 'NULLABLE', '', ())]" 210 | ] 211 | }, 212 | "execution_count": 9, 213 | "metadata": {}, 214 | "output_type": "execute_result" 215 | } 216 | ], 217 | "source": [ 218 | "#bikeshare_station_status\n", 219 | "table = 'bigquery-public-data.san_francisco_bikeshare.bikeshare_station_status'\n", 220 | "get_schema(table)" 221 | ] 222 | }, 223 | { 224 | "cell_type": "code", 225 | "execution_count": 10, 226 | "metadata": {}, 227 | "outputs": [ 228 | { 229 | "data": { 230 | "text/plain": [ 231 | "[SchemaField('trip_id', 'INTEGER', 'REQUIRED', 'Numeric ID of bike trip', ()),\n", 232 | " SchemaField('duration_sec', 'INTEGER', 'NULLABLE', 'Time of trip in seconds', ()),\n", 233 | " SchemaField('start_date', 'TIMESTAMP', 'NULLABLE', 'Start date of trip with date and time, in PST', ()),\n", 234 | " SchemaField('start_station_name', 'STRING', 'NULLABLE', 'Station name of start station', ()),\n", 235 | " SchemaField('start_station_id', 'INTEGER', 'NULLABLE', 'Numeric reference for start station', ()),\n", 236 | " SchemaField('end_date', 'TIMESTAMP', 'NULLABLE', 'End date of trip with date and time, in PST', ()),\n", 237 | " SchemaField('end_station_name', 'STRING', 'NULLABLE', 'Station name for end station', ()),\n", 238 | " SchemaField('end_station_id', 'INTEGER', 'NULLABLE', 'Numeric reference for end station', ()),\n", 239 | " SchemaField('bike_number', 'INTEGER', 'NULLABLE', 'ID of bike used', ()),\n", 240 | " SchemaField('zip_code', 'STRING', 'NULLABLE', 'Home zip code of subscriber (customers can choose to manually enter zip at kiosk however data is unreliable)', ()),\n", 241 | " SchemaField('subscriber_type', 'STRING', 'NULLABLE', 'Subscriber = annual or 30-day member; Customer = 24-hour or 3-day member', ()),\n", 242 | " SchemaField('c_subscription_type', 'STRING', 'NULLABLE', '', ()),\n", 243 | " SchemaField('start_station_latitude', 'FLOAT', 'NULLABLE', '', ()),\n", 244 | " SchemaField('start_station_longitude', 'FLOAT', 'NULLABLE', '', ()),\n", 245 | " SchemaField('end_station_latitude', 'FLOAT', 'NULLABLE', '', ()),\n", 246 | " SchemaField('end_station_longitude', 'FLOAT', 'NULLABLE', '', ()),\n", 247 | " SchemaField('member_birth_year', 'INTEGER', 'NULLABLE', '', ()),\n", 248 | " SchemaField('member_gender', 'STRING', 'NULLABLE', '', ()),\n", 249 | " SchemaField('bike_share_for_all_trip', 'STRING', 'NULLABLE', '', ()),\n", 250 | " SchemaField('start_station_geom', 'GEOGRAPHY', 'NULLABLE', '', ()),\n", 251 | " SchemaField('end_station_geom', 'GEOGRAPHY', 'NULLABLE', '', ())]" 252 | ] 253 | }, 254 | "execution_count": 10, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "#bikeshare_trips\n", 261 | "table = 'bigquery-public-data.san_francisco_bikeshare.bikeshare_trips'\n", 262 | "get_schema(table)" 263 | ] 264 | }, 265 | { 266 | "cell_type": "markdown", 267 | "metadata": {}, 268 | "source": [ 269 | "So when we take a look at the schemas, we can see that each table gives us some different information. A few things that jump out:\n", 270 | "* We get some interesting information from the `station_info` table regarding payment types. It could be interesting to look if stations with different payment types are associated with more or less rides.\n", 271 | "* The `bikeshare_trips` table will give us information around ride-by-ride stats and has unique identifiers around customers/members that use them.\n", 272 | " * We have additional information for members, but not for customers\n", 273 | " * This will allow us to take a look at where popular routes might be\n", 274 | " \n", 275 | "At this point, we can probably start looking at doing some queries for some explorative work, and seeing where we might be able to answer questions for real business impact.\n", 276 | "\n", 277 | "Let's start out by looking how many rides each of the bikes in our dataset have on them. This might give us an idea how much wear and tear these bikes have." 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 11, 283 | "metadata": {}, 284 | "outputs": [ 285 | { 286 | "data": { 287 | "text/plain": [ 288 | "Text(0.5, 1.0, 'Number of rides per bike')" 289 | ] 290 | }, 291 | "execution_count": 11, 292 | "metadata": {}, 293 | "output_type": "execute_result" 294 | }, 295 | { 296 | "data": { 297 | "image/png": "\n", 298 | "text/plain": [ 299 | "
" 300 | ] 301 | }, 302 | "metadata": { 303 | "needs_background": "light" 304 | }, 305 | "output_type": "display_data" 306 | } 307 | ], 308 | "source": [ 309 | "# q1\n", 310 | "# which bikes have been used the most?\n", 311 | "QUERY = (\n", 312 | " \"\"\"\n", 313 | " SELECT\n", 314 | " COUNT(trip_id) AS num_trips, bike_number \n", 315 | " FROM\n", 316 | " `bigquery-public-data.san_francisco_bikeshare.bikeshare_trips`\n", 317 | " GROUP BY\n", 318 | " bike_number\n", 319 | " ORDER BY\n", 320 | " num_trips DESC\n", 321 | " \"\"\")\n", 322 | "\n", 323 | "\n", 324 | "ret_df = query_to_df(QUERY)\n", 325 | "plt.hist(ret_df.num_trips, bins = 20)\n", 326 | "plt.title(\"Number of rides per bike\")" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": {}, 332 | "source": [ 333 | "So here we see that the distribution is not normal, and looks like there's two different fundamental groups that we're dealing with. We have one group of bikes that is used less than about 1000 times, and another normal-ish looking distribution centered around 2750. It might be interesting to look at the differences between these two groups of bikes- maybe they tend to be found on different routes? Maybe they have less miles on them, but just many more frequent rides?\n", 334 | "\n", 335 | "We'll start off by looking at the differences between the average ride time between the many-rides group and the few-rides group." 336 | ] 337 | }, 338 | { 339 | "cell_type": "code", 340 | "execution_count": 12, 341 | "metadata": {}, 342 | "outputs": [ 343 | { 344 | "data": { 345 | "text/plain": [ 346 | "Text(0, 0.5, 'Density/Frequency')" 347 | ] 348 | }, 349 | "execution_count": 12, 350 | "metadata": {}, 351 | "output_type": "execute_result" 352 | }, 353 | { 354 | "data": { 355 | "image/png": "\n", 356 | "text/plain": [ 357 | "
" 358 | ] 359 | }, 360 | "metadata": { 361 | "needs_background": "light" 362 | }, 363 | "output_type": "display_data" 364 | } 365 | ], 366 | "source": [ 367 | "# q2\n", 368 | "# compare average ride times for bikes above/below 1500 bikes\n", 369 | "\n", 370 | "# high rides query\n", 371 | "q2_a = (\n", 372 | " \"\"\"\n", 373 | " SELECT\n", 374 | " avg(duration_sec)/60 AS avg_trip_length_min, \n", 375 | " COUNT(trip_id) AS num_trips,\n", 376 | " bike_number \n", 377 | " FROM\n", 378 | " `bigquery-public-data.san_francisco_bikeshare.bikeshare_trips`\n", 379 | " GROUP BY\n", 380 | " bike_number\n", 381 | " HAVING\n", 382 | " num_trips >= 1500\n", 383 | " \"\"\")\n", 384 | "\n", 385 | "ret_df_a = query_to_df(q2_a)\n", 386 | "\n", 387 | "# low rides query\n", 388 | "q2_b = (\n", 389 | " \"\"\"\n", 390 | " SELECT\n", 391 | " avg(duration_sec)/60 AS avg_trip_length_min,\n", 392 | " COUNT(trip_id) AS num_trips,\n", 393 | " bike_number\n", 394 | " FROM\n", 395 | " `bigquery-public-data.san_francisco_bikeshare.bikeshare_trips`\n", 396 | " GROUP BY\n", 397 | " bike_number\n", 398 | " HAVING\n", 399 | " num_trips < 1500\n", 400 | " \"\"\")\n", 401 | "\n", 402 | "ret_df_b = query_to_df(q2_b)\n", 403 | "\n", 404 | "# plot them on the same plot with density lines rather than histograms\n", 405 | "sns.distplot(ret_df_a['avg_trip_length_min'], hist = False, kde = True,\n", 406 | " kde_kws = {'linewidth': 3},\n", 407 | " label = \"High Number of Trips\")\n", 408 | "sns.distplot(ret_df_b['avg_trip_length_min'], hist = False, kde = True,\n", 409 | " kde_kws = {'linewidth': 3},\n", 410 | " label = \"Low Number of Trips\")\n", 411 | "plt.legend(prop={'size':10})\n", 412 | "plt.title(\"Comparing High Usage Bikes to Low Usage\")\n", 413 | "plt.xlabel(\"Average ridetime\")\n", 414 | "plt.ylabel(\"Density/Frequency\")" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": {}, 420 | "source": [ 421 | "So we can see that our bikes with lower number of trips have a higher variance around the average trip length, whereas the high usage bikes have much lower variance. This gives some evidence to our theory that perhaps the higher usage bikes are going on certain high-traffic routes (that presumably are around 17 minutes or so).\n", 422 | "\n", 423 | "Note that I used a density plot instead of comparing histograms. From a data visualization perspective, we want these plotted on the same axes to make this comparison easy to make. Doing overlapping histograms can get cluttered, so we opt instead for the density plot which comes across much cleaner with the same kind of takeaway as the histogram.\n", 424 | "\n", 425 | "In the query, we also use `HAVING` instead of `WHERE` since the condition is applied after our grouping aggregation.\n", 426 | "\n", 427 | "Interestingly, the overall mean of both of these appear to be the same. We'll calculate some basic statics below to confirm." 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": 13, 433 | "metadata": {}, 434 | "outputs": [ 435 | { 436 | "data": { 437 | "text/html": [ 438 | "
\n", 439 | "\n", 452 | "\n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | "
avg_trip_length_minnum_tripsbike_number
count358.000000358.000000358.000000
mean16.1715502665.849162438.709497
std6.458857409.004550139.299144
min13.0698681517.00000016.000000
25%14.8661112469.750000349.250000
50%15.5456872754.000000447.500000
75%16.4817812947.000000545.500000
max133.6713343394.000000878.000000
\n", 512 | "
" 513 | ], 514 | "text/plain": [ 515 | " avg_trip_length_min num_trips bike_number\n", 516 | "count 358.000000 358.000000 358.000000\n", 517 | "mean 16.171550 2665.849162 438.709497\n", 518 | "std 6.458857 409.004550 139.299144\n", 519 | "min 13.069868 1517.000000 16.000000\n", 520 | "25% 14.866111 2469.750000 349.250000\n", 521 | "50% 15.545687 2754.000000 447.500000\n", 522 | "75% 16.481781 2947.000000 545.500000\n", 523 | "max 133.671334 3394.000000 878.000000" 524 | ] 525 | }, 526 | "execution_count": 13, 527 | "metadata": {}, 528 | "output_type": "execute_result" 529 | } 530 | ], 531 | "source": [ 532 | "# high usage\n", 533 | "ret_df_a.describe()" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": 14, 539 | "metadata": {}, 540 | "outputs": [ 541 | { 542 | "data": { 543 | "text/html": [ 544 | "
\n", 545 | "\n", 558 | "\n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | "
avg_trip_length_minnum_tripsbike_number
count3594.0000003594.0000003594.000000
mean16.913051276.3063442152.675849
std5.593865205.0366521090.074206
min3.8583331.0000009.000000
25%13.594331114.2500001276.250000
50%15.837713230.5000002175.500000
75%18.869190416.0000003076.750000
max86.2119101471.0000004073.000000
\n", 618 | "
" 619 | ], 620 | "text/plain": [ 621 | " avg_trip_length_min num_trips bike_number\n", 622 | "count 3594.000000 3594.000000 3594.000000\n", 623 | "mean 16.913051 276.306344 2152.675849\n", 624 | "std 5.593865 205.036652 1090.074206\n", 625 | "min 3.858333 1.000000 9.000000\n", 626 | "25% 13.594331 114.250000 1276.250000\n", 627 | "50% 15.837713 230.500000 2175.500000\n", 628 | "75% 18.869190 416.000000 3076.750000\n", 629 | "max 86.211910 1471.000000 4073.000000" 630 | ] 631 | }, 632 | "execution_count": 14, 633 | "metadata": {}, 634 | "output_type": "execute_result" 635 | } 636 | ], 637 | "source": [ 638 | "# low usage\n", 639 | "ret_df_b.describe()" 640 | ] 641 | }, 642 | { 643 | "cell_type": "markdown", 644 | "metadata": {}, 645 | "source": [ 646 | "A few more things to note...\n", 647 | "\n", 648 | "* We have a lot more bikes in the low usage group compared to high usage group, by about 9x\n", 649 | "* The means are pretty close, but the standard deviations are less similar. The higher usage has a higher variance but lower mean.\n", 650 | "* We could run a t-test to see if the means are equal... with such large sample sizes we will likely come to the conclusion that they are in fact different. I'll skip over this for now.\n", 651 | "\n", 652 | "Let's take a look into the routes used and see if this explains the differences." 653 | ] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": 15, 658 | "metadata": {}, 659 | "outputs": [ 660 | { 661 | "data": { 662 | "text/html": [ 663 | "
\n", 664 | "\n", 677 | "\n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | "
tripsstart_station_idend_station_id
087495060
181686965
272816150
366015061
465686569
565576074
660655170
759307050
857907461
957147470
1055975570
1151595070
1251136570
1350866477
1449777055
1549216769
1648877460
1748047764
1845306050
1943186939
2042413969
2142386957
2242317051
2341507074
2441116370
\n", 839 | "
" 840 | ], 841 | "text/plain": [ 842 | " trips start_station_id end_station_id\n", 843 | "0 8749 50 60\n", 844 | "1 8168 69 65\n", 845 | "2 7281 61 50\n", 846 | "3 6601 50 61\n", 847 | "4 6568 65 69\n", 848 | "5 6557 60 74\n", 849 | "6 6065 51 70\n", 850 | "7 5930 70 50\n", 851 | "8 5790 74 61\n", 852 | "9 5714 74 70\n", 853 | "10 5597 55 70\n", 854 | "11 5159 50 70\n", 855 | "12 5113 65 70\n", 856 | "13 5086 64 77\n", 857 | "14 4977 70 55\n", 858 | "15 4921 67 69\n", 859 | "16 4887 74 60\n", 860 | "17 4804 77 64\n", 861 | "18 4530 60 50\n", 862 | "19 4318 69 39\n", 863 | "20 4241 39 69\n", 864 | "21 4238 69 57\n", 865 | "22 4231 70 51\n", 866 | "23 4150 70 74\n", 867 | "24 4111 63 70" 868 | ] 869 | }, 870 | "execution_count": 15, 871 | "metadata": {}, 872 | "output_type": "execute_result" 873 | } 874 | ], 875 | "source": [ 876 | "# routes for high usage\n", 877 | "# q3 \n", 878 | "# maybe there's a more effecient way, but this works\n", 879 | "q3_a = (\n", 880 | " \"\"\"\n", 881 | " SELECT \n", 882 | " SUM(num_trips) as trips,\n", 883 | " start_station_id,\n", 884 | " end_station_id\n", 885 | " FROM\n", 886 | " (SELECT\n", 887 | " COUNT(trip_id) AS num_trips,\n", 888 | " start_station_id,\n", 889 | " end_station_id,\n", 890 | " bike_number\n", 891 | " FROM\n", 892 | " `bigquery-public-data.san_francisco_bikeshare.bikeshare_trips`\n", 893 | " WHERE\n", 894 | " bike_number IN \n", 895 | " (SELECT bike_number\n", 896 | " FROM\n", 897 | " (SELECT \n", 898 | " COUNT(trip_id) AS num_trips,\n", 899 | " bike_number\n", 900 | " FROM\n", 901 | " `bigquery-public-data.san_francisco_bikeshare.bikeshare_trips`\n", 902 | " GROUP BY\n", 903 | " bike_number\n", 904 | " HAVING\n", 905 | " num_trips >= 1500\n", 906 | " )\n", 907 | " )\n", 908 | " GROUP BY\n", 909 | " start_station_id, end_station_id, bike_number\n", 910 | " )\n", 911 | " GROUP BY\n", 912 | " start_station_id, end_station_id\n", 913 | " ORDER BY\n", 914 | " trips DESC\n", 915 | " LIMIT 25\n", 916 | " \"\"\")\n", 917 | "\n", 918 | "high_vol_routes = query_to_df(q3_a)\n", 919 | "high_vol_routes" 920 | ] 921 | }, 922 | { 923 | "cell_type": "code", 924 | "execution_count": 16, 925 | "metadata": {}, 926 | "outputs": [ 927 | { 928 | "data": { 929 | "text/html": [ 930 | "
\n", 931 | "\n", 944 | "\n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | " \n", 985 | " \n", 986 | " \n", 987 | " \n", 988 | " \n", 989 | " \n", 990 | " \n", 991 | " \n", 992 | " \n", 993 | " \n", 994 | " \n", 995 | " \n", 996 | " \n", 997 | " \n", 998 | " \n", 999 | " \n", 1000 | " \n", 1001 | " \n", 1002 | " \n", 1003 | " \n", 1004 | " \n", 1005 | " \n", 1006 | " \n", 1007 | " \n", 1008 | " \n", 1009 | " \n", 1010 | " \n", 1011 | " \n", 1012 | " \n", 1013 | " \n", 1014 | " \n", 1015 | " \n", 1016 | " \n", 1017 | " \n", 1018 | " \n", 1019 | " \n", 1020 | " \n", 1021 | " \n", 1022 | " \n", 1023 | " \n", 1024 | " \n", 1025 | " \n", 1026 | " \n", 1027 | " \n", 1028 | " \n", 1029 | " \n", 1030 | " \n", 1031 | " \n", 1032 | " \n", 1033 | " \n", 1034 | " \n", 1035 | " \n", 1036 | " \n", 1037 | " \n", 1038 | " \n", 1039 | " \n", 1040 | " \n", 1041 | " \n", 1042 | " \n", 1043 | " \n", 1044 | " \n", 1045 | " \n", 1046 | " \n", 1047 | " \n", 1048 | " \n", 1049 | " \n", 1050 | " \n", 1051 | " \n", 1052 | " \n", 1053 | " \n", 1054 | " \n", 1055 | " \n", 1056 | " \n", 1057 | " \n", 1058 | " \n", 1059 | " \n", 1060 | " \n", 1061 | " \n", 1062 | " \n", 1063 | " \n", 1064 | " \n", 1065 | " \n", 1066 | " \n", 1067 | " \n", 1068 | " \n", 1069 | " \n", 1070 | " \n", 1071 | " \n", 1072 | " \n", 1073 | " \n", 1074 | " \n", 1075 | " \n", 1076 | " \n", 1077 | " \n", 1078 | " \n", 1079 | " \n", 1080 | " \n", 1081 | " \n", 1082 | " \n", 1083 | " \n", 1084 | " \n", 1085 | " \n", 1086 | " \n", 1087 | " \n", 1088 | " \n", 1089 | " \n", 1090 | " \n", 1091 | " \n", 1092 | " \n", 1093 | " \n", 1094 | " \n", 1095 | " \n", 1096 | " \n", 1097 | " \n", 1098 | " \n", 1099 | " \n", 1100 | " \n", 1101 | " \n", 1102 | " \n", 1103 | " \n", 1104 | " \n", 1105 | "
tripsstart_station_idend_station_id
04930156
137582827
234442728
3312942
4309624
52872616
627168115
724693228
82468615
922771581
1022162832
112127166
122098182196
13187066
141841196182
1518255867
161734195182
1716772931
1816713129
1916592230
20162726
2116061727
2216043028
2315712330
2415624567
\n", 1106 | "
" 1107 | ], 1108 | "text/plain": [ 1109 | " trips start_station_id end_station_id\n", 1110 | "0 4930 15 6\n", 1111 | "1 3758 28 27\n", 1112 | "2 3444 27 28\n", 1113 | "3 3129 4 2\n", 1114 | "4 3096 2 4\n", 1115 | "5 2872 6 16\n", 1116 | "6 2716 81 15\n", 1117 | "7 2469 32 28\n", 1118 | "8 2468 6 15\n", 1119 | "9 2277 15 81\n", 1120 | "10 2216 28 32\n", 1121 | "11 2127 16 6\n", 1122 | "12 2098 182 196\n", 1123 | "13 1870 6 6\n", 1124 | "14 1841 196 182\n", 1125 | "15 1825 58 67\n", 1126 | "16 1734 195 182\n", 1127 | "17 1677 29 31\n", 1128 | "18 1671 31 29\n", 1129 | "19 1659 22 30\n", 1130 | "20 1627 2 6\n", 1131 | "21 1606 17 27\n", 1132 | "22 1604 30 28\n", 1133 | "23 1571 23 30\n", 1134 | "24 1562 45 67" 1135 | ] 1136 | }, 1137 | "execution_count": 16, 1138 | "metadata": {}, 1139 | "output_type": "execute_result" 1140 | } 1141 | ], 1142 | "source": [ 1143 | "# routes for low usage\n", 1144 | "# q3 \n", 1145 | "# maybe there's a more effecient way, but this works\n", 1146 | "q3_b = (\n", 1147 | " \"\"\"\n", 1148 | " SELECT \n", 1149 | " SUM(num_trips) as trips,\n", 1150 | " start_station_id,\n", 1151 | " end_station_id\n", 1152 | " FROM\n", 1153 | " (SELECT\n", 1154 | " COUNT(trip_id) AS num_trips,\n", 1155 | " start_station_id,\n", 1156 | " end_station_id,\n", 1157 | " bike_number\n", 1158 | " FROM\n", 1159 | " `bigquery-public-data.san_francisco_bikeshare.bikeshare_trips`\n", 1160 | " WHERE\n", 1161 | " bike_number IN \n", 1162 | " (SELECT bike_number\n", 1163 | " FROM\n", 1164 | " (SELECT \n", 1165 | " COUNT(trip_id) AS num_trips,\n", 1166 | " bike_number\n", 1167 | " FROM\n", 1168 | " `bigquery-public-data.san_francisco_bikeshare.bikeshare_trips`\n", 1169 | " GROUP BY\n", 1170 | " bike_number\n", 1171 | " HAVING\n", 1172 | " num_trips < 1500\n", 1173 | " )\n", 1174 | " )\n", 1175 | " GROUP BY\n", 1176 | " start_station_id, end_station_id, bike_number\n", 1177 | " )\n", 1178 | " GROUP BY\n", 1179 | " start_station_id, end_station_id\n", 1180 | " ORDER BY\n", 1181 | " trips DESC\n", 1182 | " LIMIT 25\n", 1183 | " \"\"\")\n", 1184 | "\n", 1185 | "low_vol_routes = query_to_df(q3_b)\n", 1186 | "low_vol_routes" 1187 | ] 1188 | }, 1189 | { 1190 | "cell_type": "markdown", 1191 | "metadata": {}, 1192 | "source": [ 1193 | "Let's do some pandas trickery now that we've gotten the data from our database as a comparison." 1194 | ] 1195 | }, 1196 | { 1197 | "cell_type": "code", 1198 | "execution_count": 17, 1199 | "metadata": {}, 1200 | "outputs": [ 1201 | { 1202 | "data": { 1203 | "text/html": [ 1204 | "
\n", 1205 | "\n", 1218 | "\n", 1219 | " \n", 1220 | " \n", 1221 | " \n", 1222 | " \n", 1223 | " \n", 1224 | " \n", 1225 | " \n", 1226 | " \n", 1227 | " \n", 1228 | " \n", 1229 | " \n", 1230 | " \n", 1231 | " \n", 1232 | " \n", 1233 | " \n", 1234 | " \n", 1235 | " \n", 1236 | " \n", 1237 | " \n", 1238 | " \n", 1239 | " \n", 1240 | " \n", 1241 | " \n", 1242 | " \n", 1243 | " \n", 1244 | " \n", 1245 | " \n", 1246 | " \n", 1247 | " \n", 1248 | " \n", 1249 | " \n", 1250 | " \n", 1251 | " \n", 1252 | " \n", 1253 | " \n", 1254 | " \n", 1255 | " \n", 1256 | " \n", 1257 | " \n", 1258 | " \n", 1259 | " \n", 1260 | " \n", 1261 | " \n", 1262 | " \n", 1263 | " \n", 1264 | " \n", 1265 | " \n", 1266 | " \n", 1267 | " \n", 1268 | " \n", 1269 | " \n", 1270 | " \n", 1271 | " \n", 1272 | " \n", 1273 | " \n", 1274 | " \n", 1275 | " \n", 1276 | " \n", 1277 | " \n", 1278 | " \n", 1279 | " \n", 1280 | " \n", 1281 | " \n", 1282 | " \n", 1283 | " \n", 1284 | " \n", 1285 | " \n", 1286 | " \n", 1287 | " \n", 1288 | " \n", 1289 | " \n", 1290 | " \n", 1291 | " \n", 1292 | " \n", 1293 | " \n", 1294 | " \n", 1295 | " \n", 1296 | " \n", 1297 | " \n", 1298 | " \n", 1299 | " \n", 1300 | " \n", 1301 | " \n", 1302 | " \n", 1303 | " \n", 1304 | " \n", 1305 | " \n", 1306 | " \n", 1307 | " \n", 1308 | " \n", 1309 | " \n", 1310 | " \n", 1311 | " \n", 1312 | " \n", 1313 | " \n", 1314 | " \n", 1315 | " \n", 1316 | " \n", 1317 | " \n", 1318 | " \n", 1319 | " \n", 1320 | " \n", 1321 | " \n", 1322 | " \n", 1323 | " \n", 1324 | " \n", 1325 | " \n", 1326 | " \n", 1327 | " \n", 1328 | " \n", 1329 | " \n", 1330 | " \n", 1331 | " \n", 1332 | " \n", 1333 | " \n", 1334 | " \n", 1335 | " \n", 1336 | " \n", 1337 | " \n", 1338 | " \n", 1339 | " \n", 1340 | " \n", 1341 | " \n", 1342 | " \n", 1343 | " \n", 1344 | " \n", 1345 | " \n", 1346 | " \n", 1347 | " \n", 1348 | " \n", 1349 | " \n", 1350 | " \n", 1351 | " \n", 1352 | " \n", 1353 | " \n", 1354 | " \n", 1355 | " \n", 1356 | " \n", 1357 | " \n", 1358 | " \n", 1359 | " \n", 1360 | " \n", 1361 | " \n", 1362 | " \n", 1363 | " \n", 1364 | " \n", 1365 | " \n", 1366 | " \n", 1367 | " \n", 1368 | " \n", 1369 | " \n", 1370 | " \n", 1371 | " \n", 1372 | " \n", 1373 | " \n", 1374 | " \n", 1375 | " \n", 1376 | " \n", 1377 | " \n", 1378 | " \n", 1379 | " \n", 1380 | " \n", 1381 | " \n", 1382 | " \n", 1383 | " \n", 1384 | " \n", 1385 | " \n", 1386 | " \n", 1387 | " \n", 1388 | " \n", 1389 | " \n", 1390 | " \n", 1391 | " \n", 1392 | " \n", 1393 | " \n", 1394 | " \n", 1395 | " \n", 1396 | " \n", 1397 | " \n", 1398 | " \n", 1399 | " \n", 1400 | " \n", 1401 | " \n", 1402 | " \n", 1403 | " \n", 1404 | " \n", 1405 | " \n", 1406 | " \n", 1407 | " \n", 1408 | " \n", 1409 | " \n", 1410 | " \n", 1411 | " \n", 1412 | " \n", 1413 | " \n", 1414 | " \n", 1415 | " \n", 1416 | " \n", 1417 | " \n", 1418 | " \n", 1419 | " \n", 1420 | " \n", 1421 | " \n", 1422 | " \n", 1423 | " \n", 1424 | " \n", 1425 | " \n", 1426 | " \n", 1427 | " \n", 1428 | " \n", 1429 | " \n", 1430 | " \n", 1431 | "
tripsstart_station_idend_station_idroute_codingin_low
08749506050_60False
18168696569_65False
27281615061_50False
36601506150_61False
46568656965_69False
56557607460_74False
66065517051_70False
75930705070_50False
85790746174_61False
95714747074_70False
105597557055_70False
115159507050_70False
125113657065_70False
135086647764_77False
144977705570_55False
154921676967_69False
164887746074_60False
174804776477_64False
184530605060_50False
194318693969_39False
204241396939_69False
214238695769_57False
224231705170_51False
234150707470_74False
244111637063_70False
\n", 1432 | "
" 1433 | ], 1434 | "text/plain": [ 1435 | " trips start_station_id end_station_id route_coding in_low\n", 1436 | "0 8749 50 60 50_60 False\n", 1437 | "1 8168 69 65 69_65 False\n", 1438 | "2 7281 61 50 61_50 False\n", 1439 | "3 6601 50 61 50_61 False\n", 1440 | "4 6568 65 69 65_69 False\n", 1441 | "5 6557 60 74 60_74 False\n", 1442 | "6 6065 51 70 51_70 False\n", 1443 | "7 5930 70 50 70_50 False\n", 1444 | "8 5790 74 61 74_61 False\n", 1445 | "9 5714 74 70 74_70 False\n", 1446 | "10 5597 55 70 55_70 False\n", 1447 | "11 5159 50 70 50_70 False\n", 1448 | "12 5113 65 70 65_70 False\n", 1449 | "13 5086 64 77 64_77 False\n", 1450 | "14 4977 70 55 70_55 False\n", 1451 | "15 4921 67 69 67_69 False\n", 1452 | "16 4887 74 60 74_60 False\n", 1453 | "17 4804 77 64 77_64 False\n", 1454 | "18 4530 60 50 60_50 False\n", 1455 | "19 4318 69 39 69_39 False\n", 1456 | "20 4241 39 69 39_69 False\n", 1457 | "21 4238 69 57 69_57 False\n", 1458 | "22 4231 70 51 70_51 False\n", 1459 | "23 4150 70 74 70_74 False\n", 1460 | "24 4111 63 70 63_70 False" 1461 | ] 1462 | }, 1463 | "execution_count": 17, 1464 | "metadata": {}, 1465 | "output_type": "execute_result" 1466 | } 1467 | ], 1468 | "source": [ 1469 | "high_vol_routes['route_coding'] = high_vol_routes.start_station_id.astype(str) + \\\n", 1470 | " \"_\" + high_vol_routes.end_station_id.astype(str)\n", 1471 | "low_vol_routes['route_coding'] = low_vol_routes.start_station_id.astype(str) + \\\n", 1472 | " \"_\" + low_vol_routes.end_station_id.astype(str)\n", 1473 | "\n", 1474 | "high_vol_routes['in_low'] = high_vol_routes['route_coding'].\\\n", 1475 | " isin({'route_coding': low_vol_routes.route_coding.values.tolist()})\n", 1476 | "high_vol_routes" 1477 | ] 1478 | }, 1479 | { 1480 | "cell_type": "markdown", 1481 | "metadata": {}, 1482 | "source": [ 1483 | "Interestingly, none of the top 25 routes for the high volume bikes are in the top 25 low volume bike routes. While we could drill a bit deeper into this, with an initial analysis we see evidence that our bikes that have the most rides are fundamentally going on different routes for the higher volume bikes and lower volume bikes. This might be an interesting business result if the company is experiencing inequal wear and tear on the bikes- perhaps bikes from the lower volume routes could be moved to the higher ones and vice versa for more equal wear. Of course, more detailed analysis on a station-by-station level rather than route-level would be warranted in this case.\n", 1484 | "\n", 1485 | "Let's pivot a little more to looking at some of our customers and subscribers.\n", 1486 | "\n", 1487 | "I want to take a look at the cumulative minutes spent on bike rides by our subscribers vs customers on a month-by-month basis for 2015 (only year we have full data)." 1488 | ] 1489 | }, 1490 | { 1491 | "cell_type": "code", 1492 | "execution_count": 18, 1493 | "metadata": {}, 1494 | "outputs": [ 1495 | { 1496 | "data": { 1497 | "text/plain": [ 1498 | "" 1499 | ] 1500 | }, 1501 | "execution_count": 18, 1502 | "metadata": {}, 1503 | "output_type": "execute_result" 1504 | }, 1505 | { 1506 | "data": { 1507 | "image/png": "\n", 1508 | "text/plain": [ 1509 | "
" 1510 | ] 1511 | }, 1512 | "metadata": { 1513 | "needs_background": "light" 1514 | }, 1515 | "output_type": "display_data" 1516 | } 1517 | ], 1518 | "source": [ 1519 | "# q4\n", 1520 | "q4 = '''\n", 1521 | "SELECT\n", 1522 | " SUM(customer_minutes_sum) OVER (ORDER BY end_month ROWS UNBOUNDED PRECEDING)/1000 as cumulative_minutes_cust,\n", 1523 | " SUM(subscriber_minutes_sum) OVER (ORDER BY end_month ROWS UNBOUNDED PRECEDING)/1000 as cumulative_minutes_sub,\n", 1524 | " end_year,\n", 1525 | " end_month\n", 1526 | "FROM\n", 1527 | " (\n", 1528 | " SELECT\n", 1529 | " SUM(CASE WHEN subscriber_type = 'Customer' THEN duration_sec/60 ELSE NULL END) AS customer_minutes_sum,\n", 1530 | " SUM(CASE WHEN subscriber_type = 'Subscriber' THEN duration_sec/60 ELSE NULL END) AS subscriber_minutes_sum,\n", 1531 | " EXTRACT(YEAR FROM end_date) AS end_year,\n", 1532 | " EXTRACT(MONTH FROM end_date) AS end_month\n", 1533 | " FROM\n", 1534 | " `bigquery-public-data.san_francisco_bikeshare.bikeshare_trips`\n", 1535 | " GROUP BY\n", 1536 | " end_year, end_month\n", 1537 | " HAVING\n", 1538 | " end_year = 2015\n", 1539 | " )\n", 1540 | "ORDER BY\n", 1541 | " end_year, end_month\n", 1542 | "'''\n", 1543 | "df4 = query_to_df(q4)\n", 1544 | "plt.plot(df4.end_month, df4.cumulative_minutes_cust, label = \"Customers\")\n", 1545 | "plt.plot(df4.end_month, df4.cumulative_minutes_sub, label = \"Subscribers\")\n", 1546 | "plt.title(\"Cumulative minutes ridden by users\")\n", 1547 | "plt.xlabel(\"Month\")\n", 1548 | "plt.ylabel(\"Cumulative Minutes (1000s)\")\n", 1549 | "plt.legend()\n" 1550 | ] 1551 | }, 1552 | { 1553 | "cell_type": "markdown", 1554 | "metadata": {}, 1555 | "source": [ 1556 | "We see a bit of an interesting phenomena here. Our subscribers, AKA our people that pay for longer-term memberships, are using the bikes at a decently consistent rate throughout the year. The customers, AKA people that don't intend to use the bikes that often (either a 3 day membership or single day) really use them a lot more in the summer, months 6 - 8. Overall, the subscribers will spend more time on the bikes over the year with their relatively more consistent usage.\n", 1557 | "\n", 1558 | "Let's change that query up slightly and look how the average ride length changes over months." 1559 | ] 1560 | }, 1561 | { 1562 | "cell_type": "code", 1563 | "execution_count": 19, 1564 | "metadata": {}, 1565 | "outputs": [ 1566 | { 1567 | "data": { 1568 | "text/plain": [ 1569 | "" 1570 | ] 1571 | }, 1572 | "execution_count": 19, 1573 | "metadata": {}, 1574 | "output_type": "execute_result" 1575 | }, 1576 | { 1577 | "data": { 1578 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEWCAYAAAB8LwAVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAIABJREFUeJzt3Xl8VPXV+PHPyR5IIJAEBQKEJYLsm4BCFcStLlCXqrVVbF0frdvT1mo3rY+t9tE+1aqtolVR+YnWvVoXiiCiEARkk52EJcgSEhISINvM+f1x74QhTJIhmclkwnm/XvPK3e+5k2TO3O/33nNFVTHGGGPqiol0AMYYY1onSxDGGGMCsgRhjDEmIEsQxhhjArIEYYwxJiBLEMYYYwKyBGHaPBEpF5E+kY7jWDUUt4hcKyILGlh3nohcH77oIktEvhGRiZGOo62zBHEccD8s9olIYqRjiQRVTVHVvOZup7EP5VALVdzRREReFJEHG1tOVQep6rwWCOm4ZgmijRORbOA7gAJTwrSPuHBs93h1vLyfTTnO4+W9aS0sQbR91wCLgBeBab6JIjJORHaJSKzftItFZKU7HCMi94jIZhEpEpHXRaSzOy9bRFRErhORbcCn7vR/utssFZH5IjLIb9vpIvIvEdkvIl+JyIP+38ZFZICIzBaRYhFZLyKX13dA7hnRgyLypdsM8y93+zP9tp/tt7yKSD93+EUReUpEPhCRMhHJFZG+dY4rrs6+rheRk4GngVPdfZa48xNF5FER2SYiu0XkaRFJdudliMj7IlLiHtfnIhLwf87d760ishHYGCDudBF5zz2+xUDfOuufLSLr3Pf+SUDqzP+JiKx1zyQ/FpFedfZ9s4hsdOc/JSJHrO+37P0i8oaIvOa+f8tEZJjf/G4i8qaIFIpIvojcHmDdV0RkP3BtnW3fCPwQuNv3e3WnbxGRX7p/mwdEJM6ddlYwMZlmUFV7teEXsAm4BRgFVAMn+M3bDJztN/5P4B53+E6cxJIFJALPAK+687JxzkheAtoDye70nwCp7vKPAcv9tj3LfbUDBgLbgQXuvPbu+I+BOGAksBcYVM8xzXOPqy/QEVgDbADOctd/CXjBb3kF+rnDLwLFwBh32ZnArDrHFVdnX9e7w9f6Yvab/xjwHtDZPfZ/AQ+58x7CSSrx7us7gNRzTArMdreTHCDuWcDr7ns1GNjh9/5lAPuBy9z93AXU+MX9Pff9Otk95t8AX9bZ9/tAGtATKATOqyfO+3H+jnz7+jmQ7w7HAEuB3wEJQB8gDzi3zrrfc5dNDrD9F4EH60zbAiwHevi9N1uAsxqLKdL/f9H+ingA9grjLxcmuP84Ge74OuAuv/kPAs+7w6nAAaCXO74WmOy3bFd3W3Ec/iDt08C+09xlOgKx7rr96+zb9wF3BfB5nfWfAe6rZ9vzgF/7jf8Z+NBv/CKOTE51E8RzfvPOB9a5w77jCipB4HxLPwD09Zt2KpDvDj8AvOvbdyO/KwXODDCtn9/7N8Bv3h/93r9rgEV14irwi/tD4Dq/+THAQb/ftQIT/Oa/jvtFIUCc99fZVwywEyf5jQW21Vn+Xtxk7a47v5H34UUCJ4ifBJh2VmMxRfp/MNpf1sTUtk0DPlHVve74/8Ovmckdv0SczutLgGWqutWd1wt4220eKcFJGB7gBL/1t/sGRCRWRB52m6T24/wDg/PtNhMnsWwPtK67r7G+fbn7+yFwYgPHtttv+FCA8ZQG1t3lN3ywkWUbkolzRrTUL+6P3OkAj+B8c/9ERPJE5J5Gtre9numB3r+tfsPd/Oep8ylZ9/193C/GYpwk0t1vmWN5T/z35cVJRt3c/XSr83v8FfX8zRyjxtarLybTDNbh00a57eCXA7Ei4vvnTwTSRGSYqq5Q1TUishX4LnAVTsLw2Y7zre2LANvOdgf9SwFfBUzFaebZgnPmsA/ng6gQp8kjC6cpCJzmAv99faaqZzfpYEPngPuzHU6TDRyZpOqWPt6Lk4wGqeqOuhtT1TLgZ8DPxOmPmSsiX6nqnHr2X19pZd/71wPnLBCcpiCfnfi9n27/Qd339w+qOrOe7R8r/33F4Pxev3VjzFfVnAbWbax8dH3zG1uvvphMM9gZRNv1PZxv/AOB4e7rZOBznCYJn/8H3A6cjtMH4fM08AdfZ6aIZIrI1Ab2lwpUAkU4H7B/9M1QVQ/wFnC/iLQTkQF1YngfOElErhaRePd1itsx3GJUtRCnbf9H7hnRTziyM3g3kCUiCe7yXuBZ4C8i0gVARLqLyLnu8IUi0s/9wN6P8/vwNCGuuu/fQI48E/wAGCQil7gd7LdzZGJ7GrjXTVKISEcR+f6xxuFnlN++7sT5vS8CFgP73Q7lZPc9HCwipxzDtnfj9F2EKibTDJYg2q5pOG2/21R1l+8FPAn8UA5fqfMqMBH41K8pCuBxnM7XT0SkDOefbWwD+3sJp9ljB06ncd1/zp/inFXsAl5291sJtd+0zwGuxPnWtwv4E84ZT0u7AfgFTqIbBHzpN+9T4Btgl4j43qtf4jQjLXKb1v4D9Hfn5bjj5cBC4G/a9Gv3f4rT7LMLp53+Bd8M9/f2feBhN+4c4Au/+W/jvJ+z3BhX45w1NtW7OP1G+4CrgUtUtdpNZBfhfBnJxznDeg7n9x6sfwAD3Saqd5ob0zGsbwIQt1PHmBYlIn8CTlTVaY0ubFoNEbkfp9P9R5GOxac1xtRW2BmEaRHi3OcwVBxjgOuAtyMdlzGmftZJbVpKKk6zUjdgD86lqe9GNCJjTIOsickYY0xA1sRkjDEmoKhuYsrIyNDs7OxIh2GMMVFl6dKle1U1s7HlojpBZGdns2TJkkiHYYwxUcW9QbZR1sRkjDEmoLAlCBF5XkT2iMjqAPN+7pYYznDHRUT+KiKbRGSliIwMV1zGGGOCE84ziBeB8+pOFJEewNnANr/J38W5+zMHuBH4exjjMsYYE4Sw9UGo6ny/om7+/gLczZHXwE8FXnKrUC4SkTQR6aqqO8MVnzGmZVVXV1NQUEBFRUWkQzluJCUlkZWVRXx8fJPWb9FOahGZAuxQ1RVy5AOrunNkOd8Cd9pRCcJ96tSNAD179qw72xjTShUUFJCamkp2djYS+IF1JoRUlaKiIgoKCujdu3eTttFindQi0g74Nc7Tpo6aHWBawDv4VHW6qo5W1dGZmY1epWWMaSUqKipIT0+35NBCRIT09PRmnbG15BlEX6A34Dt7yAKWuXV5Cjiyfr3VcjemDbLk0LKa+3632BmEqq5S1S6qmq2q2ThJYaRbgvo94Br3aqZxQKn1P5jW4MNVO9lRcijSYRgTEeG8zPVVnBr4/UWkQESua2Dxf+M83HwTzgNYbglXXMYEq6i8kv+auYy//mdjpEMxIbJr1y6uvPJK+vbty8CBAzn//PPZsGFD4yv6eeedd1izZk2YImxdwnkV0w8amZ/tN6zAreGKxZimWJxfDMCCTXtRVWseiXKqysUXX8y0adOYNWsWAMuXL2f37t2cdNJJQW/nnXfe4cILL2TgwIHhCvUoHo+H2NjYFtufj91JbUw9ct0EsaPkEFuLDkY4GtNcc+fOJT4+nptvvrl22vDhw/F4PFx44YW1037605/y4osvAnDPPfcwcOBAhg4dys9//nO+/PJL3nvvPX7xi18wfPhwNm/ezPLlyxk3bhxDhw7l4osvZt++fQBMnDiRu+66i9NPP52TTz6Zr776iksuuYScnBx+85vf1O7vlVdeYcyYMQwfPpybbroJj8d5Km1KSgq/+93vGDt2LAsXLjwqlpYQ1bWYjAmnRXlF9Ozcjm3FB1mwaS/ZGe0jHVKb8ft/fcOab/eHdJsDu3XgvosG1Tt/9erVjBo1KujtFRcX8/bbb7Nu3TpEhJKSEtLS0pgyZQoXXnghl112GQBDhw7liSee4IwzzuB3v/sdv//973nssccASEhIYP78+Tz++ONMnTqVpUuX0rlzZ/r27ctdd93Fnj17eO211/jiiy+Ij4/nlltuYebMmVxzzTUcOHCAwYMH88ADD1BcXMx11113RCwtwc4gjAmg5GAV63eXcdmoLLqnJfPFpr2Nr2TalA4dOpCUlMT111/PW2+9Rbt27Y5aprS0lJKSEs444wwApk2bxvz582vnT5kyBYAhQ4YwaNAgunbtSmJiIn369GH79u3MmTOHpUuXcsoppzB8+HDmzJlDXl4eALGxsVx66aVBxxIOdgZhTACL84tRhbG9O1Ow7yAff7Mbj1eJjbF+iFBo6Jt+uAwaNIg33njjqOlxcXF4vd7acd99A3FxcSxevJg5c+Ywa9YsnnzyST799NNj2mdiYiIAMTExtcO+8ZqaGlSVadOm8dBDDx21blJSUm2/QyhiaQo7gzAmgNz8YhLiYhjWI40JOZmUHqpm9Y7SSIdlmuHMM8+ksrKSZ599tnbaV199hcfjYc2aNVRWVlJaWsqcOXMAKC8vp7S0lPPPP5/HHnuM5cuXA5CamkpZWRkAHTt2pFOnTnz++ecAvPzyy7VnE8GYPHkyb7zxBnv27AGcZq2tW4+uxF1fLOFmZxDGBJCbX8SIHmkkxcdyWt90wLmaaViPtAhHZppKRHj77be58847efjhh0lKSiI7O5vHHnuMyy+/nKFDh5KTk8OIESMAKCsrY+rUqVRUVKCq/OUvfwHgyiuv5IYbbuCvf/0rb7zxBjNmzODmm2/m4MGD9OnThxdeeCHomAYOHMiDDz7IOeecg9frJT4+nqeeeopevXodsVx9sYRbVD+TevTo0WoPDDKhtr+imuG//4SfnpnDf5/tXP743cc/Jy05nldvHBfh6KLX2rVrOfnkkyMdxnEn0PsuIktVdXRj61oTkzF1LNlSjFdhXO/OtdMm9Etn6dZ9HKryRDAyY1qWJQhj6sjNKyY+VhjRs1PttAk5mVR5vHy1pTiCkRnTsixBGFPHovxihmWlkZxw+M7VU7I7kRAbY5e7muOKJQhj/JRX1rB6Rylj+3Q+Ynq7hDhG9krj842WIMzxwxKEMX6Wbt2Hx6uM7Z1+1Lzv5GSyZud+isorIxCZMS3PEoQxfnLzioiNEUb16nTUvPH9MgD4cnNRS4dlTERYgjDGT25+MUOzOtI+8ehbhIZ070hqUhwLrJkpav3hD39g0KBBDB06lOHDh5Obm1vvsvfffz+PPvpoSPe/ZMkSbr/99rBtP9TsRjljXIeqPKwsKOG6CX0Czo+NEU7rm27lv6PUwoULef/991m2bBmJiYns3buXqqqqFtt/TU0No0ePZvToRm8/aJSqoqrExIT3O76dQRjjWrZtH9UePaqD2t+EnEwr/x2ldu7cSUZGRm1NpIyMDLp160Z2djZ79zpnhUuWLGHixIm166xYsYIzzzyTnJyc2hIdO3fu5PTTT2f48OEMHjy4tszGRx99xMiRIxk2bBiTJ08GnLOEG2+8kXPOOYdrrrmGefPmHVFaPND2AR555BFOOeUUhg4dyn333QfAli1bOPnkk7nlllsYOXIk27dv59prr2Xw4MEMGTIkLHdX2xmEMa7cvCJiBEYH6H/wmeD2Q1j572b68B7YtSq02zxxCHz34Xpnn3POOTzwwAOcdNJJnHXWWVxxxRWN1k1auXIlixYt4sCBA4wYMYILLriAV199lXPPPZdf//rXeDweDh48SGFhITfccAPz58+nd+/eFBcfvl9m6dKlLFiwgOTkZObNm9fo9levXs3GjRtZvHgxqsqUKVOYP38+PXv2ZP369bzwwgv87W9/Y+nSpezYsYPVq1cDhKUEuJ1BGONalF/M4O4dSU2Kr3eZ7PR2dE9Ltn6IKJSSksLSpUuZPn06mZmZXHHFFbUPBqrP1KlTSU5OJiMjg0mTJrF48WJOOeUUXnjhBe6//35WrVpFamoqixYt4vTTT6d3794AdO58+Cx0ypQpJCcnB739Tz75hE8++YQRI0YwcuRI1q1bx8aNzmNve/XqxbhxTrmXPn36kJeXx2233cZHH31Ehw4dQvAuHcnOIIwBKqo9LN9ewrRTezW4nIgwoV8GH67eaeW/m6OBb/rhFBsby8SJE5k4cSJDhgxhxowZR5T79pX69qnbzyQinH766cyfP58PPviAq6++ml/84hekpaXV2yfVvn39Z5qBtq+q3Hvvvdx0001HzNuyZcsR2+rUqRMrVqzg448/5qmnnuL111/n+eefb/xNOAZ2BmEMsHx7CVU13oD3P9Q1PieD/RU1Vv47yqxfv772mzg4z6Pu1asX2dnZLF26FIA333zziHXeffddKioqKCoqYt68eZxyyils3bqVLl26cMMNN3DdddexbNkyTj31VD777DPy8/MBjmhiakig7Z977rk8//zzlJeXA7Bjx47acuD+9u7di9fr5dJLL+V//ud/WLZsWZPel4bYGYQxOPWXROCU3vV3UPtY+e/oVF5ezm233UZJSQlxcXH069eP6dOns3btWq677jr++Mc/Mnbs2CPWGTNmDBdccAHbtm3jt7/9Ld26dWPGjBk88sgjxMfHk5KSwksvvURmZibTp0/nkksuwev10qVLF2bPnt1oTIG2361bN9auXcupp54KOE1jr7zySu3Dg3x27NjBj3/849qzn0APHWouK/dtDPCD6YsoPVTNv+/4TlDLW/nvY2flviOjVZb7FpHnRWSPiKz2m/aIiKwTkZUi8raIpPnNu1dENonIehE5N1xxGVNXZY2HZdv2NXh5a13fycmw8t+mzQtnH8SLwHl1ps0GBqvqUGADcC+AiAwErgQGuev8TURiMaYFrCwopTLI/gef8f0yrPy3afPCliBUdT5QXGfaJ6pa444uArLc4anALFWtVNV8YBMwJlyxGeMvN8+prTQmiP4HnzHZnUmIjWGBlf8+JtHcpB2Nmvt+R/Iqpp8AH7rD3YHtfvMK3GlHEZEbRWSJiCwpLCwMc4jmeJCbX0z/E1Lp3D4h6HWSE2IZ1auT3Q9xDJKSkigqKrIk0UJUlaKiIpKSkpq8jYhcxSQivwZqgJm+SQEWC/hXpKrTgengdFKHJUBz3Kj2eFm6dR+XjcpqfOE6JuRk8MjH6ykqryQ9JTEM0bUtWVlZFBQUYF/sWk5SUhJZWcf+t+3T4glCRKYBFwKT9fBXiQKgh99iWcC3LR2bOf6s2lHKwSrPMfU/+Izv5ySILzYXMWVYtzBE17bEx8fX3mlsokOLNjGJyHnAL4Epqupf7ew94EoRSRSR3kAOsLglYzPHp9w8p5vsWPoffIZ070iHpDi+sGYm00aF7QxCRF4FJgIZIlIA3Idz1VIiMNu9xXyRqt6sqt+IyOvAGpymp1tV1a4fNGGXm19E38z2ZKYeexORU/47w8p/mzYrbAlCVX8QYPI/Glj+D8AfwhWPMXXVeLws2bKPKcOb3jw0PieDj77Zxdaig1bd1bQ5VovJHLfW7NxPeWUNY5vQvOTjK//9uV3uatogSxDmuOXrfxjX59g7qH185b+tH8K0RZYgzHErN7+I7PR2nNCh6deJ+8p/f7l5Lx6vXXVt2pZGE4SIxIjICBG5QETOFJETWiIwY8LJ41UW5xc36fLWunzlv1dZ+W/TxtTbSS0ifXEuST0L2AgUAknASSJyEHgGmKGq3pYI1JhQWrdrP/srao6pQF99xrvlv7/YtJfhVv7btCENnUE8CLwC9FXVc1X1R6p6mVtobwrQEbi6JYI0JtR8/Q9jm9H/4JOeksjArh2s7IZpc+o9g6jnMlXfvD3AY2GJyJgWkJtfRI/OyXRPC/ys4GM1ISeDF7/YwqEqD8kJVojYtA1BdVKLyGkicpWIXON7hTswY8LFG8L+Bx9f+e/FVv7btCGN3ignIi8DfYHlgO/uZgVeCmNcxoTNxj3l7DtY3az7H+rylf/+YtNezjgpM2TbNSaSgrmTejQwUK1Gr2kjcvOd5z805/6Huqz8t2mLgmliWg2cGO5AjGkpuXnFdOuYRFan0PQ/+EzIyWDNzv3sLa8M6XaNiZRgEkQGsEZEPhaR93yvcAdmTDioKrn5RYztkx7y4nq+shtfbi4K6XaNiZRgmpjuD3cQxrSUzYUH2FteFdL+B5/BfuW/7fkQpi1oNEGo6mctEYgxLWGR+/zpUNz/UJeV/zZtTb1NTCKywP1ZJiL7/V5lIrK/5UI0JnRy84vpkppIdnq7sGx/fE4GO0oOsaXoYOMLG9PKNXSj3AT3Z2rLhWNM+KgquXnh6X/w+Y7bD7Fg01562/MhTJQL9ka5TiIyVERG+l7hDsyYUNtSdJA9ZZVh6X/w6WXlv00bEsyNcv8DXAvkAb7CfAqcGb6wjAm93Dzf/Q/hSxC+8t8frt6Jx6vExlg/hIlewVzFdDlOwb6qcAdjTDjl5heTkZJA38yUsO5nQk4Gry3ZzqodpVbd1US1YG+Us79yE9V8/Q9jencO+9VFp/mV/zYmmgWTIB4CvrYb5Uw0K9h3iG9LK0JaoK8+Vv7btBXBNDHNAP4ErOJwH4QxUeXw/Q/h63/wZ+W/TVsQzBnEXlX9q6rOVdXPfK/GVhKR50Vkj4is9pvWWURmi8hG92cnd7qIyF9FZJOIrLSrpEyo5eYXk9YunpO6tMxV2xOs/LdpA4JJEEtF5CEROfUYL3N9ETivzrR7gDmqmgPMcccBvgvkuK8bgb8HFb0xQcrNL2JMdmdiWuiqolP8yn8bE62CaWIa4f4c5zet0ctcVXW+iGTXmTwVmOgOzwDm4Tz3eirwkltSfJGIpIlIV1XdGUR8xjTo25JDbC8+xLWn9W6xffrKf39u/RAmigVTi2lSCPd3gu9DX1V3ikgXd3p3YLvfcgXutKMShIjciHOWQc+ePUMYmmmrfM9/COcNcoFMyMngkY/Xs7e8koyUxBbdtzGh0FAtph+JSEPz+4rIhBDFEei8P+ADilR1uqqOVtXRmZn25C7TuNy8YlKT4ji5a4cW3a+V/zbRrqEziHScy1uXAkuBQiAJ6AecAezlcB9CsHb7mo5EpCuwx51eAPTwWy4L+PYYt21MQLn5xYzJ7tzidzUP7t6RjsnxVv7bRK16zxBU9XFgJPAqkAlMdsd3AFer6qWquvEY9/ceMM0dnga86zf9GvdqpnFAqfU/mFDYs7+C/L0HWuzyVn9O+e/02vLfxkSbBvsgVNUDzHZfx0REXsXpkM4QkQLgPuBh4HURuQ7YBnzfXfzfwPnAJuAg8ONj3Z8xgSzKdy4zbYkb5AIZ3y+DD1fvYkvRQavuaqJOMFcxNYmq/qCeWZMDLKvAreGKxRy/cvOKSEmMY1C3lu1/8Jlg5b9NFAuq3Lcx0So3v5jR2Z2Ii43Mn7qv/PeCjYUR2b8xzWEJwrRZe8sr2bSnPGLNS+CU//5OTgZfbi7C47V+CBNdgnkeRCJwKZDtv7yqPhC+sIxpvsW+/ocIdFD7G98vg1lfWflvE32COYN4F+dO5xrggN/LmFYtN6+IdgmxDOneMaJxWPlvE62C6aTOUtW6NZWMafVy84sZ1asT8RHqf/BJT0lkULcOfL6xkFsn9YtoLMYci2D+c74UkSFhj8SYENp3oIp1u8pavLxGfSb0y2DZ1hIOVtVEOhRjgtZQKY1VIrISmAAsE5H1bilu33RjWq3c2v6HyHVQ+xvvlv/+asu+SIdiTNAaamK6sMWiMCbEcvOLSIyLYWhWZPsffHzlvxdsLOSMk6yGmIkO9SYIVd0KICIvq+rV/vNE5GXg6oArGtMK5OYVM7JnJxLjWsfT3JITYhmd3YkFm6xwn4kewfRBDPIfEZFYYFR4wjGm+UoPVrN21/6IX95a1/h+GazduZ+95ZWRDsWYoDTUB3GviJQBQ0Vkv/sqw6nA+m596xkTaV9tKUY1cvWX6mPlv020aaia60Oqmgo8oqod3Feqqqar6r0tGKMxxyQ3v4iE2BhG9GxdN6X5yn9b2Q0TLYK5D+KfAZ5BXQpsVVW7Zs+0Orn5xQzvkUZSfOvof/CpLf+90Sn/LdKyz6cw5lgF0wfxN2ARMB141h2eBWwQkXPCGJsxx6ysoprVO0pbXf+Dz/h+GXxbWsGWooORDsWYRgWTILYAI9zHfI4ChgOrgbOA/w1jbMYcsyVb9+Fthf0PPrXlv62ZyUSBYBLEAFX9xjeiqmtwEkZe+MIypmly84qJixFG9mpd/Q8+vdLbkdUpmQVWl8lEgWD6INaLyN9xmpUArsBpXkoEqsMWmTFNkJtfxNCsjrRLCNuzsJpFRJjQL4MPVu3E49UWf062MccimDOIa3EeBXoncBeQ506rBiaFKzBjjtXBqhpWFZS2mvIa9RnfL4OyihpW7SiNdCjGNKjRr1mqegj4s/uqqzzkERnTREu37qPGq62mQF99xvv1Q9jzIUxr1ugZhIiMF5HZIrJBRPJ8r5YIzphjkZtXTGyMMDq7dSeIzu0TGNStg/VDmFYvmIbaf+A0LS0FPOENx5imy80vYnC3DqQkts7+B38T+mXwwhdbOFhV02r7S4wJpg+iVFU/VNU9qlrke4U9MmOOQUW1hxXbW3//g4+v/LfvsajGtEbBJIi5IvKIiJwqIiN9r+bsVETuEpFvRGS1iLwqIkki0ltEckVko4i8JiIJzdmHOb4s27aPKo+31fc/+Izp3ZmEuBh7DKlp1YI5tx3r/hztN02BM5uyQxHpDtwODFTVQyLyOnAlcD7wF1WdJSJPA9cBf2/KPszxJzevmBih1fc/+CTFxzK6l5X/Nq1bo2cQqjopwKtJycFPHJAsInFAO2AnTsJ5w50/A/heM/dhjiO5+UUM7NaBjsnxkQ4laFb+27R2wVzFdIKI/ENEPnTHB4rIdU3doaruAB4FtuEkhlKcDvASv+J/BUD3euK5UUSWiMiSwkIrV2CgssbD19tKWm15jfp8J8e53NWamUxrFUwfxIvAx0A3d3wDzk1zTSIinYCpQG93m+2B7wZYVAOtr6rT3bpQozMz7dGNBlZsL6WyJnr6H3wGdXPKf1uCMK1VMAkiQ1VfB7wA7rf85lzuehaQr6qFqloNvAWcBqS5TU4AWcC3zdiHOY7k5hUh4nT8RpO65b+NaW2CSRAHRCQd9xu9iIzDaRZqqm3AOBFpJ05B/MnAGmAucJm7zDTsqXXQ2Z2NAAAdcUlEQVQmSLn5xfQ/IZW0dtF34duEHKf8d/7eA5EOxZijBJMg/ht4D+grIl8ALwG3NXWHqpqL0xm9DFjlxjAd+CXw3yKyCUjHuUHPmAZV1XhZsrWYcVFy/0NdvvLf1sxkWqNgajEtE5EzgP6AAOvdpqEmU9X7gPvqTM4DxjRnu+b4s2pHCRXV0df/4NOz8+Hy31efmh3pcIw5Qr0JQkQuqWfWSSKCqr4VppiMCdqiPOdO5Gjrf/Cx8t+mNWvoDOKiBuYpTueyacT6XWX8z/trKKus4a9XDqdXevtIh9Sm5OYXk9MlhfSUxEiH0mQTcjKY9dV2VhaUMKJnp0iHY0ytehOEqv64JQNpa0oPVfOX2Rt4edFWUpPiUIULn1jAY1cMZ/LJJ0Q6vDahxuNl6ZZiLh4Z8JaZqHFa38P9EJYgTGsSTCe1OQZer/LaV9s489F5vLRwCz8Y04O5P5vI+7dNoGfndlw3Ywn/N3sDHq9d1thcq7/dz4EqT9TdIFeXlf82rZXVGQ6h5dtLuO/d1awoKGV0r07MmDKGwd07AtCpfQJv/tdp/Oad1fx1zkaWby/h8SuG06l99F2a2Vrk5jl1jMb2ic7+B38TcjJ4fkG+lf82rYqdQYTA3vJK7n5jBd976gt2llbwlyuG8c+bT61NDj5J8bE8ctlQ/njxEBZtLuLCJxawqsAeO9lUufnF9MloT5fUpEiH0mwT+mVQ7VEr/21alWBqMbUTkd+KyLPueI6IXBj+0Fq/ao+X5xfkM+nRebz99Q5uOr0Pn/58IhePyMK5B/BoIsJVY3vy+s2noqpc+vSXvP7V9haOPPp5vMpX+cVt4uwB4JRsK/9tWp9gzmVfwCmmd6o7XgD8E3g/XEFFgy837+X+975hw+5yTj8pk/suGkjfzJSg1x/eI41/3TaB22d9zd1vruTr7fu476JBJMXHhjHqtmPtzv2UVdZEff+Dj5X/Nq1RME1MfVX1f4FqAFU9hHPD3HFpR8khbp25jKuezeVQtYfpV49ixo9POabk4JOekshLPxnLLRP78uri7Vz+zEIK9h0MQ9Rtz6I21P/gMyHHKf+9eoc1O5rWIZgEUSUiyRyuxdQXOO4K2FdUe3hizkYm/3ke/1m7m7vOOonZd53BOYNOrLc5KRixMcLd5w3gmatHkV94gIueWMDnG62MeWNy84vp2bkdXTsmRzqUkJk6vDsndEjk+08v5F8rrFalibxgEsR9wEdADxGZCcwB7g5rVK2IqjJ7zW7O+ct8/jx7A5P6d2HOz87gjrNyQtocdO6gE3n3p+PJTE3kmucX89TcTXjtUtiAvF7lqy3FUVteoz7d05L5120TGNy9A7e9+jUP/XstNR5vpMMyx7FgajHNFpFlwDicpqU7VPW46EnLKyzn9/9aw2cbCunXJYWZ149lvFtcLRz6ZKbwzq3juefNVTzy8Xq+3raPP18+PKqektYS1u8uo+RgNWOjtEBfQ7qkJjHz+nE8+MEanpmfxzff7ueJH4ywy6FNRDRUi2lknUk73Z89RaSnqi4LX1iRVV5ZwxOfbuT5BfkkxcXymwtOZtpp2cTHhv+q4HYJcTx+5XBG9EzjDx+sZeqTC3j66lEMOLFD2PcdLWrvf2hjZxA+CXExPDB1MIO7d+Q3b6/moicX8MzVoxjUrWPjKxsTQlLfg0pEZK47mASMBlbgnEEMBXJVdUKLRNiA0aNH65IlS0K2PVXlvRXf8sd/r2X3/kouG5XF3ef1j9h19ku2FHPLzGXsr6jm4UuG8r0R0V1SIlT+65WlrCwo5Yt7mvto9NZv+fYSbn55KSWHqvjTpUOZOtz+BkzzichSVR3d2HL1fiVW1UmqOgnYCox0H/M5ChgBbApdqK3Dmm/3c8Uzi7hj1nJO6JDEW7ecxqPfHxbRm7BGZ3fm/dsnMLR7Gne+tpz73l1NVc3x3Sat6txM1lbPHuryXQ49NCuNO2Yt58H311i/hGkxwdwHMUBVV/lGVHW1iAwPY0wtquRgFX/+ZAMzc7eS1i6Bhy8ZwuWjexDTSsoud0lNYuYNY3n4w3X8Y0E+q3aU8rcfjuLEjtF/93BTbNpTTtGBqjZ1eWtjMlMTmXn9WP7wwVqeW5DPmp37efKqkXS2fgkTZsE0qq8VkedEZKKInOHeUb023IGFm8erzMzdyqRH5zEzdytXj+vF3J9N5MoxPVtNcvCJj43htxcO5MmrRrBuVxkXPvF57X0Ax5tFbimKtnKDXLDiY2O4f8ogHv3+MJZs3cdFTyyw+yVM2NXbB1G7gEgS8F/A6e6k+cDfVbUizLE1qql9EKsKSrn37ZWs3rGfMb078/spgzi5a3R0Am/cXcZNryxla9FB7jlvANd/p3ez7sOIFgerapi5aBtPf7aZpPhYFvxy0nFx3IGsLHD6JYoOVPHwpUO4eERWpEMyUSbYPohGE0Rr1tQE8dWWYm5/9WvuPf9kLhraNeo+aMoqqvnFP1fy0Te7OH/IifzvZcNISWybFUAPVtXwyqKtTJ+fx97yKib0y+Ce7w44qhDi8WZveSW3zlxGbn4xPxnfm3vPH9AiV9mZtqHZCUJEXlfVy0VkFe5d1P5UdWjzw2ye5lzFVFXjJSEuev+hVJXp8/P400fr6J3RnmeuHkW/LqmRDitkDlbV8PJCJzEUHajiOzkZ3DE5h9HZx0/fQ2OqPV4e+vc6nv8in3F9OvPkVSPJiOIn65mWE4oE0VVVd4pIr0DzVXVrM2NstlBf5hqNvty0l9te/ZqKag+PfH8Y5w/pGumQmuVAZQ0vL9rKs36J4c6zchjVyxJDfd5aVsC9b60ivX0CT189iqFZaZEOybRyIWliEpFY4GNVPSvEwaUBzwGDcc5OfgKsB14DsoEtwOWquq+h7ViCcOwsPcR/vbKM5dtLuPH0Ptx9bn/ioqy54UBlDS8t3Mqzn+dRbInhmK3eUcpNLy+lsLyShy4ewqWjrF/C1C9kfRAi8h5wtaqG7JIJEZkBfK6qz4lIAtAO+BVQrKoPi8g9QCdV/WVD27EEcVhljYcH31/Ly4u2Mq5PZ+67aBADTkxt9f0r5ZU1vLRwC8/Oz2PfwWpOPymTOybnMKqXPZv5WBWVV3Lbq1/z5eYirj0tm19fcLL1S5iAQpkgXsepwzQbOOCbrqq3NzGwDjh3ZfdRv52LyHpgotus1RWYp6r9G9qWJYijvbWsgF+9vYqKai9dOyYxsX8mE/t3YXy/jFbVkV1eWcOML7fw3OdOYjjjpEzuOCuHkT0tMTRHjcfLwx+u47kF+Yzp3ZmnrhpJZqr1S5gjhTJBTAs0XVVnNDGw4cB0YA0wDOdhRHcAO1Q1zW+5fara4KeFJYjA9pRVMHfdHuauK2TBpr2UV9YQHyuM6d2ZSf27MLF/F/pmto/I2UVZRXVtU1LJwWom9nfOGEZYYgipd5fv4JdvrqRTuwSe/tEohvWwfglzWKu9zFVERgOLgPGqmisijwP7gduCSRAiciNwI0DPnj1Hbd0a8b7yVq2qxsuSrcV8tr6Quev3sGF3OQA9OiczqX8XJvXvwrg+6SQnhPdJdmUV1c4Zw4J8Sg5WM6l/JnecdRLD7YMrbPz7JR783mAuH90j0iGZVqI1J4gTgUWqmu2Ofwe4B+iHNTGFXcG+g8xbX8i89Xv4YlMRh6o9JMbFcGrfdPfsIpNe6e1Dtr+yimpe/MJJDKWHqjlzQBfumJxj32hbSPGBKm57dRlfbCri6nG9+O2FA6P68m4TGq02QQCIyOfA9aq6XkTuB3yfSEV+ndSdVbXBBxNZgmieimoPi/OLmbt+D/PWF5K/1+li6pPRnon9uzBpQCZjencmMe7Yzy72u4nhH25imDygC7dbYoiIGo+XRz5ezzPz8xjdqxN/+9HIiBahNJEX8gQhIu1V9UDjSwa1reE4l7kmAHnAj3HqQr0O9AS2Ad9X1eKGtmMJIrTy9x5g3vo9zF1fyKK8IqpqvLRLiOW0vhlMGuB0dndPa/gRn77E8NzneeyvqOGsk53EYNfmR957K77l7jdW0DE5nr//aJRdEHAcC2Un9Wk4H+YpqtpTRIYBN6nqLaEJteksQYTPwaoaFuUVMXddIZ+u28OOkkMA9D8hlYkDMpnUvwujenWqvYyy9JDvjMGXGE7gjsk5DMk6vktitDZrvt3PTa8sYXdpJQ9MHcSVY3pGOiQTAaFMELnAZcB7qjrCnbZaVQeHJNJmsATRMlSVzYXlzF3ndHR/taWYao+SmhjHhJwMsjolM+ur7ZRV1HD2QCcxHO+1klqzfQequH3W13y+cS9nDzyB8X3TGdYjjZO7dgjpc9ZN6xXSBKGqY0Xka78EsUJVh4Uo1iazBBEZZRXVfLGpyG2O2sPu/ZWcM/AEbrfEEDU8XuWx/2xg1lfbKSyrBCA+VhhwYgeGZnVkWFYaw3qk0a9LCrGtrPy9ab5QJog3gP8DnsS5Ye52YLSqXhmKQJvDEkTkqSr7K2romBwf6VBME6gqu/ZXsGJ7KSsKSlhZUMLK7aWUVdYA0C4hlsHdOzIsqyNDs9IY3iONrE7Jrf4OfdOwUCaIDOBx4CycZ1J/AtyhqhF/Yo0lCGNCz+tV8osOsLKgpDZxfPPt/trH3XZun8DQ2oTh/LQqstGlVV/mGiqWIIxpGVU1XjbsLmP59pLaxLFxTxle9+Oje1oyw9xkMSwrjSFZHVtVaRdzpGATRKO/QRH5a4DJpcASVX23KcEZY6JLQlwMg7t3dPuYnCcAHKisYfWOUlYWOGcZKwpK+PeqXQCIQN/MFLcvw+nTGNA19ah7ajxepcbrpcaj1HiVGo/X+ek/7Dl6GY9XqfYqHq+Xao864+70Go9S7fUSFyOM6NmJnC4p1iTWRMGk+CRgAPBPd/xS4BvgOhGZpKp3his4Y0zr1T4xjrF90hnb5/DzwYsPVDl9GW7T1Gcb9vDmsgLA6QRPiot1E4Dz4d8SDRgZKQmM7ZPOaX3TObVPOr0zIlOHLBoFkyD6AWeqag2AiPwdpx/ibGBVGGMzxkSZzu0Tamt8gdMJ/m1pBSu2l7BqRymHqjzExwpxsTHExQhxMTHExQpxMUJsjBAfG+P+FGJjYtyf7nIxQlzs0cv4psfVGT5U7eGr/GIW5hXx5ea9fLByJwAndkji1L7pzqtPOj06t4vkW9aqBZMguuOUwvA9D6I90E1VPSJSGbbIjDFRT0TonpZM97TkiDztsHdGey4/pQeqSv7eA3y5uYiFeUXM31DI21/vAJzClaf2See0vhmc2jedEzpYGRKfYBLE/wLLRWQezlVMpwN/FJH2wH/CGJsxxoSEiNAnM4U+mSn8aFwvVJUNu8tZuHkvX24u4qPVu3h9idMU1iezfW3CGNenM+nH8RVaQV3F5FZXHYOTIBar6rfhDiwYdhWTMSYUPF5l7c79LNzsNEctzi/mQJUHgAEnpjLO7cMY2ye9TdzzE9LLXEWkE5CD02ENgKrOb1aEIWAJwhgTDtUeL6t2lLJwcxELNxexZGsxFdVeRGBwt461fRinZHdu0uW8qkpljZf9FdWUV9RQVlFDeaXzs6yiunY44DR3+lVje3LrpH5NOr5Q3ih3Pc4T37KA5Th3Uy9U1TObFFkIWYIwxrSEyhoPy7eVuB3eRSzfVkKVx0tsjDAsy0kYfTJSOFDl/8F++MO/rPYD//C0Gm/jX86T4mNITYonNTGO1KQ4UpLiSEmMIzUpnskDuvDdJvbrhDJBrAJOwXnIz3ARGQD8XlWvaFJkIWQJwhgTCYeqPCzduo+FeXtZuLmIFQWlePw+8ONjhdSkePfD/PCHemrS4fGUpLgjP/z9lvHN91VLDrWQ3SgHVKhqhYggIomquk5EGnzSmzHGtGXJCbFMyMlgQk4GAOWVNRSWVdZ+uLeVqrjBJIgCEUkD3gFmi8g+oFV0UhtjTGuQkhjXJkuLNHpEqnqxO3i/iMwFOgIfhTUqY4wxEddgghCRGGCl7+FAqvpZi0RljDEm4hrsAVFVL7BCROy5hMYYc5wJptGsK/CNiCwGDvgmquqUsEVljDEm4oJJEL8PexTGGGNanWA6qT8TkV5Ajqr+R0TaAW3jGi5jjDH1avQuDBG5AXgDeMad1B3nktdmEZFYEflaRN53x3uLSK6IbBSR10Qkobn7MMYY03TB3KZ3KzAe2A+gqhuBLiHY9x3AWr/xPwF/UdUcYB9wXQj2YYwxpomCSRCVqlrlGxGROKBZz4ESkSzgAuA5d1yAM3HOVABmAN9rzj6MMcY0TzAJ4jMR+RWQLCJn4zx69F/N3O9jwN2A1x1PB0p8T60DCnCasowxxkRIMAniHqAQ5/GiNwH/Bn7T1B2KyIXAHlVd6j85wKIBz1JE5EYRWSIiSwoLC5sahjHGmEYEc5nrVOAlVX02RPscD0wRkfNxni/RAeeMIk1E4tyziCzqqfekqtOB6eBUcw1RTMYYY+oI5gxiCrBBRF4WkQvcPogmU9V7VTVLVbOBK4FPVfWHwFzgMnexacC7zdmPMcaY5mk0Qajqj4F+OH0PVwGbReS5MMTyS+C/RWQTTp/EP8KwD2OMMUEK6mxAVatF5EOcfoFknGan65u7c1WdB8xzh/NwnnttjDGmFQjmRrnzRORFYBNOE9BzOPWZjDHGtGHBnEFcC8wCblLVyvCGY4wxprUIphbTlf7jIjIeuEpVbw1bVMYYYyIuqD4IERmO00F9OZAPvBXOoIwxxkRevQlCRE7CuQz1B0AR8BogqjqphWIzxhgTQQ2dQawDPgcuUtVNACJyV4tEZYwxJuIauorpUmAXMFdEnhWRyQQuiWGMMaYNqjdBqOrbqnoFMADnXoW7gBNE5O8ick4LxWeMMSZCgrmT+oCqzlTVC3FqJC3HKeBnjDGmDQumFlMtVS1W1WdU9cxwBWSMMaZ1OKYEYYwx5vhhCcIYY0xAliCMMcYEZAnCGGNMQJYgjDHGBGQJwhhjTECWIIwxxgRkCcIYY0xAliCMMcYEZAnCGGNMQJYgjDHGBGQJwhhjTEAtniBEpIeIzBWRtSLyjYjc4U7vLCKzRWSj+7NTS8dmjDHmsEicQdQAP1PVk4FxwK0iMhCnhPgcVc0B5mAlxY0xJqJaPEGo6k5VXeYOlwFrge7AVGCGu9gM4HstHZsxxpjDItoHISLZwAggFzhBVXeCk0SALvWsc6OILBGRJYWFhS0VqjHGHHciliBEJAV4E7hTVfcHu56qTlfV0ao6OjMzM3wBGmPMcS4iCUJE4nGSw0xVfcudvFtEurrzuwJ7IhGbMcYYRySuYhLgH8BaVf0/v1nvAdPc4WnAuy0dmzHGmMPiIrDP8cDVwCoRWe5O+xXwMPC6iFwHbAO+H4HYjDHGuFo8QajqAkDqmT25JWMxxhhTP7uT2hhjTECWIIwxxgRkCcIYY0xAliCMMcYEZAnCGGNMQJYgjDHGBGQJwhhjTECWIEx0qT4ElWXg9UY6EmPavEjcSW2OZ55qqNgPFSXuq9R5HfIb9p9ed56n8vC24ttDYgoktIeEFEhMPTyc0N4d9w2nuMMpAYbbQ0IqxNq/gzH+js//iOoK58OmJakXvDXOy3/YWwNej/uqAfUEnt7Yekes6wEExPeKOfyiznjtfAliGXe47nz1QuX++j/U/T/wq8obfp9i4iCpIySluT87Qsesw8NJac4yVQecbVWWHR6uOgDle6AqzxmuLHf3p8H9jmITAyePuCSIjYeYeIhNcBJJbII77nslOHHFJhyeVrt83fGG1o93/170cNxHDeOMa53jOqZ1fPO8zt+LepyzMvX4jbs/VY+e5vXUWTfQ9Lrb8x7+W4qJBYmt8zPGeQ+OmlbfskFMPyqumgAxBzruQMdWU/975Ds23++y9veeUOdVz/y4epaJiQ3ubzdMjs8EseFD+Oe1kY6i7UrsCMl+H+id+xz+wE9OO/LDvnbYnRffzk1CIaIK1QcPJwtfIvEfr3SnVZUdPa9iP3gKwVMDnirnDMhb7Q6707zVzoeHMaEmMW6ySPRLHO7PUdfCaT8N6+6PzwTRdRhc8H+NLxdKIs43xpjYOt+S4vxeMUeO+74F+Zb3/3nEugHWA+dbEOp+k/N/6ZHDRy0TYB3f9gItA5DUwfnAT0yN+LeeI4i4TUjtgRPCtx+v++3UlzA8vlfV4em+abUJpp5lahOk1D/sO7YjhusuRxDrSJDfzCXAtFjnb6+h6XW3p9r4t/Xas+EGvq0HM1297lmu3/9Rk85IgjyrUa3zu62CmsrDw/7Ta4cbmV9TWc96VZASxr9n1/GZIDr3cV5tnl2D0GJiYiDGbSowjWjDHzuxcRCfHOkoQsY+QYwxxgRkCcIYY0xAliCMMcYEZAnCGGNMQJYgjDHGBGQJwhhjTECWIIwxxgRkCcIYY0xAonXruUQRESkEtkY6jiBlAHsjHUSYtOVjg7Z9fHZs0as5x9dLVTMbWyiqE0Q0EZElqjo60nGEQ1s+Nmjbx2fHFr1a4visickYY0xAliCMMcYEZAmi5UyPdABh1JaPDdr28dmxRa+wH5/1QRhjjAnIziCMMcYEZAnCGGNMQJYgwkxEeojIXBFZKyLfiMgdkY4p1EQkVkS+FpH3Ix1LKIlImoi8ISLr3N/fqZGOKZRE5C73b3K1iLwqIkmRjqmpROR5EdkjIqv9pnUWkdkistH92SmSMTZHPcf3iPu3uVJE3haRtFDv1xJE+NUAP1PVk4FxwK0iMjDCMYXaHcDaSAcRBo8DH6nqAGAYbegYRaQ7cDswWlUHA7HAlZGNqlleBM6rM+0eYI6q5gBz3PFo9SJHH99sYLCqDgU2APeGeqeWIMJMVXeq6jJ3uAznQ6Z7ZKMKHRHJAi4Anot0LKEkIh2A04F/AKhqlaqWRDaqkIsDkkUkDmgHfBvheJpMVecDxXUmTwVmuMMzgO+1aFAhFOj4VPUTVa1xRxcBWaHeryWIFiQi2cAIIDeykYTUY8DdgDfSgYRYH6AQeMFtPntORNpHOqhQUdUdwKPANmAnUKqqn0Q2qpA7QVV3gvNFDegS4XjC6SfAh6HeqCWIFiIiKcCbwJ2quj/S8YSCiFwI7FHVpZGOJQzigJHA31V1BHCA6G6iOILbHj8V6A10A9qLyI8iG5VpChH5NU5T9sxQb9sSRAsQkXic5DBTVd+KdDwhNB6YIiJbgFnAmSLySmRDCpkCoEBVfWd7b+AkjLbiLCBfVQtVtRp4CzgtwjGF2m4R6Qrg/twT4XhCTkSmARcCP9Qw3NRmCSLMRERw2rHXqur/RTqeUFLVe1U1S1WzcTo4P1XVNvEtVFV3AdtFpL87aTKwJoIhhdo2YJyItHP/RifThjrhXe8B09zhacC7EYwl5ETkPOCXwBRVPRiOfViCCL/xwNU4366Xu6/zIx2UCcptwEwRWQkMB/4Y4XhCxj0zegNYBqzC+SyI2tIUIvIqsBDoLyIFInId8DBwtohsBM52x6NSPcf3JJAKzHY/V54O+X6t1IYxxphA7AzCGGNMQJYgjDHGBGQJwhhjTECWIIwxxgRkCcIYY0xAliCMaYCIqIi87DceJyKFTa1c61aIvcVvfGJbq4Jr2g5LEMY07AAwWESS3fGzgR3N2F4acEujSxnTCliCMKZxH+JUrAX4AfCqb4b7zIF33Jr8i0RkqDv9freG/zwRyROR291VHgb6ujc2PeJOS/F77sRM985mYyLOEoQxjZsFXOk+UGcoR1bj/T3wtVuT/1fAS37zBgDnAmOA+9yaXPcAm1V1uKr+wl1uBHAnMBCniuz4cB6MMcGyBGFMI1R1JZCNc/bw7zqzJwAvu8t9CqSLSEd33geqWqmqe3EKxZ1Qzy4Wq2qBqnqB5e6+jIm4uEgHYEyUeA/n+QkTgXS/6YGag3z1ayr9pnmo//8t2OWMaVF2BmFMcJ4HHlDVVXWmzwd+CM4VScDeRp73UYZTYM2YVs++qRgTBFUtwHlGdV334zx1biVwkMPlpevbTpGIfOE+fP5D4INQx2pMqFg1V2OMMQFZE5MxxpiALEEYY4wJyBKEMcaYgCxBGGOMCcgShDHGmIAsQRhjjAnIEoQxxpiA/j8y9RHkLE3QOgAAAABJRU5ErkJggg==\n", 1579 | "text/plain": [ 1580 | "
" 1581 | ] 1582 | }, 1583 | "metadata": { 1584 | "needs_background": "light" 1585 | }, 1586 | "output_type": "display_data" 1587 | } 1588 | ], 1589 | "source": [ 1590 | "q5 = '''\n", 1591 | " SELECT\n", 1592 | " AVG(CASE WHEN subscriber_type = 'Customer' THEN duration_sec/60 ELSE NULL END) AS customer_minutes_avg,\n", 1593 | " AVG(CASE WHEN subscriber_type = 'Subscriber' THEN duration_sec/60 ELSE NULL END) AS subscriber_minutes_avg,\n", 1594 | " EXTRACT(YEAR FROM end_date) AS end_year,\n", 1595 | " EXTRACT(MONTH FROM end_date) AS end_month\n", 1596 | " FROM\n", 1597 | " `bigquery-public-data.san_francisco_bikeshare.bikeshare_trips`\n", 1598 | " GROUP BY\n", 1599 | " end_year, end_month\n", 1600 | " HAVING\n", 1601 | " end_year = 2015\n", 1602 | " ORDER BY\n", 1603 | " end_year, end_month\n", 1604 | "'''\n", 1605 | "df5 = query_to_df(q5)\n", 1606 | "plt.plot(df5.end_month, df5.customer_minutes_avg, label = \"Customers\")\n", 1607 | "plt.plot(df5.end_month, df5.subscriber_minutes_avg, label = \"Subscribers\")\n", 1608 | "plt.title(\"Average minutes ridden per trip\")\n", 1609 | "plt.xlabel(\"Month\")\n", 1610 | "plt.ylabel(\"Average ride length (min)\")\n", 1611 | "plt.legend()" 1612 | ] 1613 | }, 1614 | { 1615 | "cell_type": "markdown", 1616 | "metadata": {}, 1617 | "source": [ 1618 | "This picture very clearly shows the phenomena that we showed with the other query- the average ride length skyrockets over the summer as presumably more casual customers find good weather to take longer bike rides. The subscribers are likely commuting for the most part, or at least keeping their habits very consistent. It's also interesting to note that the average ride length is much longer for the customers even without the spike, indicating again that it seems like the customers might use the bikes for leisure purposes much more than subscribers. Given the fact that the average is so much lower and the previous chart looks the way it did, we can infer that the volume of subscriber rides to customer rides is many times higher.\n", 1619 | "\n", 1620 | "To utilize data from multiple tables, we'll take a look at the origin stations popular with customers and subscribers and see if the capacities seem different for each." 1621 | ] 1622 | }, 1623 | { 1624 | "cell_type": "code", 1625 | "execution_count": 20, 1626 | "metadata": {}, 1627 | "outputs": [ 1628 | { 1629 | "data": { 1630 | "text/html": [ 1631 | "
\n", 1632 | "\n", 1645 | "\n", 1646 | " \n", 1647 | " \n", 1648 | " \n", 1649 | " \n", 1650 | " \n", 1651 | " \n", 1652 | " \n", 1653 | " \n", 1654 | " \n", 1655 | " \n", 1656 | " \n", 1657 | " \n", 1658 | " \n", 1659 | " \n", 1660 | " \n", 1661 | " \n", 1662 | " \n", 1663 | " \n", 1664 | " \n", 1665 | " \n", 1666 | " \n", 1667 | " \n", 1668 | " \n", 1669 | " \n", 1670 | " \n", 1671 | " \n", 1672 | " \n", 1673 | " \n", 1674 | " \n", 1675 | " \n", 1676 | " \n", 1677 | " \n", 1678 | " \n", 1679 | " \n", 1680 | " \n", 1681 | " \n", 1682 | " \n", 1683 | " \n", 1684 | " \n", 1685 | " \n", 1686 | "
cust_trips_milstationcap
01.561808e+15623
11.394113e+151538
28.276351e+14335
37.868649e+146031
47.781771e+147031
\n", 1687 | "
" 1688 | ], 1689 | "text/plain": [ 1690 | " cust_trips_mil station cap\n", 1691 | "0 1.561808e+15 6 23\n", 1692 | "1 1.394113e+15 15 38\n", 1693 | "2 8.276351e+14 3 35\n", 1694 | "3 7.868649e+14 60 31\n", 1695 | "4 7.781771e+14 70 31" 1696 | ] 1697 | }, 1698 | "execution_count": 20, 1699 | "metadata": {}, 1700 | "output_type": "execute_result" 1701 | } 1702 | ], 1703 | "source": [ 1704 | "q6_cust = '''\n", 1705 | "SELECT\n", 1706 | " SUM(CASE WHEN trips.subscriber_type = 'Customer' THEN trips.trip_id/1000000 ELSE NULL END) AS cust_trips_mil,\n", 1707 | " info.station_id AS station,\n", 1708 | " info.capacity AS cap\n", 1709 | "FROM\n", 1710 | " `bigquery-public-data.san_francisco_bikeshare.bikeshare_station_info` AS info\n", 1711 | " INNER JOIN\n", 1712 | " `bigquery-public-data.san_francisco_bikeshare.bikeshare_trips` AS trips\n", 1713 | " ON info.station_id = trips.start_station_id\n", 1714 | "GROUP BY\n", 1715 | " station, cap\n", 1716 | "ORDER BY\n", 1717 | " cust_trips_mil DESC\n", 1718 | "LIMIT\n", 1719 | " 25\n", 1720 | "'''\n", 1721 | "df6_cust = query_to_df(q6_cust)\n", 1722 | "df6_cust.head()\n" 1723 | ] 1724 | }, 1725 | { 1726 | "cell_type": "code", 1727 | "execution_count": 21, 1728 | "metadata": {}, 1729 | "outputs": [ 1730 | { 1731 | "data": { 1732 | "text/html": [ 1733 | "
\n", 1734 | "\n", 1747 | "\n", 1748 | " \n", 1749 | " \n", 1750 | " \n", 1751 | " \n", 1752 | " \n", 1753 | " \n", 1754 | " \n", 1755 | " \n", 1756 | " \n", 1757 | " \n", 1758 | " \n", 1759 | " \n", 1760 | " \n", 1761 | " \n", 1762 | " \n", 1763 | " \n", 1764 | " \n", 1765 | " \n", 1766 | " \n", 1767 | " \n", 1768 | " \n", 1769 | " \n", 1770 | " \n", 1771 | " \n", 1772 | " \n", 1773 | " \n", 1774 | " \n", 1775 | " \n", 1776 | " \n", 1777 | " \n", 1778 | " \n", 1779 | " \n", 1780 | " \n", 1781 | " \n", 1782 | " \n", 1783 | " \n", 1784 | " \n", 1785 | " \n", 1786 | " \n", 1787 | " \n", 1788 | "
sub_trips_milstationcap
03.945897e+157031
13.811147e+153019
23.587979e+155831
33.343885e+158135
43.157456e+151538
\n", 1789 | "
" 1790 | ], 1791 | "text/plain": [ 1792 | " sub_trips_mil station cap\n", 1793 | "0 3.945897e+15 70 31\n", 1794 | "1 3.811147e+15 30 19\n", 1795 | "2 3.587979e+15 58 31\n", 1796 | "3 3.343885e+15 81 35\n", 1797 | "4 3.157456e+15 15 38" 1798 | ] 1799 | }, 1800 | "execution_count": 21, 1801 | "metadata": {}, 1802 | "output_type": "execute_result" 1803 | } 1804 | ], 1805 | "source": [ 1806 | "q6_sub = '''\n", 1807 | "SELECT\n", 1808 | " SUM(CASE WHEN trips.subscriber_type = 'Subscriber' THEN trips.trip_id/1000000 ELSE NULL END) AS sub_trips_mil,\n", 1809 | " info.station_id AS station,\n", 1810 | " info.capacity AS cap\n", 1811 | "FROM\n", 1812 | " `bigquery-public-data.san_francisco_bikeshare.bikeshare_station_info` AS info\n", 1813 | " INNER JOIN\n", 1814 | " `bigquery-public-data.san_francisco_bikeshare.bikeshare_trips` AS trips\n", 1815 | " ON info.station_id = trips.start_station_id\n", 1816 | "GROUP BY\n", 1817 | " station, cap\n", 1818 | "ORDER BY\n", 1819 | " sub_trips_mil DESC\n", 1820 | "LIMIT\n", 1821 | " 25\n", 1822 | "'''\n", 1823 | "df6_sub = query_to_df(q6_sub)\n", 1824 | "df6_sub.head()" 1825 | ] 1826 | }, 1827 | { 1828 | "cell_type": "markdown", 1829 | "metadata": {}, 1830 | "source": [ 1831 | "Right away, we can see that the most frequent stations to start a trip for both subscribers and customers include station 70, indicating this must be an area that a lot of people go to in general. \n", 1832 | "\n", 1833 | "We'll take a look at the mean and standard deviation around each capacity in the 25 stations for both sides." 1834 | ] 1835 | }, 1836 | { 1837 | "cell_type": "code", 1838 | "execution_count": 22, 1839 | "metadata": {}, 1840 | "outputs": [ 1841 | { 1842 | "name": "stdout", 1843 | "output_type": "stream", 1844 | "text": [ 1845 | "Mean of top 25 customer stations capacity:\n", 1846 | "29.2\n", 1847 | "Standard Deviation of top 25 customer stations capacity:\n", 1848 | "6.5482822174979605\n" 1849 | ] 1850 | } 1851 | ], 1852 | "source": [ 1853 | "mean_cust = np.mean(df6_cust.cap)\n", 1854 | "sd_cust = np.std(df6_cust.cap)\n", 1855 | "print(\"Mean of top 25 customer stations capacity:\")\n", 1856 | "print(mean_cust)\n", 1857 | "print(\"Standard Deviation of top 25 customer stations capacity:\")\n", 1858 | "print(sd_cust)" 1859 | ] 1860 | }, 1861 | { 1862 | "cell_type": "code", 1863 | "execution_count": 23, 1864 | "metadata": {}, 1865 | "outputs": [ 1866 | { 1867 | "name": "stdout", 1868 | "output_type": "stream", 1869 | "text": [ 1870 | "Mean of top 25 subscriber stations capacity:\n", 1871 | "29.32\n", 1872 | "Standard Deviation of top 25 subscriber stations capacity:\n", 1873 | "6.024748957425528\n" 1874 | ] 1875 | } 1876 | ], 1877 | "source": [ 1878 | "mean_sub = np.mean(df6_sub.cap)\n", 1879 | "sd_sub = np.std(df6_sub.cap)\n", 1880 | "print(\"Mean of top 25 subscriber stations capacity:\")\n", 1881 | "print(mean_sub)\n", 1882 | "print(\"Standard Deviation of top 25 subscriber stations capacity:\")\n", 1883 | "print(sd_sub)" 1884 | ] 1885 | }, 1886 | { 1887 | "cell_type": "markdown", 1888 | "metadata": {}, 1889 | "source": [ 1890 | "We don't really see a big difference here. We might look into doing some kind of weighted comparison approach and/or hypothesis test in the future to dig into this in the future, but we'll leave the analysis here for now." 1891 | ] 1892 | }, 1893 | { 1894 | "cell_type": "code", 1895 | "execution_count": null, 1896 | "metadata": {}, 1897 | "outputs": [], 1898 | "source": [] 1899 | } 1900 | ], 1901 | "metadata": { 1902 | "kernelspec": { 1903 | "display_name": "Python 3", 1904 | "language": "python", 1905 | "name": "python3" 1906 | }, 1907 | "language_info": { 1908 | "codemirror_mode": { 1909 | "name": "ipython", 1910 | "version": 3 1911 | }, 1912 | "file_extension": ".py", 1913 | "mimetype": "text/x-python", 1914 | "name": "python", 1915 | "nbconvert_exporter": "python", 1916 | "pygments_lexer": "ipython3", 1917 | "version": "3.7.3" 1918 | } 1919 | }, 1920 | "nbformat": 4, 1921 | "nbformat_minor": 2 1922 | } 1923 | --------------------------------------------------------------------------------