├── README.md
├── r_scripts
    ├── markov_chain_viz.r
    ├── career_length_survival_analysis.r
    ├── hof_factor_analysis.r
    ├── similarity_data_viz.r
    ├── anomaly_detection_royals_stats.r
    ├── arrieta_no_hitter_analysis.r
    ├── chris_sale_lca.r
    ├── smoltz_text_analysis.r
    ├── multidimensional_scaling_2016_teams.r
    ├── historical_team_clustering.r
    ├── hosmer_statcast_analysis.r
    ├── mlb_attendance_analysis.r
    ├── rookie_all_star_predictions.Rmd
    └── association_rules_2016_games.r
├── python_scripts
    ├── royals_monte_carlo.py
    ├── pitchfx_scraper.py
    ├── all_star_network_analysis.py
    ├── survival_ingestion_wrangling.py
    ├── home_runs.py
    ├── pitching_markov_chain.py
    ├── pitcher_similarity.py
    └── match_up_simulations.py
└── csv_outputs
    └── historical_team_clustering_results.csv


/README.md:
--------------------------------------------------------------------------------
1 | # baseballdatascience
2 | 
3 | This repository contains R and Python scripts for projects on baseballdatascience.com. 
4 | 


--------------------------------------------------------------------------------
/r_scripts/markov_chain_viz.r:
--------------------------------------------------------------------------------
 1 | # Library imports
 2 | library(ggplot2)
 3 | library(ggpubr)
 4 | 
 5 | # Read in data
 6 | transitions <- read.csv('scherzer_transitions.csv')
 7 | pitch_counts <- read.csv('pitches_in_counts.csv')
 8 | 
 9 | # Transition Probabilities Plot
10 | ggdotchart(transitions, x = "Transition", y = "Probability", 
11 |            color = "Probability",
12 |            sorting = "descending",                       
13 |            rotate = TRUE,                                
14 |            dot.size = 2,                                 
15 |            y.text.col = TRUE,                            
16 |            ggtheme = theme_pubr()
17 | )+
18 |   theme_cleveland() 
19 | 
20 | # Pitch Counts Plot
21 | ggdotchart(pitch_counts, x = "pitch", y = "pitch_percentage",
22 |            color = "count", 
23 |            palette = c("#FF0000", "#00FF00", "#0000FF",
24 |                        "#00FFFF", "#800080", "#FF7F50",
25 |                        "#FF8C00", "#008000", "#00FA9A",
26 |                        "#4169E1", "#00FFFF", "#FFD700"),
27 | 
28 |            sorting = "ascending",                        
29 |            add = "segments",                             
30 |            ggtheme = theme_pubr()                        
31 | )
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/r_scripts/career_length_survival_analysis.r:
--------------------------------------------------------------------------------
 1 | # Career Length Survival Analysis
 2 | setwd("C:/Users/Micah/Desktop/Baseball Data Science/Career Length Survival Analysis")
 3 | 
 4 | # Import libraries
 5 | library(reshape)
 6 | library(dplyr)
 7 | library(tidyverse)
 8 | library(survival)
 9 | library(survminer)
10 | 
11 | # Read in data
12 | df <- read.csv('player_data_for_survival_analysis.csv')
13 | 
14 | # Survival Analysis
15 | s <- Surv(df$time_in_mlb, df$status)
16 | class(s)
17 | 
18 | # Survival analysis that doesn't consider any groupings
19 | survfit(s~1)
20 | survfit(Surv(time_in_mlb, status)~1, data=df)
21 | sfit <- survfit(Surv(time_in_mlb, status)~1, data=df)
22 | ggsurvplot(sfit)
23 | 
24 | # Survival analysis with hits
25 | sfit1 <- survfit(Surv(time_in_mlb, status) ~ hits,
26 |                    data=df)
27 | 
28 | summary(sfit1)
29 | summary(sfit1, times=seq(0, 8000, 500))
30 | plot(sfit1)
31 | ggsurvplot(sfit1)
32 | 
33 | # Survival analysis with hits and throws
34 | sfit2 <- survfit(Surv(time_in_mlb, status) ~ hits + throws,
35 |                  data=df)
36 | 
37 | ggsurvplot(sfit2)
38 | 
39 | # Survival analysis with birth_country
40 | sfit3 <- survfit(Surv(time_in_mlb, status) ~ birth_country,
41 |                  data=df)
42 | 
43 | ggsurvplot(sfit3)
44 | 
45 | # Survival analysis with height
46 | sfit4 <- survfit(Surv(time_in_mlb, status) ~ height,
47 |                  data=df)
48 | 
49 | ggsurvplot(sfit4)
50 | 
51 | # Cox regression
52 | fit <- coxph(Surv(time_in_mlb, status) ~ average_salary + 
53 |                birth_country + weight + height + hits +
54 |                throws + age_at_debut, data = df)
55 | 
56 | fit
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/r_scripts/hof_factor_analysis.r:
--------------------------------------------------------------------------------
 1 | # Factor analysis on offensive statistics for HoFers
 2 | 
 3 | # Library imports
 4 | library(nFactors)
 5 | library(gplots)
 6 | library(RColorBrewer)
 7 | library(semPlot)
 8 | 
 9 | # Read in data
10 | hof <- read.csv('hof_hitting_stats.csv')
11 |   
12 | # Determine number of factors
13 | nScree(hof)
14 | eigen(cor(hof))
15 | 
16 | # Looks like three factors will be optimal
17 | # Run factor analysis using three factors
18 | fa <- factanal(hof, factors = 3)
19 | print(fa)
20 | 
21 | # Create a heatmap of the loadings
22 | heatmap.2(fa$loadings, col = brewer.pal(9, "Greens"), trace = "none",
23 |           key = FALSE, dend = 'none', Colv = FALSE, cexCol = 1.2,
24 |           main = "\n\n\n\n\nFactor Loadings for HoF Hitting Stats")
25 | 
26 | # Create SEM plot of the factors
27 | semPaths(fa, what = "est", residuals = FALSE, cut = 0.4,
28 |          posCol = c("white", "darkgreen"), 
29 |          negCol = c("white", "red"),
30 |          edge.label.cex = 0.60, nCharNodes = 7)
31 | 
32 | ###########################################################
33 | # Lets do the same for pitching data
34 | 
35 | # Read in data
36 | hof_pitch <- read.csv('hof_pitching_stats.csv')
37 | 
38 | # Determine number of factors
39 | nScree(hof_pitch)
40 | eigen(cor(hof_pitch))
41 | 
42 | # Looks like two factors will be optimal
43 | # Run factor analysis using two factors
44 | # However, the algorithm isn't working on this data
45 | # with two factors, so we'll switch to 3
46 | fa2 <- factanal(hof_pitch, factors = 3)
47 | print(fa2)
48 | 
49 | # Create a heatmap of the loadings
50 | heatmap.2(fa2$loadings, col = brewer.pal(9, "Greens"), trace = "none",
51 |           key = FALSE, dend = 'none', Colv = FALSE, cexCol = 1.2,
52 |           main = "\n\n\n\n\nFactor Loadings for HoF Pitching Stats")
53 | 
54 | # Create SEM plot of the factors
55 | semPaths(fa2, what = "est", residuals = FALSE, cut = 0.4,
56 |          posCol = c("white", "darkgreen"), 
57 |          negCol = c("white", "red"),
58 |          edge.label.cex = 0.60, nCharNodes = 7)
59 | 


--------------------------------------------------------------------------------
/r_scripts/similarity_data_viz.r:
--------------------------------------------------------------------------------
 1 | # Library imports
 2 | 
 3 | library(ggplot2)
 4 | library(MASS)
 5 | library(colorRamps)
 6 | library(FactoMineR)
 7 | library(factoextra)
 8 | library(gplots)
 9 | 
10 | # Parallel coordinates for Pedro
11 | pedro <- read.csv('pedro_sims.csv')
12 | pedro <- pedro[-c(1)]
13 | 
14 | c <- blue2red(15)
15 | r <- cut(pedro$Counter, 15)
16 | parcoord(pedro, col=c[as.numeric(r)])
17 | 
18 | # Jitter plot for Clemens
19 | clemens <- read.csv('clemens1997.csv')
20 | clemens <- clemens[order(clemens$Similarity),] 
21 | 
22 | ggplot(clemens, aes(x=Pitcher_and_Year, y=Similarity)) + 
23 |   geom_jitter(alpha=0.5, position = position_jitter(width = 0.1)) + 
24 |   ggtitle("Pitchers Most Similar to 1997 Roger Clemens") + 
25 |   labs(y="Similarity", x='Player and Year') +
26 |   theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
27 |   theme(plot.title = element_text(hjust = 0.5))
28 | 
29 | # Circular bar chart for Johnson
30 | johnson <- read.csv('johnson2002.csv')
31 | johnson <- johnson[order(johnson$Similarity),] 
32 | 
33 | ggplot(johnson, aes(x = Pitcher_and_Year, y = Similarity,
34 |                     fill = Pitcher_and_Year)) + 
35 |   geom_bar(width = 0.85, stat="identity") +    
36 |   
37 |   # To use a polar plot and not a basic barplot
38 |   coord_polar(theta = "y") +    
39 |   
40 |   #Remove useless labels of axis
41 |   xlab("") + ylab("") +
42 |   
43 |   #Increase ylim to avoid having a complete circle
44 |   ylim(c(0,1.5)) + 
45 |   
46 |   #Add group labels close to the bars :
47 |   geom_text(data = johnson, hjust = 1, size = 3,
48 |             aes(x = Pitcher_and_Year, y = 0, label = Pitcher_and_Year)) +
49 |   
50 |   #Remove useless legend, y axis ticks and y axis text
51 |   theme(legend.position = "none", axis.text.y = element_blank(), 
52 |         axis.ticks = element_blank())
53 | 
54 | # Balloon plot for Greinke
55 | greinke <- read.csv('greinke_sims.csv')
56 | rownames(greinke) <- greinke[,1]
57 | greinke <- greinke[-c(1)]
58 | 
59 | dt <- as.table(as.matrix(greinke))
60 | balloonplot(t(dt), main='Pitching Stats', xlab="", ylab="",
61 |             label=TRUE, show.margins=FALSE)
62 | 


--------------------------------------------------------------------------------
/python_scripts/royals_monte_carlo.py:
--------------------------------------------------------------------------------
 1 | # Library imports
 2 | import pandas as pd
 3 | import random
 4 | 
 5 | 
 6 | # Define batting average class
 7 | class WARSimulation:
 8 |     def __init__(self, df, name):
 9 |         self.df = df
10 |         self.name = name
11 | 
12 |     def monte_carlo(self):
13 |         mean = self.df['war'].mean()
14 |         std = self.df['war'].std()
15 |         selections = random.normalvariate(mean, std)
16 |         return selections
17 | 
18 |     def run_the_simulation(self):
19 |         x = 0
20 |         selection = []
21 |         while x < 100:
22 |             selection.append(self.monte_carlo())
23 |             x += 1
24 | 
25 |         data = pd.DataFrame({'war': selection})
26 |         data['war'] = data['war'].round(decimals=2)
27 |         data.to_csv(self.name + '_results.csv', index=False)
28 |         return
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     # Define dataframes
33 |     hosmer_df = pd.DataFrame({'war': [1.5, -0.4, 3.5, 0.8, 3.6, 1.0]})
34 |     cain_df = pd.DataFrame({'war': [2.0, 3.2, 5.1, 7.2, 2.9]})
35 |     perez_df = pd.DataFrame({'war': [2.9, 4.1, 3.4, 2.3, 2.7]})
36 |     escobar_df = pd.DataFrame({'war': [0.5, 2.7, 3.4, 0.3, 2.5, 0.6, 0.3]})
37 |     gordon_df = pd.DataFrame({'war': [2.0, 2.8, -0.5, 7.2, 6.3, 4.2, 6.6, 2.8, 0.8]})
38 |     moustakes_df = pd.DataFrame({'war': [1.1, 3.1, -0.1, 0.4, 4.4]})
39 | 
40 |     # Hosmer - simulation
41 |     hosmer_selections = WARSimulation(hosmer_df, 'hosmer')
42 |     hosmer_selections.run_the_simulation()
43 | 
44 |     # Cain - simulation
45 |     cain_selections = WARSimulation(cain_df, 'cain')
46 |     cain_selections.run_the_simulation()
47 | 
48 |     # Perez - simulation
49 |     perez_selections = WARSimulation(perez_df, 'perez')
50 |     perez_selections.run_the_simulation()
51 | 
52 |     # Escobar - simulation
53 |     escobar_selections = WARSimulation(escobar_df, 'escobar')
54 |     escobar_selections.monte_carlo()
55 |     escobar_selections.run_the_simulation()
56 | 
57 |     # Gordan - simulation
58 |     gordon_selections = WARSimulation(gordon_df, 'gordon')
59 |     gordon_selections.run_the_simulation()
60 | 
61 |     # Moustakes - simulation
62 |     moustakes_selections = WARSimulation(moustakes_df, 'moustakes')
63 |     moustakes_selections.run_the_simulation()
64 | 


--------------------------------------------------------------------------------
/python_scripts/pitchfx_scraper.py:
--------------------------------------------------------------------------------
 1 | # Import libraries
 2 | from bs4 import BeautifulSoup
 3 | import requests
 4 | import pandas as pd
 5 | 
 6 | 
 7 | def pitch_fx_scraper(base_url):
 8 |     # Isolate the text data
 9 |     data = base_url.text
10 | 
11 |     # Create beautiful soup object from the data
12 |     soup = BeautifulSoup(data)
13 | 
14 |     # Put all the links in a list
15 |     results = []
16 |     for gid in soup.find_all('a'):
17 |         results.append(gid.get('href'))
18 | 
19 |     # Delete unnecessary elements of the list
20 |     del results[0:21]
21 | 
22 |     # Extract the Game ID from each link
23 |     results1 = [i.split('&prevGame=', 1)[1] for i in results]
24 | 
25 |     # Concatenate strings to create URLs
26 |     results2 = list(map('http://www.brooksbaseball.net/pfxVB/tabdel_expanded.php?pitchSel=477132&game={0}'.format, results1))
27 | 
28 |     # Scrape each link
29 |     results3 = []
30 |     for i in results2:
31 |         results3.append(requests.get(i))
32 | 
33 |     # Grab the text for each link
34 |     results4 = []
35 |     for i in results3:
36 |         results4.append(i.text)
37 | 
38 |     # Make each a beautiful soup object
39 |     results5 = []
40 |     for i in results4:
41 |         results5.append(BeautifulSoup(i))
42 | 
43 |     # Extract column headers
44 |     sample = results5[1]
45 |     column_headers = [th.getText() for th in
46 |                       sample.findAll('tr', limit=2)[0].findAll('th')]
47 | 
48 |     # Define data rows
49 |     results6 = []
50 |     for i in results5:
51 |         results6.append(i.findAll('tr'))
52 | 
53 |     # Get data from table
54 |     results7 = []
55 |     for j in results6:
56 |         results7.append([[td.getText() for td in j[i].findAll('td')]
57 |                 for i in range(len(j))])
58 | 
59 |     # Convert to dataframe
60 |     results8 = []
61 |     for i in results7:
62 |         results8.append(pd.DataFrame(i))
63 | 
64 |     for i in results8:
65 |         i.columns = column_headers
66 | 
67 |     return
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     # Get the URL that we'll use to construct game IDs
72 |     r = requests.get("http://www.brooksbaseball.net/tabs.php?player=477132&p_hand=1&ppos=1&cn=200&compType=none&gFilt=&time=month&minmax=ci&var=gl&s_type=2&startDate=03/30/2007&endDate=10/22/2016&balls=1&strikes=1&b_hand=1")
73 |     pitch_fx_scraper(r)


--------------------------------------------------------------------------------
/python_scripts/all_star_network_analysis.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import numpy as np
 3 | import os
 4 | import networkx as nx
 5 | import community
 6 | 
 7 | 
 8 | # Inspired by Complex Network Analysis in Python (Dmitry Zinoviev)
 9 | def ingest_and_create_dataframe():
10 |     # Read in files from Lahman database
11 |     all_star_df = pd.read_csv('AllStarFull.csv')
12 |     people_df = pd.read_csv('People.csv')
13 | 
14 |     all_star_df = all_star_df.loc[all_star_df['yearID'] >= 1970]
15 |     all_star_df['yearID'] = all_star_df['yearID'].astype('str')
16 |     all_star_df['yearID'] = all_star_df['yearID'].str[2:4]
17 |     all_star_df = all_star_df[['playerID', 'yearID']]
18 | 
19 |     people_df['name'] = people_df['nameFirst'] + ' ' + people_df['nameLast']
20 |     people_df = people_df[['playerID', 'name']]
21 | 
22 |     all_star_df = pd.merge(all_star_df, people_df, how='inner', on='playerID')
23 |     all_star_df.drop('playerID', 1, inplace=True)
24 |     all_star_df.columns = ['yearID', 'name']
25 | 
26 |     all_game_ids = list(set(all_star_df['yearID'].tolist()))
27 |     base_df = pd.DataFrame({'yearID': all_game_ids})
28 |     all_names = list(set(all_star_df['name'].tolist()))
29 | 
30 |     for name in all_names:
31 |         print(name)
32 |         temp_df = all_star_df.loc[all_star_df['name'] == name]
33 |         temp_df['name'] = 1
34 |         temp_df.rename(columns={'name': name}, inplace=True)
35 |         base_df = pd.merge(base_df, temp_df, how='left', on='yearID')
36 | 
37 |     base_df.fillna(value=0, inplace=True)
38 |     base_df.to_csv('all_star_df.csv', index=False)
39 |     return base_df
40 | 
41 | 
42 | def prepare_network(df):
43 |     df.set_index('yearID', inplace=True)
44 | 
45 |     # Create co-occurrence matrix
46 |     cooc = df.dot(df.T) * (1 - np.eye(df.shape[0]))
47 |     cooc.to_csv('cooc.csv')
48 | 
49 |     slicing = 3
50 |     weights = cooc[cooc >= slicing]
51 |     weights = weights.stack()
52 |     weights = weights / weights.max()
53 |     cd_network = weights.to_dict()
54 |     cd_network = {key: float(value) for key, value in cd_network.items()}
55 | 
56 |     player_network = nx.Graph()
57 |     player_network.add_edges_from(cd_network)
58 |     nx.set_edge_attributes(player_network, 'weight', cd_network)
59 | 
60 |     partition = community.best_partition(player_network)
61 |     nx.set_node_attributes(player_network, 'part', partition)
62 | 
63 |     if not os.path.isdir('results'):
64 |         os.mkdir('results')
65 | 
66 |     with open('results/player_network.graphml', 'wb') as ofile:
67 |         nx.write_graphml(player_network, ofile)
68 |     return
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     network_df = ingest_and_create_dataframe()
73 |     prepare_network(network_df)
74 | 


--------------------------------------------------------------------------------
/r_scripts/anomaly_detection_royals_stats.r:
--------------------------------------------------------------------------------
 1 | #Load libraries and read in data
 2 | library(AnomalyDetection)
 3 | library(ggplot2)
 4 | library(XML)
 5 | 
 6 | royals.parse <-htmlParse("http://www.baseball-reference.com/teams/tgl.cgi?team=KCR&t=b&year=2016")
 7 | royals.tab<-readHTMLTable(royals.parse, stringsAsFactors=FALSE)
 8 | royals.df<-royals.tab[[1]]
 9 | write.csv(royals.df, file = "royals2016.csv")
10 | str(royals.df)
11 | 
12 | #The dataframe includes a couple of month headers; let's delete those
13 | #They are rows 24, 53, 81, 108, 138, 
14 | royals <- royals.df[-c(24, 53, 81, 108, 138), ]
15 | 
16 | #R read in all columns as characters; we'll need to covert the 
17 | #appropriate columns to numeric values
18 | cols.num <- royals[c(1:2, 7:30)] 
19 | royals1 <- data.frame(sapply(cols.num, as.numeric))
20 | 
21 | royals_factors <- data.frame(cbind(royals$Date, royals$X, royals$Opp, 
22 |                       royals$Rslt,royals$Thr))
23 | 
24 | royals_final <- data.frame(royals1, royals_factors)
25 | names(royals_final)[27]<-"Date"
26 | names(royals_final)[28]<-"Opponent"
27 | names(royals_final)[29]<-"Result"
28 | names(royals_final)[30]<-"Pitcher_Throws"
29 | str(royals_final)
30 | 
31 | #Anomaly detection on different offensive statistics
32 | 
33 | #Runs
34 | anomr = AnomalyDetectionVec(royals_final$R, max_anoms=0.02, direction="both", 
35 |                             period = 7, plot = TRUE)
36 | anomr$plot
37 | anomr
38 | 
39 | #Home runs
40 | anomhr = AnomalyDetectionVec(royals_final$HR, max_anoms=0.02, direction="both", 
41 |                             period = 7, plot = TRUE)
42 | anomhr$plot
43 | anomhr
44 | 
45 | #Hits
46 | anomh = AnomalyDetectionVec(royals_final$H, max_anoms=0.02, direction="both", 
47 |                              period = 7, plot = TRUE)
48 | anomh$plot
49 | anomh
50 | 
51 | #Base on Balls
52 | anombb = AnomalyDetectionVec(royals_final$BB, max_anoms=0.02, direction="both", 
53 |                             period = 7, plot = TRUE)
54 | anombb$plot
55 | anombb
56 | 
57 | #Strikeouts
58 | anomso = AnomalyDetectionVec(royals_final$SO, max_anoms=0.02, direction="both", 
59 |                              period = 7, plot = TRUE)
60 | anomso$plot
61 | anomso
62 | 
63 | #Stolen Bases
64 | anomsb = AnomalyDetectionVec(royals_final$SB, max_anoms=0.02, direction="both", 
65 |                              period = 7, plot = TRUE)
66 | anomsb$plot
67 | anomsb
68 | 
69 | 
70 | #Left on base
71 | anomlob = AnomalyDetectionVec(royals_final$LOB, max_anoms=0.02, direction="both", 
72 |                               period = 7, plot = TRUE)
73 | anomlob$plot
74 | anomlob
75 | 
76 | #Ground into double play
77 | anomgdp = AnomalyDetectionVec(royals_final$GDP, max_anoms=0.02, direction="both", 
78 |                               period = 7, plot = TRUE)
79 | anomgdp$plot
80 | anomgdp
81 | 
82 | #Hit by pitch
83 | anomhbp = AnomalyDetectionVec(royals_final$HBP, max_anoms=0.02, direction="both", 
84 |                               period = 7, plot = TRUE)
85 | anomhbp$plot
86 | anomhbp
87 | 
88 | 
89 | 
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/python_scripts/survival_ingestion_wrangling.py:
--------------------------------------------------------------------------------
  1 | # Citation: https://www.cscu.cornell.edu/news/statnews/stnews67.pdf
  2 | 
  3 | # Library imports
  4 | import pymysql.cursors
  5 | import pandas as pd
  6 | from datetime import datetime
  7 | 
  8 | 
  9 | # Database connection
 10 | connection = pymysql.connect(host='localhost',
 11 |                              user='xxxxx',
 12 |                              password='xxxxx',
 13 |                              db='lahman2016',
 14 |                              charset='utf8mb4',
 15 |                              cursorclass=pymysql.cursors.DictCursor)
 16 |   
 17 |   
 18 | # Data ingestion
 19 | def ingest_data():
 20 |     master_query = '''
 21 |     SELECT * from master;'''
 22 | 
 23 |     pitching_query = '''
 24 |     SELECT sum(GS), playerID from pitching group by playerID;''' 
 25 | 
 26 |     salaries_query = '''
 27 |     SELECT yearID, salary, playerID from salaries;'''
 28 | 
 29 |     master = pd.read_sql(master_query, connection)     
 30 |     pitching = pd.read_sql(pitching_query, connection)    
 31 |     salaries = pd.read_sql(salaries_query, connection)  
 32 |     return master, pitching, salaries
 33 | 
 34 |   
 35 | def data_cleaning(master, pitching, salaries):
 36 |     # Put salaries in 2016 dollars
 37 |     inflation = pd.read_csv('inflation_conversion.csv')
 38 |     salaries = pd.merge(salaries, inflation, how = 'left', on = 'yearID')
 39 |     salaries.fillna(value = 1, inplace = True)
 40 | 
 41 |     salaries['adjusted_salary'] = salaries['salary'] / salaries['CF']
 42 | 
 43 |     # Get each players average salary
 44 |     salaries = salaries.groupby(['playerID'])['adjusted_salary'].mean()
 45 |     salaries = pd.DataFrame(salaries)
 46 |     salaries.reset_index(inplace = True)
 47 | 
 48 |     current = pd.read_csv('http://seamheads.com/baseballgauge/downloads/events.csv')
 49 |     current.rename(columns={' BAT_ID':'retroID'}, inplace=True)
 50 |     current = current[['GAME_ID', 'retroID']]
 51 |     current.drop_duplicates(subset = 'retroID', inplace = True)
 52 | 
 53 |     # Merge the data
 54 |     df = pd.merge(salaries, master, how = 'left', on = 'playerID')
 55 |     df = pd.merge(df, pitching, how = 'left', on = 'playerID')
 56 |     df = pd.merge(df, current, how = 'left', on = 'retroID')
 57 | 
 58 |     # Keep only position players
 59 |     df = df.loc[df['sum(GS)'] == 0]
 60 |     return df
 61 | 
 62 | 
 63 | # Mark current players as being censored
 64 | def prep_data_for_survival_analysis(df):
 65 |     df.rename(columns={'GAME_ID': 'Censored'}, inplace=True)
 66 |     df['Censored'].fillna(value = 2, inplace = True)
 67 | 
 68 |     def label_censor (row):
 69 |        if row['Censored'] == 2:
 70 |           return 2
 71 |        else:
 72 |         return 1
 73 | 
 74 |     df['Censored'] = df.apply (lambda row: label_censor(row), axis=1)
 75 | 
 76 |     # Age at time of debut
 77 |     # Time being in the MLB
 78 |     df['birthYear'] = df['birthYear'].astype('str')
 79 |     df['birthYear'] = df['birthYear'].str[:4]
 80 | 
 81 |     df['birthMonth'] = df['birthMonth'].astype('str')
 82 | 
 83 |     numbers_map = {'1.0': '01', '2.0': '02', '3.0': '03', '4.0': '04', '5.0': '05',
 84 |                    '6.0': '06', '7.0': '07', '8.0': '08', '9.0': '09', '10.0': '10',
 85 |                    '11.0': '11', '12.0': '12'}
 86 | 
 87 |     df['birthMonth'] = df['birthMonth'].map(numbers_map)
 88 | 
 89 |     df['birthDay'] = df['birthDay'].astype('str')
 90 |     df['birthDay'] = df['birthDay'].str[:-2]
 91 | 
 92 |     df['birthday'] = df['birthYear'] + '-' + df['birthMonth'] + '-' + df['birthDay']
 93 | 
 94 |     df['birthday'] = pd.to_datetime(df['birthday'])
 95 |     df['debut'] = pd.to_datetime(df['debut'])
 96 |     df['finalGame'] = pd.to_datetime(df['finalGame'])
 97 | 
 98 |     df['age_at_debut'] = df['debut'] - df['birthday']
 99 |     df['time_in_mlb'] = df['finalGame'] - df['debut']
100 | 
101 |     # Select columns for analysis
102 |     df = df[['adjusted_salary', 'birthCountry', 'weight', 'height', 'bats',
103 |                   'throws', 'debut', 'finalGame', 'age_at_debut',
104 |                   'time_in_mlb', 'Censored']]
105 | 
106 |     df.columns = ['average_salary', 'birth_country', 'weight', 'height', 'hits',
107 |                   'throws', 'debut', 'final_game', 'age_at_debut',
108 |                   'time_in_mlb', 'status']
109 |     return df
110 | 
111 |   
112 | if __name__ == "__main__":
113 |     master, pitching, salaries = ingest_data()
114 |     df = data_cleaning(master, pitching, salaries)
115 |     df = prep_data_for_survival_analysis(df)
116 |     df.to_csv('player_data_for_survival_analysis.csv', index = False)
117 |                             
118 | 


--------------------------------------------------------------------------------
/r_scripts/arrieta_no_hitter_analysis.r:
--------------------------------------------------------------------------------
  1 | #Create simple web scraper to retrieve data from brooksbaseball.net
  2 | library(XML)
  3 | arrieta <-htmlParse("http://www.brooksbaseball.net/pfxVB/tabdel_expanded.php?pitchSel=453562&game=gid_2016_04_21_chnmlb_cinmlb_1/&s_type=3&h_size=700&v_size=50")
  4 | arrieta.tab<-readHTMLTable(arrieta, stringsAsFactors=FALSE)
  5 | arrieta.df<-arrieta.tab[[1]]
  6 | write.csv(arrieta.df, file = "arrieta.csv")
  7 | 
  8 | #Inspect the data
  9 | str(arrieta.df)
 10 | #unfortunately, everything has been read in as characters
 11 | 
 12 | #Instead, let's read in the CSV of the data, which should eliminate this issue
 13 | arrieta <- read.csv("arrieta.csv")
 14 | summary(arrieta)
 15 | 
 16 | #Let's start by looking at Arrietta's pitch speed during the night
 17 | plot(ecdf(arrieta$start_speed),
 18 |      main = "Cumulative Distribution of Pitch Speed",
 19 |      ylab = "Cumulative Proportion",
 20 |      xlab = "Pitch Speed",
 21 |      yaxt = "n")
 22 | axis (side=2, at=seq(0, 1, by=0.1), las=1, labels=paste(seq(0, 100, by=10),
 23 |                                                             "%", sep=" "))
 24 | abline(h=0.9, lty=3)
 25 | abline(v=quantile(arrieta$start_speed, pr=0.9), lty=3)
 26 | 
 27 | #Let's now take a look at horitzontal and vertical movement of his pitches
 28 | library(ggplot2)
 29 | p <- ggplot(arrieta, aes(x=pfx_x, y=pfx_z))
 30 | p + geom_point() + stat_density2d() + ggtitle("Density of Vertical and Hortizontal Pitch Movement")
 31 | 
 32 | #Let's visualize the difference in Arrietta's pitches based on pitch velocity, 
 33 | #movement,and spin 
 34 | ggplot(arrieta, aes(start_speed, fill = mlbam_pitch_name)) +
 35 |   geom_histogram(binwidth = 1) + facet_wrap(~ mlbam_pitch_name) +
 36 |   ggtitle("Pitch Speed Histogram by Pitch Type")
 37 | 
 38 | ggplot(arrieta, aes(x=pfx_x, y=pfx_z)) +
 39 |   geom_point(shape=19) + facet_wrap(~ mlbam_pitch_name) +  
 40 |   geom_smooth() + ggtitle("Vertical and Horizontal Movement by Pitch")
 41 | 
 42 | #Delete rows containing CH
 43 | without_ch <- arrieta[-c(69, 90), ]
 44 | ggplot(without_ch, aes(spin, fill = mlbam_pitch_name)) +
 45 |   geom_density() + facet_wrap(~ mlbam_pitch_name) +
 46 |   ggtitle("Spin by Pitch Type")
 47 | 
 48 | #Subset data by different play result to see if we can deduce any insights
 49 | ball <- subset(arrieta, pdes == "Ball")
 50 | called_strike <- subset(arrieta, pdes == "Called Strike")
 51 | foul <- subset(arrieta, pdes == "Foul")
 52 | in_play_outs <- subset(arrieta, pdes == "In play, out(s)")
 53 | swinging_strike <- subset(arrieta, pdes == "Swinging Strike")
 54 | 
 55 | summary(ball)
 56 | summary(called_strike)
 57 | summary(foul)
 58 | summary(in_play_outs)
 59 | summary(swinging_strike)
 60 | 
 61 | #Look at Arrieta's performance throughout the game
 62 | mean(arrieta$start_speed)
 63 | aggregate(start_speed ~ inning + mlbam_pitch_name, data = arrieta, mean)
 64 | 
 65 | p1 <- ggplot(arrieta, aes(x=inning, y=mlbam_pitch_name, fill=start_speed))
 66 | p1 + geom_tile() + scale_fill_gradient2(midpoint=92, low="blue", high="red") +
 67 |   ggtitle("Pitch Speed by Inning") + 
 68 |   scale_x_continuous(breaks = c(1,2,3,4,5,6,7,8,9))
 69 | 
 70 | library(dplyr)
 71 | early <- filter(arrieta, inning == 1 | inning == 2 | inning == 3 )
 72 | mid <- filter(arrieta, inning == 4 | inning == 5 | inning == 6 )
 73 | late <- filter(arrieta, inning == 7 | inning == 8 | inning == 9 )
 74 | 
 75 | summary(early)
 76 | prop.table(table(early$mlbam_pitch_name))
 77 | mean(early$start_speed)
 78 | 
 79 | summary(mid)
 80 | prop.table(table(mid$mlbam_pitch_name))
 81 | mean(mid$start_speed)
 82 | 
 83 | summary(late)
 84 | summary(mid)
 85 | prop.table(table(late$mlbam_pitch_name))
 86 | mean(late$start_speed)
 87 | 
 88 | #Can we predict which pitch Arrieta will throw next?
 89 | 
 90 | #Read in new dataset
 91 | prediction <- read.csv("arrieta_prediction.csv")
 92 | summary(prediction)
 93 | str(prediction)
 94 | 
 95 | prediction$strikes <- as.factor(prediction$strikes)
 96 | prediction$balls <- as.factor(prediction$balls)
 97 | 
 98 | #Create training and test sets
 99 | library(caret)
100 | 
101 | inTrain <- createDataPartition(y=prediction$pitch_type,p=0.75, list=FALSE)
102 | training <- prediction[inTrain,]
103 | testing <- prediction[-inTrain,]
104 | 
105 | #Decision Tree and Random Forest
106 | library(rpart)
107 | library(rattle)
108 | tree <- rpart(pitch_type ~ ., method="class", data=training)
109 | printcp(tree)
110 | print(tree)
111 | fancyRpartPlot(tree)
112 | 
113 | library(randomForest)
114 | fit_rf <- randomForest(pitch_type ~ ., data=training)
115 | print(fit_rf)
116 | 
117 | head(fit_rf$votes)
118 | importance(fit_rf)
119 | barplot(fit_rf$importance[ , 1 ], main="Importance of Variables in Random Forest", 
120 |         cex.names =0.5) 
121 | 
122 | tree_predictions <- predict(tree, testing, type="class")
123 | table <- data.frame(tree_predictions, testing$pitch_type)
124 | table
125 | confusionMatrix(tree_predictions, testing$pitch_type)
126 | 
127 | rf_predictions <- predict(fit_rf, testing, type="class")
128 | table2 <- data.frame(rf_predictions, testing$pitch_type)
129 | table2
130 | confusionMatrix(rf_predictions, testing$pitch_type)
131 | 
132 | rf_predictions2 <- predict(fit_rf, testing, type="prob")
133 | table3 <- data.frame(rf_predictions2, testing$pitch_type)
134 | table3
135 | 


--------------------------------------------------------------------------------
/r_scripts/chris_sale_lca.r:
--------------------------------------------------------------------------------
  1 | #Can Latent Class Analysis identify Chris Sale's pitches?
  2 | 
  3 | #Libraries
  4 | library(XML)
  5 | library(ggplot2)
  6 | library(poLCA)
  7 | library(dplyr)
  8 | library(MASS)
  9 | library(colorRamps)
 10 | 
 11 | #Create simple web scraper to retrieve data from brooksbaseball.net
 12 | #Let's scrape the last five games of the 2016 regular season
 13 | 
 14 | #Game 1
 15 | game1 <-htmlParse("http://www.brooksbaseball.net/pfxVB/tabdel_expanded.php?pitchSel=519242&game=gid_2016_10_02_minmlb_chamlb_1/&s_type=3&h_size=700&v_size=500")
 16 | game1.tab <-readHTMLTable(game1, stringsAsFactors=FALSE)
 17 | game1.df <- game1.tab[[1]]
 18 | 
 19 | #Game 2
 20 | game2 <-htmlParse("http://www.brooksbaseball.net/pfxVB/tabdel_expanded.php?pitchSel=519242&game=gid_2016_09_27_tbamlb_chamlb_1/&s_type=3&h_size=700&v_size=500")
 21 | game2.tab <-readHTMLTable(game2, stringsAsFactors=FALSE)
 22 | game2.df <- game2.tab[[1]]
 23 | 
 24 | #Game 3
 25 | game3 <-htmlParse("http://www.brooksbaseball.net/pfxVB/tabdel_expanded.php?pitchSel=519242&game=gid_2016_09_21_chamlb_phimlb_1/&s_type=3&h_size=700&v_size=500")
 26 | game3.tab <-readHTMLTable(game3, stringsAsFactors=FALSE)
 27 | game3.df <- game3.tab[[1]]
 28 | 
 29 | #Game 4
 30 | game4 <-htmlParse("http://www.brooksbaseball.net/pfxVB/tabdel_expanded.php?pitchSel=519242&game=gid_2016_09_16_chamlb_kcamlb_1/&s_type=3&h_size=700&v_size=500")
 31 | game4.tab <-readHTMLTable(game4, stringsAsFactors=FALSE)
 32 | game4.df <- game4.tab[[1]]
 33 | 
 34 | #Game 5
 35 | game5 <-htmlParse("http://www.brooksbaseball.net/pfxVB/tabdel_expanded.php?pitchSel=519242&game=gid_2016_09_11_kcamlb_chamlb_1/&s_type=3&h_size=700&v_size=500")
 36 | game5.tab <-readHTMLTable(game5, stringsAsFactors=FALSE)
 37 | game5.df <- game5.tab[[1]]
 38 | 
 39 | #Combine all the dataframes
 40 | sale <- rbind(game1.df, game2.df, game3.df, game4.df, game5.df)
 41 | 
 42 | #All the columns were read in as characters, which isn't what we want
 43 | #Let's write the file to a CSV and read it back in, which should fix the issue
 44 | write.csv(sale, "sale.csv")
 45 | sale <- read.csv("sale.csv")
 46 | 
 47 | #Alright, it looks like everything read in correctly
 48 | 
 49 | #Let's create a pie chart of Sale's different pitches
 50 | ggplot(sale, aes(x = factor(1), fill = factor(mlbam_pitch_name))) +
 51 |   geom_bar(width = 1) + coord_polar(theta = "y") + 
 52 |   ggtitle("Pitches Thrown in Final 5 Games of 2016") + ylab(" ") +
 53 |   xlab(" ") + scale_y_continuous(breaks = sale$mlbam_pitch_names, 
 54 |                                  labels=sale$mlbam_pitch_name)
 55 | 
 56 | #Let's run multiple latent class models to see how they perform
 57 | #We'll use the following variables in the model:
 58 | #spin, pfx_x, pfx_z, vx0, vy0, vz0, ax, ay, az, start_speed
 59 | #https://fastballs.wordpress.com/2007/08/02/glossary-of-the-gameday-pitch-fields/
 60 | 
 61 | #First, though, we need to convert the numeric data into 
 62 | #categorical data; let's split the variables into quartiles
 63 | sale.sub <- sale[c("spin", "pfx_x", "pfx_z", "vx0", "vy0", "vz0", 
 64 |                    "ax", "ay", "az", "start_speed")]
 65 | 
 66 | quartile <- function(x) {
 67 |   ntile(x, 4)
 68 | }
 69 | 
 70 | sale.sub <- apply(sale.sub, 2, quartile)
 71 | sale.sub <- data.frame(sale.sub)
 72 | sale.sub <- data.frame(sapply(sale.sub, as.factor))
 73 | summary(sale.sub)
 74 | 
 75 | #Sale throws three pitches, so we're most interested in the lc3 model
 76 | f <- cbind(spin, pfx_x, pfx_z, vx0, vy0, vz0, ax, ay, az, start_speed)~1
 77 | set.seed(200)
 78 | lc2 <- poLCA(f, sale.sub, nclass=2, graph = TRUE)
 79 | lc3 <- poLCA(f, sale.sub, nclass=3, graph = TRUE)
 80 | lc4 <- poLCA(f, sale.sub, nclass=4, graph = TRUE)
 81 | 
 82 | #Since we know Sale throws three pitches, it doesn't make sense to
 83 | #run more models, though the lc4 had a better AIC than the lc3
 84 | #Let's dive deeper in the lc3 results
 85 | 
 86 | #Look at predictions and probabilities for each observation
 87 | probs <- lc3$posterior
 88 | head(probs)
 89 | 
 90 | preds <- lc3$predclass
 91 | head(preds)
 92 | 
 93 | #Create a dataframe of predictions and probabilities assigned to each observation
 94 | prediction_frame <- data.frame(preds, probs)
 95 | 
 96 | #Create a data frame of the the original numeric data and the declared pitch type
 97 | sale.original <- sale[c("spin", "pfx_x", "pfx_z", "vx0", "vy0", "vz0", 
 98 |                    "ax", "ay", "az", "start_speed", "mlbam_pitch_name")]
 99 | 
100 | #Bind the data frames
101 | sale.final <- data.frame(sale.original, prediction_frame)
102 | 
103 | #Clean the names of the columns from the prediction frame
104 | names(sale.final)[12] <- "Predicted_Class"
105 | names(sale.final)[13] <- "Class1_Prob"
106 | names(sale.final)[14] <- "Class2_Prob"
107 | names(sale.final)[15] <- "Class3_Prob"
108 | 
109 | #Change the predicted class to a factor
110 | sale.final$Predicted_Class <- as.factor(sale.final$Predicted_Class)
111 | 
112 | #Get summaries of the data
113 | sale_lca1 <- subset(sale.final, Predicted_Class=="1")
114 | summary(sale_lca1)
115 | 
116 | sale_lca2 <- subset(sale.final, Predicted_Class=="2")
117 | summary(sale_lca2)
118 | 
119 | sale_lca3 <- subset(sale.final, Predicted_Class=="3")
120 | summary(sale_lca3)
121 | 
122 | #Parallel coordinates plot to view the classes
123 | r <- (sale.final$Predicted_Class)
124 | parcoord(sale.final[1:10], col=r)
125 | 


--------------------------------------------------------------------------------
/python_scripts/home_runs.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from scipy.stats import norm
  4 | import requests
  5 | from bs4 import BeautifulSoup
  6 | from time import sleep
  7 | from sklearn.preprocessing import MinMaxScaler
  8 | 
  9 | 
 10 | def create_aggregate_file(df):
 11 |     grouped = pd.DataFrame(df.groupby('yearID')['HR'].sum())
 12 |     grouped.reset_index(inplace=True)
 13 |     grouped.columns = ['year', 'home_runs']
 14 |     grouped[['year', 'home_runs']] = grouped[['year', 'home_runs']].astype('int')
 15 |     return grouped
 16 | 
 17 | 
 18 | def find_mean_and_variance(df, year, rolling_period, excluded_years):
 19 |     cutoff = year - rolling_period
 20 |     df_lag1 = df.loc[df['year'] == cutoff - 1]
 21 |     df_lag2 = df.loc[df['year'] == cutoff - 2]
 22 | 
 23 |     df = df.loc[(df['year'] >= cutoff) & (df['year'] < year)]
 24 |     len_1 = len(df)
 25 |     df = df.loc[~df['year'].isin(excluded_years)]
 26 |     len_2 = len(df)
 27 | 
 28 |     diff = len_1 - len_2
 29 |     if diff == 1:
 30 |         df = pd.concat([df_lag1, df])
 31 |     elif diff == 2:
 32 |         df = pd.concat([df_lag2, df])
 33 | 
 34 |     mean = df['home_runs'].mean()
 35 |     var = df['home_runs'].std()
 36 |     return mean, var
 37 | 
 38 | 
 39 | def scrape_2019_home_runs():
 40 |     master_df = pd.DataFrame()
 41 |     for page in range(1, 27):
 42 |         url = 'https://www.foxsports.com/mlb/stats?season=2019&category=BATTING&group=1&sort=7&time=0&pos=0&qual=1&' \
 43 |               'sortOrder=0&splitType=0&page={0}&statID=0'.format(page)
 44 |         page = requests.get(url)
 45 |         soup = BeautifulSoup(page.text)
 46 |         table = soup.findAll('tr')
 47 |         data = ([[td.getText() for td in table[i].findAll('td')] for i in range(len(table))])
 48 |         df = pd.DataFrame(data)
 49 |         master_df = master_df.append(df)
 50 |         sleep(5)
 51 |     master_df.to_csv('2019_hr.csv')
 52 |     return master_df
 53 | 
 54 | 
 55 | def fit_pdf_and_cdf():
 56 |     df_2019 = pd.read_csv('2019_hr.csv')
 57 |     hr_total_2019 = df_2019['8'].sum()
 58 |     batting_df = pd.read_csv('baseballdatabank-2019.2/core/Batting.csv')
 59 |     batting_df = create_aggregate_file(batting_df)
 60 |     df_2019 = pd.DataFrame({'home_runs': [hr_total_2019], 'year': [2019]})
 61 |     batting_df = pd.concat([batting_df, df_2019], axis=0)
 62 |     batting_df.to_csv('full_data_df.csv', index=False)
 63 | 
 64 |     pdf_df = pd.DataFrame()
 65 |     for year in range(1980, 2020):
 66 |         temp_items = find_mean_and_variance(batting_df, year, 20, excluded_years=[1981, 1994])
 67 |         temp_batting_df = (batting_df.loc[batting_df['year'] == year]).reset_index(drop=True)
 68 |         temp_hr_value = temp_batting_df['home_runs'][0]
 69 |         temp_pdf = norm(loc=temp_items[0], scale=temp_items[1]).pdf(temp_hr_value)
 70 |         temp_cdf = norm(loc=temp_items[0], scale=temp_items[1]).cdf(temp_hr_value)
 71 |         temp_pdf_df = pd.DataFrame({
 72 |             'year': [year],
 73 |             'home_runs': [temp_hr_value],
 74 |             'rolling_mean_hr': temp_items[0],
 75 |             'rolling_std': temp_items[1],
 76 |             'pdf': [temp_pdf * 100],
 77 |             'cdf': [temp_cdf]
 78 |         })
 79 |         pdf_df = pdf_df.append(temp_pdf_df)
 80 | 
 81 |     pdf_df['var_from_average'] = abs(pdf_df['home_runs'] - pdf_df['rolling_mean_hr'])
 82 |     pdf_df.to_csv('pdf_df.csv', index=False)
 83 |     return
 84 | 
 85 | 
 86 | def calculate_summary_stats():
 87 |     batting_df = pd.read_csv('baseballdatabank-2019.2/core/Batting.csv')
 88 |     batting_df = batting_df[['yearID', 'HR', 'AB']]
 89 |     batting_df.columns = ['year', 'home_runs', 'at_bats']
 90 | 
 91 |     df_2019 = pd.read_csv('2019_hr.csv')
 92 |     df_2019 = df_2019[['8', '3']]
 93 |     df_2019['year'] = '2019'
 94 |     df_2019.rename(columns={'8': 'home_runs', '3': 'at_bats'}, inplace=True)
 95 |     df_2019.dropna(inplace=True)
 96 | 
 97 |     master_df = pd.concat([batting_df, df_2019], axis=0)
 98 |     master_df.to_csv('all_home_runs.csv', index=False)
 99 | 
100 |     master_df = master_df.loc[master_df['at_bats'] >= 150]
101 |     grouped = master_df.groupby('year').agg({'home_runs': ['mean', 'median', 'std']})
102 |     grouped.reset_index(inplace=True)
103 |     grouped.columns = grouped.columns.droplevel()
104 |     grouped.columns = ['year', 'mean', 'median', 'std']
105 | 
106 |     scalar = MinMaxScaler()
107 |     grouped['mean_scaled'] = scalar.fit_transform(grouped['mean'].values.reshape(-1, 1))
108 |     grouped['median_scaled'] = scalar.fit_transform(grouped['median'].values.reshape(-1, 1))
109 |     grouped['std_scaled'] = scalar.fit_transform(grouped['std'].values.reshape(-1, 1))
110 | 
111 |     q = pd.DataFrame(master_df.groupby('year')['home_runs'].quantile(q=np.linspace(.10, .90, 9)))
112 |     q.reset_index(inplace=True)
113 |     q.columns = ['year', 'quantile', 'home_runs']
114 |     q = q.pivot(index='year', columns='quantile', values='home_runs')
115 |     q.reset_index(inplace=True)
116 | 
117 |     q.columns = ['year', 'quantile_0.1', 'quantile_0.2', 'quantile_0.3', 'quantile_0.4', 'quantile_0.5',
118 |                  'quantile_0.6', 'quantile_0.7', 'quantile_0.8', 'quantile_0.9']
119 | 
120 |     grouped = pd.merge(grouped, q, how='left', on='year')
121 |     grouped.to_csv('yearly_summary.csv', index=False)
122 |     return
123 | 
124 | 
125 | if __name__ == "__main__":
126 |     scrape_2019_home_runs()
127 |     calculate_summary_stats()
128 |     fit_pdf_and_cdf()
129 | 


--------------------------------------------------------------------------------
/r_scripts/smoltz_text_analysis.r:
--------------------------------------------------------------------------------
  1 | # HoF text mining
  2 | # Load libraries
  3 | library(quanteda)
  4 | library(tm)
  5 | library(data.table)
  6 | library(topicmodels)
  7 | library(ggplot2)
  8 | 
  9 | #############Text Analysis#############
 10 | # Read in data
 11 | smoltz <- read.csv('smoltz_speech.csv')
 12 | 
 13 | #Convert the tweet text into a corpus, the object R needs for text analysis
 14 | masterCorpus <- Corpus(VectorSource(smoltz$text))
 15 | masterCorpus
 16 | 
 17 | #Remove punctuation and numbers from the text; convert all text to lower case
 18 | masterCorpus <- tm_map(masterCorpus, removePunctuation)
 19 | masterCorpus <- tm_map(masterCorpus, removeNumbers)
 20 | masterCorpus <- tm_map(masterCorpus, tolower)
 21 | 
 22 | #Remove common stop words as well as bespoke list of words
 23 | masterCorpus <- tm_map(masterCorpus, removeWords, stopwords("english"))
 24 | masterCorpus <- tm_map(masterCorpus, removeWords, c("how", "why", "be", "here", "there", "via", 
 25 |                                                     "amp", "there", "will", "can", "see", "new", 
 26 |                                                     "sap", "help", "find", "get", "make", "watch", 
 27 |                                                     "take", "learn", "need", "one", "now", "just",
 28 |                                                     "like", "cant", "got", "much", "say", "way",
 29 |                                                     "going", "dont", "said", "ever", "doesnt",
 30 |                                                     "come", "since","wont","saying","didnt",
 31 |                                                     "every", "hes","youre", "still", "ive", 
 32 |                                                     "use", "even", "u", "ut"))
 33 | 
 34 | #Stem the document to have R read items like "analyze" and "analyzed" as one term
 35 | #Stripe white space created by removed words and treat the corpus as plain text
 36 | masterCorpus <- tm_map(masterCorpus, stemDocument)
 37 | masterCorpus <- tm_map(masterCorpus, stripWhitespace)
 38 | masterCorpus <- tm_map(masterCorpus, PlainTextDocument)
 39 | 
 40 | #create document term matrix, which tells us which words were used in each document (i.e. a tweet)
 41 | dtm <- DocumentTermMatrix(masterCorpus)
 42 | dtm
 43 | 
 44 | #Sum the number of times each word was used
 45 | freq <- colSums(as.matrix(dtm))
 46 | 
 47 | #Order the number of times by how often they were used
 48 | ord <- order(freq)
 49 | freq[tail(ord)]
 50 | 
 51 | #Convert the list of words to a dataframe, which will be easier to work with
 52 | wf <- data.frame(word=names(freq), freq=freq) 
 53 | write.csv(wf, file = "smoltz_unigrams.csv")
 54 | 
 55 | # Create chart of most used words
 56 | p <- ggplot(subset(wf, freq > 10), aes(word, freq, fill = "blue"))    
 57 | p <- p + geom_bar(stat="identity") + ggtitle('Smoltz Most Used Words')   
 58 | p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))+
 59 |   theme(plot.title = element_text(hjust = 0.5)) + theme(legend.position = "none")   
 60 | p   
 61 | 
 62 | #Create a Quanteda corpus from the TM corpus
 63 | smoltz$text <- as.character(smoltz$text)
 64 | corpus <- corpus(smoltz$text)
 65 | 
 66 | #Create a document term matrix of bi-grams, groups of two words, and sort the results by popularity
 67 | dfm.bi <- dfm(corpus, ignoredFeatures = stopwords("english"), stem = TRUE, ngrams = 2, verbose = FALSE)
 68 | dfm.bi.freq <- colSums(dfm.bi)
 69 | dfm.bi.freq <- sort(dfm.bi.freq, decreasing=TRUE) 
 70 | 
 71 | #Take out bi-grams used fewer than ten times
 72 | dfm.bi.freq.prune <- as.numeric()
 73 | for (i in 1:length(dfm.bi.freq)) { 
 74 |   if (dfm.bi.freq[i] > 2) {
 75 |     dfm.bi.freq.prune  <- c(dfm.bi.freq.prune, dfm.bi.freq[i]) }
 76 | }
 77 | 
 78 | #Convert results to a data frame
 79 | bigrams <- data.frame(dfm.bi.freq.prune)
 80 | 
 81 | #Change rownames to a column and rename columns
 82 | setDT(bigrams, keep.rownames = TRUE)[]
 83 | names(bigrams)[1] <- "word"
 84 | names(bigrams)[2] <- "freq"
 85 | 
 86 | write.csv(bigrams, file = "smoltz_bigrams.csv")
 87 | 
 88 | p <- ggplot(subset(bigrams, freq > 3), aes(word, freq, fill = "blue"))    
 89 | p <- p + geom_bar(stat="identity") + ggtitle('Smoltz Most Used Bigramss')   
 90 | p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))+
 91 |   theme(plot.title = element_text(hjust = 0.5)) + theme(legend.position = "none")   
 92 | p 
 93 | 
 94 | #Remove sparse terms from the document term matrix
 95 | dtms <- removeSparseTerms(dtm, 0.98)  
 96 | dtms
 97 | 
 98 | #Cluster words that often appear together
 99 | d <- dist(t(dtms), method="euclidian")   
100 | kfit <- kmeans(d, 10)   
101 | clusplot(as.matrix(d), kfit$cluster, color=T, shade=T, labels=2, lines=0) 
102 | clusters <- data.frame(kfit$cluster)
103 | print(clusters)
104 | 
105 | #Create a document term matrix for topic modeling
106 | dfm.uni <- dfm(corpus, ignoredFeatures = stopwords("english"), stem = TRUE, verbose = FALSE)
107 | 
108 | #Run LDA topic model
109 | burnin <- 4000
110 | iter <- 2000
111 | thin <- 500
112 | seed <-list(2003,5,63,100001,765)
113 | nstart <- 5
114 | best <- TRUE
115 | k <- 5
116 | 
117 | ldaOut <-LDA(dtm,k, method='Gibbs', control=list(nstart=nstart, 
118 |                                                  seed = seed, 
119 |                                                  best=best, 
120 |                                                  burnin = burnin, 
121 |                                                  iter = iter, thin=thin))
122 | 
123 | ldaOut.topics <- as.matrix(terms(ldaOut, 6))
124 | 
125 | 


--------------------------------------------------------------------------------
/r_scripts/multidimensional_scaling_2016_teams.r:
--------------------------------------------------------------------------------
  1 | #Multidimensional scaling and hierarchical clustering on 2016 offensive statistics
  2 | 
  3 | #Load libraries
  4 | library(XML)
  5 | library(plyr)
  6 | library(dplyr)
  7 | library(stringr)
  8 | library(Hmisc)
  9 | library(MASS)
 10 | library(cluster)
 11 | 
 12 | #Scrape a dataset that includes each team's abbreviation
 13 | #We'll isolate the names and put them in a list, which will be used to scrape
 14 | #the data we really want
 15 | abbreviations <- htmlParse("http://www.baseball-reference.com/leagues/MLB/2016.shtml")
 16 | abbreviations.tab <- readHTMLTable(abbreviations, stringsAsFactors=FALSE)
 17 | abbreviations.df <- abbreviations.tab[[2]]
 18 | 
 19 | #Create list of teams
 20 | teams <- list(abbreviations.df$Tm)
 21 | teams <- sapply(teams, "[", c(1:30))
 22 | 
 23 | #Let's now scrape per-game offensive statistics for each team
 24 | fetch_offense <- function(team) {
 25 |   url <- paste0("http://www.baseball-reference.com/teams/tgl.cgi?team=", team, "&t=b&year=2016.com")
 26 |   data <- readHTMLTable(url, stringsAsFactors = FALSE)
 27 |   data <- data[[1]]
 28 |   data$team <- team
 29 |   data
 30 | }
 31 | 
 32 | #I get a script out of bounds error when I run a loop; looks like we may have to manually 
 33 | #insert each team
 34 | ARI_offense <- ldply("ARI", fetch_offense, .progress="text")
 35 | ATL_offense <- ldply("ATL", fetch_offense, .progress="text")
 36 | BAL_offense <- ldply("BAL", fetch_offense, .progress="text")
 37 | CHW_offense <- ldply("CHW", fetch_offense, .progress="text")
 38 | CIN_offense <- ldply("CIN", fetch_offense, .progress="text")
 39 | COL_offense <- ldply("COL", fetch_offense, .progress="text")
 40 | DET_offense <- ldply("DET", fetch_offense, .progress="text")
 41 | HOU_offense <- ldply("HOU", fetch_offense, .progress="text")
 42 | KCR_offense <- ldply("KCR", fetch_offense, .progress="text")
 43 | LAA_offense <- ldply("LAA", fetch_offense, .progress="text")
 44 | MIA_offense <- ldply("MIA", fetch_offense, .progress="text")
 45 | MIL_offense <- ldply("MIL", fetch_offense, .progress="text")
 46 | MIN_offense <- ldply("MIN", fetch_offense, .progress="text")
 47 | NYM_offense <- ldply("NYM", fetch_offense, .progress="text")
 48 | NYY_offense <- ldply("NYY", fetch_offense, .progress="text")
 49 | OAK_offense <- ldply("OAK", fetch_offense, .progress="text")
 50 | PHI_offense <- ldply("PHI", fetch_offense, .progress="text")
 51 | PIT_offense <- ldply("PIT", fetch_offense, .progress="text")
 52 | SDP_offense <- ldply("SDP", fetch_offense, .progress="text")
 53 | SEA_offense <- ldply("SEA", fetch_offense, .progress="text")
 54 | SFG_offense <- ldply("SFG", fetch_offense, .progress="text")
 55 | STL_offense <- ldply("STL", fetch_offense, .progress="text")
 56 | TBR_offense <- ldply("TBR", fetch_offense, .progress="text")
 57 | TOR_offense <- ldply("TOR", fetch_offense, .progress="text")
 58 | 
 59 | #The scraper did not work on the following teams for some reason,
 60 | #so I created a slightly different scraper for just these ones
 61 | fetch_offense2 <- function(team) {
 62 |   url <- paste0("http://www.baseball-reference.com/teams/tgl.cgi?team=", team, "&t=b&year=2016.com")
 63 |   data <- readHTMLTable(url, stringsAsFactors = FALSE)
 64 |   data <- data[[2]]
 65 |   data$team <- team
 66 |   data
 67 | }
 68 | 
 69 | TEX_offense <- ldply("TEX", fetch_offense2, .progress="text")
 70 | WSN_offense <- ldply("WSN", fetch_offense2, .progress="text")
 71 | LAD_offense <- ldply("LAD", fetch_offense2, .progress="text")
 72 | CLE_offense <- ldply("CLE", fetch_offense2, .progress="text")
 73 | BOS_offense <- ldply("BOS", fetch_offense2, .progress="text")
 74 | CHC_offense <- ldply("CHC", fetch_offense2, .progress="text")
 75 | 
 76 | #Bind the data frames
 77 | offense <- rbind(ARI_offense, ATL_offense, BAL_offense, BOS_offense, CHC_offense,
 78 |                             CHW_offense, CIN_offense, CLE_offense, COL_offense, DET_offense, HOU_offense, KCR_offense,
 79 |                             LAA_offense, LAD_offense, MIA_offense, MIL_offense, MIN_offense, NYM_offense, NYY_offense,
 80 |                             OAK_offense, PHI_offense, PIT_offense, SDP_offense, SEA_offense, SFG_offense, STL_offense,
 81 |                             TBR_offense, TEX_offense, TOR_offense, WSN_offense)
 82 | 
 83 | #Remove monthly headers in the dataframe
 84 | offense <- offense[!grepl("PA", offense$Opp),]
 85 | 
 86 | #Select only the columns we need and convert to the correct data type
 87 | offense_num <- subset(offense, select = c(7:24, 29))
 88 | offense_cat <- offense[33]
 89 | 
 90 | offense_cat$team <- as.factor(offense_cat$team)
 91 | offense_num <- data.frame(sapply(offense_num, as.numeric))
 92 | 
 93 | offense_sub <- cbind(offense_cat, offense_num)
 94 | 
 95 | #Aggregate the stats by team
 96 | totals <- aggregate(. ~ team, offense_sub, sum)
 97 | 
 98 | #Calculate, BA, OBP, and SLG for each team
 99 | totals$BA <- totals$H / totals$AB
100 | 
101 | totals$OBP <- (totals$H + totals$BB + totals$HBP) / (totals$AB + totals$BB + totals$HBP + 
102 |                                                        totals$SF)
103 | 
104 | totals$X1B <- totals$H - (totals$X2B + totals$X3B + totals$HR)
105 | totals$SLG <- (totals$X1B + totals$X2B * 2 + totals$X3B * 3 + totals$HR * 4) / totals$AB
106 | 
107 | #Apply multidimensional scaling to the data
108 | rownames(totals) <- totals[, 1]
109 | totals <- totals[2:24]
110 | totals <- scale(totals)
111 |   
112 | team.dist <- dist(totals)
113 | team.mds <- cmdscale(team.dist)
114 | 
115 | plot(team.mds, type = "n")
116 | text(team.mds, row.names(team.mds))
117 | 
118 | #Apply hierarchical clustering to the data
119 | set.seed(100)
120 | teams_hclust <- totals
121 | dm = dist(teams_hclust,method="euclidean")
122 | hclust_teams <- hclust(dm, method="complete")
123 | plot(hclust_teams)
124 | 


--------------------------------------------------------------------------------
/r_scripts/historical_team_clustering.r:
--------------------------------------------------------------------------------
  1 | #Read in and inspect data
  2 | library(Lahman) #https://cran.r-project.org/web/packages/Lahman/Lahman.pdf 
  3 | data(Teams) #http://rpackages.ianhowson.com/rforge/Lahman/man/Teams.html 
  4 | summary(Teams)
  5 | 
  6 | teams_subset <- Teams[c(7, 15:23, 27:28, 30:38)]
  7 | 
  8 | #Isolate selected numeric variables
  9 | #We'll also create a reference data frame, that will just have team name and year
 10 | #The reference set will be handy later
 11 | teams_subset <- Teams[c(7, 15:23, 27:28, 30:32, 34:38)] 
 12 | teams_reference <- Teams[c(1, 4, 7, 15:23, 27:28, 30:32, 33:38)] 
 13 | 
 14 | #Remove teams with missing values
 15 | teams_subset <- na.omit(teams_subset)
 16 | teams_reference <- na.omit(teams_reference)
 17 | 
 18 | #On the reference set, drop everything but year and franchise IDs
 19 | teams_reference <- teams_reference[c(1:2)]
 20 | 
 21 | #Make each variable per-game rather than an aggregate
 22 | teams_final <- sweep(teams_subset,1,unlist(teams_subset[,1]),"/")
 23 | summary(teams_final)
 24 | 
 25 | #Drop the games column
 26 | teams_final <- teams_final[(-1)]
 27 | 
 28 | #Create a visual of correlations among variables
 29 | library(corrplot)
 30 | library(gplots)
 31 | 
 32 | correlations <- cor(teams_final)
 33 | correlations <- round(correlations, digits=2)
 34 | 
 35 | corrplot(correlations)
 36 | corrplot(correlations, method="shade", shade.col=NA, tl.col="black")
 37 | 
 38 | #Look at relationships between selected variables
 39 | #To avoid over-plotting, we'll use the hexbin package
 40 | library(hexbin)
 41 | library(ggplot2)
 42 | p <- ggplot(teams_final, aes(x=E, y=RA))
 43 | p + stat_binhex() +
 44 |   scale_fill_gradient(low="lightblue", high="red") +
 45 |   ggtitle("Relationship Between Errors Per Game \n and Runs Allowed Per Game")
 46 | 
 47 | p1 <- ggplot(teams_final, aes(x=HR, y=R))
 48 | p1 + stat_binhex() +
 49 |   scale_fill_gradient(low="lightblue", high="red") +
 50 |   ggtitle("Relationship Between HR Per Game \n and Runs Per Game")
 51 | 
 52 | #Develop parallel coordinates plot of variables
 53 | library(MASS)
 54 | library(colorRamps)
 55 | 
 56 | c <- blue2red(100)
 57 | r <- cut(teams_final$SHO, 100)
 58 | parcoord(teams_final, col=c[as.numeric(r)])
 59 | 
 60 | h <- cut(teams_final$HR, 100)
 61 | parcoord(teams_final, col=c[as.numeric(h)])
 62 | 
 63 | #Conduct k-means cluster
 64 | teams_scaled <- scale(teams_final)
 65 | wss <- (nrow(teams_scaled)-1)*sum(apply(teams_scaled,2,var))
 66 | for (i in 2:15) wss[i] <- sum(kmeans(teams_scaled,
 67 |                                      centers=i)$withinss)
 68 | plot(1:15, wss, type="b", xlab="Number of Clusters",
 69 |      ylab="Within groups sum of squares", main = "Elbow Plot for No. of Clusters")
 70 | 
 71 | set.seed(500)
 72 | fit1 <- kmeans(teams_final, 6, nstart=25) 
 73 | 
 74 | library(cluster)
 75 | set.seed(500)
 76 | clusplot(teams_scaled, fit1$cluster, color=TRUE, shade=TRUE,
 77 |          labels=2, lines=0, main = "PCA Plot of K-Means Cluster")
 78 | 
 79 | teams_final <- data.frame(teams_final, fit1$cluster)
 80 | cluster1 <- teams_final[which(teams_final$fit1.cluster=='1'),]
 81 | cluster2 <- teams_final[which(teams_final$fit1.cluster=='2'),]
 82 | cluster3 <- teams_final[which(teams_final$fit1.cluster=='3'),]
 83 | cluster4 <- teams_final[which(teams_final$fit1.cluster=='4'),]
 84 | cluster5 <- teams_final[which(teams_final$fit1.cluster=='5'),]
 85 | cluster6 <- teams_final[which(teams_final$fit1.cluster=='6'),]
 86 | 
 87 | summary(cluster1)
 88 | summary(cluster2)
 89 | summary(cluster3)
 90 | summary(cluster4)
 91 | summary(cluster5)
 92 | summary(cluster6)
 93 | 
 94 | #Merge clusters with reference dataset of teams and years
 95 | teams_reference <- data.frame(teams_reference, fit1$cluster)
 96 | teams1 <- teams_reference[which(teams_reference$fit1.cluster=='1'),]
 97 | teams2 <- teams_reference[which(teams_reference$fit1.cluster=='2'),]
 98 | teams3 <- teams_reference[which(teams_reference$fit1.cluster=='3'),]
 99 | teams4 <- teams_reference[which(teams_reference$fit1.cluster=='4'),]
100 | teams5 <- teams_reference[which(teams_reference$fit1.cluster=='5'),]
101 | teams6 <- teams_reference[which(teams_reference$fit1.cluster=='6'),]
102 | team_clusters <- rbind(teams1, teams2, teams3, teams4, teams5, teams6)
103 | write.csv(team_clusters, file = "Historical Team Clustering Results.csv")
104 | 
105 | print(teams1)
106 | teams1$yearID <- as.factor(as.integer(teams1$yearID))
107 | summary(teams1, 50)
108 | 
109 | print(teams2)
110 | teams2$yearID <- as.factor(as.integer(teams2$yearID))
111 | summary(teams2, 50)
112 | 
113 | print(teams3)
114 | teams3$yearID <- as.factor(as.integer(teams3$yearID))
115 | summary(teams3, 50)
116 | 
117 | print(teams4)
118 | teams4$yearID <- as.factor(as.integer(teams4$yearID))
119 | summary(teams4, 50)
120 | 
121 | print(teams5)
122 | teams5$yearID <- as.factor(as.integer(teams5$yearID))
123 | summary(teams5, 50)
124 | 
125 | print(teams6)
126 | teams6$yearID <- as.factor(as.integer(teams6$yearID))
127 | summary(teams6, 50)
128 | 
129 | #Conduct hierarchical cluster
130 | library(cluster)
131 | set.seed(100)
132 | teams_hclust <- teams_scaled
133 | dm = dist(teams_hclust,method="euclidean")
134 | hclust_teams <- hclust(dm, method="complete")
135 | plot(hclust_teams)
136 | 
137 | plot(cut(as.dendrogram(hclust_teams), h=8)$lower[[2]])
138 | teams_hclust[c(307, 324), ]
139 | teams_reference[c(307, 324), ]
140 | teams_hclust[c(307, 304), ]
141 | teams_reference[c(307, 304), ]
142 | 
143 | plot(cut(as.dendrogram(hclust_teams), h=4)$lower[[30]])
144 | teams_hclust[c(768, 944), ]
145 | teams_reference[c(768, 944), ]
146 | teams_hclust[c(944, 771), ]
147 | teams_reference[c(944, 771), ]
148 | 
149 | #Conduct PCA on the data
150 | library("factoextra")
151 | library("FactoMineR")
152 | teams_final2 <- teams_final[c(-20:-21)]
153 | 
154 | teams_pca <- prcomp(teams_final2, scale = TRUE)
155 | summary(teams_pca)
156 | fviz_screeplot(teams_pca, ncp=10)
157 | 
158 | pca.var <- get_pca_var(teams_pca)
159 | pca.var
160 | pca.var$contrib
161 | pca.var$coord
162 | 
163 | fviz_contrib(teams_pca, choice = "var", axes = 1)
164 | fviz_contrib(teams_pca, choice = "var", axes = 2)
165 | 
166 | fviz_pca_var(teams_pca)
167 | fviz_pca_var(teams_pca, col.var="contrib")
168 | 
169 | 
170 | 
171 | 
172 |  
173 | 
174 | 
175 | 
176 | 
177 | 
178 | 
179 | 
180 | 
181 | 
182 | 


--------------------------------------------------------------------------------
/r_scripts/hosmer_statcast_analysis.r:
--------------------------------------------------------------------------------
  1 | ##Read in CSV and inspect data
  2 | hosmer <- read.csv("hosmer_stats_updated.csv")
  3 | summary(hosmer)
  4 | 
  5 | ##Bar charts of selected variables
  6 | library(ggplot2)
  7 | qplot(factor(pitch_type), data=hosmer, geom="bar", fill=factor(pitch_type)) +
  8 |   ggtitle("Types of Pitches Hit in Play")
  9 | 
 10 | qplot(factor(events), data=hosmer, geom="bar", fill=factor(events)) +
 11 |   ggtitle("Result of Balls in Play") + coord_flip()
 12 | 
 13 | ##Pitch type analysis
 14 | #Let's run a chi-squared test to see if hit location depends on pitch type
 15 | table1 <- table(hosmer$pitch_type, hosmer$hit_location)
 16 | table1
 17 | chisq.test(table1)
 18 | 
 19 | ##Let's look at hit distance, speed, and angle
 20 | #First, let's inspect means and standard deviations by pitch type for hit distance, speed, and angle
 21 | tapply(hosmer$hit_distance_sc, hosmer$pitch_type, mean)
 22 | tapply(hosmer$hit_distance_sc, hosmer$pitch_type, sd)
 23 | tapply(hosmer$hit_speed, hosmer$pitch_type, mean)
 24 | tapply(hosmer$hit_speed, hosmer$pitch_type, sd)
 25 | tapply(hosmer$hit_angle, hosmer$pitch_type, mean)
 26 | tapply(hosmer$hit_angle, hosmer$pitch_type, sd)
 27 | 
 28 | #Plot lowess lines to explore the relationship between variables
 29 | ggplot(hosmer, aes(x=hit_speed, y=hit_angle)) +
 30 |   geom_point() +    
 31 |   geom_smooth() + ggtitle("Relationship Between Hit Angle and Hit Speed")
 32 | 
 33 | ggplot(hosmer, aes(x=hit_angle, y=hit_distance_sc)) +
 34 |   geom_point(shape=19) +    
 35 |   geom_smooth() + ggtitle("Relationship Between Hit Distance and Hit Angle")
 36 | 
 37 | ggplot(hosmer, aes(x=hit_speed, y=hit_distance_sc)) +
 38 |   geom_point(shape=19) +    
 39 |   geom_smooth() + ggtitle("Relationship Between Hit Distance and Hit Speed")
 40 | 
 41 | #Let's look at different styles of density plots for the result of balls put in play
 42 | ggplot(hosmer, aes(hit_speed, colour = description)) +
 43 |   geom_density() + ggtitle("Density by Result of Hit")
 44 | 
 45 | ggplot(hosmer, aes(hit_speed, fill = description)) +
 46 |   geom_density(position="stack") + ggtitle("Density by Result of Hit")
 47 | 
 48 | ggplot(hosmer, aes(hit_speed, fill = description)) +
 49 |   geom_density(position="fill") + ggtitle("Density by Result of Hit")
 50 | 
 51 | #Viloin plot of hit speed by type of pitch
 52 | g<-ggplot(hosmer, aes(x=pitch_type, y=hit_speed))
 53 | g + geom_violin(alpha=0.5, color="gray")+geom_jitter(alpha=0.5, aes(color=pitch_type),
 54 |                                                      position = position_jitter(width = 0.1))+ 
 55 |   coord_flip() + ggtitle("Hit Speed by Pitch Type")
 56 | 
 57 | ##Segment pitch break length into quartiles and see how Hosmer handles pitches with greater break
 58 | library(data.table)
 59 | setDT(hosmer)
 60 | hosmer[,quartile:=cut(break_length,
 61 |                       breaks=quantile(break_length,probs=seq(0,1,by=1/4)),
 62 |                       labels=1:4,right=F)]
 63 | 
 64 | #Hit speed histograms faceted by break length
 65 | ggplot(hosmer, aes(hit_speed, fill = quartile)) +
 66 |   geom_histogram(binwidth = 10) + facet_wrap(~ quartile) +
 67 |   ggtitle("Hit Speed by Quartile of Pitch Break Length")
 68 | 
 69 | #Let's look at some specific scenarios
 70 | aggregate(hit_distance_sc ~ pitch_type + inning, data = hosmer, mean)
 71 | aggregate(hit_distance_sc ~ pitch_type + inning, data = hosmer, length)
 72 | 
 73 | hosmer$outs_when_up <- as.factor(as.numeric(hosmer$outs_when_up))
 74 | aggregate(hit_distance_sc ~ pitch_type + inning + outs_when_up, data = hosmer, mean)
 75 | aggregate(hit_distance_sc ~ pitch_type + inning + outs_when_up, data = hosmer, length)
 76 | 
 77 | #Let's look at the impact of the count
 78 | hosmer$balls <- as.factor(as.integer(hosmer$balls))
 79 | hosmer$strikes <- as.factor(as.integer(hosmer$strikes))
 80 | 
 81 | ggplot(hosmer, aes(hit_speed, fill = balls)) +
 82 |   geom_density(position="fill") + ggtitle("Hit Speed by Number of Balls")
 83 | 
 84 | ggplot(hosmer, aes(hit_speed, fill = strikes)) +
 85 |   geom_density(position="fill") + ggtitle("Hit Speed by Number of Strikes")
 86 | 
 87 | ggplot(hosmer, aes(hit_angle, fill = balls)) +
 88 |   geom_density(position="fill") + ggtitle("Hit Angle by Number of Balls")
 89 | 
 90 | ggplot(hosmer, aes(hit_angle, fill = strikes)) +
 91 |   geom_density(position="fill") + ggtitle("Hit Angle by Number of Strikes")
 92 | 
 93 | ggplot(hosmer, aes(hit_distance_sc, fill = balls)) +
 94 |   geom_density(position="fill") + ggtitle("Hit Distance by Number of Balls")
 95 | 
 96 | ggplot(hosmer, aes(hit_distance_sc, fill = strikes)) +
 97 |   geom_density(position="fill") + ggtitle("Hit Distance by Number of Strikes")
 98 | 
 99 | aggregate(hit_distance_sc ~ strikes + inning + outs_when_up, data = hosmer, mean)
100 | aggregate(hit_distance_sc ~ strikes + inning + outs_when_up, data = hosmer, length)
101 | 
102 | #Lastly, let's do a k-means cluster of hit distance, speed, and angle
103 | #Start by selecting the desired columns and scaling the data
104 | library(dplyr)
105 | hosmer1 <- subset(hosmer, select = c(54, 55, 56))
106 | 
107 | hosmer2 <- scale(hosmer1)
108 | head(hosmer2)
109 | 
110 | #Elbow plot to determine the number of clusters
111 | wss <- (nrow(hosmer2)-1)*sum(apply(hosmer2,2,var))
112 | for (i in 2:15) wss[i] <- sum(kmeans(hosmer2,
113 |                                      centers=i)$withinss)
114 | plot(1:15, wss, type="b", xlab="Number of Clusters",
115 |      ylab="Within groups sum of squares", main = "Elbow Plot for No. of Clusters")
116 | 
117 | #K-Means cluster with k=3
118 | set.seed(600)
119 | fit1 <- kmeans(hosmer2, 3) 
120 | hosmer3 <- data.frame(hosmer1, fit1$cluster) 
121 | hosmer <- data.frame(hosmer, fit1$cluster) 
122 | head(hosmer3, 5)
123 | 
124 | #Look at appended clusters in full and stripped down datasets
125 | cluster1 <- hosmer3[which(hosmer3$fit1.cluster=='1'),]
126 | cluster2 <- hosmer3[which(hosmer3$fit1.cluster=='2'),]   
127 | cluster3 <- hosmer3[which(hosmer3$fit1.cluster=='3'),]
128 | summary(cluster1)
129 | summary(cluster2)
130 | summary(cluster3)
131 | 
132 | cluster1a <- hosmer[which(hosmer$fit1.cluster=='1'),]
133 | cluster2a <- hosmer[which(hosmer$fit1.cluster=='2'),]   
134 | cluster3a <- hosmer[which(hosmer$fit1.cluster=='3'),]
135 | summary(cluster1a)
136 | summary(cluster2a)
137 | summary(cluster3a)
138 | 
139 | #PCA plot of clusters
140 | library(cluster)
141 | set.seed(500)
142 | clusplot(hosmer2, fit1$cluster, color=TRUE, shade=TRUE,
143 |          labels=2, lines=0, main = "PCA Plot of K-Means Cluster")
144 | 


--------------------------------------------------------------------------------
/python_scripts/pitching_markov_chain.py:
--------------------------------------------------------------------------------
  1 | # Citation: http://iacs-courses.seas.harvard.edu/courses/am207/blog/lecture-18.html
  2 | 
  3 | # Library imports
  4 | import pandas as pd
  5 | import random
  6 | import csv
  7 | 
  8 | 
  9 | def run_pitching_markov_chain():
 10 | 
 11 |     # Read in data
 12 |     df = pd.read_csv('scherzer_pitches.csv')
 13 | 
 14 |     # Change pitch names
 15 |     df.rename(columns={'15': 'pitch'}, inplace=True)
 16 | 
 17 |     pitch_dict = {'CH': 'CH', 'CU': 'CU', 'FA': 'OT',
 18 |                   'FC': 'OT', 'FF': 'FF', 'FT': 'OT',
 19 |                   'IN': 'OT', 'PO': 'OT', 'SL': 'SL',
 20 |                   'UN': 'OT'}
 21 | 
 22 |     df['pitch'] = df['pitch'].map(pitch_dict)
 23 | 
 24 |     # Transition Matrix
 25 |     transitions = {}
 26 |     row_sums = {}
 27 | 
 28 |     for line in open('scherzer_pitch_sequences.csv'):
 29 |         s, e = line.rstrip().split(',')
 30 |         transitions[(s, e)] = transitions.get((s, e), 0.) + 1
 31 |         row_sums[s] = row_sums.get(s, 0.) + 1
 32 | 
 33 |     for k, v in transitions.iteritems():
 34 |         s, e = k
 35 |         transitions[k] = v / row_sums[s]
 36 | 
 37 |     with open('scherzer_transitions.csv', 'wb') as f:
 38 |         w = csv.DictWriter(f, transitions.keys())
 39 |         w.writeheader()
 40 |         w.writerow(transitions)
 41 | 
 42 |     # Emission probability calculations
 43 |     def calculate_emission_probabilities(df):
 44 |         df.rename(columns={'19': 'strikes'}, inplace=True)
 45 |         df.rename(columns={'20': 'balls'}, inplace=True)
 46 | 
 47 |         df['strikes'] = df['strikes'].astype('str')
 48 |         df['balls'] = df['balls'].astype('str')
 49 |         df['count'] = df['balls'] + '-' + df['strikes']
 50 | 
 51 |         pitch_totals = df['pitch'].groupby(df['pitch']).count()
 52 |         pitch_totals = pd.DataFrame(pitch_totals)
 53 |         pitch_totals.rename(columns={'pitch': 'pitch_total'}, inplace=True)
 54 |         pitch_totals.reset_index(inplace = True)
 55 | 
 56 |         pitches_in_counts = df['pitch'].groupby([df['count'], df['pitch']]).count()
 57 |         pitches_in_counts = pd.DataFrame(pitches_in_counts)
 58 |         pitches_in_counts.rename(columns={'pitch': 'pitch_situations'}, inplace=True)
 59 |         pitches_in_counts.reset_index(inplace = True)
 60 | 
 61 |         pitches_in_counts = pd.merge(pitches_in_counts, pitch_totals,
 62 |                                      how = 'inner', on = 'pitch')
 63 | 
 64 |         pitches_in_counts['pitch_percentage'] = pitches_in_counts['pitch_situations'] /\
 65 |         pitches_in_counts['pitch_total']
 66 | 
 67 |         return pitches_in_counts
 68 | 
 69 |     pitches_in_counts = calculate_emission_probabilities(df)
 70 | 
 71 |     count_dict = {'0.0-0.0': "'0-0'", '0.0-1.0': "'0-1'", '0.0-2.0': "'0-2'", '1.0-0.0': "'1-0'",
 72 |                      '1.0-1.0': "'1-1'", '1.0-2.0': "'1-2'", '2.0-0.0': "'2-0'", '2.0-1.0': "'2-1'",
 73 |                      '2.0-2.0': "'2-2'", '3.0-0.0': "'3-0'", '3.0-1.0': "'3-1'", '3.0-2.0': "'3-2'"}
 74 | 
 75 |     pitches_in_counts['count'] = pitches_in_counts['count'].map(count_dict)
 76 |     pitches_in_counts.to_csv('pitches_in_counts.csv', index = False)
 77 | 
 78 |     # Set up states and probabilities
 79 |     states = ('Fourseam', 'Change', 'Slider', 'Curve', 'Other')
 80 | 
 81 |     observations = ('0-0', '0-1', '0-2', '1-0', '1-1', '1-2', '2-0', '2-1',
 82 |                     '2-2', '3-0', '3-1', '3-2')
 83 | 
 84 |     start_probability = {'Fourseam': 0.50, 'Change': 0.20, 'Slider': 0.20,
 85 |                          'Curve': 0.05, 'Other': 0.05}
 86 | 
 87 |     transition_probability = {
 88 |        'Fourseam' : {'Fourseam': 0.57, 'Change': 0.19, 'Slider': 0.17, 
 89 |                      'Curve': 0.05, 'Other': 0.02},
 90 | 
 91 |        'Change' : {'Fourseam': 0.61, 'Change': 0.21, 'Slider': 0.12, 
 92 |                    'Curve': 0.04, 'Other': 0.02},
 93 | 
 94 |        'Slider' : {'Fourseam': 0.58, 'Change': 0.09, 'Slider': 0.27,
 95 |                    'Curve': 0.03, 'Other': 0.03},
 96 | 
 97 |        'Curve' : {'Fourseam': 0.61, 'Change': 0.21, 'Slider': 0.09,
 98 |                   'Curve': 0.08, 'Other': 0.01},
 99 | 
100 |        'Other' : {'Fourseam': 0.34, 'Change': 0.18, 'Slider': 0.15, 
101 |                   'Curve': 0.27, 'Other': 0.06},
102 |        }
103 | 
104 |     emission_probability = {
105 |        'Fourseam' : {'0-0': 0.27, '0-1': 0.13, '0-2': 0.07, '1-0': 0.09,
106 |                      '1-1': 0.10, '1-2': 0.10, '2-0': 0.04, '2-1': 0.05,
107 |                      '2-2': 0.08, '3-0': 0.01, '3-1': 0.02, '3-2': 0.05},
108 | 
109 |        'Change' : {'0-0': 0.17, '0-1': 0.12, '0-2': 0.07, '1-0': 0.11,
110 |                      '1-1': 0.12, '1-2': 0.15, '2-0': 0.02, '2-1': 0.05,
111 |                      '2-2': 0.13, '3-0': 0.0, '3-1': 0.0, '3-2': 0.05}, 
112 | 
113 |        'Slider' : {'0-0': 0.25, '0-1': 0.16, '0-2': 0.10, '1-0': 0.08,
114 |                      '1-1': 0.10, '1-2': 0.14, '2-0': 0.0, '2-1': 0.03,
115 |                      '2-2': 0.10, '3-0': 0.0, '3-1': 0.0, '3-2': 0.03},
116 | 
117 |        'Curve' : {'0-0': 0.34, '0-1': 0.18, '0-2': 0.10, '1-0': 0.05,
118 |                      '1-1': 0.10, '1-2': 0.12, '2-0': 0.0, '2-1': 0.01,
119 |                      '2-2': 0.09, '3-0': 0.0, '3-1': 0.0, '3-2': 0.02},
120 | 
121 |        'Other' : {'0-0': 0.25, '0-1': 0.14, '0-2': 0.07, '1-0': 0.10,
122 |                      '1-1': 0.10, '1-2': 0.09, '2-0': 0.03, '2-1': 0.05,
123 |                      '2-2': 0.08, '3-0': 0.02, '3-1': 0.02, '3-2': 0.04}
124 |        }
125 | 
126 |     # A HMM is created from the above matices for 100 of Scherzer's pitches
127 |     # Both hidden and visible states are generated
128 |     N = 100
129 |     hidden = []
130 |     visible = []
131 |     
132 |     if random.random() < start_probability[states[0]]:
133 |         hidden.append(states[0])
134 |     else:
135 |         hidden.append(states[1])
136 | 
137 |     for i in xrange(N):
138 |         current_state = hidden[i]
139 |         if random.random() < transition_probability[current_state][states[0]]:
140 |             hidden.append(states[0])
141 |         else:
142 |             hidden.append(states[1])
143 |         r = random.random()
144 |         prev = 0
145 |         for observation in observations:
146 |             prev += emission_probability[current_state][observation]
147 |             if r < prev:
148 |                 visible.append(observation)
149 |                 break
150 |                 
151 |     hidden.pop()
152 | 
153 |     # Run the Viterbi algorithm
154 |     def viterbi(obs, states, start_p, trans_p, emit_p):
155 |         V = [{}]
156 |         path = {}
157 | 
158 |         for y in states:
159 |             V[0][y] = start_p[y] * emit_p[y][obs[0]]
160 |             path[y] = [y]
161 | 
162 |         for t in range(1, len(obs)):
163 |             V.append({})
164 |             newpath = {}
165 | 
166 |             for y in states:
167 |                 (prob, state) = max((V[t-1][y0] * trans_p[y0][y] * emit_p[y][obs[t]], y0) for y0 in states)
168 |                 V[t][y] = prob
169 |                 newpath[y] = path[state] + [y]
170 | 
171 |             path = newpath
172 | 
173 |         (prob, state) = max((V[t][y], y) for y in states)
174 |         return (prob, path[state])
175 | 
176 |     # Input the generated markov model
177 |     def example_model():
178 |         return viterbi(visible,
179 |                        states,
180 |                        start_probability,
181 |                        transition_probability,
182 |                        emission_probability)
183 | 
184 |     (prob, p_hidden) = example_model()
185 | 
186 |     # Assess accuracy of the model
187 |     wrong= 0
188 |     for i in range(len(hidden)):
189 |         if hidden[i] != p_hidden[i]:
190 |             wrong = wrong + 1
191 |     print "accuracy: " + str(1-float(wrong)/N)
192 |     return
193 |   
194 |   
195 |  if __name__ == "__main__":
196 |     run_pitching_markov_chain()
197 | 


--------------------------------------------------------------------------------
/python_scripts/pitcher_similarity.py:
--------------------------------------------------------------------------------
  1 | # Citation: Programming Collective Intelligence by Toby Segaran
  2 | # Import Libraries
  3 | import os
  4 | 
  5 | import pandas as pd
  6 | import pymysql.cursors
  7 | from math import sqrt
  8 | from sklearn.preprocessing import MinMaxScaler
  9 | 
 10 | 
 11 | ## Connect to the database
 12 | connection = pymysql.connect(host='localhost',
 13 |                              user='root',
 14 |                              password='xxxxx',
 15 |                              db='xxxxx',
 16 |                              charset='utf8mb4',
 17 |                              cursorclass=pymysql.cursors.DictCursor)
 18 | 
 19 | 
 20 | def ingest_data():
 21 |     pitchers_query = '''
 22 |     select
 23 |     concat(master.nameFirst, ' ', master.nameLast) as "Name",
 24 |     pitching.yearID as "Year",
 25 |     pitching.W as "Wins",
 26 |     pitching.L as "Losses",
 27 |     pitching.G as "Appearances",
 28 |     pitching.GS as "Games_Started",
 29 |     pitching.CG as "Complete_Games",
 30 |     pitching.SHO as "Shutouts",
 31 |     pitching.SV as "Saves",
 32 |     pitching.IPouts as "Outs_Recorded",
 33 |     pitching.H as "Hits_Surrendered",
 34 |     pitching.ER as "Earned_Runs",
 35 |     pitching.HR as "Home_Runs_Surrendered",
 36 |     pitching.BB as "Walks_Surrendered",
 37 |     pitching.SO as "Strikeouts",
 38 |     pitching.BAOpp as "Opponent_Batting_Average",
 39 |     pitching.ERA as "ERA",
 40 |     pitching.R as "Runs_Surrendered"
 41 | 
 42 |     from pitching
 43 |     inner join master on pitching.playerID = master.playerID
 44 | 
 45 |     where pitching.YearID >= 1900;'''
 46 | 
 47 |     pitchers = pd.read_sql(pitchers_query, connection)
 48 |     return pitchers
 49 | 
 50 | def clean_data(pitchers):
 51 |     pitchers = pitchers.dropna()    
 52 |     
 53 |     pitchers['Year'] = pitchers['Year'].astype('str')
 54 |     pitchers['Player_and_Year'] = pitchers['Name'] + ' ' + pitchers['Year']
 55 |     
 56 |     pitchers['Decisions'] = pitchers['Wins'] + pitchers['Losses']
 57 |     pitchers['Wins_Over_Decisions'] = pitchers['Wins'] / pitchers['Decisions']
 58 |     pitchers['Wins_Over_Starts'] = pitchers['Wins'] / pitchers['Games_Started']
 59 |     
 60 |     pitchers['Relief_Appearances'] = pitchers['Appearances']\
 61 |     - pitchers['Games_Started'] 
 62 |     
 63 |     pitchers['Shutout_Percentage'] = pitchers['Shutouts']\
 64 |     / pitchers['Games_Started']
 65 |     
 66 |     pitchers['Outs_Recorded_Per_Appearance'] = pitchers['Outs_Recorded']\
 67 |     /pitchers['Appearances']
 68 |     
 69 |     pitchers['Hits_Allowed_Per_Appearance'] = pitchers['Hits_Surrendered']\
 70 |     /pitchers['Appearances']
 71 |     
 72 |     pitchers['Earned_Runs_Per_Appearance'] = pitchers['Earned_Runs']\
 73 |     /pitchers['Appearances']
 74 |     
 75 |     pitchers['Runs_Per_Appearance'] = pitchers['Runs_Surrendered']\
 76 |     /pitchers['Appearances']
 77 |     
 78 |     pitchers['Home_Runs_Per_Appearance'] = pitchers['Home_Runs_Surrendered']\
 79 |     /pitchers['Appearances']
 80 |     
 81 |     pitchers['Walks_Per_Appearance'] = pitchers['Walks_Surrendered']\
 82 |     /pitchers['Appearances']
 83 |     
 84 |     pitchers['Strikeouts_Per_Appearance'] = pitchers['Strikeouts']\
 85 |     /pitchers['Appearances']
 86 |     
 87 |     pitchers = pitchers[['Player_and_Year', 'Decisions', 'Wins_Over_Decisions',
 88 |                          'Wins_Over_Starts', 'Relief_Appearances',
 89 |                          'Shutout_Percentage', 'Outs_Recorded_Per_Appearance',
 90 |                          'Hits_Allowed_Per_Appearance', 'Earned_Runs_Per_Appearance',
 91 |                          'Runs_Per_Appearance', 'Home_Runs_Per_Appearance',
 92 |                          'Walks_Per_Appearance', 'Strikeouts_Per_Appearance',
 93 |                          'ERA']]
 94 |                          
 95 |     pitchers = pitchers.fillna(value=0)
 96 |     pitchers['Wins_Over_Starts'] = pitchers['Wins_Over_Starts'].astype('str')
 97 |     pitchers['Wins_Over_Starts'] = pitchers['Wins_Over_Starts'].str.replace('inf', '0')
 98 |     pitchers['Wins_Over_Starts'] = pitchers['Wins_Over_Starts'].astype('float')
 99 |     pitchers['Decisions'] = pitchers['Decisions'].astype('int')
100 |     pitchers['Relief_Appearances'] = pitchers['Relief_Appearances'].astype('int')
101 |                          
102 |     return pitchers
103 |     
104 |     
105 | # Scale data
106 | def scale_data(pitchers):
107 |     num_data = pitchers[['Decisions', 'Wins_Over_Decisions',
108 |                          'Wins_Over_Starts', 'Relief_Appearances',
109 |                          'Shutout_Percentage', 'Outs_Recorded_Per_Appearance',
110 |                          'Hits_Allowed_Per_Appearance', 'Earned_Runs_Per_Appearance',
111 |                          'Runs_Per_Appearance', 'Home_Runs_Per_Appearance',
112 |                          'Walks_Per_Appearance', 'Strikeouts_Per_Appearance',
113 |                          'ERA']]
114 |                          
115 |     scaler = MinMaxScaler()
116 |     scaler.fit(num_data)
117 |     num_data = scaler.transform(num_data)
118 |     num_data = pd.DataFrame(num_data)
119 | 
120 |     num_data.columns = ['Decisions', 'Wins_Over_Decisions',
121 |                          'Wins_Over_Starts', 'Relief_Appearances',
122 |                          'Shutout_Percentage', 'Outs_Recorded_Per_Appearance',
123 |                          'Hits_Allowed_Per_Appearance', 'Earned_Runs_Per_Appearance',
124 |                          'Runs_Per_Appearance', 'Home_Runs_Per_Appearance',
125 |                          'Walks_Per_Appearance', 'Strikeouts_Per_Appearance',
126 |                          'ERA']
127 |                          
128 |     pitchers = pitchers[['Player_and_Year']]
129 |     
130 |     pitchers = pd.merge(pitchers, num_data, how='inner', left_index=True,
131 |                         right_index=True)
132 |     
133 |     return pitchers
134 |     
135 | 
136 | # Create dictionary of pitchers
137 | def create_dictionary(pitchers):
138 |     pitchers_melted = pd.melt(pitchers, id_vars=['Player_and_Year'], 
139 |                    value_vars=['Decisions', 'Wins_Over_Decisions',
140 |                              'Wins_Over_Starts', 'Relief_Appearances',
141 |                              'Shutout_Percentage', 'Outs_Recorded_Per_Appearance',
142 |                              'Hits_Allowed_Per_Appearance', 'Earned_Runs_Per_Appearance',
143 |                              'Runs_Per_Appearance', 'Home_Runs_Per_Appearance',
144 |                              'Walks_Per_Appearance', 'Strikeouts_Per_Appearance',
145 |                              'ERA'])
146 | 
147 | 
148 |     player_dictionary = pitchers_melted.groupby('Player_and_Year').apply(lambda x: x.set_index\
149 |     ('variable')['value'].to_dict()).to_dict()
150 |     
151 |     return player_dictionary
152 | 
153 | 
154 | # Euclidean Distance Function
155 | def sim_distance(atts, p1, p2):
156 |     si = {}
157 |     for item in atts[p1]:
158 |         if item in atts[p2]:
159 |             si[item] = 1
160 |             
161 |     if len(si) == 0:
162 |         return 0
163 | 
164 |     sum_of_squares = sum([pow(atts[p1][item] - atts[p2][item], 2) for item in
165 |                          atts[p1] if item in atts[p2]])
166 |                          
167 |     return 1 / (1 + sqrt(sum_of_squares))
168 | 
169 |   
170 | # Get top matches
171 | def top_matches(atts, person, n=15, similarity=sim_distance):
172 | 
173 |     scores = [(similarity(atts, person, other), other) for other in atts
174 |               if other != person]
175 |     scores.sort()
176 |     scores.reverse()
177 |     return scores[0:n]
178 | 
179 |   
180 | # Run the similarity analysis
181 | def get_top_matches(player_and_year):
182 |     df = top_matches(player_dictionary, player_and_year)
183 |     df = pd.DataFrame(df)
184 |     df.columns = ['Similarity', 'Pitcher_and_Year']
185 |     return df
186 |   
187 | if __name__ == "__main__:
188 |     pitchers = ingest_data()
189 |     pitchers = clean_data(pitchers)
190 |     pitchers = scale_data(pitchers)
191 |     player_dictionary = create_dictionary(pitchers)
192 |   
193 |     pedro2000 = get_top_matches('Pedro Martinez 2000')
194 |     clemens1997 = get_top_matches('Roger Clemens 1997')
195 |     johnson2002 = get_top_matches('Randy Johnson 2002')
196 |     greinke2009 = get_top_matches('Zack Greinke 2009')
197 |     maddux1992 = get_top_matches('Greg Maddux 1992')
198 |     schilling2001 = get_top_matches('Curt Schilling 2001')
199 |     rivera2004 = get_top_matches('Mariano Rivera 2004')
200 |     gagne2003 = get_top_matches('Eric Gagne 2003')
201 | 
202 |     pedro2000.to_csv('pedro2000.csv', index=False)
203 |     clemens1997.to_csv('clemens1997.csv', index=False)
204 |     johnson2002.to_csv('johnson2002.csv', index=False)
205 |     greinke2009.to_csv('greinke2009.csv', index=False)
206 |     maddux1992.to_csv('maddux1992.csv', index=False)
207 |     schilling2001.to_csv('schilling2001.csv', index=False)
208 |     rivera2004.to_csv('rivera2004.csv', index=False)
209 |     gagne2003.to_csv('gagne2003.csv', index=False)
210 | 
211 | 


--------------------------------------------------------------------------------
/r_scripts/mlb_attendance_analysis.r:
--------------------------------------------------------------------------------
  1 | #Identify links from which to scrape the data
  2 | #We'll scrape data from 1995-2015; I want the data to be post-strike
  3 | #attendance: http://www.baseball-reference.com/leagues/MLB/1990-misc.shtml
  4 | #standings: http://www.baseball-reference.com/leagues/MLB/1990-standings.shtml
  5 | #pitching: http://www.baseball-reference.com/leagues/MLB/1990-standard-pitching.shtml
  6 | #fielding: http://www.baseball-reference.com/leagues/MLB/1990-standard-fielding.shtml
  7 | #batting: http://www.baseball-reference.com/leagues/MLB/1990-standard-batting.shtml
  8 | 
  9 | #Load libraries
 10 | library(XML)
 11 | library(ggplot2)
 12 | library(plyr)
 13 | library(dplyr)
 14 | library(car)
 15 | library(data.table)
 16 | library(stringr)
 17 | library(alluvial)
 18 | library (glmnet)
 19 | 
 20 | #Scrape attendance data
 21 | fetch_attendance <- function(year) {
 22 |   url <- paste0("http://www.baseball-reference.com/leagues/MLB/", year, "-misc.shtml")
 23 |   data <- readHTMLTable(url, stringsAsFactors = FALSE)
 24 |   data <- data[[1]]
 25 |   data$year <- year
 26 |   data
 27 | }
 28 | 
 29 | attendance <- ldply(1995:2015, fetch_attendance, .progress="text")
 30 | 
 31 | #Scrape standings data
 32 | fetch_standings <- function(year1) {
 33 |   url1 <- paste0("http://www.baseball-reference.com/leagues/MLB/", year1, "-standings.shtml")
 34 |   data1 <- readHTMLTable(url1, stringsAsFactors = FALSE)
 35 |   data1 <- data1[[2]]
 36 |   data1$year1 <- year1
 37 |   data1
 38 | }
 39 | 
 40 | standings <- ldply(1995:2015, fetch_standings, .progress="text")
 41 | 
 42 | #Scrape pitching data
 43 | fetch_pitching <- function(year2) {
 44 |   url2 <- paste0("http://www.baseball-reference.com/leagues/MLB/", year2, "-standard-pitching.shtml")
 45 |   data2 <- readHTMLTable(url2, stringsAsFactors = FALSE)
 46 |   data2 <- data2[[1]]
 47 |   data2$year2 <- year2
 48 |   data2
 49 | }
 50 | 
 51 | pitching <- ldply(1995:2015, fetch_pitching, .progress="text")
 52 | 
 53 | #Scrape fielding data
 54 | fetch_fielding <- function(year3) {
 55 |   url3 <- paste0("http://www.baseball-reference.com/leagues/MLB/", year3, "-standard-fielding.shtml")
 56 |   data3 <- readHTMLTable(url3, stringsAsFactors = FALSE)
 57 |   data3 <- data3[[1]]
 58 |   data3$year3 <- year3
 59 |   data3
 60 | }
 61 | 
 62 | fielding <- ldply(1995:2015, fetch_fielding, .progress="text")
 63 | 
 64 | #Scrape batting data
 65 | fetch_batting <- function(year4) {
 66 |   url4 <- paste0("http://www.baseball-reference.com/leagues/MLB/", year4, "-standard-batting.shtml")
 67 |   data4 <- readHTMLTable(url4, stringsAsFactors = FALSE)
 68 |   data4 <- data4[[1]]
 69 |   data4$year4 <- year4
 70 |   data4
 71 | }
 72 | 
 73 | batting <- ldply(1995:2015, fetch_batting, .progress="text")
 74 | 
 75 | #Now that we've scraped the data, we need to munge the data frames
 76 | #We'll merge the data frames on team name and year
 77 | #First, though, we need to clean up the year columns
 78 | 
 79 | #Change column names
 80 | names(standings)[24]<-"year"
 81 | names(pitching)[37]<-"year"
 82 | names(fielding)[17]<-"year"
 83 | names(batting)[30]<-"year"
 84 | 
 85 | #Merge the five data frames on team name and year
 86 | #We can only merge two data frames at a time
 87 | #So we'll have to repeat the process a few times
 88 | teams_data <- merge(standings, attendance, by=c("Tm", "year"))
 89 | teams_data2 <- merge(teams_data, batting, by=c("Tm", "year"))
 90 | teams_data3 <- merge(teams_data2, fielding, by=c("Tm", "year"))
 91 | teams_data4 <- merge(teams_data3, pitching, by=c("Tm", "year"))
 92 | 
 93 | #It's possible that team names have changed over time
 94 | #Let's inspect the data frame to see if that's the case
 95 | #A "correct" team should have 21 records
 96 | teams_counts <- aggregate(year ~ Tm, data = teams_data4, length)
 97 | print(teams_counts)
 98 | teams_counts[order(teams_counts$year),]
 99 | 
100 | #It looks like we might have issues with 9 records
101 | #ARI came into the NL in 1998, so their data is fine as is
102 | #Tampa Bay should also only have 18 seasons of data
103 | 
104 | #Duplicated columns will prevent us from running the next commands,
105 | #so let's delete those here
106 | teams_data4 <- teams_data4[c(-71,-92)]
107 | 
108 | #Combine MON and WSN
109 | teams_data5 <- mutate(teams_data4, Tm = recode(Tm, "'MON'='WSN'"))
110 | 
111 | #Combine FLA and MIA
112 | teams_data5 <- mutate(teams_data5, Tm = recode(Tm, "'FLA'='MIA'"))
113 | 
114 | #Combine TBD and TBR 
115 | teams_data5 <- mutate(teams_data5, Tm = recode(Tm, "'TBD'='TBR'"))
116 | 
117 | #Combine CAL, ANA, and LAA
118 | teams_data5 <- mutate(teams_data5, Tm = recode(Tm, "'CAL'='LAA'"))
119 | teams_data5 <- mutate(teams_data5, Tm = recode(Tm, "'ANA'='LAA'"))
120 | 
121 | #Check the data frame to see if all now looks OK
122 | teams_counts1 <- aggregate(year ~ Tm, data = teams_data5, length)
123 | print(teams_counts1)
124 | 
125 | #Let's inspect the data to see if it all looks good
126 | str(teams_data5)
127 | 
128 | #We need to remove the commas from the attendance column, since R will 
129 | #have trouble reading it
130 | remove_commas <- function(x) {
131 |   x <- str_replace_all(x, ",", "") 
132 | }
133 | 
134 | teams_data5$Attendance <- remove_commas(teams_data5$Attendance)
135 | 
136 | #All the colums have been read as characters
137 | #Let's identify the numeric columns and make them numeric
138 | #Let's also rename the data frame while we're at it
139 | columns <- subset(teams_data5, select = c(5:13, 25:33, 40:81, 84:117))
140 | teams <- data.frame(sapply(columns, as.numeric))
141 | teams_other <- subset(teams_data5, select = c(-5:-13, -25:-33, -40:-81, -84:-117))
142 | teams_final <- data.frame(teams, teams_other)
143 | 
144 | #Let's add a column to the data frame that identifies which quartile
145 | #each team falls under in terms of wins
146 | setDT(teams_final)
147 | teams_final[,wins_quartile:=cut(W.x,
148 |                       breaks=quantile(W.x,probs=seq(0,1,by=1/4)),
149 |                       labels=1:4,right=F)]
150 | 
151 | str(teams_final$wins_quartile)
152 | 
153 | #I want to create rad viz plots but found it quite cumbersome in R
154 | #So, let's write CSVs of the data we want to visualize and import to
155 | #Python, which makes rad viz pretty easy using Pandas
156 | offensive_stats <- subset(teams_final, select = c(118,25,26,29,31,34))
157 | write.csv(offensive_stats, file = "offensive_stats.csv")
158 | 
159 | pitching_stats <- subset(teams_final, select = c(118,56,67,79,81))
160 | write.csv(pitching_stats, file = "pitching_stats.csv")
161 | 
162 | #Develop cleveland dot plots
163 | #Wins by Season
164 | ggplot(teams_final)+ geom_point(aes(x=Tm, y=W.x), colour = "blue") +
165 |   coord_flip() + ggtitle("Single Season Win Totals") + 
166 |   xlab("Team") + ylab("Wins")
167 | 
168 | #Total Wins
169 | wins_total <- aggregate(W.x ~ Tm, data = teams_final, sum) 
170 | wins_total1$Tm <-factor(wins_total$Tm, levels=wins_total[order(wins_total$W.x), "Tm"])
171 | 
172 | ggplot(wins_total1)+ geom_point(aes(x=Tm, y=W.x), colour = "blue") +
173 |   coord_flip() + ggtitle("Win Totals from 1995-2015") + 
174 |   xlab("Team") + ylab("Wins")
175 | 
176 | #Total Home Runs
177 | hr_total <- aggregate(HR.x ~ Tm, data = teams_final, sum) 
178 | hr_total$Tm <-factor(hr_total$Tm, levels=hr_total[order(hr_total$HR.x), "Tm"])
179 | 
180 | ggplot(hr_total)+ geom_point(aes(x=Tm, y=HR.x), colour = "blue") +
181 |   coord_flip() + ggtitle("Home Run Totals from 1995-2015") + 
182 |   xlab("Team") + ylab("Home Runs")
183 | 
184 | #Total Stolen Bases
185 | sb_total <- aggregate(SB ~ Tm, data = teams_final, sum) 
186 | sb_total$Tm <-factor(sb_total$Tm, levels=sb_total[order(sb_total$SB), "Tm"])
187 | 
188 | ggplot(sb_total)+ geom_point(aes(x=Tm, y=SB), colour = "blue") +
189 |   coord_flip() + ggtitle("Stolen Base Totals from 1995-2015") + 
190 |   xlab("Team") + ylab("Stolen Bases")
191 | 
192 | #Total Errors
193 | e_total <- aggregate(E ~ Tm, data = teams_final, sum) 
194 | e_total$Tm <-factor(e_total$Tm, levels=e_total[order(e_total$E), "Tm"])
195 | 
196 | ggplot(e_total)+ geom_point(aes(x=Tm, y=E), colour = "blue") +
197 |   coord_flip() + ggtitle("Error Totals from 1995-2015") + 
198 |   xlab("Team") + ylab("Errors")
199 | 
200 | #Total Earned Runs
201 | er_total <- aggregate(ER ~ Tm, data = teams_final, sum) 
202 | er_total$Tm <-factor(er_total$Tm, levels=er_total[order(er_total$ER), "Tm"])
203 | 
204 | ggplot(er_total)+ geom_point(aes(x=Tm, y=ER), colour = "blue") +
205 |   coord_flip() + ggtitle("Earned Run Totals from 1995-2015") + 
206 |   xlab("Team") + ylab("Earned Runs")
207 | 
208 | #Alluvial plot
209 | wins_over_time <- subset(teams_final, select = c(95,96,2))
210 | selected_teams <- filter(wins_over_time, Tm == "NYY" | Tm == "ATL" | Tm == "STL" 
211 |                          | Tm == "SFG" | Tm == "BOS" | Tm == "TEX")
212 | 
213 | alluvial_ts(selected_teams, title = "Wins over Time")
214 | 
215 | #Now that we've inspected the data, let's do some prediction
216 | #What seems to impact attendance?
217 | #Let's start with subsetting our data frame to only include the columns we
218 | #propose are predictive
219 | teams_subset <- subset(teams_final, select = c(10,2,12,13,17,25,34,56,73,77,95))
220 | summary(teams_subset)
221 | teams_subset$Tm <- as.factor(as.character(teams_subset$Tm))
222 | 
223 | #Let's also sum the attendance column for context
224 | sum(teams_subset$Attendance)
225 | #1,492,344,734
226 | 
227 | #Split data into training and test sets
228 | set.seed(10)
229 | train = sample(1: nrow(x), nrow(x)/2)
230 | test = (-train )
231 | y.test = y[test]
232 | 
233 | #Ordinary least squares
234 | #I'm interested in seeing what the coefficients will look like if we feed the 
235 | #model the entire dataset
236 | pairs(teams_subset)
237 | ols_model <- lm(Attendance ~ 0 + W.x + BatAge.x + PAge.x + X.A.S +
238 |                   R.y + SO.x + E + SV + ER + Tm, data=teams_subset)
239 | plot(ols_model)
240 | summary(ols_model)
241 | vif(ols_model)
242 | 
243 | #Looks like we have some issues with multi-collinearity, so let's pivot to
244 | #ridge and lasso
245 | 
246 | #Ridge regression
247 | #Create matrices needed for the glmnet package
248 | x <- model.matrix (Attendance ~.,teams_subset )[,-1]
249 | y=teams_subset$Attendance
250 | 
251 | #Run the ridge regression model
252 | ridge_model = glmnet(x[train,], y[train], alpha = 0)
253 | 
254 | #Use cross-validation to determine the best value for lambda
255 | cv.out = cv.glmnet(x[train,], y[train], alpha = 0)
256 | plot(cv.out)
257 | bestlam = cv.out$lambda.min
258 | bestlam
259 | 
260 | #Run the ridge regression on the full dataset using the optimal lambda value
261 | #View the coefficients of the model
262 | out = glmnet(x,y,alpha =0)
263 | ridge.coef = predict(out, type ="coefficients", s=bestlam)
264 | ridge.coef
265 | 
266 | #Lasso regression
267 | #Follow the same process as above, except we need to set alpha = 1
268 | 
269 | #Run the lasso regression model
270 | lasso_model = glmnet(x[train,], y[train], alpha = 1)
271 | 
272 | #Use cross-validation to determine the best lambda
273 | cv.out1 = cv.glmnet(x[train,], y[train], alpha = 1)
274 | plot(cv.out1)
275 | bestlam1 = cv.out$lambda.min
276 | bestlam1
277 | 
278 | #Run the lasso regression on the full dataset using the optimal lambda value
279 | #View the coefficients of the model
280 | out1 = glmnet(x,y,alpha = 1)
281 | lasso.coef = predict(out1, type ="coefficients", s=bestlam1)
282 | lasso.coef
283 | 
284 | #Lastly, let's look at the MSE of each model
285 | #MSE of the OLS model
286 | #OLS is simply the same as setting lambda equal to zero
287 | ols_pred = predict(ridge.mod ,s=0, newx=x[test,], exact=T)
288 | ols_mse <- mean((ols_pred -y.test)^2)
289 | print(ols_mse)
290 | sqrt(ols_mse)
291 | #MSE: 197,876,169,581
292 | #RMSE:444,832
293 | 
294 | #MSE of the ridge regression model
295 | ridge.pred = predict(ridge_model,s=bestlam ,newx=x[test ,])
296 | ridge_mse <- mean((ridge.pred-y.test)^2)
297 | print(ridge_mse)
298 | sqrt(ridge_mse)
299 | #MSE: 189,415,000,000
300 | #RMSE: 435,218
301 | 
302 | #MSE of the lasso regression model
303 | lasso.pred = predict(lasso_model,s=bestlam ,newx=x[test ,])
304 | lasso_mse <- mean((lasso.pred-y.test)^2)
305 | print(lasso_mse)
306 | sqrt(lasso_mse)
307 | #MSE: 220,407,311,134
308 | #RMSE: 469,475
309 | 
310 | 
311 | 
312 | 


--------------------------------------------------------------------------------
/r_scripts/rookie_all_star_predictions.Rmd:
--------------------------------------------------------------------------------
  1 | # Rookie All-Star Modeling
  2 | The goal of this project is twofold: 1) predict if a player will become an all-star based on their rookie offensive stats and 2) determine clusters of rookies.
  3 | 
  4 | At the end of the day, this is a fairly limited analysis, though we can see some interesting
  5 | patterns in the data. 
  6 | 
  7 | ```{r message=FALSE}
  8 | options(warn=-1)
  9 | setwd("C:/Users/Micah/Desktop/applied_data_mining")
 10 | set.seed(19)
 11 | 
 12 | library(ggplot2)
 13 | library(lattice)
 14 | library(caret)
 15 | library(pROC)
 16 | library(plyr)
 17 | library(rpart)
 18 | library(rattle)
 19 | library(cluster)
 20 | library(data.table)
 21 | library(MASS)
 22 | library(colorRamps)
 23 | library(nFactors)
 24 | library(gplots)
 25 | library(RColorBrewer)
 26 | library(semPlot)
 27 | library(waffle)
 28 | library(extrafont)
 29 | 
 30 | font_import()
 31 | 
 32 | 
 33 | ```
 34 | 
 35 | # Cleaning Functions
 36 | ```{r}
 37 | subset_to_rookie_year <- function(df){
 38 |   # Define rookie season as first yearID in which player had more than 100 ABs.
 39 |   df[, 'yearID'] <- sapply(df[, 'yearID'], as.numeric)
 40 |   eligible_df <- df[ which(df$AB > 100), ]
 41 |   rookie_df <- aggregate(eligible_df$yearID, by=list(eligible_df$playerID), min)
 42 |   colnames(rookie_df) <- c('playerID', 'yearID')
 43 |   df <- merge(df, rookie_df, by=c('playerID', 'yearID'))
 44 |   df <- df[!duplicated(df$playerID),]
 45 |   # For simplicity, remove small number of players with 100+ ABs for two teams 
 46 |   # in their rookie season. 
 47 |   df <- df[ which(df$AB > 100), ]
 48 |   return(df)
 49 | }
 50 | 
 51 | 
 52 | # Only use the last few decades of players.
 53 | # Do not use players who are too recent - they may still become all-stars. 
 54 | subset_to_between_1970_and_2010 <- function(df){
 55 |   df <- df[ which(df$yearID >= 1970 & df$yearID <= 2010), ]
 56 |   return(df)
 57 | }
 58 | 
 59 | 
 60 | count_all_star_appearances <- function(all_stars, batting){
 61 |   all_star_temp <- all_stars[,c('playerID', 'yearID')]
 62 |   all_star_temp$rookie_all_star_appearance <- 'yes'
 63 |   batting <- merge(batting, all_star_temp, by=c('playerID', 'yearID'), all.x=TRUE)
 64 |   batting$rookie_all_star_appearance[is.na(batting$rookie_all_star_appearance)] <- 'no'
 65 |   
 66 |   batting$rookie_id <- 'yes'
 67 |   all_stars_non_rookie <- merge(all_stars, batting, by=c('playerID', 'yearID'), all.x=TRUE)
 68 |   all_stars_non_rookie$rookie_id[is.na(all_stars_non_rookie$rookie_id)] <- 'no'
 69 |   all_stars_non_rookie <- all_stars_non_rookie[ which(all_stars_non_rookie$rookie_id == 'no'), ]
 70 |   
 71 |   all_stars_non_rookie <- all_stars_non_rookie[c('playerID')]
 72 |   all_stars_non_rookie <- as.data.frame(table(all_stars_non_rookie))
 73 |   colnames(all_stars_non_rookie) <- c('playerID', 'all_star')
 74 |   
 75 |   merged_df <- merge(batting, all_stars_non_rookie, by='playerID', all.x=TRUE)
 76 |   merged_df$all_star[merged_df$all_star > 0] <- "yes"
 77 |   merged_df$all_star[merged_df$all_star != 'yes'] <- "no"
 78 |   merged_df$all_star[is.na(merged_df$all_star)] <- 'no'
 79 |   return(merged_df)
 80 | }
 81 | 
 82 | 
 83 | create_name_to_id_mapping <- function(df){
 84 |   df$playerName <- paste(df$nameFirst, ' ', df$nameLast)
 85 |   df <- df[c('playerID', 'playerName')]
 86 |   return(df)
 87 | }
 88 | 
 89 | 
 90 | calculate_slg_obp_obp_and_avg <- function(df){
 91 |   df[is.na(df)] <- 0
 92 |   df$avg <- df$H / df$AB
 93 |   df$obp <- (df$H + df$BB + df$HBP) / (df$AB + df$BB + df$HBP + df$SF)
 94 |   df$slg <- (df$H + (df$X2B + df$X3B + df$HR) + (df$X2B * 2) + 
 95 |               (df$X3B * 3 + df$HR * 4)) / df$AB
 96 |   df[is.na(df)] <- 0
 97 |   return(df)
 98 | }
 99 | 
100 | 
101 | select_columns_for_modeling <- function(df){
102 |   df <- subset(df, select=c(G, AB, R, H, X2B, X3B, HR, RBI, SB, BB, SO, avg, obp, slg, 
103 |                             all_star, playerID))
104 |   return(df)
105 | }
106 | 
107 | 
108 | drop_player_id <- function(df){
109 |   drop <- c('playerID')
110 |   df <- df[ , !(names(df) %in% drop)]
111 |   return(df)
112 | }
113 | 
114 | ```
115 | 
116 | ## Exploration Functions
117 | ```{r}
118 | count_factor_occurrences_by_target <- function(df, feature, target, title){
119 |   print(ggplot(df, aes_string(feature, fill = target)) +
120 |           geom_bar() + ggtitle(title))
121 | }
122 | 
123 | 
124 | make_histogram_by_target <- function(df, feature, target, title, bins){
125 |   print(ggplot(df, aes_string(feature, fill = target)) +
126 |           geom_histogram(binwidth = bins) + ggtitle(title))
127 | }
128 | 
129 | 
130 | make_parallel_coordinates <- function(df, feature, cuts){
131 |   c <- blue2red(cuts)
132 |   r <- cut(feature, cuts)
133 |   parcoord(df, col=c[as.numeric(r)])
134 | }
135 | 
136 | ```
137 | 
138 | ## Factor Analysis Functions
139 | ```{r}
140 | make_scree_table_for_factor_analysis <- function(df){
141 |   nScree(df)
142 | }
143 | 
144 | 
145 | get_eigenvalues <- function(df){
146 |   eigen(cor(df))
147 | }
148 | 
149 | 
150 | build_factor_analysis_model <- function(df, n_factors){
151 |   fa <- factanal(df, factors = n_factors, lower = 0.01)
152 |   print(fa)
153 |   return(fa)
154 | }
155 | 
156 | 
157 | make_factor_analysis_heatmp <- function(fa){
158 |   heatmap.2(fa$loadings, col = brewer.pal(9, "Greens"), trace = "none",
159 |             key = FALSE, dend = 'none', Colv = FALSE, cexCol = 1.2,
160 |             main = "Factor Loadings")
161 | }
162 | 
163 | 
164 | make_factor_analysis_sem_plot <- function(fa){
165 |   semPaths(fa, what = "est", residuals = FALSE, cut = 0.4,
166 |            posCol = c("white", "darkgreen"), 
167 |            negCol = c("white", "red"),
168 |            edge.label.cex = 0.60, nCharNodes = 7)
169 | }
170 | 
171 | ```
172 | 
173 | ## Supervised Machine Learning Functions
174 | ```{r}
175 | train_random_forest <- function(train_df, target){
176 |   control <- trainControl(method="repeatedcv", number=3, repeats=3, classProbs=TRUE)
177 |   mtry <- c(sqrt(ncol(train_df)), log2(ncol(train_df)))
178 |   grid <- expand.grid(.mtry=mtry)
179 |   formula <- as.formula(paste(target, "~ ."))
180 |   
181 |   model <- train(formula, 
182 |                  data=train_df, 
183 |                  preProcess=c("center", "scale"),
184 |                  method="rf", 
185 |                  metric="ROC",
186 |                  trControl=control, 
187 |                  tuneGrid=grid,
188 |                  allowParallel=TRUE,
189 |                  num.threads=4)
190 |   return(model)
191 | }
192 | 
193 | 
194 | train_log_reg <- function(train_df, target){
195 |   control <- trainControl(method="repeatedcv", number=10, repeats=3, classProbs=TRUE)
196 |   grid <- expand.grid(parameter=c(0.001, 0.01, 0.1, 1,10, 100))
197 |   formula <- as.formula(paste(target, "~ ."))
198 |   
199 |   model <- train(formula, 
200 |                  data=train_df, 
201 |                  preProcess=c("center", "scale"),
202 |                  method="glm", 
203 |                  family="binomial", 
204 |                  metric="ROC",
205 |                  trControl=control, 
206 |                  tuneGrid=grid)
207 |   
208 |   return(model)
209 | }
210 | 
211 | 
212 | train_decision_tree <- function(train_df, target){
213 |   control <- trainControl(method="repeatedcv", number=10, repeats=3, classProbs=TRUE)
214 |   grid <- expand.grid(.maxdepth=c(3, 5, 7, 10))
215 |   formula <- as.formula(paste(target, "~ ."))
216 |   
217 |   model <- train(formula, 
218 |                  data=train_df, 
219 |                  preProcess=c("center", "scale"),
220 |                  method="rpart2", 
221 |                  metric="ROC",
222 |                  trControl=control, 
223 |                  tuneGrid=grid)
224 |   return(model)
225 | }
226 | 
227 | 
228 | train_gradient_boosting <- function(train_df, target){
229 |   control <- trainControl(method="repeatedcv", number=10, repeats=3, classProbs=TRUE)
230 |   grid <- expand.grid(interaction.depth = c(1, 3, 5), 
231 |               n.trees = c(50, 100, 150), 
232 |               shrinkage = 0.1,
233 |               n.minobsinnode = 20)
234 |   
235 |   formula <- as.formula(paste(target, "~ ."))
236 |   
237 |   model <- train(formula, 
238 |                  data=train_df, 
239 |                  preProcess=c("center", "scale"),
240 |                  method="gbm", 
241 |                  metric="ROC",
242 |                  verbose=F,
243 |                  trControl=control, 
244 |                  tuneGrid=grid)
245 |   return(model)
246 | }
247 | 
248 | 
249 | plot_decision_tree <- function(df, target, depth){
250 |   formula <- as.formula(paste(target, "~ ."))
251 |   set.seed(19)
252 |   tree <- rpart(formula, method="class", maxdepth=depth, data=df)
253 |   printcp(tree)
254 |   print(tree)
255 |   fancyRpartPlot(tree)
256 | }
257 | 
258 | 
259 | plot_model <- function(model){
260 |   plot(model)
261 | }
262 | 
263 | 
264 | print_grid_search_results <- function(model){
265 |   model$bestTune
266 |   results <- model$results
267 |   results 
268 | }
269 | 
270 | 
271 | print_confusion_matrix <- function(model, df, target){
272 |   predictions <- predict(model, df)
273 |   con_matrix <- confusionMatrix(predictions, target, positive = 'yes')   
274 |   con_matrix
275 | }
276 | 
277 | 
278 | get_roc_auc <- function(model, df, target){
279 |   probabilities <- predict(model, df, type="prob")
280 |   
281 |   ROC <- roc(predictor=probabilities$yes,
282 |              response=target)
283 |   print(ROC$auc)
284 |   plot(ROC, main="ROC")
285 |   return(ROC)
286 | }
287 | 
288 | 
289 | get_variable_importances <- function(model){
290 |   varImp(model)
291 | }
292 | 
293 | ```
294 | 
295 | ## Unsupervised Machine Learning Functions
296 | ```{r}
297 | scale_dataframe <- function(df){
298 |   df[, -c(3)] <- scale(df[, -c(3)])
299 |   df <- data.frame(df)
300 |   return(df)
301 | }
302 | 
303 | 
304 | plot_within_cluster_sum_of_squares <- function(df, title){
305 |   wss <- (nrow(df)-1) * sum(apply(df, 2, var))
306 |   for (i in 2:15) wss[i] <- sum(kmeans(df, centers=i)$withinss)
307 |   plot(1:15, wss, type="b", xlab="Number of Clusters",
308 |        ylab="Within groups sum of squares", main = paste(title,' elbow plot'))
309 | }
310 | 
311 | 
312 | train_k_means_model <- function(df, k){
313 |   set.seed(19)
314 |   model <- kmeans(df, k, nstart=25)
315 |   return(model)
316 | }
317 | 
318 | 
319 | plot_k_means_model <- function(model, df, title){
320 |   clusplot(df, model$cluster, color=TRUE, shade=TRUE,
321 |            labels=2, lines=0, main = paste(title,' PCA Plot of K-Means'))
322 | }
323 | 
324 | 
325 | create_hclust_and_plot <- function(df){
326 |   set.seed(19)
327 |   dm = dist(df,method="euclidean")
328 |   hclust_model <- hclust(dm, method="complete")
329 |   plot(hclust_model)
330 |   return(hclust_model)
331 | }
332 | 
333 | 
334 | summarize_clusters <- function(df){
335 |   cluster1 <- df[which(df$k_means.cluster=='1'),]
336 |   cluster2 <- df[which(df$k_means.cluster=='2'),]
337 |   cluster3 <- df[which(df$k_means.cluster=='3'),]
338 |   
339 |   print('cluster 1 summary')
340 |   print(summary(cluster1))
341 |   print('cluster 2 summary')
342 |   print(summary(cluster2))
343 |   print('cluster 3 summary')
344 |   print(summary(cluster3))
345 | }
346 | 
347 | ```
348 | 
349 | ## Execution
350 | 
351 | ### Read in data
352 | ```{r}
353 | all_star_df <- read.csv('data/AllstarFull.csv')
354 | batting_df <- read.csv('data/Batting.csv')
355 | people_df <- read.csv('data/People.csv')
356 | 
357 | ```
358 | 
359 | ### Data cleaning
360 | ```{r}
361 | batting_df <- subset_to_rookie_year(batting_df)
362 | batting_df <- subset_to_between_1970_and_2010(batting_df)
363 | batting_df <- count_all_star_appearances(all_star_df, batting_df)
364 | batting_df <- calculate_slg_obp_obp_and_avg(batting_df)
365 | batting_df <- select_columns_for_modeling(batting_df)
366 | batting_df_copy <- batting_df
367 | batting_df <- drop_player_id(batting_df)
368 | 
369 | ```
370 | 
371 | ### Data Exploration
372 | ```{r}
373 | agg_cols_for_hist <- c('G', 'H', 'X2B', 'HR', 'RBI', 'SB')
374 | for (column in agg_cols_for_hist){
375 |   make_histogram_by_target(batting_df, column, 'all_star', 
376 |                            paste(column,' histogram by all star'), 10)
377 | }
378 | 
379 | rate_cols_for_hist <- c('avg', 'obp', 'slg')
380 | for (column in rate_cols_for_hist){
381 |   make_histogram_by_target(batting_df, column, 'all_star', 
382 |                            paste(column,' histogram by all star'), .1)
383 | }
384 | 
385 | # home runs paralell coordinates
386 | make_parallel_coordinates(batting_df[1:14], batting_df$HR, 20)
387 | # obp paralell coordinates
388 | make_parallel_coordinates(batting_df[1:14], batting_df$obp, 20)
389 | # hits paralell coordinates
390 | make_parallel_coordinates(batting_df[1:14], batting_df$H, 20)
391 | # slg paralell coordinates
392 | make_parallel_coordinates(batting_df[1:14], batting_df$slg, 20)
393 | 
394 | ```
395 | 
396 | ## Factor Analysis
397 | ```{r}
398 | make_scree_table_for_factor_analysis(batting_df[1:14])
399 | get_eigenvalues(batting_df[1:14])
400 | batting_factor_analysis <- build_factor_analysis_model(batting_df[1:14], 3)
401 | make_factor_analysis_heatmp(batting_factor_analysis)
402 | make_factor_analysis_sem_plot(batting_factor_analysis)
403 | 
404 | ```
405 | 
406 | ### Classification Models
407 | ### Class Imbalance
408 | ```{r}
409 | # This is not perfectly to scale but close enough to be useful. 
410 | waffle(c(all_star = 52, non_all_star = 233), rows = 19, 
411 |        title = "Target Distribution")
412 | 
413 | ```
414 | 
415 | 
416 | #### Train-Test Splits
417 | ```{r}
418 | partition <- createDataPartition(batting_df$all_star, p = 0.7, list=FALSE)
419 | train_df <- batting_df[partition, ]
420 | test_df <- batting_df[-partition, ]
421 | 
422 | ```
423 | 
424 | #### Decision Tree
425 | ```{r}
426 | decision_tree <- train_decision_tree(train_df, 'all_star')
427 | plot_model(decision_tree)
428 | print_grid_search_results(decision_tree)
429 | print_confusion_matrix(decision_tree, test_df, test_df$all_star)
430 | tree_roc <- get_roc_auc(decision_tree, test_df, test_df$all_star)
431 | plot_decision_tree(train_df, 'all_star', 3)
432 | 
433 | ```
434 | 
435 | #### Random Forest
436 | ```{r message=FALSE}
437 | random_forest <- train_random_forest(train_df, 'all_star')
438 | plot_model(random_forest)
439 | print_grid_search_results(random_forest)
440 | print_confusion_matrix(random_forest, test_df, test_df$all_star)
441 | forest_roc <- get_roc_auc(random_forest, test_df, test_df$all_star)
442 | get_variable_importances(random_forest)
443 | 
444 | ```
445 | 
446 | #### Gradient Boosting
447 | ```{r message=FALSE}
448 | gradient_boosting <- train_gradient_boosting(train_df, 'all_star')
449 | plot_model(gradient_boosting)
450 | print_grid_search_results(gradient_boosting)
451 | print_confusion_matrix(gradient_boosting, test_df, test_df$all_star)
452 | gb_roc <- get_roc_auc(gradient_boosting, test_df, test_df$all_star)
453 | get_variable_importances(gradient_boosting)
454 | 
455 | ```
456 | 
457 | #### Logistic Regression
458 | ```{r}
459 | log_reg <- train_log_reg(train_df, 'all_star')
460 | print_grid_search_results(log_reg)
461 | print_confusion_matrix(log_reg, test_df, test_df$all_star)
462 | lr_roc <- get_roc_auc(log_reg, test_df, test_df$all_star)
463 | 
464 | ```
465 | 
466 | ## Clustering
467 | ### Data Preparation
468 | ```{r}
469 | rownames(batting_df_copy) <- batting_df_copy$playerID
470 | batting_df_copy <- subset(batting_df_copy, select=-c(playerID, all_star))
471 | batting_df_scaled <- scale_dataframe(batting_df_copy)
472 | 
473 | ```
474 | 
475 | ### K-Means Clustering
476 | ```{r}
477 | plot_within_cluster_sum_of_squares(batting_df_scaled, 'rookie batting data')
478 | k_means <- train_k_means_model(batting_df_scaled, 3)
479 | plot_k_means_model(k_means, batting_df_scaled, 'rookie batting')
480 | batting_df_copy <- data.frame(batting_df_copy, k_means$cluster)
481 | summarize_clusters(batting_df_copy)
482 | batting_df_copy$k_means.cluster <- as.factor(batting_df_copy$k_means.cluster)
483 | 
484 | agg_summary_cols <- c('G', 'AB', 'R', 'H', 'X2B', 'X3B', 'HR', 'RBI', 'SB')
485 | for (column in agg_summary_cols){
486 |   make_histogram_by_target(batting_df_copy, column, 'k_means.cluster', 
487 |                            paste(column,' histogram by cluster'), 10)
488 | }
489 | 
490 | rate_summary_cols <- c('avg', 'obp', 'slg')
491 | for (column in rate_summary_cols){
492 |   make_histogram_by_target(batting_df_copy, column, 'k_means.cluster', 
493 |                            paste(column,' histogram by cluster'), .1)
494 | }
495 | 
496 | ```
497 | 
498 | ### Heirarchical Clustering
499 | ```{r}
500 | hclust_model <- create_hclust_and_plot(batting_df_scaled)
501 | plot(cut(as.dendrogram(hclust_model), h=8)$lower[[4]])
502 | plot(cut(as.dendrogram(hclust_model), h=6)$lower[[15]])
503 | 
504 | ```
505 | 
506 | 


--------------------------------------------------------------------------------
/r_scripts/association_rules_2016_games.r:
--------------------------------------------------------------------------------
  1 | ##Load libraries##
  2 | library(XML)
  3 | library(ggplot2)
  4 | library(arules)
  5 | library(arulesViz)
  6 | library(plyr)
  7 | library(dplyr)
  8 | library(stringr)
  9 | library(matrixStats)
 10 | library(data.table)
 11 | library(Hmisc)
 12 | library(gridExtra)
 13 | library(knitr)
 14 | library(rgl)
 15 | 
 16 | ##Read in and inspect data##
 17 | 
 18 | #Scrape a dataset that includes each team's abbreviation
 19 | #We'll isolate the names and put them in a list, which will be used to scrape
 20 | #the data we really want
 21 | abbreviations <- htmlParse("http://www.baseball-reference.com/leagues/MLB/2016.shtml")
 22 | abbreviations.tab <- readHTMLTable(abbreviations, stringsAsFactors=FALSE)
 23 | abbreviations.df <- abbreviations.tab[[2]]
 24 | 
 25 | #Create list of teams
 26 | teams <- list(abbreviations.df$Tm)
 27 | teams <- sapply(teams, "[", c(1:30))
 28 | 
 29 | #Let's now scrape per-game offensive statistics for each team
 30 | fetch_offense <- function(team) {
 31 |   url <- paste0("http://www.baseball-reference.com/teams/tgl.cgi?team=", team, "&t=b&year=2016.com")
 32 |   data <- readHTMLTable(url, stringsAsFactors = FALSE)
 33 |   data <- data[[1]]
 34 |   data$team <- team
 35 |   data
 36 | }
 37 | 
 38 | offensive_stats <- ldply("teams", fetch_offense, .progress="text")
 39 | 
 40 | #Hmmmm, I'm getting a subscript out of bounds error
 41 | #Let's try a loop and see if that works
 42 | results <- data.frame()
 43 | for (i in "teams") {
 44 |   results <- c(results, fetch_offense(i))
 45 |   }
 46 | 
 47 | #Still getting the same error; looks like we may have to manually 
 48 | #insert each team
 49 | ARI_offense <- ldply("ARI", fetch_offense, .progress="text")
 50 | ATL_offense <- ldply("ATL", fetch_offense, .progress="text")
 51 | BAL_offense <- ldply("BAL", fetch_offense, .progress="text")
 52 | BOS_offense <- ldply("BOS", fetch_offense, .progress="text")
 53 | CHC_offense <- ldply("CHC", fetch_offense, .progress="text")
 54 | CHW_offense <- ldply("CHW", fetch_offense, .progress="text")
 55 | CIN_offense <- ldply("CIN", fetch_offense, .progress="text")
 56 | CLE_offense <- ldply("CLE", fetch_offense, .progress="text")
 57 | COL_offense <- ldply("COL", fetch_offense, .progress="text")
 58 | DET_offense <- ldply("DET", fetch_offense, .progress="text")
 59 | HOU_offense <- ldply("HOU", fetch_offense, .progress="text")
 60 | KCR_offense <- ldply("KCR", fetch_offense, .progress="text")
 61 | LAA_offense <- ldply("LAA", fetch_offense, .progress="text")
 62 | LAD_offense <- ldply("LAD", fetch_offense, .progress="text")
 63 | MIA_offense <- ldply("MIA", fetch_offense, .progress="text")
 64 | MIL_offense <- ldply("MIL", fetch_offense, .progress="text")
 65 | MIN_offense <- ldply("MIN", fetch_offense, .progress="text")
 66 | NYM_offense <- ldply("NYM", fetch_offense, .progress="text")
 67 | NYY_offense <- ldply("NYY", fetch_offense, .progress="text")
 68 | OAK_offense <- ldply("OAK", fetch_offense, .progress="text")
 69 | PHI_offense <- ldply("PHI", fetch_offense, .progress="text")
 70 | PIT_offense <- ldply("PIT", fetch_offense, .progress="text")
 71 | SDP_offense <- ldply("SDP", fetch_offense, .progress="text")
 72 | SEA_offense <- ldply("SEA", fetch_offense, .progress="text")
 73 | SFG_offense <- ldply("SFG", fetch_offense, .progress="text")
 74 | STL_offense <- ldply("STL", fetch_offense, .progress="text")
 75 | TBR_offense <- ldply("TBR", fetch_offense, .progress="text")
 76 | TEX_offense <- ldply("TEX", fetch_offense, .progress="text")
 77 | TOR_offense <- ldply("TOR", fetch_offense, .progress="text")
 78 | WSN_offense <- ldply("WSN", fetch_offense, .progress="text")
 79 | 
 80 | #Bind the data frames
 81 | offensive_complete <- rbind(ARI_offense, ATL_offense, BAL_offense, BOS_offense, CHC_offense,
 82 | CHW_offense, CIN_offense, CLE_offense, COL_offense, DET_offense, HOU_offense, KCR_offense,
 83 | LAA_offense, LAD_offense, MIA_offense, MIL_offense, MIN_offense, NYM_offense, NYY_offense,
 84 | OAK_offense, PHI_offense, PIT_offense, SDP_offense, SEA_offense, SFG_offense, STL_offense,
 85 | TBR_offense, TEX_offense, TOR_offense, WSN_offense)
 86 | 
 87 | #OK, now that we have offensive stats, let's scrape pitching stats
 88 | fetch_pitching <- function(team) {
 89 |   url <- paste0("http://www.baseball-reference.com/teams/tgl.cgi?team=", team, "&t=p&year=2016.com")
 90 |   data <- readHTMLTable(url, stringsAsFactors = FALSE)
 91 |   data <- data[[1]]
 92 |   data$team <- team
 93 |   data
 94 | }
 95 | 
 96 | pitching_stats <- ldply("teams", fetch_pitching, .progress="text")
 97 | 
 98 | #Still getting a subscript out of bounds error, so we'll have to 
 99 | #manually insert each team
100 | ARI_pitching <- ldply("ARI", fetch_pitching, .progress="text")
101 | ATL_pitching <- ldply("ATL", fetch_pitching, .progress="text")
102 | BAL_pitching <- ldply("BAL", fetch_pitching, .progress="text")
103 | BOS_pitching <- ldply("BOS", fetch_pitching, .progress="text")
104 | CHC_pitching <- ldply("CHC", fetch_pitching, .progress="text")
105 | CHW_pitching <- ldply("CHW", fetch_pitching, .progress="text")
106 | CIN_pitching <- ldply("CIN", fetch_pitching, .progress="text")
107 | CLE_pitching <- ldply("CLE", fetch_pitching, .progress="text")
108 | COL_pitching <- ldply("COL", fetch_pitching, .progress="text")
109 | DET_pitching <- ldply("DET", fetch_pitching, .progress="text")
110 | HOU_pitching <- ldply("HOU", fetch_pitching, .progress="text")
111 | KCR_pitching <- ldply("KCR", fetch_pitching, .progress="text")
112 | LAA_pitching <- ldply("LAA", fetch_pitching, .progress="text")
113 | LAD_pitching <- ldply("LAD", fetch_pitching, .progress="text")
114 | MIA_pitching <- ldply("MIA", fetch_pitching, .progress="text")
115 | MIL_pitching <- ldply("MIL", fetch_pitching, .progress="text")
116 | MIN_pitching <- ldply("MIN", fetch_pitching, .progress="text")
117 | NYM_pitching <- ldply("NYM", fetch_pitching, .progress="text")
118 | NYY_pitching <- ldply("NYY", fetch_pitching, .progress="text")
119 | OAK_pitching <- ldply("OAK", fetch_pitching, .progress="text")
120 | PHI_pitching <- ldply("PHI", fetch_pitching, .progress="text")
121 | PIT_pitching <- ldply("PIT", fetch_pitching, .progress="text")
122 | SDP_pitching <- ldply("SDP", fetch_pitching, .progress="text")
123 | SEA_pitching <- ldply("SEA", fetch_pitching, .progress="text")
124 | SFG_pitching <- ldply("SFG", fetch_pitching, .progress="text")
125 | STL_pitching <- ldply("STL", fetch_pitching, .progress="text")
126 | TBR_pitching <- ldply("TBR", fetch_pitching, .progress="text")
127 | TEX_pitching <- ldply("TEX", fetch_pitching, .progress="text")
128 | TOR_pitching <- ldply("TOR", fetch_pitching, .progress="text")
129 | WSN_pitching <- ldply("WSN", fetch_pitching, .progress="text")
130 | 
131 | #Bind the data frames
132 | pitching_complete <- rbind(ARI_pitching, ATL_pitching, BAL_pitching, BOS_pitching, CHC_pitching,
133 | CHW_pitching, CIN_pitching, CLE_pitching, COL_pitching, DET_pitching, HOU_pitching, KCR_pitching,
134 | LAA_pitching, LAD_pitching, MIA_pitching, MIL_pitching, MIN_pitching, NYM_pitching, NYY_pitching,
135 | OAK_pitching, PHI_pitching, PIT_pitching, SDP_pitching, SEA_pitching, SFG_pitching, STL_pitching,
136 | TBR_pitching, TEX_pitching, TOR_pitching, WSN_pitching)
137 | 
138 | #The pitching data was read in an odd wa, so let's fix that
139 | fix_date <- function(x) {
140 |   x <- str_replace_all(x, "Â", "") 
141 | }
142 | 
143 | pitching_complete$Date <- fix_date(pitching_complete$Date)
144 | 
145 | #Merge data frames on team and Gtm, which is essentially a unique ID 
146 | #for each game
147 | game_logs <- merge(offensive_complete, pitching_complete, by=c("Gtm", "team"))
148 | 
149 | #Drop the first 750 rows, which are essentially the headings
150 | game_logs <- game_logs[-c(1:750), ] 
151 | 
152 | #All of the columns were read in as characters
153 | #Let's convert each column to the correct data type
154 | #We'll drop a few columns in the process
155 | cat_columns <- subset(game_logs, select = c(1:6, 32:33, 65))
156 | game_logs1 <- data.frame(sapply(cat_columns, as.factor))
157 | summary(game_logs1)
158 | 
159 | num_columns <- subset(game_logs, select = c(8:31, 39:64))
160 | game_logs2 <- data.frame(sapply(num_columns, as.numeric))
161 | 
162 | game_logs3 <- subset(game_logs, select = c(7, 66))
163 | 
164 | games_final <- data.frame(game_logs1, game_logs2, game_logs3)
165 | summary(games_final)
166 | 
167 | write.csv(games_final, file = "2016_game_logs.csv")
168 | 
169 | ##Create exploratory visualizations##
170 | #Look at distributions of key variables
171 | 
172 | #Runs scored
173 | p1 <- ggplot(games_final, aes(R.x)) +
174 |   geom_density() + ggtitle("Distribution of Runs Scored") + xlab("Runs Scored") +
175 |   ylab(" ")
176 | 
177 | #Stolen Bases
178 | p2 <- ggplot(games_final, aes(SB.x)) +
179 |   geom_density() + ggtitle("Distribution of Stolen Bases") + xlab("Stolen Bases") +
180 |   ylab(" ")
181 | 
182 | #Earned Runs
183 | p3 <- ggplot(games_final, aes(ER)) +
184 |   geom_density() + ggtitle("Distribution of Earned Runs") + xlab("Earned Runs") +
185 |   ylab(" ")
186 | 
187 | #Pitches
188 | p4 <- ggplot(games_final, aes(Pit)) +
189 |   geom_density() + ggtitle("Distribution of Pitches") + xlab("Pitches") +
190 |   ylab(" ")
191 | 
192 | grid.arrange(p1, p2, p3, p4, nrow = 2, ncol = 2)
193 | 
194 | #Make 3D scatter plots of runs, hits, and home runs
195 | interleave <- function(v1, v2) as.vector(rbind(v1,v2))
196 | 
197 | plot3d(games_final$R.x, games_final$HR.x, games_final$H.x, xlab = "Runs",
198 | ylab = "Home Runs", zlab = "Hits", type = "s", size = 0.75, lit = FALSE)
199 | 
200 | segments3d(interleave(games_final$R.x, games_final$R.x),
201 |            interleave(games_final$HR.x, games_final$HR.x),
202 |            interleave(games_final$H.x, min(games_final$H.x)),
203 |            alpha = 0.4, col = "blue")
204 | 
205 | #Create color-coded scatter plot of pitches thrown and walks issued
206 | p5 <- ggplot(games_final, aes(x=Pit, y=BB.y, colour=R.y)) +
207 |   geom_point(size=3) + scale_color_gradientn(colours = c("darkred",
208 |   "orange", "yellow")) + xlab("Pitches Thrown") + ylab("Walks Issued") +
209 |   ggtitle("Relationship between Walks and Pitches \n Colored by Runs Allowed")
210 | 
211 | p5
212 | 
213 | #Let's dive into some stats by umpire
214 | umps <- table(games_final$Umpire)
215 | umps <- as.data.frame(umps)
216 | mean(umps$Freq)
217 | sd(umps$Freq)
218 | 
219 | #Inspect hits by umpire
220 | mean_hits <- aggregate(H.x ~ Umpire, data = games_final, mean)
221 | sd_hits <- aggregate(H.x ~ Umpire, data = games_final, sd)
222 | ump_hits <- merge(mean_hits, sd_hits, by = "Umpire")
223 | names(ump_hits)[2]<-"Mean"
224 | names(ump_hits)[3]<-"Standard Deviation"
225 | ump_hits$Umpire <- sub("^$", "Unknown", ump_hits$Umpire)
226 | ump_hits1 <- head(ump_hits[order(-ump_hits$Mean),], 15)
227 | 
228 | data.m <- melt(ump_hits1, id.vars='Umpire')
229 | ggplot(data.m, aes(Umpire, value)) +   
230 |   geom_bar(aes(fill = variable), position = "dodge", stat="identity") + 
231 |   theme(axis.text.x = element_text(angle = 60, hjust = 1)) + xlab("") +
232 |   ggtitle("Mean and Standard Deviations for Hits \n in Games Segmented by Home Plate Umpire")
233 |   
234 | #Inspect runs by umpire
235 | mean_runs <- aggregate(R.x ~ Umpire, data = games_final, mean)
236 | sd_runs <- aggregate(R.x ~ Umpire, data = games_final, sd)
237 | ump_runs <- merge(mean_runs, sd_runs, by = "Umpire")
238 | names(ump_runs)[2]<-"Mean"
239 | names(ump_runs)[3]<-"Standard Deviation"
240 | ump_runs$Umpire <- sub("^$", "Unknown", ump_runs$Umpire)
241 | ump_runs1 <- head(ump_runs[order(-ump_runs$Mean),], 15)
242 | 
243 | data.m <- melt(ump_runs1, id.vars='Umpire')
244 | ggplot(data.m, aes(Umpire, value)) +   
245 |   geom_bar(aes(fill = variable), position = "dodge", stat="identity") + 
246 |   theme(axis.text.x = element_text(angle = 60, hjust = 1)) + xlab("") +
247 |   ggtitle("Mean and Standard Deviations for Runs \n in Games Segmented by Home Plate Umpire")
248 | 
249 | #Inspect walks by umpire
250 | mean_walks <- aggregate(BB.y ~ Umpire, data = games_final, mean)
251 | sd_walks <- aggregate(BB.y ~ Umpire, data = games_final, sd)
252 | ump_walks <- merge(mean_walks, sd_walks, by = "Umpire")
253 | names(ump_walks)[2]<-"Mean"
254 | names(ump_walks)[3]<-"Standard Deviation"
255 | ump_walks$Umpire <- sub("^$", "Unknown", ump_walks$Umpire)
256 | ump_walks1 <- head(ump_walks[order(-ump_walks$Mean),], 15)
257 | 
258 | data.m <- melt(ump_walks1, id.vars='Umpire')
259 | ggplot(data.m, aes(Umpire, value)) +   
260 |   geom_bar(aes(fill = variable), position = "dodge", stat="identity") + 
261 |   theme(axis.text.x = element_text(angle = 60, hjust = 1)) + xlab("") +
262 |   ggtitle("Mean and Standard Deviations for Walks \n in Games Segmented by Home Plate Umpire")
263 | 
264 | #Strikeouts by Umpire
265 | mean_strikeouts <- aggregate(SO.y ~ Umpire, data = games_final, mean)
266 | sd_strikeouts <- aggregate(SO.x ~ Umpire, data = games_final, sd)
267 | ump_strikeouts <- merge(mean_strikeouts, sd_strikeouts, by = "Umpire")
268 | names(ump_strikeouts)[2]<-"Mean"
269 | names(ump_strikeouts)[3]<-"Standard Deviation"
270 | ump_strikeouts$Umpire <- sub("^$", "Unknown", ump_strikeouts$Umpire)
271 | ump_strikeouts1 <- head(ump_strikeouts[order(-ump_strikeouts$Mean),], 15)
272 | 
273 | data.m <- melt(ump_strikeouts1, id.vars='Umpire')
274 | ggplot(data.m, aes(Umpire, value)) +   
275 |   geom_bar(aes(fill = variable), position = "dodge", stat="identity") + 
276 |   theme(axis.text.x = element_text(angle = 60, hjust = 1)) + xlab("") +
277 |   ggtitle("Mean and Standard Deviations for Strikeouts \n in Games Segmented by Home Plate Umpire")
278 | 
279 | #Pitches by Umpire
280 | mean_pitches <- aggregate(Pit ~ Umpire, data = games_final, mean)
281 | sd_pitches <- aggregate(Pit ~ Umpire, data = games_final, sd)
282 | ump_pitches <- merge(mean_pitches, sd_pitches, by = "Umpire")
283 | names(ump_pitches)[2]<-"Mean"
284 | names(ump_pitches)[3]<-"Standard Deviation"
285 | ump_pitches$Umpire <- sub("^$", "Unknown", ump_pitches$Umpire)
286 | ump_pitches1 <- head(ump_pitches[order(-ump_pitches$Mean),], 15)
287 | 
288 | data.m <- melt(ump_pitches1, id.vars='Umpire')
289 | ggplot(data.m, aes(Umpire, value)) +   
290 |   geom_bar(aes(fill = variable), position = "dodge", stat="identity") + 
291 |   theme(axis.text.x = element_text(angle = 60, hjust = 1)) + xlab("") +
292 |   ggtitle("Mean and Standard Deviations for Pitches \n in Games Segmented by Home Plate Umpire")
293 | 
294 | #Strikes by Umpire
295 | mean_strikes <- aggregate(Str ~ Umpire, data = games_final, mean)
296 | sd_strikes <- aggregate(Str ~ Umpire, data = games_final, sd)
297 | ump_strikes <- merge(mean_strikes, sd_strikes, by = "Umpire")
298 | names(ump_strikes)[2]<-"Mean"
299 | names(ump_strikes)[3]<-"Standard Deviation"
300 | ump_strikes$Umpire <- sub("^$", "Unknown", ump_strikes$Umpire)
301 | ump_strikes1 <- head(ump_strikes[order(-ump_strikes$Mean),], 15)
302 | 
303 | data.m <- melt(ump_strikes1, id.vars='Umpire')
304 | ggplot(data.m, aes(Umpire, value)) +   
305 |   geom_bar(aes(fill = variable), position = "dodge", stat="identity") + 
306 |   theme(axis.text.x = element_text(angle = 60, hjust = 1)) + xlab("") +
307 |   ggtitle("Mean and Standard Deviations for Strikes \n in Games Segmented by Home Plate Umpire")
308 | 
309 | #Let's see what variables are most correlated
310 | correlations <- cor(games_final[10:59])
311 | write.csv(correlations, file = "game_stat_correlations.csv")
312 | 
313 | ##Cut each variable into quartiles
314 | game_num <- games_final[10:59]
315 | 
316 | quartile3 <- function(x) {
317 |   ntile(x, 4)
318 | }
319 | 
320 | cut_data <- apply(game_num, 2, quartile3)
321 | cut_data <- data.frame(cut_data)
322 | cut_data <- data.frame(sapply(cut_data, as.factor))
323 | summary(cut_data)
324 | 
325 | ##Conduct association rules mining##
326 | 
327 | #Tell R to treat the data as transactions
328 | game.trans <- as(cut_data, "transactions") 
329 | summary(game.trans)
330 | 
331 | #Calculate, summarize, and plot rules
332 | seg.rules <- apriori(game.trans, parameter = list(support=0.1, conf=0.3, target="rules"))
333 | summary(seg.rules)
334 | plot(seg.rules)
335 | 
336 | #Inspect 50 rules with the highest lift
337 | seg.hi <- head(sort(seg.rules, by = "lift"), 50)
338 | inspect(seg.hi)
339 | plot(seg.hi, method = "graph", control = list(type="items"))
340 | 
341 | #Let's drop a few variables that probably should not be in the analysis
342 | myvars <- names(cut_data) %in% c("BA", "OBP", "SLG", "OPS", "IR", "IS")
343 | cut_data1 <- cut_data[!myvars]
344 | 
345 | ##Re-conduct association rules mining##
346 | #Tell R to treat the data as transactions
347 | game.trans1 <- as(cut_data1, "transactions") 
348 | summary(game.trans1)
349 | 
350 | #Calculate, summarize, and plot rules
351 | seg.rules1 <- apriori(game.trans1, parameter = list(support=0.1, conf=0.3, target="rules"))
352 | summary(seg.rules1)
353 | plot(seg.rules1)
354 | 
355 | #Inspect 50 rules with the highest lift
356 | seg.hi1 <- head(sort(seg.rules1, by = "lift"), 500)
357 | inspect(seg.hi1)
358 | plot(seg.hi1, method = "graph", control = list(type="items"))
359 | 


--------------------------------------------------------------------------------
/python_scripts/match_up_simulations.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | 
  4 | 
  5 | def run_match_up_simulation(fastball_surrender_prob, curve_surrender_prob, change_surrender_prob, fastball_hit_prob,
  6 |                             curve_hit_prob, change_hit_prob, zero_zero_pitch_probs, zero_zero_hit_surrender_prob,
  7 |                             zero_zero_hit_prob, one_zero_pitch_probs, one_zero_hit_surrender_prob, one_zero_hit_prob,
  8 |                             zero_one_pitch_probs, zero_one_hit_surrender_prob, zero_one_hit_prob, one_one_pitch_probs,
  9 |                             one_one_hit_surrender_prob, one_one_hit_prob, two_zero_pitch_probs,
 10 |                             two_zero_hit_surrender_prob, two_zero_hit_prob, zero_two_pitch_probs,
 11 |                             zero_two_hit_surrender_prob, zero_two_hit_prob, one_two_pitch_probs,
 12 |                             one_two_hit_surrender_prob, one_two_hit_prob, two_one_pitch_probs,
 13 |                             two_one_hit_surrender_prob, two_one_hit_prob, three_zero_pitch_probs,
 14 |                             three_zero_hit_surrender_prob, three_zero_hit_prob, two_two_pitch_probs,
 15 |                             two_two_hit_surrender_prob, two_two_hit_prob, three_one_pitch_probs,
 16 |                             three_one_hit_surrender_prob, three_one_hit_prob, three_two_pitch_probs,
 17 |                             three_two_hit_surrender_prob, three_two_hit_prob, zero_zero_swing_prob,
 18 |                             one_zero_swing_prob, zero_one_swing_prob, one_one_swing_prob,
 19 |                             two_zero_swing_prob, zero_two_swing_prob, one_two_swing_prob,
 20 |                             two_one_swing_prob, three_zero_swing_prob, two_two_swing_prob,
 21 |                             three_one_swing_prob, three_two_swing_prob, swing_produces_out):
 22 | 
 23 |     def run_pitch_simulation(count, count_swing_prob, pitch_probs, count_surrender_prob, fastball_surrender_prob,
 24 |                              count_hit_prob, fastball_hit_prob, curve_surrender_prob, curve_hit_prob,
 25 |                              change_surrender_prob, change_hit_prob, df):
 26 | 
 27 |         pitch = np.random.choice(a=['fastball', 'curve', 'change'], p=pitch_probs)
 28 |         swing = np.random.choice(a=['yes', 'no'], p=[count_swing_prob, 1 - count_swing_prob])
 29 | 
 30 |         if pitch == 'fastball':
 31 |             hit_prob = (count_surrender_prob + fastball_surrender_prob + count_hit_prob + fastball_hit_prob) / 4
 32 |             if swing == 'yes':
 33 |                 outcome = np.random.choice(a=['hit', 'no_hit'], p=[hit_prob, 1 - hit_prob])
 34 |             elif swing == 'no':
 35 |                 outcome = 'no_hit'
 36 |             df = df.append(pd.DataFrame({'count': count, 'pitch': 'fastball', 'swing': swing, 'result': [outcome]}))
 37 | 
 38 |         elif pitch == 'curve':
 39 |             hit_prob = (count_surrender_prob + curve_surrender_prob + count_hit_prob + curve_hit_prob) / 4
 40 |             if swing == 'yes':
 41 |                 outcome = np.random.choice(a=['hit', 'no_hit'], p=[hit_prob, 1 - hit_prob])
 42 |             elif swing == 'no':
 43 |                 outcome = 'no_hit'
 44 |             df = df.append(pd.DataFrame({'count': count, 'pitch': 'curve', 'swing': swing, 'result': [outcome]}))
 45 | 
 46 |         elif pitch == 'change':
 47 |             hit_prob = (count_surrender_prob + change_surrender_prob + count_hit_prob + change_hit_prob) / 4
 48 |             if swing == 'yes':
 49 |                 outcome = np.random.choice(a=['hit', 'no_hit'], p=[hit_prob, 1 - hit_prob])
 50 |             elif swing == 'no':
 51 |                 outcome = 'no_hit'
 52 |             df = df.append(pd.DataFrame({'count': count, 'pitch': 'change', 'swing': swing, 'result': [outcome]}))
 53 | 
 54 |         return df
 55 | 
 56 |     at_bat_results = pd.DataFrame()
 57 | 
 58 |     at_bat_results = run_pitch_simulation(count='0-0',
 59 |                                           count_swing_prob=zero_zero_swing_prob,
 60 |                                           pitch_probs=zero_zero_pitch_probs,
 61 |                                           count_surrender_prob=zero_zero_hit_surrender_prob,
 62 |                                           fastball_surrender_prob=fastball_surrender_prob,
 63 |                                           count_hit_prob=zero_zero_hit_prob,
 64 |                                           fastball_hit_prob=fastball_hit_prob,
 65 |                                           curve_surrender_prob=curve_surrender_prob,
 66 |                                           curve_hit_prob=curve_hit_prob,
 67 |                                           change_surrender_prob=change_surrender_prob,
 68 |                                           change_hit_prob=change_hit_prob,
 69 |                                           df=at_bat_results)
 70 | 
 71 |     last_row_df = at_bat_results.tail(1)
 72 | 
 73 |     if last_row_df['swing'].any() == 'yes':
 74 |         end_of_at_bat = np.random.choice(a=['yes', 'no'], p=[swing_produces_out, 1 - swing_produces_out])
 75 |     else:
 76 |         end_of_at_bat = 'no'
 77 | 
 78 |     if at_bat_results['result'].any() == 'no_hit' and end_of_at_bat == 'no':
 79 |         new_count = np.random.choice(a=['0-1', '1-0'], p=[0.50, 0.50])
 80 | 
 81 |         if new_count == '0-1':
 82 |             at_bat_results = run_pitch_simulation(count='0-1',
 83 |                                                   count_swing_prob=zero_one_swing_prob,
 84 |                                                   pitch_probs=zero_one_pitch_probs,
 85 |                                                   count_surrender_prob=zero_one_hit_surrender_prob,
 86 |                                                   fastball_surrender_prob=fastball_surrender_prob,
 87 |                                                   count_hit_prob=zero_one_hit_prob,
 88 |                                                   fastball_hit_prob=fastball_hit_prob,
 89 |                                                   curve_surrender_prob=curve_surrender_prob,
 90 |                                                   curve_hit_prob=curve_hit_prob,
 91 |                                                   change_surrender_prob=change_surrender_prob,
 92 |                                                   change_hit_prob=change_hit_prob,
 93 |                                                   df=at_bat_results)
 94 | 
 95 |         elif new_count == '1-0':
 96 |             at_bat_results = run_pitch_simulation(count='1-0',
 97 |                                                   count_swing_prob=one_zero_swing_prob,
 98 |                                                   pitch_probs=one_zero_pitch_probs,
 99 |                                                   count_surrender_prob=one_zero_hit_surrender_prob,
100 |                                                   fastball_surrender_prob=fastball_surrender_prob,
101 |                                                   count_hit_prob=one_zero_hit_prob,
102 |                                                   fastball_hit_prob=fastball_hit_prob,
103 |                                                   curve_surrender_prob=curve_surrender_prob,
104 |                                                   curve_hit_prob=curve_hit_prob,
105 |                                                   change_surrender_prob=change_surrender_prob,
106 |                                                   change_hit_prob=change_hit_prob,
107 |                                                   df=at_bat_results)
108 | 
109 |         last_row_df = at_bat_results.tail(1)
110 | 
111 |         if last_row_df['swing'].any() == 'yes':
112 |             end_of_at_bat = np.random.choice(a=['yes', 'no'], p=[swing_produces_out, 1 - swing_produces_out])
113 |         else:
114 |             end_of_at_bat = 'no'
115 | 
116 |         if last_row_df['result'].any() == 'no_hit' and last_row_df['count'].any() == '1-0' and end_of_at_bat == 'no':
117 |             new_count = np.random.choice(a=['1-1', '2-0'], p=[0.50, 0.50])
118 | 
119 |         elif last_row_df['result'].any() == 'no_hit' and last_row_df['count'].any() == '0-1':
120 |             new_count = np.random.choice(a=['1-1', '0-2'], p=[0.50, 0.50])
121 | 
122 |         if new_count == '1-1':
123 |             at_bat_results = run_pitch_simulation(count='1-1',
124 |                                                   count_swing_prob=one_one_swing_prob,
125 |                                                   pitch_probs=one_one_pitch_probs,
126 |                                                   count_surrender_prob=one_one_hit_surrender_prob,
127 |                                                   fastball_surrender_prob=fastball_surrender_prob,
128 |                                                   count_hit_prob=one_one_hit_prob,
129 |                                                   fastball_hit_prob=fastball_hit_prob,
130 |                                                   curve_surrender_prob=curve_surrender_prob,
131 |                                                   curve_hit_prob=curve_hit_prob,
132 |                                                   change_surrender_prob=change_surrender_prob,
133 |                                                   change_hit_prob=change_hit_prob,
134 |                                                   df=at_bat_results)
135 | 
136 |         elif new_count == '2-0':
137 |             at_bat_results = run_pitch_simulation(count='2-0',
138 |                                                   count_swing_prob=two_zero_swing_prob,
139 |                                                   pitch_probs=two_zero_pitch_probs,
140 |                                                   count_surrender_prob=two_zero_hit_surrender_prob,
141 |                                                   fastball_surrender_prob=fastball_surrender_prob,
142 |                                                   count_hit_prob=two_zero_hit_prob,
143 |                                                   fastball_hit_prob=fastball_hit_prob,
144 |                                                   curve_surrender_prob=curve_surrender_prob,
145 |                                                   curve_hit_prob=curve_hit_prob,
146 |                                                   change_surrender_prob=change_surrender_prob,
147 |                                                   change_hit_prob=change_hit_prob,
148 |                                                   df=at_bat_results)
149 | 
150 |         elif new_count == '0-2':
151 |             at_bat_results = run_pitch_simulation(count='0-2',
152 |                                                   count_swing_prob=zero_two_swing_prob,
153 |                                                   pitch_probs=zero_two_pitch_probs,
154 |                                                   count_surrender_prob=zero_two_hit_surrender_prob,
155 |                                                   fastball_surrender_prob=fastball_surrender_prob,
156 |                                                   count_hit_prob=zero_two_hit_prob,
157 |                                                   fastball_hit_prob=fastball_hit_prob,
158 |                                                   curve_surrender_prob=curve_surrender_prob,
159 |                                                   curve_hit_prob=curve_hit_prob,
160 |                                                   change_surrender_prob=change_surrender_prob,
161 |                                                   change_hit_prob=change_hit_prob,
162 |                                                   df=at_bat_results)
163 | 
164 |         last_row_df = at_bat_results.tail(1)
165 | 
166 |         if last_row_df['swing'].any() == 'yes':
167 |             end_of_at_bat = np.random.choice(a=['yes', 'no'], p=[swing_produces_out, 1 - swing_produces_out])
168 |         else:
169 |             end_of_at_bat = 'no'
170 | 
171 |         if last_row_df['result'].any() == 'no_hit' and last_row_df['count'].any() == '1-1' and end_of_at_bat == 'no':
172 |             new_count = np.random.choice(a=['1-2', '2-1'], p=[0.50, 0.50])
173 | 
174 |         elif last_row_df['result'].any() == 'no_hit' and last_row_df['count'].any() == '2-0':
175 |             new_count = np.random.choice(a=['2-1', '3-0'], p=[0.50, 0.50])
176 | 
177 |         elif last_row_df['result'].any() == 'no_hit' and last_row_df['count'].any() == '0-2':
178 |             new_count = '1-2'
179 | 
180 |         if new_count == '1-2':
181 |             at_bat_results = run_pitch_simulation(count='1-2',
182 |                                                   count_swing_prob=one_two_swing_prob,
183 |                                                   pitch_probs=one_two_pitch_probs,
184 |                                                   count_surrender_prob=one_two_hit_surrender_prob,
185 |                                                   fastball_surrender_prob=fastball_surrender_prob,
186 |                                                   count_hit_prob=one_two_hit_prob,
187 |                                                   fastball_hit_prob=fastball_hit_prob,
188 |                                                   curve_surrender_prob=curve_surrender_prob,
189 |                                                   curve_hit_prob=curve_hit_prob,
190 |                                                   change_surrender_prob=change_surrender_prob,
191 |                                                   change_hit_prob=change_hit_prob,
192 |                                                   df=at_bat_results)
193 | 
194 |         if new_count == '2-1':
195 |             at_bat_results = run_pitch_simulation(count='2-1',
196 |                                                   count_swing_prob=two_one_swing_prob,
197 |                                                   pitch_probs=two_one_pitch_probs,
198 |                                                   count_surrender_prob=two_one_hit_surrender_prob,
199 |                                                   fastball_surrender_prob=fastball_surrender_prob,
200 |                                                   count_hit_prob=two_one_hit_prob,
201 |                                                   fastball_hit_prob=fastball_hit_prob,
202 |                                                   curve_surrender_prob=curve_surrender_prob,
203 |                                                   curve_hit_prob=curve_hit_prob,
204 |                                                   change_surrender_prob=change_surrender_prob,
205 |                                                   change_hit_prob=change_hit_prob,
206 |                                                   df=at_bat_results)
207 | 
208 |         if new_count == '3-0':
209 |             at_bat_results = run_pitch_simulation(count='3-0',
210 |                                                   count_swing_prob=three_zero_swing_prob,
211 |                                                   pitch_probs=three_zero_pitch_probs,
212 |                                                   count_surrender_prob=three_zero_hit_surrender_prob,
213 |                                                   fastball_surrender_prob=fastball_surrender_prob,
214 |                                                   count_hit_prob=three_zero_hit_prob,
215 |                                                   fastball_hit_prob=fastball_hit_prob,
216 |                                                   curve_surrender_prob=curve_surrender_prob,
217 |                                                   curve_hit_prob=curve_hit_prob,
218 |                                                   change_surrender_prob=change_surrender_prob,
219 |                                                   change_hit_prob=change_hit_prob,
220 |                                                   df=at_bat_results)
221 | 
222 |         last_row_df = at_bat_results.tail(1)
223 | 
224 |         if last_row_df['swing'].any() == 'yes':
225 |             end_of_at_bat = np.random.choice(a=['yes', 'no'], p=[swing_produces_out, 1 - swing_produces_out])
226 |         else:
227 |             end_of_at_bat = 'no'
228 | 
229 |         if last_row_df['result'].any() == 'no_hit' and last_row_df['count'].any() == '1-2' and end_of_at_bat == 'no':
230 |             new_count = '2-2'
231 | 
232 |         elif last_row_df['result'].any() == 'no_hit' and last_row_df['count'].any() == '2-1':
233 |             new_count = np.random.choice(a=['2-2', '3-1'], p=[0.50, 0.50])
234 | 
235 |         elif last_row_df['result'].any() == 'no_hit' and last_row_df['count'].any() == '3-0':
236 |             new_count = '3-1'
237 | 
238 |         if new_count == '2-2':
239 |             at_bat_results = run_pitch_simulation(count='2-2',
240 |                                                   count_swing_prob=two_two_swing_prob,
241 |                                                   pitch_probs=two_two_pitch_probs,
242 |                                                   count_surrender_prob=two_two_hit_surrender_prob,
243 |                                                   fastball_surrender_prob=fastball_surrender_prob,
244 |                                                   count_hit_prob=two_two_hit_prob,
245 |                                                   fastball_hit_prob=fastball_hit_prob,
246 |                                                   curve_surrender_prob=curve_surrender_prob,
247 |                                                   curve_hit_prob=curve_hit_prob,
248 |                                                   change_surrender_prob=change_surrender_prob,
249 |                                                   change_hit_prob=change_hit_prob,
250 |                                                   df=at_bat_results)
251 | 
252 |         if new_count == '3-1':
253 |             at_bat_results = run_pitch_simulation(count='3-1',
254 |                                                   count_swing_prob=three_one_swing_prob,
255 |                                                   pitch_probs=three_one_pitch_probs,
256 |                                                   count_surrender_prob=three_one_hit_surrender_prob,
257 |                                                   fastball_surrender_prob=fastball_surrender_prob,
258 |                                                   count_hit_prob=three_one_hit_prob,
259 |                                                   fastball_hit_prob=fastball_hit_prob,
260 |                                                   curve_surrender_prob=curve_surrender_prob,
261 |                                                   curve_hit_prob=curve_hit_prob,
262 |                                                   change_surrender_prob=change_surrender_prob,
263 |                                                   change_hit_prob=change_hit_prob,
264 |                                                   df=at_bat_results)
265 | 
266 |         last_row_df = at_bat_results.tail(1)
267 | 
268 |         if last_row_df['swing'].any() == 'yes':
269 |             end_of_at_bat = np.random.choice(a=['yes', 'no'], p=[swing_produces_out, 1 - swing_produces_out])
270 |         else:
271 |             end_of_at_bat = 'no'
272 | 
273 |         if last_row_df['result'].any() == 'no_hit' and end_of_at_bat == 'no':
274 |             at_bat_results = run_pitch_simulation(count='3-2',
275 |                                                   count_swing_prob=three_two_swing_prob,
276 |                                                   pitch_probs=three_two_pitch_probs,
277 |                                                   count_surrender_prob=three_two_hit_surrender_prob,
278 |                                                   fastball_surrender_prob=fastball_surrender_prob,
279 |                                                   count_hit_prob=three_two_hit_prob,
280 |                                                   fastball_hit_prob=fastball_hit_prob,
281 |                                                   curve_surrender_prob=curve_surrender_prob,
282 |                                                   curve_hit_prob=curve_hit_prob,
283 |                                                   change_surrender_prob=change_surrender_prob,
284 |                                                   change_hit_prob=change_hit_prob,
285 |                                                   df=at_bat_results)
286 | 
287 |     return at_bat_results
288 | 
289 | 
290 | if __name__ == "__main__":
291 |     fastball_surrender_prob = 0.240
292 |     curve_surrender_prob = 0.260
293 |     change_surrender_prob = 0.250
294 | 
295 |     fastball_hit_prob = 0.300
296 |     curve_hit_prob = 0.230
297 |     change_hit_prob = 0.270
298 | 
299 |     zero_zero_pitch_probs = [0.60, 0.25, 0.15]
300 |     zero_zero_hit_surrender_prob = 0.250
301 |     zero_zero_hit_prob = 0.260
302 |     zero_zero_swing_prob = 0.250
303 | 
304 |     one_zero_pitch_probs = [0.65, 0.20, 0.15]
305 |     one_zero_hit_surrender_prob = 0.255
306 |     one_zero_hit_prob = 0.265
307 |     one_zero_swing_prob = 0.40
308 | 
309 |     zero_one_pitch_probs = [0.55, 0.25, 0.20]
310 |     zero_one_hit_surrender_prob = 0.245
311 |     zero_one_hit_prob = 0.255
312 |     zero_one_swing_prob = 0.450
313 | 
314 |     one_one_pitch_probs = [0.60, 0.20, 0.20]
315 |     one_one_hit_surrender_prob = 0.250
316 |     one_one_hit_prob = 0.260
317 |     one_one_swing_prob = 0.50
318 | 
319 |     two_zero_pitch_probs = [0.75, 0.20, 0.05]
320 |     two_zero_hit_surrender_prob = 0.275
321 |     two_zero_hit_prob = 0.285
322 |     two_zero_swing_prob = 0.40
323 | 
324 |     zero_two_pitch_probs = [0.45, 0.35, 0.20]
325 |     zero_two_hit_surrender_prob = 0.205
326 |     zero_two_hit_prob = 0.220
327 |     zero_two_swing_prob = 0.50
328 | 
329 |     one_two_pitch_probs = [0.55, 0.20, 0.25]
330 |     one_two_hit_surrender_prob = 0.215
331 |     one_two_hit_prob = 0.230
332 |     one_two_swing_prob = 0.60
333 | 
334 |     two_one_pitch_probs = [0.65, 0.20, 0.15]
335 |     two_one_hit_surrender_prob = 0.270
336 |     two_one_hit_prob = 0.280
337 |     two_one_swing_prob = 0.60
338 | 
339 |     three_zero_pitch_probs = [0.90, 0.05, 0.05]
340 |     three_zero_hit_surrender_prob = 0.275
341 |     three_zero_hit_prob = 0.285
342 |     three_zero_swing_prob = 0.10
343 | 
344 |     two_two_pitch_probs = [0.65, 0.20, 0.15]
345 |     two_two_hit_surrender_prob = 0.230
346 |     two_two_hit_prob = 0.245
347 |     two_two_swing_prob = 0.65
348 | 
349 |     three_one_pitch_probs = [0.75, 0.20, 0.05]
350 |     three_one_hit_surrender_prob = 0.275
351 |     three_one_hit_prob = 0.285
352 |     three_one_swing_prob = 0.55
353 | 
354 |     three_two_pitch_probs = [0.70, 0.15, 0.15]
355 |     three_two_hit_surrender_prob = 0.260
356 |     three_two_hit_prob = 0.265
357 |     three_two_swing_prob = 0.75
358 | 
359 |     swing_produces_out = 0.80
360 | 
361 |     at_bat_results = pd.DataFrame()
362 | 
363 |     simulation_runs = 100
364 |     counter = 0
365 |     while counter < simulation_runs:
366 |         temp_df = run_match_up_simulation(fastball_surrender_prob, curve_surrender_prob, change_surrender_prob,
367 |                                           fastball_hit_prob, curve_hit_prob, change_hit_prob, zero_zero_pitch_probs,
368 |                                           zero_zero_hit_surrender_prob, zero_zero_hit_prob, one_zero_pitch_probs,
369 |                                           one_zero_hit_surrender_prob, one_zero_hit_prob, zero_one_pitch_probs,
370 |                                           zero_one_hit_surrender_prob, zero_one_hit_prob, one_one_pitch_probs,
371 |                                           one_one_hit_surrender_prob, one_one_hit_prob, two_zero_pitch_probs,
372 |                                           two_zero_hit_surrender_prob, two_zero_hit_prob, zero_two_pitch_probs,
373 |                                           zero_two_hit_surrender_prob, zero_two_hit_prob, one_two_pitch_probs,
374 |                                           one_two_hit_surrender_prob, one_two_hit_prob, two_one_pitch_probs,
375 |                                           two_one_hit_surrender_prob, two_one_hit_prob, three_zero_pitch_probs,
376 |                                           three_zero_hit_surrender_prob, three_zero_hit_prob, two_two_pitch_probs,
377 |                                           two_two_hit_surrender_prob, two_two_hit_prob, three_one_pitch_probs,
378 |                                           three_one_hit_surrender_prob, three_one_hit_prob, three_two_pitch_probs,
379 |                                           three_two_hit_surrender_prob, three_two_hit_prob, zero_zero_swing_prob,
380 |                                           one_zero_swing_prob, zero_one_swing_prob, one_one_swing_prob,
381 |                                           two_zero_swing_prob, zero_two_swing_prob, one_two_swing_prob,
382 |                                           two_one_swing_prob, three_zero_swing_prob, two_two_swing_prob,
383 |                                           three_one_swing_prob, three_two_swing_prob, swing_produces_out)
384 | 
385 |         at_bat_results = at_bat_results.append(temp_df)
386 |         counter += 1
387 | 
388 |     at_bat_results.to_csv('simulation_results.csv', index=False)
389 | 


--------------------------------------------------------------------------------
/csv_outputs/historical_team_clustering_results.csv:
--------------------------------------------------------------------------------
   1 | yearID,franchID,cluster
   2 | 1946,BOS,1
   3 | 1946,CLE,1
   4 | 1946,DET,1
   5 | 1946,NYY,1
   6 | 1947,NYY,1
   7 | 1948,LAD,1
   8 | 1950,CHC,1
   9 | 1950,CLE,1
  10 | 1950,SFG,1
  11 | 1951,CLE,1
  12 | 1952,BOS,1
  13 | 1952,LAD,1
  14 | 1952,ATL,1
  15 | 1952,CHC,1
  16 | 1952,CIN,1
  17 | 1952,CLE,1
  18 | 1952,DET,1
  19 | 1952,SFG,1
  20 | 1952,NYY,1
  21 | 1952,PIT,1
  22 | 1953,LAD,1
  23 | 1953,CLE,1
  24 | 1953,ATL,1
  25 | 1954,BAL,1
  26 | 1954,LAD,1
  27 | 1954,CLE,1
  28 | 1954,ATL,1
  29 | 1954,SFG,1
  30 | 1954,NYY,1
  31 | 1954,MIN,1
  32 | 1955,BAL,1
  33 | 1955,BOS,1
  34 | 1955,LAD,1
  35 | 1955,CHW,1
  36 | 1955,CHC,1
  37 | 1955,CLE,1
  38 | 1955,ATL,1
  39 | 1955,SFG,1
  40 | 1955,NYY,1
  41 | 1955,PHI,1
  42 | 1955,PIT,1
  43 | 1955,STL,1
  44 | 1956,BAL,1
  45 | 1956,LAD,1
  46 | 1956,CHW,1
  47 | 1956,CHC,1
  48 | 1956,CIN,1
  49 | 1956,CLE,1
  50 | 1956,OAK,1
  51 | 1956,ATL,1
  52 | 1956,SFG,1
  53 | 1956,NYY,1
  54 | 1956,PHI,1
  55 | 1956,PIT,1
  56 | 1956,STL,1
  57 | 1956,MIN,1
  58 | 1957,BAL,1
  59 | 1957,BOS,1
  60 | 1957,LAD,1
  61 | 1957,CHW,1
  62 | 1957,CHC,1
  63 | 1957,CLE,1
  64 | 1957,DET,1
  65 | 1957,OAK,1
  66 | 1957,ATL,1
  67 | 1957,SFG,1
  68 | 1957,NYY,1
  69 | 1957,PHI,1
  70 | 1957,PIT,1
  71 | 1957,STL,1
  72 | 1957,MIN,1
  73 | 1958,BAL,1
  74 | 1958,BOS,1
  75 | 1958,CHW,1
  76 | 1958,CHC,1
  77 | 1958,CIN,1
  78 | 1958,CLE,1
  79 | 1958,DET,1
  80 | 1958,OAK,1
  81 | 1958,LAD,1
  82 | 1958,ATL,1
  83 | 1958,NYY,1
  84 | 1958,PHI,1
  85 | 1958,PIT,1
  86 | 1958,SFG,1
  87 | 1958,STL,1
  88 | 1958,MIN,1
  89 | 1959,BAL,1
  90 | 1959,BOS,1
  91 | 1959,CHW,1
  92 | 1959,CHC,1
  93 | 1959,CIN,1
  94 | 1959,CLE,1
  95 | 1959,DET,1
  96 | 1959,OAK,1
  97 | 1959,ATL,1
  98 | 1959,NYY,1
  99 | 1959,PHI,1
 100 | 1959,PIT,1
 101 | 1959,SFG,1
 102 | 1959,STL,1
 103 | 1959,MIN,1
 104 | 1960,BAL,1
 105 | 1960,BOS,1
 106 | 1960,CHW,1
 107 | 1960,CHC,1
 108 | 1960,CIN,1
 109 | 1960,CLE,1
 110 | 1960,DET,1
 111 | 1960,OAK,1
 112 | 1960,LAD,1
 113 | 1960,ATL,1
 114 | 1960,NYY,1
 115 | 1960,PHI,1
 116 | 1960,PIT,1
 117 | 1960,SFG,1
 118 | 1960,STL,1
 119 | 1960,MIN,1
 120 | 1961,BAL,1
 121 | 1961,BOS,1
 122 | 1961,CHW,1
 123 | 1961,CIN,1
 124 | 1961,CLE,1
 125 | 1961,DET,1
 126 | 1961,OAK,1
 127 | 1961,MIN,1
 128 | 1961,ATL,1
 129 | 1961,NYY,1
 130 | 1961,PHI,1
 131 | 1961,PIT,1
 132 | 1961,SFG,1
 133 | 1961,STL,1
 134 | 1961,TEX,1
 135 | 1962,BAL,1
 136 | 1962,BOS,1
 137 | 1962,CHW,1
 138 | 1962,CHC,1
 139 | 1962,CIN,1
 140 | 1962,CLE,1
 141 | 1962,DET,1
 142 | 1962,HOU,1
 143 | 1962,OAK,1
 144 | 1962,ANA,1
 145 | 1962,LAD,1
 146 | 1962,MIN,1
 147 | 1962,ATL,1
 148 | 1962,NYY,1
 149 | 1962,NYM,1
 150 | 1962,PHI,1
 151 | 1962,PIT,1
 152 | 1962,SFG,1
 153 | 1962,STL,1
 154 | 1962,TEX,1
 155 | 1963,BAL,1
 156 | 1963,CHW,1
 157 | 1963,CHC,1
 158 | 1963,CIN,1
 159 | 1963,DET,1
 160 | 1963,HOU,1
 161 | 1963,OAK,1
 162 | 1963,ANA,1
 163 | 1963,LAD,1
 164 | 1963,MIN,1
 165 | 1963,ATL,1
 166 | 1963,NYY,1
 167 | 1963,NYM,1
 168 | 1963,PHI,1
 169 | 1963,PIT,1
 170 | 1963,SFG,1
 171 | 1963,STL,1
 172 | 1963,TEX,1
 173 | 1964,BAL,1
 174 | 1964,CHW,1
 175 | 1964,CHC,1
 176 | 1964,CIN,1
 177 | 1964,DET,1
 178 | 1964,HOU,1
 179 | 1964,ANA,1
 180 | 1964,LAD,1
 181 | 1964,ATL,1
 182 | 1964,NYY,1
 183 | 1964,NYM,1
 184 | 1964,PHI,1
 185 | 1964,PIT,1
 186 | 1964,SFG,1
 187 | 1964,STL,1
 188 | 1964,TEX,1
 189 | 1965,BAL,1
 190 | 1965,ANA,1
 191 | 1965,CHW,1
 192 | 1965,CHC,1
 193 | 1965,CLE,1
 194 | 1965,DET,1
 195 | 1965,HOU,1
 196 | 1965,OAK,1
 197 | 1965,LAD,1
 198 | 1965,MIN,1
 199 | 1965,ATL,1
 200 | 1965,NYY,1
 201 | 1965,NYM,1
 202 | 1965,PIT,1
 203 | 1965,SFG,1
 204 | 1965,STL,1
 205 | 1965,TEX,1
 206 | 1966,ATL,1
 207 | 1966,BAL,1
 208 | 1966,BOS,1
 209 | 1966,ANA,1
 210 | 1966,CHW,1
 211 | 1966,CLE,1
 212 | 1966,HOU,1
 213 | 1966,OAK,1
 214 | 1966,LAD,1
 215 | 1966,MIN,1
 216 | 1966,NYY,1
 217 | 1966,NYM,1
 218 | 1966,PHI,1
 219 | 1966,PIT,1
 220 | 1966,SFG,1
 221 | 1966,STL,1
 222 | 1966,TEX,1
 223 | 1967,ATL,1
 224 | 1967,BAL,1
 225 | 1967,BOS,1
 226 | 1967,ANA,1
 227 | 1967,CHW,1
 228 | 1967,CHC,1
 229 | 1967,CIN,1
 230 | 1967,CLE,1
 231 | 1967,DET,1
 232 | 1967,HOU,1
 233 | 1967,OAK,1
 234 | 1967,LAD,1
 235 | 1967,MIN,1
 236 | 1967,NYY,1
 237 | 1967,NYM,1
 238 | 1967,PHI,1
 239 | 1967,PIT,1
 240 | 1967,SFG,1
 241 | 1967,STL,1
 242 | 1967,TEX,1
 243 | 1968,ATL,1
 244 | 1968,BAL,1
 245 | 1968,BOS,1
 246 | 1968,ANA,1
 247 | 1968,CHW,1
 248 | 1968,CHC,1
 249 | 1968,CIN,1
 250 | 1968,CLE,1
 251 | 1968,DET,1
 252 | 1968,HOU,1
 253 | 1968,LAD,1
 254 | 1968,MIN,1
 255 | 1968,NYY,1
 256 | 1968,NYM,1
 257 | 1968,OAK,1
 258 | 1968,PHI,1
 259 | 1968,PIT,1
 260 | 1968,SFG,1
 261 | 1968,STL,1
 262 | 1968,TEX,1
 263 | 1969,ATL,1
 264 | 1969,BAL,1
 265 | 1969,BOS,1
 266 | 1969,ANA,1
 267 | 1969,CHW,1
 268 | 1969,CHC,1
 269 | 1969,CLE,1
 270 | 1969,DET,1
 271 | 1969,KCR,1
 272 | 1969,LAD,1
 273 | 1969,MIN,1
 274 | 1969,WSN,1
 275 | 1969,NYY,1
 276 | 1969,NYM,1
 277 | 1969,OAK,1
 278 | 1969,SDP,1
 279 | 1969,MIL,1
 280 | 1969,SFG,1
 281 | 1969,STL,1
 282 | 1969,TEX,1
 283 | 1970,ATL,1
 284 | 1970,BAL,1
 285 | 1970,BOS,1
 286 | 1970,ANA,1
 287 | 1970,CHW,1
 288 | 1970,CHC,1
 289 | 1970,CIN,1
 290 | 1970,CLE,1
 291 | 1970,DET,1
 292 | 1970,HOU,1
 293 | 1970,KCR,1
 294 | 1970,LAD,1
 295 | 1970,MIN,1
 296 | 1970,MIL,1
 297 | 1970,WSN,1
 298 | 1970,NYY,1
 299 | 1970,OAK,1
 300 | 1970,PIT,1
 301 | 1970,TEX,1
 302 | 1971,ATL,1
 303 | 1971,BAL,1
 304 | 1971,BOS,1
 305 | 1971,ANA,1
 306 | 1971,CHW,1
 307 | 1971,CHC,1
 308 | 1971,CIN,1
 309 | 1971,CLE,1
 310 | 1971,DET,1
 311 | 1971,HOU,1
 312 | 1971,KCR,1
 313 | 1971,LAD,1
 314 | 1971,MIN,1
 315 | 1971,MIL,1
 316 | 1971,WSN,1
 317 | 1971,NYY,1
 318 | 1971,NYM,1
 319 | 1971,OAK,1
 320 | 1971,PHI,1
 321 | 1971,PIT,1
 322 | 1971,SDP,1
 323 | 1971,SFG,1
 324 | 1971,STL,1
 325 | 1971,TEX,1
 326 | 1972,ATL,1
 327 | 1972,BAL,1
 328 | 1972,BOS,1
 329 | 1972,ANA,1
 330 | 1972,CHW,1
 331 | 1972,CHC,1
 332 | 1972,CIN,1
 333 | 1972,CLE,1
 334 | 1972,DET,1
 335 | 1972,KCR,1
 336 | 1972,LAD,1
 337 | 1972,MIN,1
 338 | 1972,MIL,1
 339 | 1972,WSN,1
 340 | 1972,NYM,1
 341 | 1972,OAK,1
 342 | 1972,PHI,1
 343 | 1972,PIT,1
 344 | 1972,SDP,1
 345 | 1972,SFG,1
 346 | 1972,STL,1
 347 | 1972,TEX,1
 348 | 1973,ATL,1
 349 | 1973,BAL,1
 350 | 1973,BOS,1
 351 | 1973,ANA,1
 352 | 1973,CHW,1
 353 | 1973,CHC,1
 354 | 1973,CIN,1
 355 | 1973,CLE,1
 356 | 1973,DET,1
 357 | 1973,HOU,1
 358 | 1973,KCR,1
 359 | 1973,LAD,1
 360 | 1973,MIN,1
 361 | 1973,MIL,1
 362 | 1973,WSN,1
 363 | 1973,NYY,1
 364 | 1973,NYM,1
 365 | 1973,OAK,1
 366 | 1973,PHI,1
 367 | 1973,PIT,1
 368 | 1973,SDP,1
 369 | 1973,SFG,1
 370 | 1973,STL,1
 371 | 1973,TEX,1
 372 | 1974,ATL,1
 373 | 1974,BAL,1
 374 | 1974,BOS,1
 375 | 1974,ANA,1
 376 | 1974,CHW,1
 377 | 1974,CHC,1
 378 | 1974,CIN,1
 379 | 1974,CLE,1
 380 | 1974,DET,1
 381 | 1974,HOU,1
 382 | 1974,KCR,1
 383 | 1974,LAD,1
 384 | 1974,MIN,1
 385 | 1974,MIL,1
 386 | 1974,WSN,1
 387 | 1974,NYY,1
 388 | 1974,NYM,1
 389 | 1974,OAK,1
 390 | 1974,PHI,1
 391 | 1974,PIT,1
 392 | 1974,SDP,1
 393 | 1974,SFG,1
 394 | 1974,STL,1
 395 | 1974,TEX,1
 396 | 1975,ATL,1
 397 | 1975,BAL,1
 398 | 1975,BOS,1
 399 | 1975,ANA,1
 400 | 1975,CHW,1
 401 | 1975,CHC,1
 402 | 1975,CIN,1
 403 | 1975,CLE,1
 404 | 1975,DET,1
 405 | 1975,HOU,1
 406 | 1975,KCR,1
 407 | 1975,LAD,1
 408 | 1975,MIN,1
 409 | 1975,MIL,1
 410 | 1975,WSN,1
 411 | 1975,NYY,1
 412 | 1975,NYM,1
 413 | 1975,OAK,1
 414 | 1975,PHI,1
 415 | 1975,PIT,1
 416 | 1975,SDP,1
 417 | 1975,SFG,1
 418 | 1975,STL,1
 419 | 1975,TEX,1
 420 | 1976,ATL,1
 421 | 1976,BAL,1
 422 | 1976,BOS,1
 423 | 1976,ANA,1
 424 | 1976,CHW,1
 425 | 1976,CHC,1
 426 | 1976,CIN,1
 427 | 1976,CLE,1
 428 | 1976,DET,1
 429 | 1976,HOU,1
 430 | 1976,LAD,1
 431 | 1976,MIN,1
 432 | 1976,MIL,1
 433 | 1976,WSN,1
 434 | 1976,NYM,1
 435 | 1976,OAK,1
 436 | 1976,PHI,1
 437 | 1976,PIT,1
 438 | 1976,SFG,1
 439 | 1976,STL,1
 440 | 1976,TEX,1
 441 | 1977,BAL,1
 442 | 1977,BOS,1
 443 | 1977,ANA,1
 444 | 1977,CHC,1
 445 | 1977,CIN,1
 446 | 1977,CLE,1
 447 | 1977,DET,1
 448 | 1977,HOU,1
 449 | 1977,KCR,1
 450 | 1977,LAD,1
 451 | 1977,MIL,1
 452 | 1977,WSN,1
 453 | 1977,NYY,1
 454 | 1977,NYM,1
 455 | 1977,OAK,1
 456 | 1977,PHI,1
 457 | 1977,PIT,1
 458 | 1977,SEA,1
 459 | 1977,SFG,1
 460 | 1977,STL,1
 461 | 1977,TEX,1
 462 | 1977,TOR,1
 463 | 1978,ATL,1
 464 | 1978,BAL,1
 465 | 1978,BOS,1
 466 | 1978,ANA,1
 467 | 1978,CHW,1
 468 | 1978,CHC,1
 469 | 1978,CIN,1
 470 | 1978,CLE,1
 471 | 1978,DET,1
 472 | 1978,HOU,1
 473 | 1978,LAD,1
 474 | 1978,MIN,1
 475 | 1978,MIL,1
 476 | 1978,WSN,1
 477 | 1978,NYY,1
 478 | 1978,NYM,1
 479 | 1978,OAK,1
 480 | 1978,PHI,1
 481 | 1978,PIT,1
 482 | 1978,SDP,1
 483 | 1978,SEA,1
 484 | 1978,SFG,1
 485 | 1978,STL,1
 486 | 1978,TEX,1
 487 | 1978,TOR,1
 488 | 1979,ATL,1
 489 | 1979,BAL,1
 490 | 1979,ANA,1
 491 | 1979,CHC,1
 492 | 1979,CIN,1
 493 | 1979,CLE,1
 494 | 1979,DET,1
 495 | 1979,HOU,1
 496 | 1979,LAD,1
 497 | 1979,WSN,1
 498 | 1979,NYY,1
 499 | 1979,NYM,1
 500 | 1979,OAK,1
 501 | 1979,PHI,1
 502 | 1979,PIT,1
 503 | 1979,SDP,1
 504 | 1979,SFG,1
 505 | 1979,STL,1
 506 | 1979,TEX,1
 507 | 1980,ATL,1
 508 | 1980,BAL,1
 509 | 1980,ANA,1
 510 | 1980,CHW,1
 511 | 1980,CHC,1
 512 | 1980,CIN,1
 513 | 1980,DET,1
 514 | 1980,HOU,1
 515 | 1980,LAD,1
 516 | 1980,MIN,1
 517 | 1980,WSN,1
 518 | 1980,NYY,1
 519 | 1980,NYM,1
 520 | 1980,OAK,1
 521 | 1980,PHI,1
 522 | 1980,PIT,1
 523 | 1980,SDP,1
 524 | 1980,SEA,1
 525 | 1980,SFG,1
 526 | 1980,STL,1
 527 | 1980,TOR,1
 528 | 1981,ATL,1
 529 | 1981,BAL,1
 530 | 1981,BOS,1
 531 | 1981,ANA,1
 532 | 1981,CHW,1
 533 | 1981,CHC,1
 534 | 1981,CIN,1
 535 | 1981,CLE,1
 536 | 1981,DET,1
 537 | 1981,HOU,1
 538 | 1981,LAD,1
 539 | 1981,MIN,1
 540 | 1981,MIL,1
 541 | 1981,WSN,1
 542 | 1981,NYY,1
 543 | 1981,NYM,1
 544 | 1981,OAK,1
 545 | 1981,PHI,1
 546 | 1981,PIT,1
 547 | 1981,SDP,1
 548 | 1981,SEA,1
 549 | 1981,SFG,1
 550 | 1981,STL,1
 551 | 1981,TEX,1
 552 | 1981,TOR,1
 553 | 1982,ATL,1
 554 | 1982,BAL,1
 555 | 1982,BOS,1
 556 | 1982,ANA,1
 557 | 1982,CHW,1
 558 | 1982,CHC,1
 559 | 1982,CIN,1
 560 | 1982,CLE,1
 561 | 1982,DET,1
 562 | 1982,HOU,1
 563 | 1982,LAD,1
 564 | 1982,MIN,1
 565 | 1982,WSN,1
 566 | 1982,NYY,1
 567 | 1982,NYM,1
 568 | 1982,OAK,1
 569 | 1982,PHI,1
 570 | 1982,PIT,1
 571 | 1982,SDP,1
 572 | 1982,SEA,1
 573 | 1982,SFG,1
 574 | 1982,STL,1
 575 | 1982,TEX,1
 576 | 1982,TOR,1
 577 | 1983,ATL,1
 578 | 1983,BAL,1
 579 | 1983,BOS,1
 580 | 1983,CHW,1
 581 | 1983,CHC,1
 582 | 1983,CIN,1
 583 | 1983,CLE,1
 584 | 1983,DET,1
 585 | 1983,HOU,1
 586 | 1983,LAD,1
 587 | 1983,MIN,1
 588 | 1983,WSN,1
 589 | 1983,NYY,1
 590 | 1983,NYM,1
 591 | 1983,OAK,1
 592 | 1983,PHI,1
 593 | 1983,PIT,1
 594 | 1983,SDP,1
 595 | 1983,SEA,1
 596 | 1983,SFG,1
 597 | 1983,STL,1
 598 | 1983,TEX,1
 599 | 1983,TOR,1
 600 | 1984,ATL,1
 601 | 1984,BAL,1
 602 | 1984,BOS,1
 603 | 1984,ANA,1
 604 | 1984,CHW,1
 605 | 1984,CHC,1
 606 | 1984,CIN,1
 607 | 1984,CLE,1
 608 | 1984,DET,1
 609 | 1984,HOU,1
 610 | 1984,KCR,1
 611 | 1984,LAD,1
 612 | 1984,MIN,1
 613 | 1984,MIL,1
 614 | 1984,WSN,1
 615 | 1984,NYY,1
 616 | 1984,NYM,1
 617 | 1984,OAK,1
 618 | 1984,PIT,1
 619 | 1984,SDP,1
 620 | 1984,SEA,1
 621 | 1984,STL,1
 622 | 1984,TEX,1
 623 | 1984,TOR,1
 624 | 1985,ATL,1
 625 | 1985,BAL,1
 626 | 1985,BOS,1
 627 | 1985,ANA,1
 628 | 1985,CHW,1
 629 | 1985,CHC,1
 630 | 1985,CIN,1
 631 | 1985,CLE,1
 632 | 1985,DET,1
 633 | 1985,HOU,1
 634 | 1985,KCR,1
 635 | 1985,LAD,1
 636 | 1985,MIN,1
 637 | 1985,MIL,1
 638 | 1985,WSN,1
 639 | 1985,NYY,1
 640 | 1985,NYM,1
 641 | 1985,OAK,1
 642 | 1985,PHI,1
 643 | 1985,PIT,1
 644 | 1985,SDP,1
 645 | 1985,SEA,1
 646 | 1985,SFG,1
 647 | 1985,STL,1
 648 | 1985,TEX,1
 649 | 1985,TOR,1
 650 | 1986,ATL,1
 651 | 1986,BAL,1
 652 | 1986,BOS,1
 653 | 1986,ANA,1
 654 | 1986,CHW,1
 655 | 1986,CIN,1
 656 | 1986,CLE,1
 657 | 1986,DET,1
 658 | 1986,HOU,1
 659 | 1986,KCR,1
 660 | 1986,LAD,1
 661 | 1986,MIL,1
 662 | 1986,NYY,1
 663 | 1986,NYM,1
 664 | 1986,OAK,1
 665 | 1986,PIT,1
 666 | 1986,SDP,1
 667 | 1986,SFG,1
 668 | 1986,STL,1
 669 | 1987,ATL,1
 670 | 1987,CHW,1
 671 | 1987,KCR,1
 672 | 1987,LAD,1
 673 | 1987,PIT,1
 674 | 1987,SDP,1
 675 | 1987,SEA,1
 676 | 1987,STL,1
 677 | 1988,ATL,1
 678 | 1988,BAL,1
 679 | 1988,BOS,1
 680 | 1988,ANA,1
 681 | 1988,CHW,1
 682 | 1988,CHC,1
 683 | 1988,CIN,1
 684 | 1988,CLE,1
 685 | 1988,DET,1
 686 | 1988,HOU,1
 687 | 1988,KCR,1
 688 | 1988,LAD,1
 689 | 1988,MIN,1
 690 | 1988,MIL,1
 691 | 1988,WSN,1
 692 | 1988,NYM,1
 693 | 1988,OAK,1
 694 | 1988,PHI,1
 695 | 1988,PIT,1
 696 | 1988,SDP,1
 697 | 1988,SEA,1
 698 | 1988,SFG,1
 699 | 1988,STL,1
 700 | 1988,TEX,1
 701 | 1988,TOR,1
 702 | 1989,ATL,1
 703 | 1989,BAL,1
 704 | 1989,BOS,1
 705 | 1989,ANA,1
 706 | 1989,CHW,1
 707 | 1989,CHC,1
 708 | 1989,CIN,1
 709 | 1989,CLE,1
 710 | 1989,DET,1
 711 | 1989,HOU,1
 712 | 1989,KCR,1
 713 | 1989,LAD,1
 714 | 1989,MIN,1
 715 | 1989,MIL,1
 716 | 1989,WSN,1
 717 | 1989,NYY,1
 718 | 1989,NYM,1
 719 | 1989,OAK,1
 720 | 1989,PHI,1
 721 | 1989,PIT,1
 722 | 1989,SDP,1
 723 | 1989,SEA,1
 724 | 1989,SFG,1
 725 | 1989,STL,1
 726 | 1989,TOR,1
 727 | 1990,BAL,1
 728 | 1990,BOS,1
 729 | 1990,ANA,1
 730 | 1990,CHW,1
 731 | 1990,CHC,1
 732 | 1990,CIN,1
 733 | 1990,CLE,1
 734 | 1990,DET,1
 735 | 1990,HOU,1
 736 | 1990,KCR,1
 737 | 1990,LAD,1
 738 | 1990,MIN,1
 739 | 1990,MIL,1
 740 | 1990,WSN,1
 741 | 1990,NYY,1
 742 | 1990,OAK,1
 743 | 1990,PHI,1
 744 | 1990,PIT,1
 745 | 1990,SDP,1
 746 | 1990,SEA,1
 747 | 1990,SFG,1
 748 | 1990,STL,1
 749 | 1990,TOR,1
 750 | 1991,ATL,1
 751 | 1991,BOS,1
 752 | 1991,ANA,1
 753 | 1991,CHW,1
 754 | 1991,CHC,1
 755 | 1991,CLE,1
 756 | 1991,LAD,1
 757 | 1991,MIN,1
 758 | 1991,MIL,1
 759 | 1991,WSN,1
 760 | 1991,NYY,1
 761 | 1991,NYM,1
 762 | 1991,OAK,1
 763 | 1991,PHI,1
 764 | 1991,PIT,1
 765 | 1991,SDP,1
 766 | 1991,SEA,1
 767 | 1991,SFG,1
 768 | 1991,STL,1
 769 | 1991,TOR,1
 770 | 1992,ATL,1
 771 | 1992,BAL,1
 772 | 1992,BOS,1
 773 | 1992,ANA,1
 774 | 1992,CHW,1
 775 | 1992,CHC,1
 776 | 1992,CIN,1
 777 | 1992,CLE,1
 778 | 1992,DET,1
 779 | 1992,HOU,1
 780 | 1992,KCR,1
 781 | 1992,LAD,1
 782 | 1992,MIN,1
 783 | 1992,MIL,1
 784 | 1992,WSN,1
 785 | 1992,NYY,1
 786 | 1992,NYM,1
 787 | 1992,OAK,1
 788 | 1992,PHI,1
 789 | 1992,PIT,1
 790 | 1992,SDP,1
 791 | 1992,SEA,1
 792 | 1992,SFG,1
 793 | 1992,STL,1
 794 | 1992,TOR,1
 795 | 1993,ATL,1
 796 | 1993,BAL,1
 797 | 1993,BOS,1
 798 | 1993,ANA,1
 799 | 1993,CHW,1
 800 | 1993,CHC,1
 801 | 1993,FLA,1
 802 | 1993,HOU,1
 803 | 1993,KCR,1
 804 | 1993,LAD,1
 805 | 1993,MIN,1
 806 | 1993,MIL,1
 807 | 1993,WSN,1
 808 | 1993,NYM,1
 809 | 1993,SFG,1
 810 | 1993,STL,1
 811 | 1994,SFG,1
 812 | 1995,KCR,1
 813 | 1995,STL,1
 814 | 2002,ANA,1
 815 | 2002,DET,1
 816 | 2002,SFG,1
 817 | 2002,STL,1
 818 | 2003,ANA,1
 819 | 2003,NYM,1
 820 | 2003,OAK,1
 821 | 2005,MIN,1
 822 | 2005,OAK,1
 823 | 2005,SEA,1
 824 | 2005,SFG,1
 825 | 2005,STL,1
 826 | 1871,BNA,2
 827 | 1871,CNA,2
 828 | 1871,CFC,2
 829 | 1871,KEK,2
 830 | 1871,NNA,2
 831 | 1871,PNA,2
 832 | 1871,ROK,2
 833 | 1871,TRO,2
 834 | 1871,OLY,2
 835 | 1872,BLC,2
 836 | 1872,ECK,2
 837 | 1872,BRA,2
 838 | 1872,BNA,2
 839 | 1872,CFC,2
 840 | 1872,MAN,2
 841 | 1872,NNA,2
 842 | 1872,PNA,2
 843 | 1872,TRO,2
 844 | 1872,OLY,2
 845 | 1872,NAT,2
 846 | 1873,BLC,2
 847 | 1873,MAR,2
 848 | 1873,BRA,2
 849 | 1873,BNA,2
 850 | 1873,RES,2
 851 | 1873,NNA,2
 852 | 1873,PNA,2
 853 | 1873,PWS,2
 854 | 1873,WBL,2
 855 | 1898,LAD,3
 856 | 1898,ATL,3
 857 | 1898,STL,3
 858 | 1899,ATL,3
 859 | 1901,LAD,3
 860 | 1901,CIN,3
 861 | 1902,BOS,3
 862 | 1902,CLE,3
 863 | 1911,ATL,3
 864 | 1912,ATL,3
 865 | 1912,STL,3
 866 | 1919,PHI,3
 867 | 1920,BOS,3
 868 | 1920,CHW,3
 869 | 1920,CLE,3
 870 | 1920,DET,3
 871 | 1920,NYY,3
 872 | 1920,OAK,3
 873 | 1920,PHI,3
 874 | 1920,BAL,3
 875 | 1920,STL,3
 876 | 1920,MIN,3
 877 | 1921,BOS,3
 878 | 1921,LAD,3
 879 | 1921,ATL,3
 880 | 1921,CHW,3
 881 | 1921,CHC,3
 882 | 1921,CLE,3
 883 | 1921,DET,3
 884 | 1921,SFG,3
 885 | 1921,NYY,3
 886 | 1921,OAK,3
 887 | 1921,PHI,3
 888 | 1921,PIT,3
 889 | 1921,BAL,3
 890 | 1921,STL,3
 891 | 1921,MIN,3
 892 | 1922,BOS,3
 893 | 1922,LAD,3
 894 | 1922,ATL,3
 895 | 1922,CHW,3
 896 | 1922,CHC,3
 897 | 1922,CIN,3
 898 | 1922,CLE,3
 899 | 1922,DET,3
 900 | 1922,SFG,3
 901 | 1922,NYY,3
 902 | 1922,OAK,3
 903 | 1922,PHI,3
 904 | 1922,PIT,3
 905 | 1922,BAL,3
 906 | 1922,STL,3
 907 | 1922,MIN,3
 908 | 1923,BOS,3
 909 | 1923,LAD,3
 910 | 1923,ATL,3
 911 | 1923,CHW,3
 912 | 1923,CHC,3
 913 | 1923,CIN,3
 914 | 1923,CLE,3
 915 | 1923,DET,3
 916 | 1923,SFG,3
 917 | 1923,NYY,3
 918 | 1923,OAK,3
 919 | 1923,PHI,3
 920 | 1923,PIT,3
 921 | 1923,BAL,3
 922 | 1923,STL,3
 923 | 1923,MIN,3
 924 | 1924,BOS,3
 925 | 1924,LAD,3
 926 | 1924,ATL,3
 927 | 1924,CHW,3
 928 | 1924,CHC,3
 929 | 1924,CIN,3
 930 | 1924,CLE,3
 931 | 1924,DET,3
 932 | 1924,SFG,3
 933 | 1924,NYY,3
 934 | 1924,OAK,3
 935 | 1924,PHI,3
 936 | 1924,PIT,3
 937 | 1924,BAL,3
 938 | 1924,STL,3
 939 | 1924,MIN,3
 940 | 1925,BOS,3
 941 | 1925,LAD,3
 942 | 1925,ATL,3
 943 | 1925,CHW,3
 944 | 1925,CHC,3
 945 | 1925,CIN,3
 946 | 1925,CLE,3
 947 | 1925,DET,3
 948 | 1925,SFG,3
 949 | 1925,NYY,3
 950 | 1925,OAK,3
 951 | 1925,PHI,3
 952 | 1925,PIT,3
 953 | 1925,BAL,3
 954 | 1925,STL,3
 955 | 1925,MIN,3
 956 | 1926,BOS,3
 957 | 1926,ATL,3
 958 | 1926,CHW,3
 959 | 1926,CIN,3
 960 | 1926,CLE,3
 961 | 1926,DET,3
 962 | 1926,SFG,3
 963 | 1926,NYY,3
 964 | 1926,PHI,3
 965 | 1926,PIT,3
 966 | 1926,BAL,3
 967 | 1926,STL,3
 968 | 1926,MIN,3
 969 | 1927,BOS,3
 970 | 1927,ATL,3
 971 | 1927,CHW,3
 972 | 1927,CHC,3
 973 | 1927,CIN,3
 974 | 1927,CLE,3
 975 | 1927,DET,3
 976 | 1927,SFG,3
 977 | 1927,NYY,3
 978 | 1927,OAK,3
 979 | 1927,PHI,3
 980 | 1927,PIT,3
 981 | 1927,BAL,3
 982 | 1927,STL,3
 983 | 1927,MIN,3
 984 | 1928,BOS,3
 985 | 1928,ATL,3
 986 | 1928,CHW,3
 987 | 1928,CHC,3
 988 | 1928,CIN,3
 989 | 1928,CLE,3
 990 | 1928,DET,3
 991 | 1928,SFG,3
 992 | 1928,NYY,3
 993 | 1928,OAK,3
 994 | 1928,PHI,3
 995 | 1928,PIT,3
 996 | 1928,BAL,3
 997 | 1928,STL,3
 998 | 1928,MIN,3
 999 | 1929,BOS,3
1000 | 1929,LAD,3
1001 | 1929,ATL,3
1002 | 1929,CHW,3
1003 | 1929,CHC,3
1004 | 1929,CIN,3
1005 | 1929,CLE,3
1006 | 1929,DET,3
1007 | 1929,SFG,3
1008 | 1929,NYY,3
1009 | 1929,OAK,3
1010 | 1929,PHI,3
1011 | 1929,PIT,3
1012 | 1929,BAL,3
1013 | 1929,STL,3
1014 | 1929,MIN,3
1015 | 1930,BOS,3
1016 | 1930,LAD,3
1017 | 1930,ATL,3
1018 | 1930,CHW,3
1019 | 1930,CHC,3
1020 | 1930,CIN,3
1021 | 1930,CLE,3
1022 | 1930,DET,3
1023 | 1930,SFG,3
1024 | 1930,NYY,3
1025 | 1930,OAK,3
1026 | 1930,PIT,3
1027 | 1930,BAL,3
1028 | 1930,STL,3
1029 | 1930,MIN,3
1030 | 1931,BOS,3
1031 | 1931,LAD,3
1032 | 1931,CHW,3
1033 | 1931,CHC,3
1034 | 1931,CIN,3
1035 | 1931,CLE,3
1036 | 1931,DET,3
1037 | 1931,SFG,3
1038 | 1931,NYY,3
1039 | 1931,OAK,3
1040 | 1931,PHI,3
1041 | 1931,PIT,3
1042 | 1931,BAL,3
1043 | 1931,STL,3
1044 | 1931,MIN,3
1045 | 1932,BOS,3
1046 | 1932,LAD,3
1047 | 1932,ATL,3
1048 | 1932,CHW,3
1049 | 1932,CHC,3
1050 | 1932,CIN,3
1051 | 1932,CLE,3
1052 | 1932,DET,3
1053 | 1932,SFG,3
1054 | 1932,NYY,3
1055 | 1932,OAK,3
1056 | 1932,PHI,3
1057 | 1932,PIT,3
1058 | 1932,BAL,3
1059 | 1932,STL,3
1060 | 1932,MIN,3
1061 | 1933,BOS,3
1062 | 1933,LAD,3
1063 | 1933,CHW,3
1064 | 1933,CLE,3
1065 | 1933,DET,3
1066 | 1933,NYY,3
1067 | 1933,OAK,3
1068 | 1933,PHI,3
1069 | 1933,PIT,3
1070 | 1933,BAL,3
1071 | 1933,STL,3
1072 | 1933,MIN,3
1073 | 1934,BOS,3
1074 | 1934,LAD,3
1075 | 1934,ATL,3
1076 | 1934,CHW,3
1077 | 1934,CHC,3
1078 | 1934,CIN,3
1079 | 1934,CLE,3
1080 | 1934,DET,3
1081 | 1934,SFG,3
1082 | 1934,NYY,3
1083 | 1934,OAK,3
1084 | 1934,PHI,3
1085 | 1934,PIT,3
1086 | 1934,BAL,3
1087 | 1934,STL,3
1088 | 1934,MIN,3
1089 | 1935,BOS,3
1090 | 1935,LAD,3
1091 | 1935,ATL,3
1092 | 1935,CHW,3
1093 | 1935,CHC,3
1094 | 1935,CIN,3
1095 | 1935,CLE,3
1096 | 1935,DET,3
1097 | 1935,SFG,3
1098 | 1935,NYY,3
1099 | 1935,OAK,3
1100 | 1935,PHI,3
1101 | 1935,PIT,3
1102 | 1935,BAL,3
1103 | 1935,STL,3
1104 | 1935,MIN,3
1105 | 1936,BOS,3
1106 | 1936,LAD,3
1107 | 1936,ATL,3
1108 | 1936,CHW,3
1109 | 1936,CHC,3
1110 | 1936,CIN,3
1111 | 1936,CLE,3
1112 | 1936,DET,3
1113 | 1936,SFG,3
1114 | 1936,NYY,3
1115 | 1936,OAK,3
1116 | 1936,PHI,3
1117 | 1936,PIT,3
1118 | 1936,BAL,3
1119 | 1936,STL,3
1120 | 1936,MIN,3
1121 | 1937,BOS,3
1122 | 1937,LAD,3
1123 | 1937,CHW,3
1124 | 1937,CHC,3
1125 | 1937,CLE,3
1126 | 1937,DET,3
1127 | 1937,SFG,3
1128 | 1937,NYY,3
1129 | 1937,OAK,3
1130 | 1937,PHI,3
1131 | 1937,PIT,3
1132 | 1937,BAL,3
1133 | 1937,STL,3
1134 | 1937,MIN,3
1135 | 1938,BOS,3
1136 | 1938,LAD,3
1137 | 1938,CHW,3
1138 | 1938,CHC,3
1139 | 1938,CIN,3
1140 | 1938,CLE,3
1141 | 1938,DET,3
1142 | 1938,SFG,3
1143 | 1938,NYY,3
1144 | 1938,OAK,3
1145 | 1938,PHI,3
1146 | 1938,PIT,3
1147 | 1938,BAL,3
1148 | 1938,STL,3
1149 | 1938,MIN,3
1150 | 1939,BOS,3
1151 | 1939,LAD,3
1152 | 1939,ATL,3
1153 | 1939,CHW,3
1154 | 1939,CHC,3
1155 | 1939,CIN,3
1156 | 1939,CLE,3
1157 | 1939,DET,3
1158 | 1939,SFG,3
1159 | 1939,NYY,3
1160 | 1939,OAK,3
1161 | 1939,PHI,3
1162 | 1939,PIT,3
1163 | 1939,BAL,3
1164 | 1939,STL,3
1165 | 1939,MIN,3
1166 | 1940,BOS,3
1167 | 1940,LAD,3
1168 | 1940,ATL,3
1169 | 1940,CHW,3
1170 | 1940,CHC,3
1171 | 1940,CLE,3
1172 | 1940,DET,3
1173 | 1940,SFG,3
1174 | 1940,NYY,3
1175 | 1940,OAK,3
1176 | 1940,PIT,3
1177 | 1940,BAL,3
1178 | 1940,STL,3
1179 | 1940,MIN,3
1180 | 1941,BOS,3
1181 | 1941,LAD,3
1182 | 1941,ATL,3
1183 | 1941,CHW,3
1184 | 1941,CLE,3
1185 | 1941,DET,3
1186 | 1941,SFG,3
1187 | 1941,NYY,3
1188 | 1941,OAK,3
1189 | 1941,PHI,3
1190 | 1941,PIT,3
1191 | 1941,BAL,3
1192 | 1941,STL,3
1193 | 1941,MIN,3
1194 | 1942,BOS,3
1195 | 1942,CHC,3
1196 | 1942,OAK,3
1197 | 1942,BAL,3
1198 | 1942,MIN,3
1199 | 1943,LAD,3
1200 | 1943,SFG,3
1201 | 1944,BOS,3
1202 | 1944,LAD,3
1203 | 1944,CHC,3
1204 | 1944,CLE,3
1205 | 1944,SFG,3
1206 | 1944,NYY,3
1207 | 1944,PIT,3
1208 | 1945,BOS,3
1209 | 1945,LAD,3
1210 | 1945,ATL,3
1211 | 1945,CIN,3
1212 | 1945,SFG,3
1213 | 1945,PHI,3
1214 | 1945,PIT,3
1215 | 1945,STL,3
1216 | 1946,BAL,3
1217 | 1946,MIN,3
1218 | 1947,BOS,3
1219 | 1947,LAD,3
1220 | 1947,ATL,3
1221 | 1947,CHC,3
1222 | 1947,CIN,3
1223 | 1947,SFG,3
1224 | 1947,PIT,3
1225 | 1947,STL,3
1226 | 1948,BOS,3
1227 | 1948,ATL,3
1228 | 1948,CHW,3
1229 | 1948,CHC,3
1230 | 1948,CIN,3
1231 | 1948,CLE,3
1232 | 1948,DET,3
1233 | 1948,SFG,3
1234 | 1948,NYY,3
1235 | 1948,OAK,3
1236 | 1948,PHI,3
1237 | 1948,PIT,3
1238 | 1948,BAL,3
1239 | 1948,STL,3
1240 | 1948,MIN,3
1241 | 1949,BOS,3
1242 | 1949,LAD,3
1243 | 1949,ATL,3
1244 | 1949,CHW,3
1245 | 1949,CHC,3
1246 | 1949,CIN,3
1247 | 1949,DET,3
1248 | 1949,SFG,3
1249 | 1949,NYY,3
1250 | 1949,OAK,3
1251 | 1949,PHI,3
1252 | 1949,PIT,3
1253 | 1949,BAL,3
1254 | 1949,STL,3
1255 | 1949,MIN,3
1256 | 1950,BOS,3
1257 | 1950,LAD,3
1258 | 1950,ATL,3
1259 | 1950,CHW,3
1260 | 1950,CIN,3
1261 | 1950,DET,3
1262 | 1950,NYY,3
1263 | 1950,OAK,3
1264 | 1950,PHI,3
1265 | 1950,PIT,3
1266 | 1950,BAL,3
1267 | 1950,STL,3
1268 | 1950,MIN,3
1269 | 1951,BOS,3
1270 | 1951,LAD,3
1271 | 1951,ATL,3
1272 | 1951,CHW,3
1273 | 1951,CHC,3
1274 | 1951,DET,3
1275 | 1951,SFG,3
1276 | 1951,OAK,3
1277 | 1951,PHI,3
1278 | 1951,PIT,3
1279 | 1951,BAL,3
1280 | 1951,STL,3
1281 | 1951,MIN,3
1282 | 1952,OAK,3
1283 | 1952,BAL,3
1284 | 1953,BOS,3
1285 | 1953,CHC,3
1286 | 1953,CIN,3
1287 | 1953,DET,3
1288 | 1953,SFG,3
1289 | 1953,NYY,3
1290 | 1953,OAK,3
1291 | 1953,PHI,3
1292 | 1953,PIT,3
1293 | 1953,BAL,3
1294 | 1953,STL,3
1295 | 1954,BOS,3
1296 | 1954,CHC,3
1297 | 1954,CIN,3
1298 | 1954,OAK,3
1299 | 1954,PIT,3
1300 | 1954,STL,3
1301 | 1955,CIN,3
1302 | 1955,DET,3
1303 | 1955,OAK,3
1304 | 1955,MIN,3
1305 | 1956,BOS,3
1306 | 1956,DET,3
1307 | 1957,CIN,3
1308 | 1977,CHW,3
1309 | 1977,MIN,3
1310 | 1979,BOS,3
1311 | 1979,CHW,3
1312 | 1979,KCR,3
1313 | 1979,MIN,3
1314 | 1979,MIL,3
1315 | 1979,SEA,3
1316 | 1979,TOR,3
1317 | 1980,BOS,3
1318 | 1980,CLE,3
1319 | 1980,KCR,3
1320 | 1980,MIL,3
1321 | 1980,TEX,3
1322 | 1982,KCR,3
1323 | 1982,MIL,3
1324 | 1983,ANA,3
1325 | 1983,KCR,3
1326 | 1983,MIL,3
1327 | 1886,LAD,4
1328 | 1886,ATL,4
1329 | 1886,CHC,4
1330 | 1886,CIN,4
1331 | 1886,DTN,4
1332 | 1886,KCN,4
1333 | 1886,LOU,4
1334 | 1886,PHA,4
1335 | 1886,STL,4
1336 | 1887,BLO,4
1337 | 1887,LAD,4
1338 | 1887,ATL,4
1339 | 1887,CHC,4
1340 | 1887,CLV,4
1341 | 1887,CIN,4
1342 | 1887,DTN,4
1343 | 1887,IND,4
1344 | 1887,LOU,4
1345 | 1887,SFG,4
1346 | 1887,NYP,4
1347 | 1887,PHA,4
1348 | 1887,PHI,4
1349 | 1887,PIT,4
1350 | 1887,STL,4
1351 | 1887,WNL,4
1352 | 1888,BLO,4
1353 | 1888,CLV,4
1354 | 1888,CIN,4
1355 | 1888,DTN,4
1356 | 1888,IND,4
1357 | 1888,KCC,4
1358 | 1888,LOU,4
1359 | 1888,PHA,4
1360 | 1889,BLO,4
1361 | 1889,LAD,4
1362 | 1889,ATL,4
1363 | 1889,CHC,4
1364 | 1889,CLV,4
1365 | 1889,CLS,4
1366 | 1889,CIN,4
1367 | 1889,IND,4
1368 | 1889,KCC,4
1369 | 1889,LOU,4
1370 | 1889,SFG,4
1371 | 1889,PHA,4
1372 | 1889,PHI,4
1373 | 1889,PIT,4
1374 | 1889,STL,4
1375 | 1889,WNL,4
1376 | 1890,BFB,4
1377 | 1890,BRG,4
1378 | 1890,LAD,4
1379 | 1890,BWW,4
1380 | 1890,ATL,4
1381 | 1890,BRS,4
1382 | 1890,CHC,4
1383 | 1890,CHP,4
1384 | 1890,CIN,4
1385 | 1890,CLV,4
1386 | 1890,CLI,4
1387 | 1890,LOU,4
1388 | 1890,SFG,4
1389 | 1890,NYI,4
1390 | 1890,PHA,4
1391 | 1890,PHI,4
1392 | 1890,PHQ,4
1393 | 1890,PIT,4
1394 | 1890,PBB,4
1395 | 1890,ROC,4
1396 | 1890,STL,4
1397 | 1890,SYS,4
1398 | 1890,TLM,4
1399 | 1891,BLO,4
1400 | 1891,LAD,4
1401 | 1891,BRS,4
1402 | 1891,ATL,4
1403 | 1891,CHC,4
1404 | 1891,CIN,4
1405 | 1891,CLV,4
1406 | 1891,CLS,4
1407 | 1891,CKK,4
1408 | 1891,LOU,4
1409 | 1891,MLA,4
1410 | 1891,SFG,4
1411 | 1891,PHQ,4
1412 | 1891,PHI,4
1413 | 1891,PIT,4
1414 | 1891,STL,4
1415 | 1891,WAS,4
1416 | 1892,BLO,4
1417 | 1892,LAD,4
1418 | 1892,ATL,4
1419 | 1892,SFG,4
1420 | 1892,PHI,4
1421 | 1892,PIT,4
1422 | 1892,STL,4
1423 | 1892,WAS,4
1424 | 1893,BLO,4
1425 | 1893,LAD,4
1426 | 1893,ATL,4
1427 | 1893,CHC,4
1428 | 1893,CIN,4
1429 | 1893,CLV,4
1430 | 1893,LOU,4
1431 | 1893,SFG,4
1432 | 1893,PHI,4
1433 | 1893,PIT,4
1434 | 1893,STL,4
1435 | 1893,WAS,4
1436 | 1894,BLO,4
1437 | 1894,LAD,4
1438 | 1894,ATL,4
1439 | 1894,CHC,4
1440 | 1894,CIN,4
1441 | 1894,CLV,4
1442 | 1894,LOU,4
1443 | 1894,SFG,4
1444 | 1894,PHI,4
1445 | 1894,PIT,4
1446 | 1894,STL,4
1447 | 1894,WAS,4
1448 | 1895,BLO,4
1449 | 1895,LAD,4
1450 | 1895,ATL,4
1451 | 1895,CHC,4
1452 | 1895,CIN,4
1453 | 1895,CLV,4
1454 | 1895,LOU,4
1455 | 1895,SFG,4
1456 | 1895,PHI,4
1457 | 1895,PIT,4
1458 | 1895,STL,4
1459 | 1895,WAS,4
1460 | 1896,BLO,4
1461 | 1896,LAD,4
1462 | 1896,ATL,4
1463 | 1896,CHC,4
1464 | 1896,CIN,4
1465 | 1896,CLV,4
1466 | 1896,LOU,4
1467 | 1896,SFG,4
1468 | 1896,PHI,4
1469 | 1896,PIT,4
1470 | 1896,STL,4
1471 | 1896,WAS,4
1472 | 1897,BLO,4
1473 | 1897,LAD,4
1474 | 1897,ATL,4
1475 | 1897,CHC,4
1476 | 1897,CIN,4
1477 | 1897,CLV,4
1478 | 1897,LOU,4
1479 | 1897,SFG,4
1480 | 1897,PHI,4
1481 | 1897,PIT,4
1482 | 1897,STL,4
1483 | 1897,WAS,4
1484 | 1898,BLO,4
1485 | 1898,CHC,4
1486 | 1898,CIN,4
1487 | 1898,LOU,4
1488 | 1898,SFG,4
1489 | 1898,PHI,4
1490 | 1898,WAS,4
1491 | 1899,BLO,4
1492 | 1899,LAD,4
1493 | 1899,CHC,4
1494 | 1899,CIN,4
1495 | 1899,CLV,4
1496 | 1899,LOU,4
1497 | 1899,SFG,4
1498 | 1899,PHI,4
1499 | 1899,PIT,4
1500 | 1899,STL,4
1501 | 1899,WAS,4
1502 | 1900,LAD,4
1503 | 1900,ATL,4
1504 | 1900,CIN,4
1505 | 1900,SFG,4
1506 | 1900,PHI,4
1507 | 1900,STL,4
1508 | 1901,NYY,4
1509 | 1901,BOS,4
1510 | 1901,CHW,4
1511 | 1901,CLE,4
1512 | 1901,DET,4
1513 | 1901,BAL,4
1514 | 1901,OAK,4
1515 | 1901,STL,4
1516 | 1901,MIN,4
1517 | 1902,NYY,4
1518 | 1902,OAK,4
1519 | 1902,MIN,4
1520 | 1930,PHI,4
1521 | 1959,LAD,5
1522 | 1961,CHC,5
1523 | 1961,ANA,5
1524 | 1961,LAD,5
1525 | 1963,BOS,5
1526 | 1963,CLE,5
1527 | 1964,BOS,5
1528 | 1964,CLE,5
1529 | 1964,OAK,5
1530 | 1964,MIN,5
1531 | 1965,BOS,5
1532 | 1965,CIN,5
1533 | 1965,PHI,5
1534 | 1966,CHC,5
1535 | 1966,CIN,5
1536 | 1966,DET,5
1537 | 1969,CIN,5
1538 | 1969,HOU,5
1539 | 1969,PHI,5
1540 | 1969,PIT,5
1541 | 1970,NYM,5
1542 | 1970,PHI,5
1543 | 1970,SDP,5
1544 | 1970,SFG,5
1545 | 1970,STL,5
1546 | 1972,HOU,5
1547 | 1977,ATL,5
1548 | 1977,SDP,5
1549 | 1984,PHI,5
1550 | 1984,SFG,5
1551 | 1986,CHC,5
1552 | 1986,MIN,5
1553 | 1986,WSN,5
1554 | 1986,PHI,5
1555 | 1986,SEA,5
1556 | 1986,TEX,5
1557 | 1986,TOR,5
1558 | 1987,BAL,5
1559 | 1987,BOS,5
1560 | 1987,ANA,5
1561 | 1987,CHC,5
1562 | 1987,CIN,5
1563 | 1987,CLE,5
1564 | 1987,DET,5
1565 | 1987,HOU,5
1566 | 1987,MIN,5
1567 | 1987,MIL,5
1568 | 1987,WSN,5
1569 | 1987,NYY,5
1570 | 1987,NYM,5
1571 | 1987,OAK,5
1572 | 1987,PHI,5
1573 | 1987,SFG,5
1574 | 1987,TEX,5
1575 | 1987,TOR,5
1576 | 1988,NYY,5
1577 | 1989,TEX,5
1578 | 1990,ATL,5
1579 | 1990,NYM,5
1580 | 1990,TEX,5
1581 | 1991,BAL,5
1582 | 1991,CIN,5
1583 | 1991,DET,5
1584 | 1991,HOU,5
1585 | 1991,KCR,5
1586 | 1991,TEX,5
1587 | 1992,TEX,5
1588 | 1993,CIN,5
1589 | 1993,CLE,5
1590 | 1993,COL,5
1591 | 1993,DET,5
1592 | 1993,NYY,5
1593 | 1993,OAK,5
1594 | 1993,PHI,5
1595 | 1993,PIT,5
1596 | 1993,SDP,5
1597 | 1993,SEA,5
1598 | 1993,TEX,5
1599 | 1993,TOR,5
1600 | 1994,ATL,5
1601 | 1994,BAL,5
1602 | 1994,BOS,5
1603 | 1994,ANA,5
1604 | 1994,CHW,5
1605 | 1994,CHC,5
1606 | 1994,CIN,5
1607 | 1994,CLE,5
1608 | 1994,COL,5
1609 | 1994,DET,5
1610 | 1994,FLA,5
1611 | 1994,HOU,5
1612 | 1994,KCR,5
1613 | 1994,LAD,5
1614 | 1994,MIN,5
1615 | 1994,MIL,5
1616 | 1994,WSN,5
1617 | 1994,NYY,5
1618 | 1994,NYM,5
1619 | 1994,OAK,5
1620 | 1994,PHI,5
1621 | 1994,PIT,5
1622 | 1994,SDP,5
1623 | 1994,SEA,5
1624 | 1994,STL,5
1625 | 1994,TEX,5
1626 | 1994,TOR,5
1627 | 1995,ATL,5
1628 | 1995,BAL,5
1629 | 1995,BOS,5
1630 | 1995,ANA,5
1631 | 1995,CHW,5
1632 | 1995,CHC,5
1633 | 1995,CIN,5
1634 | 1995,CLE,5
1635 | 1995,COL,5
1636 | 1995,DET,5
1637 | 1995,FLA,5
1638 | 1995,HOU,5
1639 | 1995,LAD,5
1640 | 1995,MIN,5
1641 | 1995,MIL,5
1642 | 1995,WSN,5
1643 | 1995,NYY,5
1644 | 1995,NYM,5
1645 | 1995,OAK,5
1646 | 1995,PHI,5
1647 | 1995,PIT,5
1648 | 1995,SDP,5
1649 | 1995,SEA,5
1650 | 1995,SFG,5
1651 | 1995,TEX,5
1652 | 1995,TOR,5
1653 | 1996,ATL,5
1654 | 1996,BAL,5
1655 | 1996,BOS,5
1656 | 1996,ANA,5
1657 | 1996,CHW,5
1658 | 1996,CHC,5
1659 | 1996,CIN,5
1660 | 1996,CLE,5
1661 | 1996,COL,5
1662 | 1996,DET,5
1663 | 1996,FLA,5
1664 | 1996,HOU,5
1665 | 1996,KCR,5
1666 | 1996,LAD,5
1667 | 1996,MIN,5
1668 | 1996,MIL,5
1669 | 1996,WSN,5
1670 | 1996,NYY,5
1671 | 1996,NYM,5
1672 | 1996,OAK,5
1673 | 1996,PHI,5
1674 | 1996,PIT,5
1675 | 1996,SDP,5
1676 | 1996,SEA,5
1677 | 1996,SFG,5
1678 | 1996,STL,5
1679 | 1996,TEX,5
1680 | 1996,TOR,5
1681 | 1997,ANA,5
1682 | 1997,ATL,5
1683 | 1997,BAL,5
1684 | 1997,BOS,5
1685 | 1997,CHW,5
1686 | 1997,CHC,5
1687 | 1997,CIN,5
1688 | 1997,CLE,5
1689 | 1997,COL,5
1690 | 1997,DET,5
1691 | 1997,FLA,5
1692 | 1997,HOU,5
1693 | 1997,KCR,5
1694 | 1997,LAD,5
1695 | 1997,MIN,5
1696 | 1997,MIL,5
1697 | 1997,WSN,5
1698 | 1997,NYY,5
1699 | 1997,NYM,5
1700 | 1997,OAK,5
1701 | 1997,PHI,5
1702 | 1997,PIT,5
1703 | 1997,SDP,5
1704 | 1997,SEA,5
1705 | 1997,SFG,5
1706 | 1997,STL,5
1707 | 1997,TEX,5
1708 | 1997,TOR,5
1709 | 1998,ANA,5
1710 | 1998,ARI,5
1711 | 1998,ATL,5
1712 | 1998,BAL,5
1713 | 1998,BOS,5
1714 | 1998,CHW,5
1715 | 1998,CHC,5
1716 | 1998,CIN,5
1717 | 1998,CLE,5
1718 | 1998,COL,5
1719 | 1998,DET,5
1720 | 1998,FLA,5
1721 | 1998,HOU,5
1722 | 1998,KCR,5
1723 | 1998,LAD,5
1724 | 1998,MIL,5
1725 | 1998,MIN,5
1726 | 1998,WSN,5
1727 | 1998,NYY,5
1728 | 1998,NYM,5
1729 | 1998,OAK,5
1730 | 1998,PHI,5
1731 | 1998,PIT,5
1732 | 1998,SDP,5
1733 | 1998,SEA,5
1734 | 1998,SFG,5
1735 | 1998,STL,5
1736 | 1998,TBD,5
1737 | 1998,TEX,5
1738 | 1998,TOR,5
1739 | 1999,ANA,5
1740 | 1999,ARI,5
1741 | 1999,ATL,5
1742 | 1999,BAL,5
1743 | 1999,BOS,5
1744 | 1999,CHW,5
1745 | 1999,CHC,5
1746 | 1999,CIN,5
1747 | 1999,CLE,5
1748 | 1999,COL,5
1749 | 1999,DET,5
1750 | 1999,FLA,5
1751 | 1999,HOU,5
1752 | 1999,KCR,5
1753 | 1999,LAD,5
1754 | 1999,MIL,5
1755 | 1999,MIN,5
1756 | 1999,WSN,5
1757 | 1999,NYY,5
1758 | 1999,NYM,5
1759 | 1999,OAK,5
1760 | 1999,PHI,5
1761 | 1999,PIT,5
1762 | 1999,SDP,5
1763 | 1999,SEA,5
1764 | 1999,SFG,5
1765 | 1999,STL,5
1766 | 1999,TBD,5
1767 | 1999,TEX,5
1768 | 1999,TOR,5
1769 | 2000,ANA,5
1770 | 2000,ARI,5
1771 | 2000,ATL,5
1772 | 2000,BAL,5
1773 | 2000,BOS,5
1774 | 2000,CHW,5
1775 | 2000,CHC,5
1776 | 2000,CIN,5
1777 | 2000,CLE,5
1778 | 2000,COL,5
1779 | 2000,DET,5
1780 | 2000,FLA,5
1781 | 2000,HOU,5
1782 | 2000,KCR,5
1783 | 2000,LAD,5
1784 | 2000,MIL,5
1785 | 2000,MIN,5
1786 | 2000,WSN,5
1787 | 2000,NYY,5
1788 | 2000,NYM,5
1789 | 2000,OAK,5
1790 | 2000,PHI,5
1791 | 2000,PIT,5
1792 | 2000,SDP,5
1793 | 2000,SEA,5
1794 | 2000,SFG,5
1795 | 2000,STL,5
1796 | 2000,TBD,5
1797 | 2000,TEX,5
1798 | 2000,TOR,5
1799 | 2001,ANA,5
1800 | 2001,ARI,5
1801 | 2001,ATL,5
1802 | 2001,BAL,5
1803 | 2001,BOS,5
1804 | 2001,CHW,5
1805 | 2001,CHC,5
1806 | 2001,CIN,5
1807 | 2001,CLE,5
1808 | 2001,COL,5
1809 | 2001,DET,5
1810 | 2001,FLA,5
1811 | 2001,HOU,5
1812 | 2001,KCR,5
1813 | 2001,LAD,5
1814 | 2001,MIL,5
1815 | 2001,MIN,5
1816 | 2001,WSN,5
1817 | 2001,NYY,5
1818 | 2001,NYM,5
1819 | 2001,OAK,5
1820 | 2001,PHI,5
1821 | 2001,PIT,5
1822 | 2001,SDP,5
1823 | 2001,SEA,5
1824 | 2001,SFG,5
1825 | 2001,STL,5
1826 | 2001,TBD,5
1827 | 2001,TEX,5
1828 | 2001,TOR,5
1829 | 2002,ARI,5
1830 | 2002,ATL,5
1831 | 2002,BAL,5
1832 | 2002,BOS,5
1833 | 2002,CHW,5
1834 | 2002,CHC,5
1835 | 2002,CIN,5
1836 | 2002,CLE,5
1837 | 2002,COL,5
1838 | 2002,FLA,5
1839 | 2002,HOU,5
1840 | 2002,KCR,5
1841 | 2002,LAD,5
1842 | 2002,MIL,5
1843 | 2002,MIN,5
1844 | 2002,WSN,5
1845 | 2002,NYY,5
1846 | 2002,NYM,5
1847 | 2002,OAK,5
1848 | 2002,PHI,5
1849 | 2002,PIT,5
1850 | 2002,SDP,5
1851 | 2002,SEA,5
1852 | 2002,TBD,5
1853 | 2002,TEX,5
1854 | 2002,TOR,5
1855 | 2003,ARI,5
1856 | 2003,ATL,5
1857 | 2003,BAL,5
1858 | 2003,BOS,5
1859 | 2003,CHW,5
1860 | 2003,CHC,5
1861 | 2003,CIN,5
1862 | 2003,CLE,5
1863 | 2003,COL,5
1864 | 2003,DET,5
1865 | 2003,FLA,5
1866 | 2003,HOU,5
1867 | 2003,KCR,5
1868 | 2003,LAD,5
1869 | 2003,MIL,5
1870 | 2003,MIN,5
1871 | 2003,WSN,5
1872 | 2003,NYY,5
1873 | 2003,PHI,5
1874 | 2003,PIT,5
1875 | 2003,SDP,5
1876 | 2003,SEA,5
1877 | 2003,SFG,5
1878 | 2003,STL,5
1879 | 2003,TBD,5
1880 | 2003,TEX,5
1881 | 2003,TOR,5
1882 | 2004,ANA,5
1883 | 2004,ARI,5
1884 | 2004,ATL,5
1885 | 2004,BAL,5
1886 | 2004,BOS,5
1887 | 2004,CHW,5
1888 | 2004,CHC,5
1889 | 2004,CIN,5
1890 | 2004,CLE,5
1891 | 2004,COL,5
1892 | 2004,DET,5
1893 | 2004,FLA,5
1894 | 2004,HOU,5
1895 | 2004,KCR,5
1896 | 2004,LAD,5
1897 | 2004,MIL,5
1898 | 2004,MIN,5
1899 | 2004,WSN,5
1900 | 2004,NYY,5
1901 | 2004,NYM,5
1902 | 2004,OAK,5
1903 | 2004,PHI,5
1904 | 2004,PIT,5
1905 | 2004,SDP,5
1906 | 2004,SEA,5
1907 | 2004,SFG,5
1908 | 2004,STL,5
1909 | 2004,TBD,5
1910 | 2004,TEX,5
1911 | 2004,TOR,5
1912 | 2005,ARI,5
1913 | 2005,ATL,5
1914 | 2005,BAL,5
1915 | 2005,BOS,5
1916 | 2005,CHW,5
1917 | 2005,CHC,5
1918 | 2005,CIN,5
1919 | 2005,CLE,5
1920 | 2005,COL,5
1921 | 2005,DET,5
1922 | 2005,FLA,5
1923 | 2005,HOU,5
1924 | 2005,KCR,5
1925 | 2005,ANA,5
1926 | 2005,LAD,5
1927 | 2005,MIL,5
1928 | 2005,NYY,5
1929 | 2005,NYM,5
1930 | 2005,PHI,5
1931 | 2005,PIT,5
1932 | 2005,SDP,5
1933 | 2005,TBD,5
1934 | 2005,TEX,5
1935 | 2005,TOR,5
1936 | 2005,WSN,5
1937 | 2006,ARI,5
1938 | 2006,ATL,5
1939 | 2006,BAL,5
1940 | 2006,BOS,5
1941 | 2006,CHW,5
1942 | 2006,CHC,5
1943 | 2006,CIN,5
1944 | 2006,CLE,5
1945 | 2006,COL,5
1946 | 2006,DET,5
1947 | 2006,FLA,5
1948 | 2006,HOU,5
1949 | 2006,KCR,5
1950 | 2006,ANA,5
1951 | 2006,LAD,5
1952 | 2006,MIL,5
1953 | 2006,MIN,5
1954 | 2006,NYY,5
1955 | 2006,NYM,5
1956 | 2006,OAK,5
1957 | 2006,PHI,5
1958 | 2006,PIT,5
1959 | 2006,SDP,5
1960 | 2006,SEA,5
1961 | 2006,SFG,5
1962 | 2006,STL,5
1963 | 2006,TBD,5
1964 | 2006,TEX,5
1965 | 2006,TOR,5
1966 | 2006,WSN,5
1967 | 2007,ARI,5
1968 | 2007,ATL,5
1969 | 2007,BAL,5
1970 | 2007,BOS,5
1971 | 2007,CHW,5
1972 | 2007,CHC,5
1973 | 2007,CIN,5
1974 | 2007,CLE,5
1975 | 2007,COL,5
1976 | 2007,DET,5
1977 | 2007,FLA,5
1978 | 2007,HOU,5
1979 | 2007,KCR,5
1980 | 2007,ANA,5
1981 | 2007,LAD,5
1982 | 2007,MIL,5
1983 | 2007,MIN,5
1984 | 2007,NYY,5
1985 | 2007,NYM,5
1986 | 2007,OAK,5
1987 | 2007,PHI,5
1988 | 2007,PIT,5
1989 | 2007,SDP,5
1990 | 2007,SEA,5
1991 | 2007,SFG,5
1992 | 2007,STL,5
1993 | 2007,TBD,5
1994 | 2007,TEX,5
1995 | 2007,TOR,5
1996 | 2007,WSN,5
1997 | 2008,ARI,5
1998 | 2008,ATL,5
1999 | 2008,BAL,5
2000 | 2008,BOS,5
2001 | 2008,CHW,5
2002 | 2008,CHC,5
2003 | 2008,CIN,5
2004 | 2008,CLE,5
2005 | 2008,COL,5
2006 | 2008,DET,5
2007 | 2008,FLA,5
2008 | 2008,HOU,5
2009 | 2008,KCR,5
2010 | 2008,ANA,5
2011 | 2008,LAD,5
2012 | 2008,MIL,5
2013 | 2008,MIN,5
2014 | 2008,NYY,5
2015 | 2008,NYM,5
2016 | 2008,OAK,5
2017 | 2008,PHI,5
2018 | 2008,PIT,5
2019 | 2008,SDP,5
2020 | 2008,SEA,5
2021 | 2008,SFG,5
2022 | 2008,STL,5
2023 | 2008,TBD,5
2024 | 2008,TEX,5
2025 | 2008,TOR,5
2026 | 2008,WSN,5
2027 | 2009,ARI,5
2028 | 2009,ATL,5
2029 | 2009,BAL,5
2030 | 2009,BOS,5
2031 | 2009,CHW,5
2032 | 2009,CHC,5
2033 | 2009,CIN,5
2034 | 2009,CLE,5
2035 | 2009,COL,5
2036 | 2009,DET,5
2037 | 2009,FLA,5
2038 | 2009,HOU,5
2039 | 2009,KCR,5
2040 | 2009,ANA,5
2041 | 2009,LAD,5
2042 | 2009,MIL,5
2043 | 2009,MIN,5
2044 | 2009,NYY,5
2045 | 2009,NYM,5
2046 | 2009,OAK,5
2047 | 2009,PHI,5
2048 | 2009,PIT,5
2049 | 2009,SDP,5
2050 | 2009,SEA,5
2051 | 2009,SFG,5
2052 | 2009,STL,5
2053 | 2009,TBD,5
2054 | 2009,TEX,5
2055 | 2009,TOR,5
2056 | 2009,WSN,5
2057 | 2010,ARI,5
2058 | 2010,ATL,5
2059 | 2010,BAL,5
2060 | 2010,BOS,5
2061 | 2010,CHW,5
2062 | 2010,CHC,5
2063 | 2010,CIN,5
2064 | 2010,CLE,5
2065 | 2010,COL,5
2066 | 2010,DET,5
2067 | 2010,FLA,5
2068 | 2010,HOU,5
2069 | 2010,KCR,5
2070 | 2010,ANA,5
2071 | 2010,LAD,5
2072 | 2010,MIL,5
2073 | 2010,MIN,5
2074 | 2010,NYY,5
2075 | 2010,NYM,5
2076 | 2010,OAK,5
2077 | 2010,PHI,5
2078 | 2010,PIT,5
2079 | 2010,SDP,5
2080 | 2010,SEA,5
2081 | 2010,SFG,5
2082 | 2010,STL,5
2083 | 2010,TBD,5
2084 | 2010,TEX,5
2085 | 2010,TOR,5
2086 | 2010,WSN,5
2087 | 2011,ARI,5
2088 | 2011,ATL,5
2089 | 2011,BAL,5
2090 | 2011,BOS,5
2091 | 2011,CHW,5
2092 | 2011,CHC,5
2093 | 2011,CIN,5
2094 | 2011,CLE,5
2095 | 2011,COL,5
2096 | 2011,DET,5
2097 | 2011,FLA,5
2098 | 2011,HOU,5
2099 | 2011,KCR,5
2100 | 2011,ANA,5
2101 | 2011,LAD,5
2102 | 2011,MIL,5
2103 | 2011,MIN,5
2104 | 2011,NYY,5
2105 | 2011,NYM,5
2106 | 2011,OAK,5
2107 | 2011,PHI,5
2108 | 2011,PIT,5
2109 | 2011,SDP,5
2110 | 2011,SEA,5
2111 | 2011,SFG,5
2112 | 2011,STL,5
2113 | 2011,TBD,5
2114 | 2011,TEX,5
2115 | 2011,TOR,5
2116 | 2011,WSN,5
2117 | 2012,ARI,5
2118 | 2012,ATL,5
2119 | 2012,BAL,5
2120 | 2012,BOS,5
2121 | 2012,CHW,5
2122 | 2012,CHC,5
2123 | 2012,CIN,5
2124 | 2012,CLE,5
2125 | 2012,COL,5
2126 | 2012,DET,5
2127 | 2012,HOU,5
2128 | 2012,KCR,5
2129 | 2012,ANA,5
2130 | 2012,LAD,5
2131 | 2012,FLA,5
2132 | 2012,MIL,5
2133 | 2012,MIN,5
2134 | 2012,NYY,5
2135 | 2012,NYM,5
2136 | 2012,OAK,5
2137 | 2012,PHI,5
2138 | 2012,PIT,5
2139 | 2012,SDP,5
2140 | 2012,SEA,5
2141 | 2012,SFG,5
2142 | 2012,STL,5
2143 | 2012,TBD,5
2144 | 2012,TEX,5
2145 | 2012,TOR,5
2146 | 2012,WSN,5
2147 | 2013,ARI,5
2148 | 2013,ATL,5
2149 | 2013,BAL,5
2150 | 2013,BOS,5
2151 | 2013,CHW,5
2152 | 2013,CHC,5
2153 | 2013,CIN,5
2154 | 2013,CLE,5
2155 | 2013,COL,5
2156 | 2013,DET,5
2157 | 2013,HOU,5
2158 | 2013,KCR,5
2159 | 2013,ANA,5
2160 | 2013,LAD,5
2161 | 2013,FLA,5
2162 | 2013,MIL,5
2163 | 2013,MIN,5
2164 | 2013,NYY,5
2165 | 2013,NYM,5
2166 | 2013,OAK,5
2167 | 2013,PHI,5
2168 | 2013,PIT,5
2169 | 2013,SDP,5
2170 | 2013,SEA,5
2171 | 2013,SFG,5
2172 | 2013,STL,5
2173 | 2013,TBD,5
2174 | 2013,TEX,5
2175 | 2013,TOR,5
2176 | 2013,WSN,5
2177 | 2014,ARI,5
2178 | 2014,ATL,5
2179 | 2014,BAL,5
2180 | 2014,BOS,5
2181 | 2014,CHW,5
2182 | 2014,CHC,5
2183 | 2014,CIN,5
2184 | 2014,CLE,5
2185 | 2014,COL,5
2186 | 2014,DET,5
2187 | 2014,HOU,5
2188 | 2014,KCR,5
2189 | 2014,ANA,5
2190 | 2014,LAD,5
2191 | 2014,FLA,5
2192 | 2014,MIL,5
2193 | 2014,MIN,5
2194 | 2014,NYY,5
2195 | 2014,NYM,5
2196 | 2014,OAK,5
2197 | 2014,PHI,5
2198 | 2014,PIT,5
2199 | 2014,SDP,5
2200 | 2014,SEA,5
2201 | 2014,SFG,5
2202 | 2014,STL,5
2203 | 2014,TBD,5
2204 | 2014,TEX,5
2205 | 2014,TOR,5
2206 | 2014,WSN,5
2207 | 1886,BLO,6
2208 | 1886,SFG,6
2209 | 1886,NYP,6
2210 | 1886,PHI,6
2211 | 1886,PIT,6
2212 | 1886,SLM,6
2213 | 1886,WNL,6
2214 | 1888,LAD,6
2215 | 1888,ATL,6
2216 | 1888,CHC,6
2217 | 1888,SFG,6
2218 | 1888,PHI,6
2219 | 1888,PIT,6
2220 | 1888,STL,6
2221 | 1888,WNL,6
2222 | 1890,BLO,6
2223 | 1890,CLS,6
2224 | 1892,CHC,6
2225 | 1892,CIN,6
2226 | 1892,CLV,6
2227 | 1892,LOU,6
2228 | 1898,CLV,6
2229 | 1898,PIT,6
2230 | 1900,CHC,6
2231 | 1900,PIT,6
2232 | 1901,ATL,6
2233 | 1901,CHC,6
2234 | 1901,SFG,6
2235 | 1901,PHI,6
2236 | 1901,PIT,6
2237 | 1902,LAD,6
2238 | 1902,ATL,6
2239 | 1902,CHW,6
2240 | 1902,CHC,6
2241 | 1902,CIN,6
2242 | 1902,DET,6
2243 | 1902,SFG,6
2244 | 1902,PHI,6
2245 | 1902,PIT,6
2246 | 1902,BAL,6
2247 | 1902,STL,6
2248 | 1903,BOS,6
2249 | 1903,CHW,6
2250 | 1903,CLE,6
2251 | 1903,DET,6
2252 | 1903,NYY,6
2253 | 1903,OAK,6
2254 | 1903,BAL,6
2255 | 1903,MIN,6
2256 | 1904,BOS,6
2257 | 1904,CHW,6
2258 | 1904,CLE,6
2259 | 1904,DET,6
2260 | 1904,NYY,6
2261 | 1904,OAK,6
2262 | 1904,BAL,6
2263 | 1904,MIN,6
2264 | 1910,LAD,6
2265 | 1910,ATL,6
2266 | 1910,CHC,6
2267 | 1910,CIN,6
2268 | 1910,SFG,6
2269 | 1910,PHI,6
2270 | 1910,PIT,6
2271 | 1910,STL,6
2272 | 1911,LAD,6
2273 | 1911,CHC,6
2274 | 1911,CIN,6
2275 | 1911,SFG,6
2276 | 1911,PHI,6
2277 | 1911,PIT,6
2278 | 1911,STL,6
2279 | 1912,LAD,6
2280 | 1912,CHC,6
2281 | 1912,CIN,6
2282 | 1912,SFG,6
2283 | 1912,PHI,6
2284 | 1912,PIT,6
2285 | 1913,BOS,6
2286 | 1913,LAD,6
2287 | 1913,ATL,6
2288 | 1913,CHW,6
2289 | 1913,CHC,6
2290 | 1913,CIN,6
2291 | 1913,CLE,6
2292 | 1913,DET,6
2293 | 1913,SFG,6
2294 | 1913,NYY,6
2295 | 1913,OAK,6
2296 | 1913,PHI,6
2297 | 1913,PIT,6
2298 | 1913,BAL,6
2299 | 1913,STL,6
2300 | 1913,MIN,6
2301 | 1914,BLT,6
2302 | 1914,BOS,6
2303 | 1914,BTT,6
2304 | 1914,LAD,6
2305 | 1914,ATL,6
2306 | 1914,BFL,6
2307 | 1914,CHW,6
2308 | 1914,CHH,6
2309 | 1914,CHC,6
2310 | 1914,CIN,6
2311 | 1914,CLE,6
2312 | 1914,DET,6
2313 | 1914,NEW,6
2314 | 1914,KCP,6
2315 | 1914,SFG,6
2316 | 1914,NYY,6
2317 | 1914,OAK,6
2318 | 1914,PHI,6
2319 | 1914,PIT,6
2320 | 1914,PBS,6
2321 | 1914,BAL,6
2322 | 1914,SLI,6
2323 | 1914,STL,6
2324 | 1914,MIN,6
2325 | 1915,BLT,6
2326 | 1915,BOS,6
2327 | 1915,BTT,6
2328 | 1915,LAD,6
2329 | 1915,ATL,6
2330 | 1915,BFL,6
2331 | 1915,CHW,6
2332 | 1915,CHH,6
2333 | 1915,CHC,6
2334 | 1915,CIN,6
2335 | 1915,CLE,6
2336 | 1915,DET,6
2337 | 1915,KCP,6
2338 | 1915,NEW,6
2339 | 1915,SFG,6
2340 | 1915,NYY,6
2341 | 1915,OAK,6
2342 | 1915,PHI,6
2343 | 1915,PIT,6
2344 | 1915,PBS,6
2345 | 1915,BAL,6
2346 | 1915,SLI,6
2347 | 1915,STL,6
2348 | 1915,MIN,6
2349 | 1916,BOS,6
2350 | 1916,LAD,6
2351 | 1916,ATL,6
2352 | 1916,CHW,6
2353 | 1916,CHC,6
2354 | 1916,CIN,6
2355 | 1916,CLE,6
2356 | 1916,DET,6
2357 | 1916,SFG,6
2358 | 1916,NYY,6
2359 | 1916,OAK,6
2360 | 1916,PHI,6
2361 | 1916,PIT,6
2362 | 1916,BAL,6
2363 | 1916,STL,6
2364 | 1916,MIN,6
2365 | 1917,BOS,6
2366 | 1917,LAD,6
2367 | 1917,ATL,6
2368 | 1917,CHW,6
2369 | 1917,CHC,6
2370 | 1917,CIN,6
2371 | 1917,CLE,6
2372 | 1917,DET,6
2373 | 1917,SFG,6
2374 | 1917,NYY,6
2375 | 1917,OAK,6
2376 | 1917,PHI,6
2377 | 1917,PIT,6
2378 | 1917,BAL,6
2379 | 1917,STL,6
2380 | 1917,MIN,6
2381 | 1918,BOS,6
2382 | 1918,LAD,6
2383 | 1918,ATL,6
2384 | 1918,CHW,6
2385 | 1918,CHC,6
2386 | 1918,CIN,6
2387 | 1918,CLE,6
2388 | 1918,DET,6
2389 | 1918,SFG,6
2390 | 1918,NYY,6
2391 | 1918,OAK,6
2392 | 1918,PHI,6
2393 | 1918,PIT,6
2394 | 1918,BAL,6
2395 | 1918,STL,6
2396 | 1918,MIN,6
2397 | 1919,BOS,6
2398 | 1919,LAD,6
2399 | 1919,ATL,6
2400 | 1919,CHW,6
2401 | 1919,CHC,6
2402 | 1919,CIN,6
2403 | 1919,CLE,6
2404 | 1919,DET,6
2405 | 1919,SFG,6
2406 | 1919,NYY,6
2407 | 1919,OAK,6
2408 | 1919,PIT,6
2409 | 1919,BAL,6
2410 | 1919,STL,6
2411 | 1919,MIN,6
2412 | 1920,LAD,6
2413 | 1920,ATL,6
2414 | 1920,CHC,6
2415 | 1920,CIN,6
2416 | 1920,SFG,6
2417 | 1920,PIT,6
2418 | 1921,CIN,6
2419 | 1926,LAD,6
2420 | 1926,CHC,6
2421 | 1926,OAK,6
2422 | 1927,LAD,6
2423 | 1928,LAD,6
2424 | 1931,ATL,6
2425 | 1933,ATL,6
2426 | 1933,CHC,6
2427 | 1933,CIN,6
2428 | 1933,SFG,6
2429 | 1937,ATL,6
2430 | 1937,CIN,6
2431 | 1938,ATL,6
2432 | 1940,CIN,6
2433 | 1940,PHI,6
2434 | 1941,CHC,6
2435 | 1941,CIN,6
2436 | 1942,LAD,6
2437 | 1942,ATL,6
2438 | 1942,CHW,6
2439 | 1942,CIN,6
2440 | 1942,CLE,6
2441 | 1942,DET,6
2442 | 1942,SFG,6
2443 | 1942,NYY,6
2444 | 1942,PHI,6
2445 | 1942,PIT,6
2446 | 1942,STL,6
2447 | 1943,BOS,6
2448 | 1943,ATL,6
2449 | 1943,CHW,6
2450 | 1943,CHC,6
2451 | 1943,CIN,6
2452 | 1943,CLE,6
2453 | 1943,DET,6
2454 | 1943,NYY,6
2455 | 1943,OAK,6
2456 | 1943,PHI,6
2457 | 1943,PIT,6
2458 | 1943,BAL,6
2459 | 1943,STL,6
2460 | 1943,MIN,6
2461 | 1944,ATL,6
2462 | 1944,CHW,6
2463 | 1944,CIN,6
2464 | 1944,DET,6
2465 | 1944,OAK,6
2466 | 1944,PHI,6
2467 | 1944,BAL,6
2468 | 1944,STL,6
2469 | 1944,MIN,6
2470 | 1945,CHW,6
2471 | 1945,CHC,6
2472 | 1945,CLE,6
2473 | 1945,DET,6
2474 | 1945,NYY,6
2475 | 1945,OAK,6
2476 | 1945,BAL,6
2477 | 1945,MIN,6
2478 | 1946,LAD,6
2479 | 1946,ATL,6
2480 | 1946,CHW,6
2481 | 1946,CHC,6
2482 | 1946,CIN,6
2483 | 1946,SFG,6
2484 | 1946,OAK,6
2485 | 1946,PHI,6
2486 | 1946,PIT,6
2487 | 1946,STL,6
2488 | 1947,CHW,6
2489 | 1947,CLE,6
2490 | 1947,DET,6
2491 | 1947,OAK,6
2492 | 1947,PHI,6
2493 | 1947,BAL,6
2494 | 1947,MIN,6
2495 | 1949,CLE,6
2496 | 1951,CIN,6
2497 | 1951,NYY,6
2498 | 1952,CHW,6
2499 | 1952,PHI,6
2500 | 1952,STL,6
2501 | 1952,MIN,6
2502 | 1953,CHW,6
2503 | 1953,MIN,6
2504 | 1954,CHW,6
2505 | 1954,DET,6
2506 | 1954,PHI,6
2507 | 1972,NYY,6
2508 | 1976,KCR,6
2509 | 1976,NYY,6
2510 | 1976,SDP,6
2511 | 1978,KCR,6
2512 | 1981,KCR,6
2513 | 


--------------------------------------------------------------------------------