├── download_scripts ├── __init__.py ├── tests │ ├── __init__.py │ ├── test_func.py │ └── test_data.py ├── pytest.ini ├── run_all_scripts.py ├── retrosheet_download.py ├── conftest.py ├── lahman_download.py ├── retrosheet_datadictionary.py ├── README.md ├── retrosheet_parse.py ├── postgres_load_data.py ├── lahman_wrangle.py ├── retrosheet_collect.py ├── data_helper.py └── retrosheet_wrangle.py ├── data ├── retrosheet │ ├── nb_data │ │ ├── pf_types.csv │ │ ├── fangraphs_types.csv │ │ ├── pf.csv │ │ └── fangraphs.csv │ ├── event_types.csv │ ├── player_game_types.csv │ ├── game_types.csv │ ├── cwdaily_datadictionary.txt │ └── cwgame_datadictionary.txt └── lahman │ └── readme2017.txt ├── .gitignore ├── LICENSE ├── MLB_Data_Details.md ├── baseball_jupyter_nb └── README.md ├── RetrosheetParsers.md ├── MLB_Data_Overview.md └── README.md /download_scripts/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /download_scripts/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /download_scripts/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | markers = 3 | slow: marks tests as slow 4 | -------------------------------------------------------------------------------- /data/retrosheet/nb_data/pf_types.csv: -------------------------------------------------------------------------------- 1 | index,dtypes 2 | team_id,object 3 | year,int64 4 | pf,float64 5 | pf_half,float64 6 | name,object 7 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | data/lahman/raw 3 | data/lahman/wrangled 4 | data/retrosheet/raw 5 | data/retrosheet/parsed 6 | data/retrosheet/collected 7 | data/retrosheet/wrangled 8 | *.log 9 | test_data 10 | tmp* 11 | __pycache__ 12 | *checkpoint.ipynb 13 | -------------------------------------------------------------------------------- /data/retrosheet/nb_data/fangraphs_types.csv: -------------------------------------------------------------------------------- 1 | index,dtypes 2 | Season,int64 3 | Team,object 4 | Basic (5yr),int64 5 | 3yr,int64 6 | 1yr,int64 7 | 1B,int64 8 | 2B,int64 9 | 3B,int64 10 | HR,int64 11 | SO,int64 12 | BB,int64 13 | GB,int64 14 | FB,int64 15 | LD,int64 16 | IFFB,int64 17 | FIP,int64 18 | -------------------------------------------------------------------------------- /data/retrosheet/event_types.csv: -------------------------------------------------------------------------------- 1 | index,dtypes 2 | game_id,object 3 | inn_ct,uint8 4 | home_half,uint8 5 | away_score_ct,uint8 6 | home_score_ct,uint8 7 | bat_id,object 8 | pit_id,object 9 | event_tx,object 10 | h_cd,uint8 11 | outs,uint8 12 | e,uint8 13 | event_id,uint8 14 | team_id,object 15 | opponent_team_id,object 16 | inn_runs_ct,uint8 17 | start_bases_cd,uint8 18 | end_bases_cd,uint8 19 | r,uint8 20 | fate_runs_ct,uint8 21 | ab,bool 22 | sh,bool 23 | sf,bool 24 | dp,bool 25 | tp,bool 26 | wp,bool 27 | pb,bool 28 | inn_end,bool 29 | pa,bool 30 | bat_safe_err,bool 31 | so,bool 32 | sb,uint8 33 | cs,uint8 34 | bk,bool 35 | ibb,bool 36 | bb,bool 37 | hbp,bool 38 | xi,bool 39 | single,bool 40 | double,bool 41 | triple,bool 42 | hr,bool 43 | h,bool 44 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019,2020 by Stephen Diehl 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MLB_Data_Details.md: -------------------------------------------------------------------------------- 1 | ## MLB Data Details 2 | 3 | ### Lahman Data Source 4 | 5 | The most recent data will be downloaded from: https://github.com/chadwickbureau/baseballdatabank/archive/master.zip 6 | 7 | **Lahman Data License** from https://github.com/chadwickbureau/baseballdatabank readme.txt 8 | 9 | ``` 10 | This work is licensed under a Creative Commons Attribution-ShareAlike 11 | 3.0 Unported License. For details see: 12 | http://creativecommons.org/licenses/by-sa/3.0/ 13 | ``` 14 | 15 | #### Lahman Data Dictionary 16 | 17 | The most recent data dictionary is: http://www.seanlahman.com/files/database/readme2017.txt It is applicable to the 2019 Lahman data. 18 | 19 | This file is also copied to this repo at: `data/lahman/readme2017.txt` 20 | 21 | ### Retrosheet Data Source 22 | 23 | The play-by-play data will be downloaded from: https://github.com/chadwickbureau/retrosheet/archive/master.zip 24 | 25 | The retrosheet_download script will put these in: `../data/retrosheet/raw` 26 | 27 | **Retrosheet Data License** from https://www.retrosheet.org/notice.txt 28 | 29 | ``` 30 | The information used here was obtained free of 31 | charge from and is copyrighted by Retrosheet. Interested 32 | parties may contact Retrosheet at "www.retrosheet.org". 33 | ``` 34 | -------------------------------------------------------------------------------- /download_scripts/run_all_scripts.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Run all scripts""" 4 | 5 | __author__ = 'Stephen Diehl' 6 | 7 | import argparse 8 | import sys 9 | import subprocess 10 | 11 | 12 | def get_parser(): 13 | """Args Description""" 14 | 15 | # current_year = datetime.datetime.today().year 16 | parser = argparse.ArgumentParser( 17 | description=__doc__, 18 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 19 | 20 | parser.add_argument("--data-dir", type=str, help="baseball data directory", default='../data') 21 | parser.add_argument("--start-year", type=int, help="start year", default='1955') 22 | parser.add_argument("--end-year", type=int, help="end year", default='2019') 23 | 24 | return parser 25 | 26 | 27 | def run_cmd(cmd): 28 | proc = subprocess.Popen(cmd, stdout=subprocess.PIPE) 29 | for line in proc.stdout: 30 | sys.stdout.buffer.write(line) 31 | sys.stdout.buffer.flush() 32 | 33 | 34 | def main(): 35 | parser = get_parser() 36 | args = parser.parse_args() 37 | data_dir = f'--data-dir={args.data_dir}' 38 | start_year = f'--start-year={args.start_year}' 39 | end_year = f'--end-year={args.end_year}' 40 | 41 | print('Running lahman_download:') 42 | cmd = ['./lahman_download.py', '-v', '--log=INFO', data_dir] 43 | run_cmd(cmd) 44 | 45 | print('Running lahman_wrangle:') 46 | cmd = ['./lahman_wrangle.py', '-v', '--log=INFO', data_dir] 47 | run_cmd(cmd) 48 | 49 | print('Running retrosheet_download:') 50 | cmd = ['./retrosheet_download.py', '-v', '--log=INFO', data_dir] 51 | run_cmd(cmd) 52 | 53 | print('Running retrosheet_parse:') 54 | cmd = ['./retrosheet_parse.py', '-v', '--log=INFO', '--run-cwevent', data_dir, start_year, end_year] 55 | run_cmd(cmd) 56 | 57 | print('Running retrosheet_collect:') 58 | cmd = ['./retrosheet_collect.py', '-v', '--log=INFO', '--use-datatypes', data_dir] 59 | run_cmd(cmd) 60 | 61 | print('Running retrosheet_wrangle:') 62 | cmd = ['./retrosheet_wrangle.py', '-v', '--log=INFO', data_dir] 63 | run_cmd(cmd) 64 | 65 | print('Running pytest:') 66 | cmd = ['pytest', '-v', data_dir] 67 | run_cmd(cmd) 68 | print('All scripts have run.') 69 | 70 | 71 | if __name__ == '__main__': 72 | main() 73 | -------------------------------------------------------------------------------- /download_scripts/tests/test_func.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pandas as pd 4 | 5 | __author__ = 'Stephen Diehl' 6 | 7 | from .. import data_helper as dh 8 | 9 | 10 | def test_python_version(): 11 | assert sys.version_info.major == 3 12 | assert sys.version_info.minor >= 7 13 | 14 | 15 | def test_data_dir(data_dir): 16 | # if this does not pass either data_dir was passed incorrectly or 17 | # pytest was not run from the download_scripts directory 18 | assert data_dir.is_dir() 19 | 20 | 21 | def test_optimize_df(): 22 | # 16 columns but only 3 data types chosen by Pandas 0.25.x 23 | df = pd.DataFrame(dh.get_dtype_range()) 24 | assert len(df.columns) == 16 25 | assert df.dtypes.nunique() == 3 26 | 27 | # optimize the data types (modifies df inplace) 28 | dh.optimize_df_dtypes(df) 29 | assert len(df.columns) == 16 30 | assert df.dtypes.nunique() == 16 31 | 32 | # Pandas will silently convert to float if int value is too large 33 | # Verify that no implicit conversion took place 34 | assert (df.dtypes.values == df.columns.values).all() 35 | 36 | 37 | def test_rw_with_types(data_dir): 38 | dtype_range = dh.get_dtype_range() 39 | df = pd.DataFrame(dtype_range) 40 | dtypes_orig = df.dtypes 41 | 42 | dh.optimize_df_dtypes(df) 43 | dh.to_csv_with_types(df, data_dir / 'tmp.csv.gz') 44 | df = dh.from_csv_with_types(data_dir / 'tmp.csv.gz') 45 | 46 | assert (df.dtypes == list(dtype_range.keys())).all() 47 | assert not (df.dtypes == dtypes_orig).all() 48 | 49 | assert (data_dir / 'tmp.csv.gz').is_file() 50 | assert (data_dir / 'tmp_types.csv').is_file() 51 | os.remove(data_dir / 'tmp.csv.gz') 52 | os.remove(data_dir / 'tmp_types.csv') 53 | 54 | 55 | def test_sum_stats_for_dups(): 56 | data = {'pkey1': [1, 2, 3, 3, 4, 5, 5, 5], 57 | 'pkey2': [2, 3, 4, 4, 5, 6, 6, 6], 58 | 'stat1': [1, 2, 3, 4, 5, 6, 7, 8], 59 | 'stat2': [1, 1, 1, 1, 1, 1, 1, 1], 60 | 'misc': ['a', 'b', 'c1', 'c2', 'd', 'e1', 'e2', 'e3']} 61 | df = pd.DataFrame(data) 62 | 63 | df = dh.sum_stats_for_dups(df, ['pkey1', 'pkey2'], ['stat1', 'stat2']) 64 | 65 | chk = {'pkey1': [1, 2, 3, 4, 5], 66 | 'pkey2': [2, 3, 4, 5, 6], 67 | 'stat1': [1, 2, 7, 5, 21], 68 | 'stat2': [1, 1, 2, 1, 3], 69 | 'misc': ['a', 'b', 'c1', 'd', 'e1']} 70 | df_chk = pd.DataFrame(chk) 71 | 72 | assert df.equals(df_chk) 73 | -------------------------------------------------------------------------------- /baseball_jupyter_nb/README.md: -------------------------------------------------------------------------------- 1 | ## Jupyter Notebooks 2 | 3 | The Jupyter Notebooks: 4 | 5 | * with a CSV suffix uses CSV files as the data source 6 | * with a SQL suffix uses SQL to to perform the same analysis 7 | 8 | The links below will display the notebook using nbviewer: 9 | 10 | - [01_Intro_CSV](https://nbviewer.jupyter.org/github/sdiehl28/baseball-analytics/blob/master/baseball_jupyter_nb/01_Intro_CSV.ipynb) 11 | - how has game length increased over the years? 12 | - how has pitcher count increased over the years? 13 | - what it the relationship between pitcher count and game length? 14 | - how many more runs are scored in games for which the DH is used? 15 | - [01_Intro_SQL](https://nbviewer.jupyter.org/github/sdiehl28/baseball-analytics/blob/master/baseball_jupyter_nb/01_Intro_SQL.ipynb) 16 | - same as above, but using SQL as much as possible 17 | - [02_Data_Consistency_CSV](https://nbviewer.jupyter.org/github/sdiehl28/baseball-analytics/blob/master/baseball_jupyter_nb/02_Data_Consistency_CSV.ipynb) 18 | - Compare Retrosheet stats aggregated to season level, to Lahman stats. 19 | - Compare individual stats aggregated to team level, to team stats, for both Retrosheet and Lahman. 20 | - Compare batting stats to pitching-allowed stats, for both Retrosheet and Lahman. 21 | - [03a_ParkFactor_CSV](https://nbviewer.jupyter.org/github/sdiehl28/baseball-analytics/blob/master/baseball_jupyter_nb/03a_ParkFactor_CSV.ipynb) 22 | - Compute the Park Factor, for all teams for several years, accounting for home games not played in home park. 23 | - Web scrape FanGraphs and compare. 24 | - It is shown that ESPN and FanGraphs included Boston's runs scored in London, as part of the Fenway Park runs, thereby mistakenly increasing the Park Factor for Fenway Park. 25 | - [03b_ParkFactor_CSV](https://nbviewer.jupyter.org/github/sdiehl28/baseball-analytics/blob/master/baseball_jupyter_nb/03b_ParkFactor_CSV.ipynb) 26 | - Compute Park Factor, for all teams for several years, accounting for each team's road schedule. 27 | - It is shown that the road schedule can significantly impact the home park factor for a couple of teams each year. 28 | - [04_LinearWeights_CSV](https://nbviewer.jupyter.org/github/sdiehl28/baseball-analytics/blob/master/baseball_jupyter_nb/04_LinearWeights_CSV.ipynb) 29 | - Model runs per half-inning using Linear Regression. 30 | - The coefficient for single, double, triple, home run and other plays is determined. 31 | - The model accounts for 78% of the variance of runs scored per half-inning. -------------------------------------------------------------------------------- /data/retrosheet/player_game_types.csv: -------------------------------------------------------------------------------- 1 | index,dtypes 2 | game_id,object 3 | game_dt,uint32 4 | game_ct,uint8 5 | appear_dt,uint32 6 | team_id,object 7 | player_id,object 8 | slot_ct,uint8 9 | seq_ct,uint8 10 | home_fl,uint8 11 | opponent_id,object 12 | park_id,object 13 | b_g,uint8 14 | b_pa,uint8 15 | b_ab,uint8 16 | b_r,uint8 17 | b_h,uint8 18 | b_tb,uint8 19 | b_2b,uint8 20 | b_3b,uint8 21 | b_hr,uint8 22 | b_hr4,uint8 23 | b_rbi,uint8 24 | b_bb,uint8 25 | b_ibb,uint8 26 | b_so,uint8 27 | b_gdp,uint8 28 | b_hp,uint8 29 | b_sh,uint8 30 | b_sf,uint8 31 | b_sb,uint8 32 | b_cs,uint8 33 | b_xi,uint8 34 | b_g_dh,uint8 35 | b_g_ph,uint8 36 | b_g_pr,uint8 37 | p_g,uint8 38 | p_gs,uint8 39 | p_cg,uint8 40 | p_sho,uint8 41 | p_gf,uint8 42 | p_w,uint8 43 | p_l,uint8 44 | p_sv,uint8 45 | p_out,uint8 46 | p_tbf,uint8 47 | p_ab,uint8 48 | p_r,uint8 49 | p_er,uint8 50 | p_h,uint8 51 | p_tb,uint8 52 | p_2b,uint8 53 | p_3b,uint8 54 | p_hr,uint8 55 | p_hr4,uint8 56 | p_bb,uint8 57 | p_ibb,uint8 58 | p_so,uint8 59 | p_gdp,uint8 60 | p_hp,uint8 61 | p_sh,uint8 62 | p_sf,uint8 63 | p_xi,uint8 64 | p_wp,uint8 65 | p_bk,uint8 66 | p_ir,uint8 67 | p_irs,uint8 68 | p_go,uint8 69 | p_ao,uint8 70 | p_pitch,UInt8 71 | p_strike,UInt8 72 | f_p_g,uint8 73 | f_p_gs,uint8 74 | f_p_out,uint8 75 | f_p_tc,uint8 76 | f_p_po,uint8 77 | f_p_a,uint8 78 | f_p_e,uint8 79 | f_p_dp,uint8 80 | f_p_tp,uint8 81 | f_c_g,uint8 82 | f_c_gs,uint8 83 | f_c_out,uint8 84 | f_c_tc,uint8 85 | f_c_po,uint8 86 | f_c_a,uint8 87 | f_c_e,uint8 88 | f_c_dp,uint8 89 | f_c_tp,uint8 90 | f_c_pb,uint8 91 | f_c_xi,uint8 92 | f_1b_g,uint8 93 | f_1b_gs,uint8 94 | f_1b_out,uint8 95 | f_1b_tc,uint8 96 | f_1b_po,uint8 97 | f_1b_a,uint8 98 | f_1b_e,uint8 99 | f_1b_dp,uint8 100 | f_1b_tp,uint8 101 | f_2b_g,uint8 102 | f_2b_gs,uint8 103 | f_2b_out,uint8 104 | f_2b_tc,uint8 105 | f_2b_po,uint8 106 | f_2b_a,uint8 107 | f_2b_e,uint8 108 | f_2b_dp,uint8 109 | f_2b_tp,uint8 110 | f_3b_g,uint8 111 | f_3b_gs,uint8 112 | f_3b_out,uint8 113 | f_3b_tc,uint8 114 | f_3b_po,uint8 115 | f_3b_a,uint8 116 | f_3b_e,uint8 117 | f_3b_dp,uint8 118 | f_3b_tp,uint8 119 | f_ss_g,uint8 120 | f_ss_gs,uint8 121 | f_ss_out,uint8 122 | f_ss_tc,uint8 123 | f_ss_po,uint8 124 | f_ss_a,uint8 125 | f_ss_e,uint8 126 | f_ss_dp,uint8 127 | f_ss_tp,uint8 128 | f_lf_g,uint8 129 | f_lf_gs,uint8 130 | f_lf_out,uint8 131 | f_lf_tc,uint8 132 | f_lf_po,uint8 133 | f_lf_a,uint8 134 | f_lf_e,uint8 135 | f_lf_dp,uint8 136 | f_lf_tp,uint8 137 | f_cf_g,uint8 138 | f_cf_gs,uint8 139 | f_cf_out,uint8 140 | f_cf_tc,uint8 141 | f_cf_po,uint8 142 | f_cf_a,uint8 143 | f_cf_e,uint8 144 | f_cf_dp,uint8 145 | f_cf_tp,uint8 146 | f_rf_g,uint8 147 | f_rf_gs,uint8 148 | f_rf_out,uint8 149 | f_rf_tc,uint8 150 | f_rf_po,uint8 151 | f_rf_a,uint8 152 | f_rf_e,uint8 153 | f_rf_dp,uint8 154 | f_rf_tp,uint8 155 | -------------------------------------------------------------------------------- /RetrosheetParsers.md: -------------------------------------------------------------------------------- 1 | ## Retrosheet Parsers: Source and Exectuables 2 | 3 | The open source parsers created by Dr. T. L. Turocy to parse the Retrosheet play-by-play data are excellent. 4 | 5 | These parsers must be installed and on the path for the Python scripts to make use of them. 6 | 7 | Parser Description: http://chadwick.sourceforge.net/doc/cwtools.html 8 | Parser Executables and Source: https://sourceforge.net/projects/chadwick/ 9 | 10 | At the time of this writing, version 0.7.2 is the latest version. Executable versions of the parsers are available for Windows. Source code is available for Linux and MacOS. See [How to Build Retrosheet Parsers on Linux](#how-to-build-retrosheet-parsers-on-linux) 11 | 12 | ## Retrosheet Parsers 13 | 14 | Three parsers are used: 15 | 16 | * cwevent - creates one record for each single, double, error, stolen base, hit by pitch, balk, etc. 17 | * cwdaily - similar to a box score in which each players stats for a game are created 18 | * cwgame - similar to a line score in which each teams stats for a game are created 19 | 20 | In more detail: 21 | 22 | * cwevent 23 | * was missing about 10 fields that cwgame creates and are useful for analysis. These 10 fields were added by using regular expressions to parse the event_tx field. These 10 fields were then aggregated to the game level and verified to be 100% consistent with the output of cwgame. 24 | * cwdaily 25 | * the output of cwdaily is split into 3 csv files: batting, pitching and fielding. This is how Lahman structures their data as well. 26 | * cwgame 27 | * the output of cwgame is split into 2 csv files: team_game and game. The data in game is data specific to a game such as which park it was played in. The data in team_game is specific to a team for that game. 28 | 29 | All possible fields are extracted from the cwdaily and cwgame parsers. Both parsers are run automatically by the retrosheet_parse.py script. 30 | 31 | The cwevent parsers creates a great many rows. As such a default subset of the fields is selected. This parser is optionally run by the retrosheet_parse.py script. 32 | 33 | ## Retrosheet Parsers License 34 | 35 | From https://github.com/chadwickbureau/chadwick README 36 | 37 | ``` 38 | This is Chadwick, a library and toolset for baseball play-by-play 39 | and statistics. 40 | 41 | Chadwick is Open Source software, distributed under the terms of the 42 | GNU General Public License (GPL). 43 | ``` 44 | 45 | Parser Description: http://chadwick.sourceforge.net/doc/cwtools.html 46 | Parser Executables and Source: https://sourceforge.net/projects/chadwick/ 47 | 48 | ## How to Build Retrosheet Parsers on Linux 49 | 50 | If you do not already have a build environment: 51 | 52 | 1. sudo apt install gcc 53 | 2. sudo apt install build-essential 54 | 55 | cd to the source directory: 56 | 57 | 1. ./configure 58 | 2. make 59 | 3. sudo make install 60 | 61 | Result 62 | 63 | 1. The cw command line tools will be installed in /usr/local/bin. 64 | 2. The cw library will be installed in /usr/local/lib. 65 | 66 | To allow the command line tools to find the shared libraries, add the following to your .bashrc and then: source .bashrc 67 | `export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib` -------------------------------------------------------------------------------- /download_scripts/retrosheet_download.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Download and Unzip Retrosheet Data to {data_dir}/retrosheet/raw 4 | 5 | Will not download data if it has already been downloaded. 6 | """ 7 | 8 | __author__ = 'Stephen Diehl' 9 | 10 | import os 11 | import shutil 12 | import argparse 13 | import requests 14 | from pathlib import Path 15 | import zipfile 16 | import logging 17 | import sys 18 | 19 | logger = logging.getLogger(__name__) 20 | logger.setLevel(logging.DEBUG) 21 | 22 | 23 | def get_parser(): 24 | """Args Description""" 25 | 26 | parser = argparse.ArgumentParser( 27 | description=__doc__, 28 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 29 | 30 | parser.add_argument("--data-dir", type=str, help="baseball data directory", default='../data') 31 | parser.add_argument("-v", "--verbose", help="verbose output", action="store_true") 32 | parser.add_argument("--log", dest="log_level", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 33 | help="Set the logging level") 34 | 35 | return parser 36 | 37 | 38 | def mk_dirs(data_dir): 39 | """Make data directories""" 40 | p_retrosheet_raw = data_dir / 'retrosheet/raw' 41 | p_retrosheet_wrangled = data_dir / 'retrosheet/wrangled' 42 | 43 | # create directories from these path objects 44 | p_retrosheet_raw.mkdir(parents=True, exist_ok=True) 45 | p_retrosheet_wrangled.mkdir(parents=True, exist_ok=True) 46 | 47 | 48 | def download_data(raw_dir): 49 | """download and unzip retrosheet event files""" 50 | 51 | os.chdir(raw_dir) 52 | 53 | # download most recent Retrosheet data 54 | # most recent data is from chadwickbureau on github. 55 | zip_filename = 'retrosheet-master.zip' 56 | 57 | if not Path(zip_filename).is_file(): 58 | logger.info('Downloading >200 MB of Data ...') 59 | 60 | url = 'https://github.com/chadwickbureau/retrosheet/archive/master.zip' 61 | r = requests.get(url) 62 | r.raise_for_status() 63 | with open(zip_filename, 'wb') as f: 64 | f.write(r.content) 65 | 66 | # unzip it 67 | with zipfile.ZipFile(zip_filename, "r") as zip_ref: 68 | zip_ref.extractall('.') 69 | 70 | 71 | def reorg_files(raw_dir): 72 | """move the unzipped files to the raw directory and remove the extract directory""" 73 | os.chdir(raw_dir) 74 | 75 | unzip_dir = raw_dir / 'retrosheet-master' 76 | 77 | if unzip_dir.exists(): 78 | # move the subdirectories up one directory 79 | for dir in os.listdir(unzip_dir): 80 | shutil.move(unzip_dir.joinpath(dir).as_posix(), '.') 81 | 82 | # rm the extract directory 83 | shutil.rmtree('retrosheet-master') 84 | 85 | 86 | def main(): 87 | """Download Retrosheet Event Files 88 | """ 89 | parser = get_parser() 90 | args = parser.parse_args() 91 | 92 | if args.log_level: 93 | fh = logging.FileHandler('download.log') 94 | formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s') 95 | fh.setFormatter(formatter) 96 | fh.setLevel(args.log_level) 97 | logger.addHandler(fh) 98 | 99 | if args.verbose: 100 | # send INFO level logging to stdout 101 | sh = logging.StreamHandler(sys.stdout) 102 | formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s') 103 | sh.setFormatter(formatter) 104 | sh.setLevel(logging.INFO) 105 | logger.addHandler(sh) 106 | 107 | data_dir = Path(args.data_dir) 108 | mk_dirs(data_dir) 109 | 110 | raw_dir = (data_dir / 'retrosheet/raw').resolve() 111 | download_data(raw_dir) 112 | reorg_files(raw_dir) 113 | 114 | 115 | if __name__ == '__main__': 116 | main() 117 | -------------------------------------------------------------------------------- /data/retrosheet/game_types.csv: -------------------------------------------------------------------------------- 1 | index,dtypes 2 | game_id,object 3 | game_dt,uint32 4 | game_ct,uint8 5 | game_dy,object 6 | start_game_tm,uint16 7 | dh_fl,object 8 | daynight_park_cd,object 9 | away_team_id,object 10 | home_team_id,object 11 | park_id,object 12 | away_start_pit_id,object 13 | home_start_pit_id,object 14 | base4_ump_id,object 15 | base1_ump_id,object 16 | base2_ump_id,object 17 | base3_ump_id,object 18 | attend_park_ct,int32 19 | scorer_record_id,object 20 | translator_record_id,object 21 | inputter_record_id,object 22 | input_record_ts,object 23 | method_record_cd,uint8 24 | pitches_record_cd,uint8 25 | temp_park_ct,int8 26 | wind_direction_park_cd,uint8 27 | wind_speed_park_ct,int8 28 | field_park_cd,uint8 29 | precip_park_cd,uint8 30 | sky_park_cd,uint8 31 | minutes_game_ct,uint16 32 | inn_ct,uint8 33 | away_score_ct,uint8 34 | home_score_ct,uint8 35 | away_hits_ct,uint8 36 | home_hits_ct,uint8 37 | away_err_ct,uint8 38 | home_err_ct,uint8 39 | away_lob_ct,uint8 40 | home_lob_ct,uint8 41 | win_pit_id,object 42 | lose_pit_id,object 43 | save_pit_id,object 44 | gwrbi_bat_id,object 45 | away_lineup1_bat_id,object 46 | away_lineup1_fld_cd,uint8 47 | away_lineup2_bat_id,object 48 | away_lineup2_fld_cd,uint8 49 | away_lineup3_bat_id,object 50 | away_lineup3_fld_cd,uint8 51 | away_lineup4_bat_id,object 52 | away_lineup4_fld_cd,uint8 53 | away_lineup5_bat_id,object 54 | away_lineup5_fld_cd,uint8 55 | away_lineup6_bat_id,object 56 | away_lineup6_fld_cd,uint8 57 | away_lineup7_bat_id,object 58 | away_lineup7_fld_cd,uint8 59 | away_lineup8_bat_id,object 60 | away_lineup8_fld_cd,uint8 61 | away_lineup9_bat_id,object 62 | away_lineup9_fld_cd,uint8 63 | home_lineup1_bat_id,object 64 | home_lineup1_fld_cd,uint8 65 | home_lineup2_bat_id,object 66 | home_lineup2_fld_cd,uint8 67 | home_lineup3_bat_id,object 68 | home_lineup3_fld_cd,uint8 69 | home_lineup4_bat_id,object 70 | home_lineup4_fld_cd,uint8 71 | home_lineup5_bat_id,object 72 | home_lineup5_fld_cd,uint8 73 | home_lineup6_bat_id,object 74 | home_lineup6_fld_cd,uint8 75 | home_lineup7_bat_id,object 76 | home_lineup7_fld_cd,uint8 77 | home_lineup8_bat_id,object 78 | home_lineup8_fld_cd,uint8 79 | home_lineup9_bat_id,object 80 | home_lineup9_fld_cd,uint8 81 | away_finish_pit_id,object 82 | home_finish_pit_id,object 83 | away_team_league_id,object 84 | home_team_league_id,object 85 | outs_ct,uint8 86 | away_line_tx,object 87 | home_line_tx,object 88 | away_ab_ct,uint8 89 | away_2b_ct,uint8 90 | away_3b_ct,uint8 91 | away_hr_ct,uint8 92 | away_bi_ct,uint8 93 | away_sh_ct,uint8 94 | away_sf_ct,uint8 95 | away_hp_ct,uint8 96 | away_bb_ct,uint8 97 | away_ibb_ct,uint8 98 | away_so_ct,uint8 99 | away_sb_ct,uint8 100 | away_cs_ct,uint8 101 | away_gdp_ct,uint8 102 | away_xi_ct,uint8 103 | away_pitcher_ct,uint8 104 | away_er_ct,uint8 105 | away_ter_ct,uint8 106 | away_wp_ct,uint8 107 | away_bk_ct,uint8 108 | away_po_ct,uint8 109 | away_a_ct,uint8 110 | away_pb_ct,uint8 111 | away_dp_ct,uint8 112 | away_tp_ct,uint8 113 | home_ab_ct,uint8 114 | home_2b_ct,uint8 115 | home_3b_ct,uint8 116 | home_hr_ct,uint8 117 | home_bi_ct,uint8 118 | home_sh_ct,uint8 119 | home_sf_ct,uint8 120 | home_hp_ct,uint8 121 | home_bb_ct,uint8 122 | home_ibb_ct,uint8 123 | home_so_ct,uint8 124 | home_sb_ct,uint8 125 | home_cs_ct,uint8 126 | home_gdp_ct,uint8 127 | home_xi_ct,uint8 128 | home_pitcher_ct,uint8 129 | home_er_ct,uint8 130 | home_ter_ct,uint8 131 | home_wp_ct,uint8 132 | home_bk_ct,uint8 133 | home_po_ct,uint8 134 | home_a_ct,uint8 135 | home_pb_ct,uint8 136 | home_dp_ct,uint8 137 | home_tp_ct,uint8 138 | win_pit_name_tx,object 139 | lose_pit_name_tx,object 140 | save_pit_name_tx,object 141 | goahead_rbi_id,object 142 | goahead_rbi_name_tx,object 143 | away_lineup1_bat_name_tx,object 144 | away_lineup2_bat_name_tx,object 145 | away_lineup3_bat_name_tx,object 146 | away_lineup4_bat_name_tx,object 147 | away_lineup5_bat_name_tx,object 148 | away_lineup6_bat_name_tx,object 149 | away_lineup7_bat_name_tx,object 150 | away_lineup8_bat_name_tx,object 151 | away_lineup9_bat_name_tx,object 152 | home_lineup1_bat_name_tx,object 153 | home_lineup2_bat_name_tx,object 154 | home_lineup3_bat_name_tx,object 155 | home_lineup4_bat_name_tx,object 156 | home_lineup5_bat_name_tx,object 157 | home_lineup6_bat_name_tx,object 158 | home_lineup7_bat_name_tx,object 159 | home_lineup8_bat_name_tx,object 160 | home_lineup9_bat_name_tx,object 161 | -------------------------------------------------------------------------------- /download_scripts/conftest.py: -------------------------------------------------------------------------------- 1 | """Fixtures for Data Consistency Testing 2 | 3 | Data Consistency Testing is for the year 1974 through 2019 inclusive. 4 | """ 5 | import pytest 6 | from pathlib import Path 7 | from . import data_helper as dh 8 | 9 | 10 | def pytest_addoption(parser): 11 | parser.addoption( 12 | "--data-dir", action='store', default="../data", type=str, help="baseball data directory" 13 | ) 14 | parser.addoption( 15 | "--runslow", action="store_true", default=False, help="run slow tests" 16 | ) 17 | 18 | 19 | def pytest_collection_modifyitems(config, items): 20 | if config.getoption("--runslow"): 21 | # --runslow given in cli: do not skip slow tests 22 | return 23 | 24 | skip_slow = pytest.mark.skip(reason="need --runslow option to run") 25 | for item in items: 26 | if "slow" in item.keywords: 27 | item.add_marker(skip_slow) 28 | 29 | 30 | @pytest.fixture(scope='session') 31 | def data_dir(request): 32 | return Path(request.config.getoption("--data-dir")) 33 | 34 | 35 | # @pytest.fixture(scope='session') 36 | # def player_game(data_dir): 37 | # # depending upon the amount of data, it could take 30 seconds to decompress player_game.csv.gz 38 | # filename = data_dir / 'retrosheet' / 'collected' / 'player_game.csv.gz' 39 | # player_game = dh.from_csv_with_types(filename) 40 | # return player_game 41 | 42 | 43 | @pytest.fixture(scope='session') 44 | def team_game(data_dir): 45 | filename = data_dir / 'retrosheet' / 'wrangled' / 'team_game.csv.gz' 46 | team_game = dh.from_csv_with_types(filename) 47 | team_game = team_game.query('1974 <= game_start.dt.year <= 2019') 48 | return team_game 49 | 50 | 51 | @pytest.fixture(scope='session') 52 | def game(data_dir): 53 | filename = data_dir / 'retrosheet' / 'wrangled' / 'game.csv.gz' 54 | game = dh.from_csv_with_types(filename) 55 | game = game.query('1974 <= game_start.dt.year <= 2019') 56 | return game 57 | 58 | 59 | @pytest.fixture(scope='session') 60 | def batting(data_dir): 61 | filename = data_dir / 'retrosheet' / 'wrangled' / 'batting.csv.gz' 62 | batting = dh.from_csv_with_types(filename) 63 | batting = batting.query('1974 <= game_start.dt.year <= 2019') 64 | return batting 65 | 66 | 67 | @pytest.fixture(scope='session') 68 | def pitching(data_dir): 69 | filename = data_dir / 'retrosheet' / 'wrangled' / 'pitching.csv.gz' 70 | pitching = dh.from_csv_with_types(filename) 71 | pitching = pitching.query('1974 <= game_start.dt.year <= 2019') 72 | return pitching 73 | 74 | 75 | @pytest.fixture(scope='session') 76 | def fielding(data_dir): 77 | filename = data_dir / 'retrosheet' / 'wrangled' / 'fielding.csv.gz' 78 | fielding = dh.from_csv_with_types(filename) 79 | fielding = fielding.query('1974 <= game_start.dt.year <= 2019') 80 | return fielding 81 | 82 | 83 | @pytest.fixture(scope='session') 84 | def lahman_batting(data_dir): 85 | filename = data_dir / 'lahman' / 'wrangled' / 'batting.csv' 86 | batting = dh.from_csv_with_types(filename) 87 | batting = batting.query('1974 <= year <= 2019') 88 | return batting 89 | 90 | 91 | @pytest.fixture(scope='session') 92 | def lahman_pitching(data_dir): 93 | filename = data_dir / 'lahman' / 'wrangled' / 'pitching.csv' 94 | pitching = dh.from_csv_with_types(filename) 95 | pitching = pitching.query('1974 <= year <= 2019') 96 | return pitching 97 | 98 | 99 | @pytest.fixture(scope='session') 100 | def lahman_fielding(data_dir): 101 | filename = data_dir / 'lahman' / 'wrangled' / 'fielding.csv' 102 | fielding = dh.from_csv_with_types(filename) 103 | fielding = fielding.query('1974 <= year <= 2019') 104 | return fielding 105 | 106 | 107 | @pytest.fixture(scope='session') 108 | def lahman_teams(data_dir): 109 | filename = data_dir / 'lahman' / 'wrangled' / 'teams.csv' 110 | teams = dh.from_csv_with_types(filename) 111 | teams = teams.query('1974 <= year <= 2019') 112 | return teams 113 | 114 | 115 | @pytest.fixture(scope='session') 116 | def lahman_people(data_dir): 117 | filename = data_dir / 'lahman' / 'wrangled' / 'people.csv' 118 | return dh.from_csv_with_types(filename) 119 | 120 | 121 | @pytest.fixture(scope='session') 122 | def event(data_dir): 123 | filename = data_dir / 'retrosheet' / 'wrangled' / 'event.csv.gz' 124 | return dh.from_csv_with_types(filename) 125 | -------------------------------------------------------------------------------- /download_scripts/lahman_download.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Download and Unzip Lahman Data to {data_dir}/lahman/raw 4 | 5 | Will not download data if it has already been downloaded. 6 | """ 7 | 8 | __author__ = 'Stephen Diehl' 9 | 10 | import os 11 | import shutil 12 | import argparse 13 | import requests 14 | from pathlib import Path 15 | import zipfile 16 | import logging 17 | import sys 18 | 19 | logger = logging.getLogger(__name__) 20 | logger.setLevel(logging.DEBUG) 21 | 22 | 23 | def get_parser(): 24 | """Args Description""" 25 | 26 | parser = argparse.ArgumentParser( 27 | description=__doc__, 28 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 29 | 30 | parser.add_argument("--data-dir", type=str, help="baseball data directory", default='../data') 31 | parser.add_argument("-v", "--verbose", help="verbose output", action="store_true") 32 | parser.add_argument("--log", dest="log_level", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 33 | help="Set the logging level") 34 | 35 | return parser 36 | 37 | 38 | def mk_dirs(data_dir): 39 | """Make data directories""" 40 | p_lahman = Path(data_dir) / 'lahman' 41 | p_lahman_raw = p_lahman / 'raw' 42 | p_lahman_wrangled = p_lahman / 'wrangled' 43 | 44 | # create directories from these path objects 45 | p_lahman_raw.mkdir(parents=True, exist_ok=True) 46 | p_lahman_wrangled.mkdir(parents=True, exist_ok=True) 47 | 48 | msg = " ".join(os.listdir(p_lahman)) 49 | logger.info(f'{p_lahman} contents: {msg}') 50 | 51 | 52 | def download_data(raw_dir): 53 | """download and unzip Lahman zip file""" 54 | os.chdir(raw_dir) 55 | 56 | # download most recent data dictionary (accurate for 2019) 57 | url = 'http://www.seanlahman.com/files/database/readme2017.txt' 58 | dd_filename = '../readme2017.txt' 59 | if not Path(dd_filename).is_file(): 60 | r = requests.get(url) 61 | r.raise_for_status() 62 | with open(dd_filename, 'wb') as f: 63 | f.write(r.content) 64 | 65 | # download most recent Lahman data 66 | # most recent data is not from www.seanlahman.com. It is from chadwickbureau on github. 67 | zip_filename = 'baseballdatabank-master.zip' 68 | 69 | if not Path(zip_filename).is_file(): 70 | logger.info('Downloading Data ...') 71 | 72 | url = 'https://github.com/chadwickbureau/baseballdatabank/archive/master.zip' 73 | r = requests.get(url) 74 | r.raise_for_status() 75 | with open(zip_filename, 'wb') as f: 76 | f.write(r.content) 77 | 78 | # unzip it 79 | with zipfile.ZipFile(zip_filename, "r") as zip_ref: 80 | zip_ref.extractall('.') 81 | 82 | 83 | def reorg_files(raw_dir): 84 | """move the unzipped files to the raw directory and remove the extract directory""" 85 | os.chdir(raw_dir) 86 | 87 | if not Path('People.csv').is_file(): 88 | unzip_dir = raw_dir / 'baseballdatabank-master' / 'core' 89 | 90 | # move the unzipped csv files to the current working directory 91 | for root, dirs, files in os.walk(unzip_dir): 92 | for file in files: 93 | shutil.move(root + '/' + file, '.') 94 | 95 | # rm the extract directory 96 | shutil.rmtree('baseballdatabank-master') 97 | 98 | msg = '\n'.join(os.listdir('.')) 99 | logger.info(f'{raw_dir} contents:\n {msg}') 100 | 101 | 102 | def main(): 103 | """Download and Unzip Lahman Data to {data_dir}/lahman/raw""" 104 | parser = get_parser() 105 | args = parser.parse_args() 106 | 107 | if args.log_level: 108 | fh = logging.FileHandler('download.log') 109 | formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s') 110 | fh.setFormatter(formatter) 111 | fh.setLevel(args.log_level) 112 | logger.addHandler(fh) 113 | 114 | if args.verbose: 115 | # send INFO level logging to stdout 116 | sh = logging.StreamHandler(sys.stdout) 117 | formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s') 118 | sh.setFormatter(formatter) 119 | sh.setLevel(logging.INFO) 120 | logger.addHandler(sh) 121 | 122 | data_dir = Path(args.data_dir) 123 | mk_dirs(data_dir) 124 | 125 | raw_dir = data_dir / 'lahman/raw' 126 | raw_dir = raw_dir.resolve() 127 | download_data(raw_dir) 128 | reorg_files(raw_dir) 129 | 130 | logger.info('Finished') 131 | 132 | 133 | if __name__ == '__main__': 134 | main() 135 | -------------------------------------------------------------------------------- /download_scripts/retrosheet_datadictionary.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Use the Retrosheet parsers to generate their Data Dictionaries.""" 4 | 5 | __author__ = 'Stephen Diehl' 6 | 7 | import csv 8 | import subprocess 9 | import os 10 | from pathlib import Path 11 | import io 12 | import re 13 | import argparse 14 | 15 | 16 | def get_parser(): 17 | """Args Description""" 18 | 19 | # current_year = datetime.datetime.today().year 20 | parser = argparse.ArgumentParser( 21 | description=__doc__, 22 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 23 | 24 | parser.add_argument("--data-dir", type=str, help="baseball data directory", default='../data') 25 | parser.add_argument("-v", "--verbose", help="verbose output", action="store_true") 26 | 27 | return parser 28 | 29 | 30 | def check_for_retrosheet_parsers(): 31 | """Check that parsers can be executed.""" 32 | p1 = subprocess.run(['cwdaily', '-h'], shell=False, capture_output=True) 33 | if p1.returncode != 0: 34 | raise FileNotFoundError('could not execute cwdaily') 35 | 36 | p1 = subprocess.run(['cwgame', '-h'], shell=False, capture_output=True) 37 | if p1.returncode != 0: 38 | raise FileNotFoundError('could not execute cwgame') 39 | 40 | 41 | def get_cwdaily_values(description): 42 | """Get cwdaily field descriptions""" 43 | cwdaily_values = [] 44 | for line in io.StringIO(description): 45 | 46 | # if the line starts with a number 47 | if re.match(r'^\d+', line): 48 | tmp = line.rstrip()[8:] 49 | 50 | if ':' in tmp: 51 | key, value = tmp.split(':') 52 | value = value.strip() 53 | else: 54 | value = tmp.strip() 55 | cwdaily_values.append(value) 56 | 57 | return cwdaily_values 58 | 59 | 60 | def get_cwgame_values(description): 61 | """get cwgame field descriptions""" 62 | cwgame_values = [] 63 | for line in io.StringIO(description): 64 | 65 | # if the line starts with a number 66 | if re.match(r'^\d+', line): 67 | tmp = line.rstrip()[8:] 68 | 69 | if ':' in tmp: 70 | key, value = tmp.split(':') 71 | value = value.strip() 72 | else: 73 | value = tmp.strip() 74 | cwgame_values.append(value) 75 | 76 | return cwgame_values 77 | 78 | 79 | def main(): 80 | """Generate Data Dictionary from cwdaily and cwgame parsers""" 81 | check_for_retrosheet_parsers() 82 | 83 | parser = get_parser() 84 | args = parser.parse_args() 85 | 86 | p_data = Path(args.data_dir).resolve() 87 | p_data_raw = p_data.joinpath('retrosheet/raw') 88 | os.chdir(p_data_raw) 89 | 90 | # TODO allow this to work with any event file found in raw directory 91 | if not Path('2019LAN.EVN').is_file(): 92 | raise FileNotFoundError('retrosheet data must be downloaded first') 93 | 94 | args = ['cwdaily', '-f', '0-153', '-n', '-y', '2019', '2019LAN.EVN'] 95 | result = subprocess.run(args, shell=False, text=True, capture_output=True) 96 | 97 | # get header row 98 | cwdaily_keys = next(csv.reader(io.StringIO(result.stdout))) 99 | 100 | args = ['cwgame', '-f', '0-83', '-x', '0-94', '-n', '-y', '2019', '2019LAN.EVN'] 101 | result = subprocess.run(args, shell=False, text=True, capture_output=True) 102 | 103 | # get header row 104 | cwgame_keys = next(csv.reader(io.StringIO(result.stdout))) 105 | 106 | args = ['cwdaily', '-f', '0-153', '-d'] 107 | result = subprocess.run(args, shell=False, text=True, capture_output=True) 108 | 109 | # stderr not stdout 110 | cwdaily_values = get_cwdaily_values(result.stderr) 111 | 112 | args = ['cwgame', '-f', '0-83', '-x', '0-94', '-d'] 113 | result = subprocess.run(args, shell=False, text=True, capture_output=True) 114 | 115 | # stderr not stdout 116 | cwgame_values = get_cwgame_values(result.stderr) 117 | 118 | assert len(cwdaily_keys) == len(cwdaily_values) 119 | assert len(cwgame_keys) == len(cwgame_values) 120 | cwdaily_dict = dict(zip(cwdaily_keys, cwdaily_values)) 121 | cwgame_dict = dict(zip(cwgame_keys, cwgame_values)) 122 | 123 | p_retrosheet = p_data.joinpath('retrosheet') 124 | os.chdir(p_retrosheet) 125 | with open('cwdaily_datadictionary.txt', 'w') as fh: 126 | for key, value in cwdaily_dict.items(): 127 | fh.write(f'{key} = {value}\n') 128 | 129 | with open('cwgame_datadictionary.txt', 'w') as fh: 130 | for key, value in cwgame_dict.items(): 131 | fh.write(f'{key} = {value}\n') 132 | 133 | 134 | if __name__ == '__main__': 135 | main() 136 | -------------------------------------------------------------------------------- /data/retrosheet/cwdaily_datadictionary.txt: -------------------------------------------------------------------------------- 1 | GAME_ID = game id 2 | GAME_DT = date 3 | GAME_CT = game number (0 = no double header) 4 | APPEAR_DT = apperance date 5 | TEAM_ID = team id 6 | PLAYER_ID = player id 7 | SLOT_CT = player slot in batting order 8 | SEQ_CT = sequence in batting order slot 9 | HOME_FL = home flag 10 | OPPONENT_ID = opponent id 11 | PARK_ID = park id 12 | B_G = games played 13 | B_PA = plate appearances 14 | B_AB = at bats 15 | B_R = runs 16 | B_H = hits 17 | B_TB = total bases 18 | B_2B = doubles 19 | B_3B = triples 20 | B_HR = home runs 21 | B_HR4 = grand slams 22 | B_RBI = runs batted in 23 | B_GW = game winning RBI 24 | B_BB = walks 25 | B_IBB = intentional walks 26 | B_SO = strikeouts 27 | B_GDP = grounded into DP 28 | B_HP = hit by pitch 29 | B_SH = sacrifice hits 30 | B_SF = sacrifice flies 31 | B_SB = stolen bases 32 | B_CS = caught stealing 33 | B_XI = reached on interference 34 | B_G_DH = games as DH 35 | B_G_PH = games as PH 36 | B_G_PR = games as PR 37 | P_G = games pitched 38 | P_GS = games started 39 | P_CG = complete games 40 | P_SHO = shutouts 41 | P_GF = games finished 42 | P_W = wins 43 | P_L = losses 44 | P_SV = saves 45 | P_OUT = outs recorded (innings pitched times 3) 46 | P_TBF = batters faced 47 | P_AB = at bats 48 | P_R = runs allowed 49 | P_ER = earned runs allowed 50 | P_H = hits allowed 51 | P_TB = total bases allowed 52 | P_2B = doubles allowed 53 | P_3B = triples allowed 54 | P_HR = home runs allowed 55 | P_HR4 = grand slams allowed 56 | P_BB = walks allowed 57 | P_IBB = intentional walks allowed 58 | P_SO = strikeouts 59 | P_GDP = grounded into double play 60 | P_HP = hit batsmen 61 | P_SH = sacrifice hits against 62 | P_SF = sacrifice flies against 63 | P_XI = reached on interference 64 | P_WP = wild pitches 65 | P_BK = balks 66 | P_IR = inherited runners 67 | P_IRS = inherited runners scored 68 | P_GO = ground outs 69 | P_AO = air outs 70 | P_PITCH = pitches 71 | P_STRIKE = strikes 72 | F_P_G = games at P 73 | F_P_GS = games started at P 74 | F_P_OUT = outs recorded at P (innings fielded times 3) 75 | F_P_TC = total chances at P 76 | F_P_PO = putouts at P 77 | F_P_A = assists at P 78 | F_P_E = errors at P 79 | F_P_DP = double plays at P 80 | F_P_TP = triple plays at P 81 | F_C_G = games at C 82 | F_C_GS = games started at C 83 | F_C_OUT = outs recorded at C (innings fielded times 3) 84 | F_C_TC = total chances at C 85 | F_C_PO = putouts at C 86 | F_C_A = assists at C 87 | F_C_E = errors at C 88 | F_C_DP = double plays at C 89 | F_C_TP = triple plays at C 90 | F_C_PB = passed balls at C 91 | F_C_XI = catcher's interference at C 92 | F_1B_G = games at 1B 93 | F_1B_GS = games started at 1B 94 | F_1B_OUT = outs recorded at 1B (innings fielded times 3) 95 | F_1B_TC = total chances at 1B 96 | F_1B_PO = putouts at 1B 97 | F_1B_A = assists at 1B 98 | F_1B_E = errors at 1B 99 | F_1B_DP = double plays at 1B 100 | F_1B_TP = triple plays at 1B 101 | F_2B_G = games at 2B 102 | F_2B_GS = games started at 2B 103 | F_2B_OUT = outs recorded at 2B (innings fielded times 3) 104 | F_2B_TC = total chances at 2B 105 | F_2B_PO = putouts at 2B 106 | F_2B_A = assists at 2B 107 | F_2B_E = errors at 2B 108 | F_2B_DP = double plays at 2B 109 | F_2B_TP = triple plays at 2B 110 | F_3B_G = games at 3B 111 | F_3B_GS = games started at 3B 112 | F_3B_OUT = outs recorded at 3B (innings fielded times 3) 113 | F_3B_TC = total chances at 3B 114 | F_3B_PO = putouts at 3B 115 | F_3B_A = assists at 3B 116 | F_3B_E = errors at 3B 117 | F_3B_DP = double plays at 3B 118 | F_3B_TP = triple plays at 3B 119 | F_SS_G = games at SS 120 | F_SS_GS = games started at SS 121 | F_SS_OUT = outs recorded at SS (innings fielded times 3) 122 | F_SS_TC = total chances at SS 123 | F_SS_PO = putouts at SS 124 | F_SS_A = assists at SS 125 | F_SS_E = errors at SS 126 | F_SS_DP = double plays at SS 127 | F_SS_TP = triple plays at SS 128 | F_LF_G = games at LF 129 | F_LF_GS = games started at LF 130 | F_LF_OUT = outs recorded at LF (innings fielded times 3) 131 | F_LF_TC = total chances at LF 132 | F_LF_PO = putouts at LF 133 | F_LF_A = assists at LF 134 | F_LF_E = errors at LF 135 | F_LF_DP = double plays at LF 136 | F_LF_TP = triple plays at LF 137 | F_CF_G = games at CF 138 | F_CF_GS = games started at CF 139 | F_CF_OUT = outs recorded at CF (innings fielded times 3) 140 | F_CF_TC = total chances at CF 141 | F_CF_PO = putouts at CF 142 | F_CF_A = assists at CF 143 | F_CF_E = errors at CF 144 | F_CF_DP = double plays at CF 145 | F_CF_TP = triple plays at CF 146 | F_RF_G = games at RF 147 | F_RF_GS = games started at RF 148 | F_RF_OUT = outs recorded at RF (innings fielded times 3) 149 | F_RF_TC = total chances at RF 150 | F_RF_PO = putouts at RF 151 | F_RF_A = assists at RF 152 | F_RF_E = errors at RF 153 | F_RF_DP = double plays at RF 154 | F_RF_TP = triple plays at RF 155 | -------------------------------------------------------------------------------- /data/retrosheet/nb_data/pf.csv: -------------------------------------------------------------------------------- 1 | team_id,year,pf,pf_half,name 2 | ANA,2015,0.86,0.93,Angels 3 | ANA,2016,0.91,0.96,Angels 4 | ANA,2017,0.95,0.97,Angels 5 | ANA,2018,0.97,0.98,Angels 6 | ANA,2019,1.01,1.0,Angels 7 | ARI,2015,1.06,1.03,Diamondbacks 8 | ARI,2016,1.22,1.11,Diamondbacks 9 | ARI,2017,1.2,1.1,Diamondbacks 10 | ARI,2018,1.06,1.03,Diamondbacks 11 | ARI,2019,0.98,0.99,Diamondbacks 12 | ATL,2015,0.94,0.97,Braves 13 | ATL,2016,1.06,1.03,Braves 14 | ATL,2017,0.98,0.99,Braves 15 | ATL,2018,1.12,1.06,Braves 16 | ATL,2019,1.0,1.0,Braves 17 | BAL,2015,1.21,1.1,Orioles 18 | BAL,2016,0.95,0.98,Orioles 19 | BAL,2017,1.03,1.01,Orioles 20 | BAL,2018,0.98,0.99,Orioles 21 | BAL,2019,1.09,1.04,Orioles 22 | BOS,2015,1.19,1.1,Red Sox 23 | BOS,2016,1.2,1.1,Red Sox 24 | BOS,2017,1.03,1.01,Red Sox 25 | BOS,2018,1.08,1.04,Red Sox 26 | BOS,2019,1.03,1.01,Red Sox 27 | CHA,2015,0.9,0.95,White Sox 28 | CHA,2016,0.93,0.96,White Sox 29 | CHA,2017,1.0,1.0,White Sox 30 | CHA,2018,0.94,0.97,White Sox 31 | CHA,2019,0.97,0.98,White Sox 32 | CHN,2015,0.95,0.98,Cubs 33 | CHN,2016,0.87,0.94,Cubs 34 | CHN,2017,1.13,1.07,Cubs 35 | CHN,2018,1.08,1.04,Cubs 36 | CHN,2019,0.93,0.97,Cubs 37 | CIN,2015,1.12,1.06,Reds 38 | CIN,2016,0.99,0.99,Reds 39 | CIN,2017,1.02,1.01,Reds 40 | CIN,2018,1.13,1.06,Reds 41 | CIN,2019,1.03,1.02,Reds 42 | CLE,2015,1.26,1.13,Indians 43 | CLE,2016,1.21,1.1,Indians 44 | CLE,2017,0.97,0.99,Indians 45 | CLE,2018,1.12,1.06,Indians 46 | CLE,2019,0.97,0.99,Indians 47 | COL,2015,1.44,1.22,Rockies 48 | COL,2016,1.37,1.18,Rockies 49 | COL,2017,1.33,1.17,Rockies 50 | COL,2018,1.27,1.14,Rockies 51 | COL,2019,1.39,1.2,Rockies 52 | DET,2015,0.9,0.95,Tigers 53 | DET,2016,1.02,1.01,Tigers 54 | DET,2017,1.17,1.08,Tigers 55 | DET,2018,0.95,0.97,Tigers 56 | DET,2019,1.11,1.05,Tigers 57 | HOU,2015,0.93,0.96,Astros 58 | HOU,2016,0.81,0.9,Astros 59 | HOU,2017,0.82,0.91,Astros 60 | HOU,2018,0.99,0.99,Astros 61 | HOU,2019,1.1,1.05,Astros 62 | KCA,2015,1.02,1.01,Royals 63 | KCA,2016,1.17,1.09,Royals 64 | KCA,2017,0.93,0.96,Royals 65 | KCA,2018,1.06,1.03,Royals 66 | KCA,2019,1.07,1.04,Royals 67 | LAN,2015,0.92,0.96,Dodgers 68 | LAN,2016,0.81,0.91,Dodgers 69 | LAN,2017,0.97,0.99,Dodgers 70 | LAN,2018,0.86,0.93,Dodgers 71 | LAN,2019,0.9,0.95,Dodgers 72 | MIA,2015,0.95,0.98,Marlins 73 | MIA,2016,0.83,0.92,Marlins 74 | MIA,2017,0.85,0.93,Marlins 75 | MIA,2018,0.75,0.87,Marlins 76 | MIA,2019,1.09,1.04,Marlins 77 | MIL,2015,1.1,1.05,Brewers 78 | MIL,2016,0.97,0.99,Brewers 79 | MIL,2017,1.08,1.04,Brewers 80 | MIL,2018,1.01,1.01,Brewers 81 | MIL,2019,0.98,0.99,Brewers 82 | MIN,2015,0.99,1.0,Twins 83 | MIN,2016,1.04,1.02,Twins 84 | MIN,2017,1.1,1.05,Twins 85 | MIN,2018,1.02,1.01,Twins 86 | MIN,2019,0.98,0.99,Twins 87 | NYA,2015,1.02,1.01,Yankees 88 | NYA,2016,1.04,1.02,Yankees 89 | NYA,2017,1.0,1.0,Yankees 90 | NYA,2018,1.13,1.06,Yankees 91 | NYA,2019,0.84,0.92,Yankees 92 | NYN,2015,0.87,0.94,Mets 93 | NYN,2016,0.99,0.99,Mets 94 | NYN,2017,0.86,0.93,Mets 95 | NYN,2018,0.73,0.87,Mets 96 | NYN,2019,0.89,0.95,Mets 97 | OAK,2015,0.94,0.97,Athletics 98 | OAK,2016,0.83,0.91,Athletics 99 | OAK,2017,1.1,1.05,Athletics 100 | OAK,2018,0.84,0.92,Athletics 101 | OAK,2019,0.89,0.94,Athletics 102 | PHI,2015,1.04,1.02,Phillies 103 | PHI,2016,0.84,0.92,Phillies 104 | PHI,2017,1.07,1.04,Phillies 105 | PHI,2018,1.04,1.02,Phillies 106 | PHI,2019,1.05,1.02,Phillies 107 | PIT,2015,0.93,0.97,Pirates 108 | PIT,2016,1.01,1.0,Pirates 109 | PIT,2017,0.95,0.97,Pirates 110 | PIT,2018,0.88,0.94,Pirates 111 | PIT,2019,1.0,1.0,Pirates 112 | SDN,2015,0.93,0.97,Padres 113 | SDN,2016,1.01,1.01,Padres 114 | SDN,2017,0.83,0.91,Padres 115 | SDN,2018,1.04,1.02,Padres 116 | SDN,2019,0.86,0.93,Padres 117 | SEA,2015,0.88,0.94,Mariners 118 | SEA,2016,0.94,0.97,Mariners 119 | SEA,2017,0.92,0.96,Mariners 120 | SEA,2018,0.85,0.92,Mariners 121 | SEA,2019,0.95,0.98,Mariners 122 | SFN,2015,0.85,0.92,Giants 123 | SFN,2016,1.01,1.01,Giants 124 | SFN,2017,0.85,0.92,Giants 125 | SFN,2018,1.01,1.01,Giants 126 | SFN,2019,0.8,0.9,Giants 127 | SLN,2015,0.93,0.97,Cardinals 128 | SLN,2016,0.92,0.96,Cardinals 129 | SLN,2017,0.89,0.94,Cardinals 130 | SLN,2018,0.93,0.96,Cardinals 131 | SLN,2019,0.92,0.96,Cardinals 132 | TBA,2015,0.96,0.98,Rays 133 | TBA,2016,0.89,0.94,Rays 134 | TBA,2017,0.92,0.96,Rays 135 | TBA,2018,0.93,0.96,Rays 136 | TBA,2019,0.89,0.95,Rays 137 | TEX,2015,1.14,1.07,Rangers 138 | TEX,2016,1.16,1.08,Rangers 139 | TEX,2017,1.22,1.11,Rangers 140 | TEX,2018,1.35,1.18,Rangers 141 | TEX,2019,1.24,1.12,Rangers 142 | TOR,2015,0.91,0.95,Blue Jays 143 | TOR,2016,1.16,1.08,Blue Jays 144 | TOR,2017,0.95,0.97,Blue Jays 145 | TOR,2018,0.96,0.98,Blue Jays 146 | TOR,2019,1.03,1.02,Blue Jays 147 | WAS,2015,1.0,1.0,Nationals 148 | WAS,2016,0.96,0.98,Nationals 149 | WAS,2017,1.06,1.03,Nationals 150 | WAS,2018,1.13,1.07,Nationals 151 | WAS,2019,1.1,1.05,Nationals 152 | -------------------------------------------------------------------------------- /download_scripts/README.md: -------------------------------------------------------------------------------- 1 | # Data Preparation Scripts for Baseball Analytics 2 | 3 | These scripts download, parse and wrangle the Lahman and Retrosheet data. 4 | 5 | An optional script creates Postgres tables with appropriate primary key constraints and loads the csv files into these tables. 6 | 7 | All scripts should be run from the download_scripts directory. 8 | 9 | For all scripts: 10 | 11 | * --help for help 12 | * -v for verbose: logs to stdout 13 | * --log INFO: appends to download.log file (at the INFO level) 14 | * --data-dir ../data: specifies the data directory (default is ../data) 15 | 16 | Scripts with example command line arguments: 17 | 18 | * **./run_all_scripts.py** --start-year=1974 --end-year=2019 19 | * convenience script to run all scripts with -v --log=INFO 20 | * default data directory is ../data 21 | * all data is downloaded but only the years specified are parsed and wrangled 22 | * **./lahman_download.py** -v --log=INFO 23 | * downloads all the lahman data and unzips it to `../data/lahman/raw` 24 | 25 | * **./lahman_wrangle.py** -v -log=INFO 26 | * converts field names to snake_case 27 | * performs custom parsing of dates 28 | * drops fielding columns that have more than 90% missing values 29 | * optimizes data types 30 | * persists with optimized data types to `../data/lahman/wrangle` 31 | * **./retrosheet_download.py** -v -log=INFO 32 | * downloads the retrosheet data and unzips it to `../data/retrosheet/raw` 33 | * **./retrosheet_parse.py** -v --log=INFO --start-year=1974 --end-year=2019 34 | * parses data in `data/retrosheet/raw` for the specified years 35 | * cwdaily and cwgame are always run 36 | * use '--run-cwevent' to run the cwevent parser as well 37 | * use '--cwevent-fields' to specify your own set of fields using the cwevent syntax 38 | * for example, to specify all fields use: --cwevent-fields='-f 0-96 -x 0-62' 39 | * **./retrosheet_collect.py** -v --log=INFO --use-datatypes 40 | * with --use-datatypes option 41 | * uses the precomputed optimized data types: `data/retrosheet/*_types.csv` 42 | * this can save several Gigs of RAM, if data goes back to the 1950s or earlier 43 | * without --use-datatypes option 44 | * will compute and save the optimized data types 45 | * may require more than 16 Gig of RAM, if data goes back to the 1950s or earlier 46 | * collects the results into one DataFrame for cwdaily and one DataFrame for cwgame 47 | * if there are cwevent files, it will collect these into a single DataFrame as well 48 | * if there are cwevent files, it will add the following new fields to make play-by-play analysis easier: so, sb, cs, bk, bb, ibb, hbp, xi, single, double, triple, hr 49 | * converts the field names to lower case 50 | * drops columns that have more than 99% missing values 51 | * persists the results to `../data/retrosheet/collected` 52 | * the csv files are compressed using gzip 53 | * **./retrosheet_datadictionary.py** 54 | * this is an optional script which produces the data dictionary for the cwdaily and cwgame parsers 55 | * the results of running this script are published in this github repo at `data/retrosheet` as cwdaily_datadictionary.txt and cwgame_datadictionary.txt 56 | * **./retrosheet_wrangle.py** -v --log=INFO 57 | * data cleanup for non-unique primary key (player_id, game_id) 58 | * between 1948 and 2019 there is only one duplicate primary key 59 | * custom parsing of game start time 60 | * restructure cwdaily output to create batting/pitching/fielding csv files that have a row only if the player has a non-zero batting/pitching/fielding statistic for that game 61 | * restructure cwgame output to create stats per team per game (team_game.csv) and stats per game (game.csv) 62 | * the csv files are compressed using gzip 63 | * **./postgres_load_data.py** -v --log=INFO 64 | * optional script to: 65 | * create tables with optimized data types 66 | * create primary and foreign key constraints 67 | * load data into tables 68 | * the baseball database must have already been created 69 | * connect string: f'postgresql://{db_user}:{db_pass}@localhost:5432/baseball' 70 | 71 | ### Performing Data Validation 72 | 73 | pytest is used to automate the running of more than 50 data integrity and data consistency tests. 74 | 75 | Running pytest: 76 | 77 | * recommend: 'pytest -v' 78 | * must be run from the `download_scripts` directory 79 | * must be run after the scripts which download and parse the data have been run 80 | * accepts custom option: --data-dir= 81 | 82 | If you like, you may spot check the data using [Baseball Reference](https://www.baseball-reference.com/). Baseball Reference uses the Retrosheet data. The box score for a game can be constructed from the game_id using: 83 | `'https://www.baseball-reference.com/boxes/' + game_id.str[:3] + '/' + game_id + '.shtml'` 84 | For example, to verify that there are two entries for Chris Young for game_id = BOS201708250, the url is: 85 | https://www.baseball-reference.com/boxes/BOS/BOS201708250.shtml 86 | 87 | ### Rerunning the Scripts 88 | 89 | It is rarely necessary to re-download the data. Minor tweaks are continually being made to Lahman and Retrosheet for very old data, but recent data is usually accurate and complete the first time it is made available. 90 | 91 | The data is not updated during the season. It is added to both Lahman and Retrosheet around late December. For example, all of the 2019 regular and post-season data for both Lahman and Retrosheet became available in late December 2019. 92 | 93 | To rerun the scripts, it is only necessary to remove the data from data directories other than the raw data directories. -------------------------------------------------------------------------------- /MLB_Data_Overview.md: -------------------------------------------------------------------------------- 1 | ## MLB Data Overview 2 | 3 | ### Tidy Data Definition 4 | 5 | Data is [tidy](https://en.wikipedia.org/wiki/Tidy_data) if: 6 | 7 | 1. Each variable forms a column. 8 | 2. Each observation forms a row. 9 | 3. Each type of observational unit forms a table or csv file. 10 | 11 | The above is nearly identical to the database term "3rd normal form". Arguably the last rule above is not required for data analysis, but it saves space and helps to ensure data consistency. 12 | 13 | The benefit of making the data tidy is that data analysis is much easier. 14 | 15 | ### Lahman Overview 16 | 17 | The Lahman data is tidy. The description of these csv files is in the `data/lahman` directory and is called readme2017.txt. It was copied from the Lahman website and it is accurate for 2018 and 2019 as well. 18 | 19 | A description of data might be called a "data dictionary" or a "code book" or simply just a "readme.txt". 20 | 21 | As of December 2019, Lahman has data through the end of the 2019 season. 22 | 23 | ### Retrosheet Overview 24 | 25 | The Retrosheet data is not tidy nor is it in csv format, rather it is in a custom text format. Reading this format is most easily done using the open-source parsers by Dr. T. L. Turocy which convert the Retrosheet text files into csv files with a header row. 26 | 27 | As of December 2019, Retrosheet has data through the 2019 season. 28 | 29 | ### Field Names 30 | 31 | The field names in both datasets are based on standard baseball abbreviations. See for example https://en.wikipedia.org/wiki/Baseball_statistics. 32 | 33 | The field names have been changed as little as possible to remain familiar. Field name changes include: 34 | 35 | * columns in different csv files with the same meaning, now have the same column name 36 | * CamelCase is converted to snake_case 37 | * '2B' and '3B' are changed to 'double' and 'triple' to make them valid identifiers 38 | * Retrosheet's 'gdp' is changed to 'gidp' to match Lahman 39 | * Retrosheet's 'hp' is changed to 'hbp' to match Lahman 40 | 41 | ### CSV Files Created 42 | 43 | After data wrangling, the following csv files exist: 44 | 45 | **Lahman** 46 | 47 | * Stats per Player per Year: 48 | * batting.csv 49 | * pitching.csv 50 | * fielding.csv 51 | * Postseason Stats per Round per Player per Year 52 | * battingpost.csv 53 | * pitchingpost.csv 54 | * fieldingpost.csv 55 | * Stats per Team per Year: 56 | * teams.csv -- contains team_id for both Lahman and Retrosheet 57 | * Other 58 | * people.csv -- contains player_id for Lahman, Retrosheet and Baseball-Reference 59 | * salaries.csv 60 | * parks.csv 61 | * more to be added soon ... 62 | 63 | 64 | **Retrosheet** 65 | 66 | * Stats per Event 67 | * event.csv.gz 68 | * Stats per Player per Game: 69 | * batting.csv.gz 70 | * pitching.csv.gz 71 | * fielding.csv.gz 72 | * Stats per Team per Game: 73 | * team_game.csv.gz 74 | * Stats per Game: 75 | * game.csv.gz 76 | * Postseason stats: to be added soon ... 77 | 78 | A script to create Postgres tables with appropriate primary key constraints and load each of the above csv files into these tables is provided. 79 | 80 | ### Unique Identifiers (Primary Keys) 81 | 82 | When performing data analysis, it is essential to know what field(s) uniquely identify a row in a csv file (or table). It turns out that cwgame generates the equivalent of two entries for the "box score" one time since 1948. These two entries were summed appropriately so that the expected unique identifiers work properly. 83 | 84 | Not having unique identifiers greatly complicates data analysis. 85 | 86 | ### Data Types 87 | 88 | There are several reasons to pay close attention to the data types used by Pandas and/or Postgres: 89 | 90 | * the data type provides information about the field 91 | * the data type helps to ensure correct code 92 | * use the smallest appropriate data type saves memory and database storage 93 | 94 | For example, the default value for an integer in Pandas is 'int64', and yet the maximum number of hits in a game can be saved in just 8 bits with a 'uint8'. Pandas nullable integer data types are also made use of. 95 | 96 | Data type optimization per column per csv file are persisted to disk by writing a corresponding csv files with the suffice _types.csv. I have written python function which then read the csv back into a dataframe using the optimized persisted data types. 97 | 98 | ## Data Wrangling 99 | 100 | The scripts which wrangle the Lahman and Retrosheet data will: 101 | 102 | * ensure that the same field name has the same meaning across all csv files for both Lahman and Retrosheet 103 | * ensure that the field names conform to official baseball abbreviations as much as possible 104 | * with the caveat that all field names must be valid Python identifiers and valid SQL column names 105 | * determine the most efficient data type, for both Pandas and Postgres tables, and persist that data type for each corresponding csv file 106 | * automate the running of 3 Retrosheet parsers and tidy the output 107 | * translate numeric codes into text so they can be understood 108 | * identify the different ways in which missing data is represented and create the appropriate value in Pandas 109 | * translate unusual date and time representations to appropriate data and time Pandas data types 110 | * normalize the data 111 | * for example, every player does not play every fielding position in every game, and yet that is how the output of the cwgame parsers presents the data. As such, that output is almost all zeros. A better representation is to create a row for each player for each fielding position they actually played in a game. 112 | * and more ... 113 | 114 | ### Baseball Player Roles 115 | 116 | A baseball player may have several roles during the course of a game, such as batter, pitcher and any of the 9 fielding positions. 117 | 118 | Attribute names for batters and pitchers are the same where it makes sense to do so. For example, if a batter hits a "hr" then then opposing team's pitcher must have given up a "hr". 119 | 120 | All attribute names for the 9 fielding positions are identical, even though passed-ball only applies to the catcher and interference is mostly relevant to the catcher, pitcher and first baseman. This allows for a single csv file for fielding with no null values. -------------------------------------------------------------------------------- /download_scripts/retrosheet_parse.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Parse all event files in {data_dir}/retrosheet/raw and put result in {data_dir}/retrosheet/parsed""" 4 | 5 | __author__ = 'Stephen Diehl' 6 | 7 | import argparse 8 | import subprocess 9 | import sys 10 | from pathlib import Path 11 | import os 12 | import glob 13 | import logging 14 | 15 | logger = logging.getLogger(__name__) 16 | logger.setLevel(logging.DEBUG) 17 | 18 | 19 | def get_parser(): 20 | """Args Description""" 21 | 22 | parser = argparse.ArgumentParser( 23 | description=__doc__, 24 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 25 | 26 | parser.add_argument("--data-dir", type=str, help="baseball data directory", default='../data') 27 | 28 | # Some Key MLB Data Dates 29 | # 1955: sacrifice files, sacrifice bunts and intentional walks are recorded for the first time 30 | # 1969: divisional play begins 31 | # 1974: Retrosheet is missing no games from 1974 to present 32 | parser.add_argument("--start-year", type=int, help="start year", default='1955') 33 | 34 | # Retrosheet Data for 2019 became available in December 2019 35 | parser.add_argument("--end-year", type=int, help="end year", default='2019') 36 | 37 | parser.add_argument("-v", "--verbose", help="verbose output", action="store_true") 38 | parser.add_argument("--log", dest="log_level", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 39 | help="Set the logging level") 40 | 41 | parser.add_argument("--run-cwevent", help="verbose output", action="store_true") 42 | parser.add_argument("--cwevent-fields", type=str, help="cwevent field specification", 43 | default='-f 0,2,3,8,9,10,14,29,36-42,44,45,51,96 -x 1,2,5,8,11,13,14,45,50,55') 44 | 45 | return parser 46 | 47 | 48 | def check_for_retrosheet_parsers(): 49 | """Check that parsers can be executed.""" 50 | p1 = subprocess.run(['cwevent', '-h'], shell=False, capture_output=True) 51 | if p1.returncode != 0: 52 | raise FileNotFoundError('could not execute cwevent') 53 | 54 | p1 = subprocess.run(['cwdaily', '-h'], shell=False, capture_output=True) 55 | if p1.returncode != 0: 56 | raise FileNotFoundError('could not execute cwdaily') 57 | 58 | p1 = subprocess.run(['cwgame', '-h'], shell=False, capture_output=True) 59 | if p1.returncode != 0: 60 | raise FileNotFoundError('could not execute cwgame') 61 | 62 | 63 | def parse_event_files(raw_dir, parse_dir, parser, fields, start_year, end_year): 64 | """Parse raw Retrosheet data""" 65 | os.chdir(raw_dir) 66 | 67 | for year in range(start_year, end_year + 1): 68 | files = sorted(glob.glob(f'{year}*.EV*')) 69 | first = True 70 | 71 | cmd = [parser] 72 | cmd.extend(fields.split(' ')) 73 | 74 | logger.info(f'{parser} parsing {len(files)} teams for {year} ...') 75 | 76 | for file in files: 77 | out = f'{parse_dir.as_posix()}/{parser}{year}.csv' 78 | if first: 79 | # print csv header using -n 80 | cmd.append('-n') 81 | cmd.extend(['-y', str(year)]) 82 | 83 | cmd_full = cmd + [file] 84 | logger.debug(f'{" ".join(cmd_full)}') 85 | 86 | # overwrite existing file if it exists 87 | with open(out, "w+") as outfile: 88 | result = subprocess.run(cmd_full, shell=False, stdout=outfile, stderr=subprocess.DEVNULL) 89 | first = False 90 | 91 | # don't print csv header for subsequent teams in the same year 92 | cmd.remove('-n') 93 | else: 94 | cmd_full = cmd + [file] 95 | logger.debug(f'{" ".join(cmd_full)}') 96 | 97 | # append to existing file 98 | with open(out, "a+") as outfile: 99 | result = subprocess.run(cmd_full, shell=False, stdout=outfile, stderr=subprocess.DEVNULL) 100 | 101 | 102 | def main(): 103 | """Parse the data and organize the results. 104 | """ 105 | parser = get_parser() 106 | args = parser.parse_args() 107 | 108 | if args.log_level: 109 | fh = logging.FileHandler('download.log') 110 | formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s') 111 | fh.setFormatter(formatter) 112 | fh.setLevel(args.log_level) 113 | logger.addHandler(fh) 114 | 115 | if args.verbose: 116 | # send INFO level logging to stdout 117 | sh = logging.StreamHandler(sys.stdout) 118 | formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s') 119 | sh.setFormatter(formatter) 120 | sh.setLevel(logging.INFO) 121 | logger.addHandler(sh) 122 | 123 | if args.start_year > 1974: 124 | logger.warning('data consistency tests require start_year <= 1974') 125 | args.start_year = 1974 126 | 127 | if args.end_year < 2019: 128 | logger.warning('data consistency tests require end-year >= 2019') 129 | args.end_year = 2019 130 | 131 | check_for_retrosheet_parsers() 132 | 133 | p_data = Path(args.data_dir).resolve() 134 | 135 | p_data_raw = p_data.joinpath('retrosheet/raw/event/regular') 136 | p_data_parsed = p_data.joinpath('retrosheet/parsed') 137 | p_data_collected = p_data.joinpath('retrosheet/collected') 138 | 139 | # create directories, if they do not exist 140 | p_data_parsed.mkdir(parents=True, exist_ok=True) 141 | p_data_collected.mkdir(parents=True, exist_ok=True) 142 | 143 | # this selection of fields appears to support most play-by-play analysis 144 | if args.run_cwevent: 145 | if (p_data_parsed / 'cwevent2019.csv').exists(): 146 | logger.info('Skipping cwevent parsing -- already performed') 147 | else: 148 | parse_event_files(p_data_raw, p_data_parsed, 'cwevent', 149 | args.cwevent_fields, args.start_year, args.end_year) 150 | 151 | # request all available fields for cwdaily and cwgame 152 | if (p_data_parsed / 'cwdaily2019.csv').exists(): 153 | logger.info('Skipping cwdaily parser -- already performed') 154 | else: 155 | parse_event_files(p_data_raw, p_data_parsed, 'cwdaily', '-f 0-153', args.start_year, args.end_year) 156 | 157 | if (p_data_parsed / 'cwgame2019.csv').exists(): 158 | logger.info('Skipping cwgame parser -- already performed') 159 | else: 160 | parse_event_files(p_data_raw, p_data_parsed, 'cwgame', '-f 0-83 -x 0-94', args.start_year, args.end_year) 161 | 162 | 163 | if __name__ == '__main__': 164 | main() 165 | -------------------------------------------------------------------------------- /data/retrosheet/cwgame_datadictionary.txt: -------------------------------------------------------------------------------- 1 | GAME_ID = game id 2 | GAME_DT = date 3 | GAME_CT = game number (0 = no double header) 4 | GAME_DY = day of week 5 | START_GAME_TM = start time 6 | DH_FL = DH used flag 7 | DAYNIGHT_PARK_CD = day/night flag 8 | AWAY_TEAM_ID = visiting team 9 | HOME_TEAM_ID = home team 10 | PARK_ID = game site 11 | AWAY_START_PIT_ID = vis. starting pitcher 12 | HOME_START_PIT_ID = home starting pitcher 13 | BASE4_UMP_ID = home plate umpire 14 | BASE1_UMP_ID = first base umpire 15 | BASE2_UMP_ID = second base umpire 16 | BASE3_UMP_ID = third base umpire 17 | LF_UMP_ID = left field umpire 18 | RF_UMP_ID = right field umpire 19 | ATTEND_PARK_CT = attendance 20 | SCORER_RECORD_ID = PS scorer 21 | TRANSLATOR_RECORD_ID = translator 22 | INPUTTER_RECORD_ID = inputter 23 | INPUT_RECORD_TS = input time 24 | EDIT_RECORD_TS = edit time 25 | METHOD_RECORD_CD = how scored 26 | PITCHES_RECORD_CD = pitches entered? 27 | TEMP_PARK_CT = temperature 28 | WIND_DIRECTION_PARK_CD = wind direction 29 | WIND_SPEED_PARK_CT = wind speed 30 | FIELD_PARK_CD = field condition 31 | PRECIP_PARK_CD = precipitation 32 | SKY_PARK_CD = sky 33 | MINUTES_GAME_CT = time of game 34 | INN_CT = number of innings 35 | AWAY_SCORE_CT = visitor final score 36 | HOME_SCORE_CT = home final score 37 | AWAY_HITS_CT = visitor hits 38 | HOME_HITS_CT = home hits 39 | AWAY_ERR_CT = visitor errors 40 | HOME_ERR_CT = home errors 41 | AWAY_LOB_CT = visitor left on base 42 | HOME_LOB_CT = home left on base 43 | WIN_PIT_ID = winning pitcher 44 | LOSE_PIT_ID = losing pitcher 45 | SAVE_PIT_ID = save for 46 | GWRBI_BAT_ID = GW RBI 47 | AWAY_LINEUP1_BAT_ID = visitor batter 1 48 | AWAY_LINEUP1_FLD_CD = visitor position 1 49 | AWAY_LINEUP2_BAT_ID = visitor batter 2 50 | AWAY_LINEUP2_FLD_CD = visitor position 2 51 | AWAY_LINEUP3_BAT_ID = visitor batter 3 52 | AWAY_LINEUP3_FLD_CD = visitor position 3 53 | AWAY_LINEUP4_BAT_ID = visitor batter 4 54 | AWAY_LINEUP4_FLD_CD = visitor position 4 55 | AWAY_LINEUP5_BAT_ID = visitor batter 5 56 | AWAY_LINEUP5_FLD_CD = visitor position 5 57 | AWAY_LINEUP6_BAT_ID = visitor batter 6 58 | AWAY_LINEUP6_FLD_CD = visitor position 6 59 | AWAY_LINEUP7_BAT_ID = visitor batter 7 60 | AWAY_LINEUP7_FLD_CD = visitor position 7 61 | AWAY_LINEUP8_BAT_ID = visitor batter 8 62 | AWAY_LINEUP8_FLD_CD = visitor position 8 63 | AWAY_LINEUP9_BAT_ID = visitor batter 9 64 | AWAY_LINEUP9_FLD_CD = visitor position 9 65 | HOME_LINEUP1_BAT_ID = home batter 1 66 | HOME_LINEUP1_FLD_CD = home position 1 67 | HOME_LINEUP2_BAT_ID = home batter 2 68 | HOME_LINEUP2_FLD_CD = home position 2 69 | HOME_LINEUP3_BAT_ID = home batter 3 70 | HOME_LINEUP3_FLD_CD = home position 3 71 | HOME_LINEUP4_BAT_ID = home batter 4 72 | HOME_LINEUP4_FLD_CD = home position 4 73 | HOME_LINEUP5_BAT_ID = home batter 5 74 | HOME_LINEUP5_FLD_CD = home position 5 75 | HOME_LINEUP6_BAT_ID = home batter 6 76 | HOME_LINEUP6_FLD_CD = home position 6 77 | HOME_LINEUP7_BAT_ID = home batter 7 78 | HOME_LINEUP7_FLD_CD = home position 7 79 | HOME_LINEUP8_BAT_ID = home batter 8 80 | HOME_LINEUP8_FLD_CD = home position 8 81 | HOME_LINEUP9_BAT_ID = home batter 9 82 | HOME_LINEUP9_FLD_CD = home position 9 83 | AWAY_FINISH_PIT_ID = visiting finisher (NULL if complete game) 84 | HOME_FINISH_PIT_ID = home finisher (NULL if complete game) 85 | AWAY_TEAM_LEAGUE_ID = visiting team league 86 | HOME_TEAM_LEAGUE_ID = home team league 87 | AWAY_TEAM_GAME_CT = visiting team game number 88 | HOME_TEAM_GAME_CT = home team game number 89 | OUTS_CT = length of game in outs 90 | COMPLETION_TX = information on completion of game 91 | FORFEIT_TX = information on forfeit of game 92 | PROTEST_TX = information on protest of game 93 | AWAY_LINE_TX = visiting team linescore 94 | HOME_LINE_TX = home team linescore 95 | AWAY_AB_CT = visiting team AB 96 | AWAY_2B_CT = visiting team 2B 97 | AWAY_3B_CT = visiting team 3B 98 | AWAY_HR_CT = visiting team HR 99 | AWAY_BI_CT = visiting team RBI 100 | AWAY_SH_CT = visiting team SH 101 | AWAY_SF_CT = visiting team SF 102 | AWAY_HP_CT = visiting team HP 103 | AWAY_BB_CT = visiting team BB 104 | AWAY_IBB_CT = visiting team IBB 105 | AWAY_SO_CT = visiting team SO 106 | AWAY_SB_CT = visiting team SB 107 | AWAY_CS_CT = visiting team CS 108 | AWAY_GDP_CT = visiting team GDP 109 | AWAY_XI_CT = visiting team reach on interference 110 | AWAY_PITCHER_CT = number of pitchers used by visiting team 111 | AWAY_ER_CT = visiting team individual ER allowed 112 | AWAY_TER_CT = visiting team team ER allowed 113 | AWAY_WP_CT = visiting team WP 114 | AWAY_BK_CT = visiting team BK 115 | AWAY_PO_CT = visiting team PO 116 | AWAY_A_CT = visiting team A 117 | AWAY_PB_CT = visiting team PB 118 | AWAY_DP_CT = visiting team DP 119 | AWAY_TP_CT = visiting team TP 120 | HOME_AB_CT = home team AB 121 | HOME_2B_CT = home team 2B 122 | HOME_3B_CT = home team 3B 123 | HOME_HR_CT = home team HR 124 | HOME_BI_CT = home team RBI 125 | HOME_SH_CT = home team SH 126 | HOME_SF_CT = home team SF 127 | HOME_HP_CT = home team HP 128 | HOME_BB_CT = home team BB 129 | HOME_IBB_CT = home team IBB 130 | HOME_SO_CT = home team SO 131 | HOME_SB_CT = home team SB 132 | HOME_CS_CT = home team CS 133 | HOME_GDP_CT = home team GDP 134 | HOME_XI_CT = home team reach on interference 135 | HOME_PITCHER_CT = number of pitchers used by home team 136 | HOME_ER_CT = home team individual ER allowed 137 | HOME_TER_CT = home team team ER allowed 138 | HOME_WP_CT = home team WP 139 | HOME_BK_CT = home team BK 140 | HOME_PO_CT = home team PO 141 | HOME_A_CT = home team A 142 | HOME_PB_CT = home team PB 143 | HOME_DP_CT = home team DP 144 | HOME_TP_CT = home team TP 145 | UMP_HOME_NAME_TX = home plate umpire name 146 | UMP_1B_NAME_TX = first base umpire name 147 | UMP_2B_NAME_TX = second base umpire name 148 | UMP_3B_NAME_TX = third base umpire name 149 | UMP_LF_NAME_TX = left field umpire name 150 | UMP_RF_NAME_TX = right field umpire name 151 | AWAY_MANAGER_ID = visitors manager ID 152 | AWAY_MANAGER_NAME_TX = visitors manager name 153 | HOME_MANAGER_ID = home manager ID 154 | HOME_MANAGER_NAME_TX = home manager name 155 | WIN_PIT_NAME_TX = winning pitcher name 156 | LOSE_PIT_NAME_TX = losing pitcher name 157 | SAVE_PIT_NAME_TX = save pitcher name 158 | GOAHEAD_RBI_ID = batter with goahead RBI ID 159 | GOAHEAD_RBI_NAME_TX = batter with goahead RBI 160 | AWAY_LINEUP1_BAT_NAME_TX = visitor batter 1 name 161 | AWAY_LINEUP2_BAT_NAME_TX = visitor batter 2 name 162 | AWAY_LINEUP3_BAT_NAME_TX = visitor batter 3 name 163 | AWAY_LINEUP4_BAT_NAME_TX = visitor batter 4 name 164 | AWAY_LINEUP5_BAT_NAME_TX = visitor batter 5 name 165 | AWAY_LINEUP6_BAT_NAME_TX = visitor batter 6 name 166 | AWAY_LINEUP7_BAT_NAME_TX = visitor batter 7 name 167 | AWAY_LINEUP8_BAT_NAME_TX = visitor batter 8 name 168 | AWAY_LINEUP9_BAT_NAME_TX = visitor batter 9 name 169 | HOME_LINEUP1_BAT_NAME_TX = home batter 1 name 170 | HOME_LINEUP2_BAT_NAME_TX = home batter 2 name 171 | HOME_LINEUP3_BAT_NAME_TX = home batter 3 name 172 | HOME_LINEUP4_BAT_NAME_TX = home batter 4 name 173 | HOME_LINEUP5_BAT_NAME_TX = home batter 5 name 174 | HOME_LINEUP6_BAT_NAME_TX = home batter 6 name 175 | HOME_LINEUP7_BAT_NAME_TX = home batter 7 name 176 | HOME_LINEUP8_BAT_NAME_TX = home batter 8 name 177 | HOME_LINEUP9_BAT_NAME_TX = home batter 9 name 178 | ADD_INFO_TX = additional information 179 | ACQ_INFO_TX = acquisition information 180 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Baseball Analytics 2 | 3 | ## Overview 4 | 5 | Scripts are provided which download, parse, and wrangle the Lahman and Retrosheet data to produce a set of tidy csv files that can be analyzed in Python and Pandas, or R. There is also an optional script to load the data into Postgres. 6 | 7 | Examples of data analysis are provided using Python and Pandas in Jupyter Notebooks. 8 | 9 | The value of publishing the scripts and the analysis is that the results are repeatable. The precise data used and the precise data processing, are made available for anyone to use, modify and evaluate. 10 | 11 | The value of wrangling the data is that the analysis is much easier and the RAM and storage requirements are much less. 12 | 13 | ## Data Science and Sabermetrics 14 | 15 | [Sabermetrics](https://en.wikipedia.org/wiki/Sabermetrics) was created before the advent of modern software tools for data analysis and fast personal computers. One aim is to create metrics that make it easy for people to quickly grasp how much a baseball player contributes to his team's wins. In data science terminology, this is an example of explanatory modeling. 16 | 17 | Another aim of Sabermetrics is to identify metrics that are likely to be useful in a predictive model. In data science terminology, a baseball domain expert uses feature engineering to create inputs (Sabermetrics) to improve predictive accuracy. 18 | 19 | Data Science, and science in general, must produce results that can be repeated by others. See [Reproducible Research](https://en.wikipedia.org/wiki/Reproducibility#Reproducible_research). A problem with many Sabermetric blog posts is that the results cannot be repeated because the code use to perform the analysis, and the data itself, are not made public. 20 | 21 | The emphasis here is on repeatable data analysis. The scripts to download the data are provided. The data is wrangled to simplify the analysis, and the data wrangling scripts are provided. Over 50 tests are also provided to verify the data wrangling, verify the Retrosheet parsers, and determine how consistent the Retrosheet data is with the Lahman data. These tests can be run with the single command, 'pytest'. The data analysis is published in unambiguous code in the form of Jupyter notebooks. 22 | 23 | ### Data Preparation Scripts 24 | 25 | The Python scripts prepare the data for analysis, including running the open-source [Retrosheet Parsers](https://github.com/sdiehl28/baseball-analytics/blob/master/RetrosheetParsers.md). These scripts are at [download_scripts](https://github.com/sdiehl28/baseball-analytics/tree/master/download_scripts). 26 | 27 | ### Data Analysis 28 | 29 | Examples of baseball analysis are presented using Jupyter Notebooks with Python, Pandas and matplotlib/seaborn plots. 30 | 31 | Some initial analysis includes: 32 | 33 | - How many more runs per game are there when the DH is used? Could this difference be due to chance? 34 | - How has game length and pitcher count increased over the years? 35 | - - How is game length related to pitcher count? Could this relationship be due to chance? 36 | - Computing the Park Factor 37 | - - What did ESPN, Fangraphs, and others get wrong about the park factor for Fenway Park in 2019? 38 | - Demonstrate that accounting for each team's road schedule will strongly affect the home park factor, for a few teams each year. 39 | - Compute the game-weighted average Park Factor on the road, for each team, for several years. 40 | - Linear Modeling of Runs per Half Inning 41 | - How much does a singe, double, triple and home run contribute to run scoring per half-inning? 42 | 43 | These Jupyter Notebooks are in this repo at: [Baseball Analysis](https://github.com/sdiehl28/baseball-analytics/tree/master/baseball_jupyter_nb). 44 | 45 | ### Data Validation and Wrangling Validation 46 | 47 | There is no way to know the accuracy of the Retrosheet play-by-play data, but it is assumed to be quite accurate given the large number of volunteers who have worked on it for decades. 48 | 49 | The Lahman data was originally gathered at the season level independently of the Retrosheet data and is therefore inconsistent with Retrosheet in some cases. For the last few years it appears the Lahman seasonal data is derived from the Retrosheet data so there are no new discrepancies. Lahman also includes data not in Retrosheet, such as player's salaries. 50 | 51 | The following data checks can be made: 52 | 53 | * how close is the Retrosheet data to the Lahman data 54 | * how consistent is the data produced by three Retrosheet parsers with each other 55 | * how consistent is the data in the Lahman tables 56 | 57 | Performing these checks on the wrangled data also verifies the wrangling (data restructuring) code did not change the data. 58 | 59 | pytest is used to automate more than 50 tests which check more than 100 attributes. The data is checked for all years between 1974 and 2019, as this is the period for which there is no missing Retrosheet data. 60 | 61 | The data consistency tests show that the [Retrosheet parsers](https://github.com/sdiehl28/baseball-analytics/blob/master/RetrosheetParsers.md), are 100% self-consistent. In other words, when the data from one Retrosheet parser is aggregated to the same level as another Retrosheet parser and compared, the results are identical. This shows that there are no errors in the parsers, and no errors in my restructuring of the parser output. 62 | 63 | The data consistency tests show that the Lahman data is almost 100% self-consistent. For example, when the data in batting is aggregated to the team level and compared with the batting data in teams, the results are almost identical. 64 | 65 | The data consistency tests show that the Retrosheet data when aggregated and compared with the Lahman data over the period 1974 through 2019 is: 66 | 67 | - for batting stats: within 0.01% 68 | - for pitching stats: within 0.06% 69 | - for fielding stats: within 0.8% 70 | 71 | For a detailed description of many of the data consistency tests, see my Jupyter notebook [Data Consistency](https://nbviewer.jupyter.org/github/sdiehl28/baseball-analytics/blob/master/baseball_jupyter_nb/02_Data_Consistency_CSV.ipynb) 72 | 73 | ### Ongoing 74 | 75 | Additional examples of baseball data analysis are continually being added. 76 | 77 | Retrosheet postseason data will soon be parsed and wrangled. All Retrosheet regular season data has been parsed and wrangled. 78 | 79 | ## Additional Information 80 | 81 | For more information about the Lahman and Retrosheet data sets and how they were wrangled, see: [MLB Data Overview](https://github.com/sdiehl28/baseball-analytics/blob/master/MLB_Data_Overview.md) 82 | 83 | For the data sources and their licenses see: [MLB Data Details](https://github.com/sdiehl28/baseball-analytics/blob/master/MLB_Data_Details.md) 84 | 85 | ## Development Environment 86 | 87 | Clone the repo: `git clone https://github.com/sdiehl28/baseball-analytics.git` 88 | 89 | Active your conda environment. If creating a new conda environment, run `conda install anaconda`. If using Postgres, also run `conda install psycopg2` 90 | 91 | The scripts and Jupyter Notebooks were testing using Python 3.7 and Pandas 1.0.1 in a full [Anaconda](https://www.anaconda.com/distribution/) 2019.10 environment. 92 | 93 | The [open-source parsers](https://sourceforge.net/projects/chadwick/) for Retrosheet must be installed to run the scripts. See: [Retrosheet Parsers](https://github.com/sdiehl28/baseball-analytics/blob/master/RetrosheetParsers.md). 94 | 95 | -------------------------------------------------------------------------------- /download_scripts/postgres_load_data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Load Wrangled data into Postgres""" 4 | 5 | __author__ = 'Stephen Diehl' 6 | 7 | import os 8 | import sys 9 | from pathlib import Path 10 | import argparse 11 | import logging 12 | import csv 13 | from io import StringIO 14 | 15 | from sqlalchemy import create_engine 16 | 17 | import data_helper as dh 18 | 19 | logger = logging.getLogger(__name__) 20 | logger.setLevel(logging.DEBUG) 21 | 22 | 23 | def get_parser(): 24 | """Args Description""" 25 | 26 | parser = argparse.ArgumentParser( 27 | description=__doc__, 28 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 29 | 30 | parser.add_argument("--data-dir", type=str, help="baseball data directory", default='../data') 31 | parser.add_argument("-v", "--verbose", help="verbose output", action="store_true") 32 | parser.add_argument("--log", dest="log_level", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 33 | help="Set the logging level") 34 | 35 | return parser 36 | 37 | 38 | # This improves df.to_sql() write speed by a couple orders of magnitude! 39 | # This method was copied verbatim from: 40 | # https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#io-sql-method 41 | # Alternative to_sql() *method* for DBs that support COPY FROM 42 | def psql_insert_copy(table, conn, keys, data_iter): 43 | # gets a DBAPI connection that can provide a cursor 44 | dbapi_conn = conn.connection 45 | with dbapi_conn.cursor() as cur: 46 | s_buf = StringIO() 47 | writer = csv.writer(s_buf) 48 | writer.writerows(data_iter) 49 | s_buf.seek(0) 50 | 51 | columns = ', '.join('"{}"'.format(k) for k in keys) 52 | if table.schema: 53 | table_name = '{}.{}'.format(table.schema, table.name) 54 | else: 55 | table_name = table.name 56 | 57 | sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format( 58 | table_name, columns) 59 | cur.copy_expert(sql=sql, file=s_buf) 60 | 61 | 62 | def create_and_load_table(engine, prefix, filename, pkey=None): 63 | table = prefix + filename.name.split('.')[0] 64 | logger.info(f'{table} loading ...') 65 | 66 | # read with optimized Pandas data types 67 | df = dh.from_csv_with_types(filename) 68 | 69 | # compute optimized database data types 70 | db_dtypes = dh.optimize_db_dtypes(df) 71 | 72 | # drop table and its dependencies (e.g. primary key constraint) 73 | engine.execute(f'DROP TABLE IF EXISTS {table} CASCADE') 74 | df.to_sql(table, engine, index=False, dtype=db_dtypes, method=psql_insert_copy) 75 | 76 | # add primary key constraint 77 | if pkey: 78 | pkeys_str = ', '.join(pkey) 79 | sql = f'ALTER TABLE {table} ADD PRIMARY KEY ({pkeys_str})' 80 | engine.execute(sql) 81 | 82 | # rows added 83 | rs = engine.execute(f'SELECT COUNT(*) from {table}') 84 | result = rs.fetchall() 85 | rows = result[0][0] 86 | 87 | logger.info(f'{table} added with {rows} rows') 88 | 89 | 90 | def load_lahman_tables(engine, data_dir): 91 | lahman_data = data_dir.joinpath('lahman/wrangled') 92 | 93 | create_and_load_table(engine, 'lahman_', lahman_data / 'people.csv', ['player_id']) 94 | sql = 'ALTER TABLE lahman_people ADD CONSTRAINT retro_player_unique UNIQUE (retro_id)' 95 | engine.execute(sql) 96 | 97 | create_and_load_table(engine, 'lahman_', lahman_data / 'batting.csv', 98 | ['player_id', 'year', 'stint']) 99 | create_and_load_table(engine, 'lahman_', lahman_data / 'battingpost.csv', 100 | ['player_id', 'year', 'round']) 101 | create_and_load_table(engine, 'lahman_', lahman_data / 'pitching.csv', 102 | ['player_id', 'year', 'stint']) 103 | create_and_load_table(engine, 'lahman_', lahman_data / 'pitchingpost.csv', 104 | ['player_id', 'year', 'round']) 105 | create_and_load_table(engine, 'lahman_', lahman_data / 'fielding.csv', 106 | ['player_id', 'year', 'stint', 'pos']) 107 | create_and_load_table(engine, 'lahman_', lahman_data / 'fieldingpost.csv', 108 | ['player_id', 'year', 'round', 'pos']) 109 | create_and_load_table(engine, 'lahman_', lahman_data / 'parks.csv', 110 | ['park_key']) 111 | create_and_load_table(engine, 'lahman_', lahman_data / 'salaries.csv', 112 | ['player_id', 'year', 'team_id']) 113 | create_and_load_table(engine, 'lahman_', lahman_data / 'teams.csv', 114 | ['team_id', 'year']) 115 | sql = 'ALTER TABLE lahman_teams ADD CONSTRAINT retro_team_unique UNIQUE (team_id_retro, year)' 116 | engine.execute(sql) 117 | 118 | 119 | def load_retrosheet_tables(engine, data_dir): 120 | retro_data = data_dir.joinpath('retrosheet/wrangled') 121 | 122 | create_and_load_table(engine, 'retro_', retro_data / 'batting.csv.gz', 123 | ['player_id', 'game_id']) 124 | sql = """ALTER TABLE retro_batting 125 | ADD CONSTRAINT batting_player_id 126 | FOREIGN KEY(player_id) 127 | REFERENCES lahman_people(retro_id) 128 | """ 129 | engine.execute(sql) 130 | 131 | create_and_load_table(engine, 'retro_', retro_data / 'pitching.csv.gz', 132 | ['player_id', 'game_id']) 133 | sql = """ALTER TABLE retro_pitching 134 | ADD CONSTRAINT pitching_player_id 135 | FOREIGN KEY(player_id) 136 | REFERENCES lahman_people(retro_id) 137 | """ 138 | engine.execute(sql) 139 | 140 | create_and_load_table(engine, 'retro_', retro_data / 'fielding.csv.gz', 141 | ['player_id', 'game_id', 'pos']) 142 | sql = """ALTER TABLE retro_fielding 143 | ADD CONSTRAINT fielding_player_id 144 | FOREIGN KEY(player_id) 145 | REFERENCES lahman_people(retro_id) 146 | """ 147 | engine.execute(sql) 148 | 149 | create_and_load_table(engine, 'retro_', retro_data / 'game.csv.gz', 150 | ['game_id']) 151 | 152 | create_and_load_table(engine, 'retro_', retro_data / 'team_game.csv.gz', 153 | ['team_id', 'game_id']) 154 | 155 | sql = """ALTER TABLE retro_team_game 156 | ADD CONSTRAINT retro_team_id FOREIGN KEY (team_id, year) 157 | REFERENCES lahman_teams (team_id_retro, year) 158 | """ 159 | engine.execute(sql) 160 | 161 | create_and_load_table(engine, 'retro_', retro_data / 'event.csv.gz', 162 | ['game_id', 'event_id']) 163 | 164 | 165 | def main(): 166 | """Load the data in Postgres. 167 | """ 168 | parser = get_parser() 169 | args = parser.parse_args() 170 | 171 | if args.log_level: 172 | fh = logging.FileHandler('download.log') 173 | formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s') 174 | fh.setFormatter(formatter) 175 | fh.setLevel(args.log_level) 176 | logger.addHandler(fh) 177 | 178 | if args.verbose: 179 | # send INFO level logging to stdout 180 | sh = logging.StreamHandler(sys.stdout) 181 | formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s') 182 | sh.setFormatter(formatter) 183 | sh.setLevel(logging.INFO) 184 | logger.addHandler(sh) 185 | 186 | # Get the user and password from the environment (rather than hardcoding it) 187 | db_user = os.environ.get('DB_USER') 188 | db_pass = os.environ.get('DB_PASS') 189 | 190 | # avoid putting passwords directly in code 191 | connect_str = f'postgresql://{db_user}:{db_pass}@localhost:5432/baseball' 192 | 193 | # for distinction between engine.execute() and engine.connect().execute() see: 194 | # https://stackoverflow.com/questions/34322471/sqlalchemy-engine-connection-and-session-difference#answer-42772654 195 | engine = create_engine(connect_str) 196 | 197 | data_dir = Path('../data') 198 | load_lahman_tables(engine, data_dir) 199 | load_retrosheet_tables(engine, data_dir) 200 | 201 | logger.info('Finished') 202 | 203 | 204 | if __name__ == '__main__': 205 | main() 206 | -------------------------------------------------------------------------------- /download_scripts/lahman_wrangle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Wrangle Lahman Data from {data_dir}/lahman/raw to {data_dir}/lahman/wrangled 4 | 5 | Wrangles: batting, pitching, fielding, people, teams, salaries, parks 6 | """ 7 | 8 | __author__ = 'Stephen Diehl' 9 | 10 | import pandas as pd 11 | 12 | import os 13 | import argparse 14 | from pathlib import Path 15 | import logging 16 | import sys 17 | 18 | import data_helper as dh 19 | 20 | logger = logging.getLogger(__name__) 21 | logger.setLevel(logging.DEBUG) 22 | 23 | 24 | def get_fieldname_mapping(): 25 | """Dictionary of fieldnames to modify.""" 26 | 27 | # It is easier to maintain fieldname mappings in a single location 28 | new_names = { 29 | 'playerID': 'player_id', 30 | 'yearID': 'year', 31 | 'teamID': 'team_id', 32 | 'lgID': 'lg_id', 33 | '2B': 'double', 34 | '3B': 'triple', 35 | 'BAOpp': 'ba_opp', 36 | 'IPouts': 'ip_outs', 37 | 'InnOuts': 'inn_outs', 38 | 'franchID': 'franch_id', 39 | 'divID': 'div_id', 40 | 'Ghome': 'g_home', 41 | 'DivWin': 'div_win', 42 | 'WCWin': 'wc_win', 43 | 'LgWin': 'lg_win', 44 | 'WSWin': 'ws_win', 45 | 'teamIDBR': 'team_id_br', 46 | 'teamIDlahman45': 'team_id_lahman45', 47 | 'teamIDretro': 'team_id_retro', 48 | 'birthYear': 'birth_year', 49 | 'birthMonth': 'birth_month', 50 | 'birthDay': 'birth_day', 51 | 'birthCountry': 'birth_country', 52 | 'birthState': 'birth_state', 53 | 'birthCity': 'birth_city', 54 | 'deathYear': 'death_year', 55 | 'deathMonth': 'death_month', 56 | 'deathDay': 'death_day', 57 | 'deathCountry': 'death_country', 58 | 'deathState': 'death_state', 59 | 'deathCity': 'death_city', 60 | 'nameFirst': 'name_first', 61 | 'nameLast': 'name_last', 62 | 'nameGiven': 'name_given', 63 | 'finalGame': 'final_game', 64 | 'retroID': 'retro_id', 65 | 'bbrefID': 'bb_ref_id', 66 | 'park.key': 'park_key', 67 | 'park.name': 'park_name', 68 | 'park.alias': 'park_alias' 69 | } 70 | 71 | return new_names 72 | 73 | 74 | def get_parser(): 75 | """Args Description""" 76 | 77 | parser = argparse.ArgumentParser( 78 | description=__doc__, 79 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 80 | 81 | parser.add_argument("--data-dir", type=str, help="baseball data directory", default='../data') 82 | parser.add_argument("-v", "--verbose", help="verbose output", action="store_true") 83 | parser.add_argument("--log", dest="log_level", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 84 | help="Set the logging level") 85 | 86 | return parser 87 | 88 | 89 | def to_date(row, prefix): 90 | """Custom Parsing of birth and death dates""" 91 | y = row[prefix + '_year'] 92 | m = row[prefix + '_month'] 93 | d = row[prefix + '_day'] 94 | 95 | # NaT if year is missing 96 | if pd.isna(y): 97 | return pd.NaT 98 | 99 | # if year present but month missing 100 | if pd.isna(m): 101 | m = 1 102 | 103 | # if year present but day missing 104 | if pd.isna(d): 105 | d = 1 106 | 107 | return pd.to_datetime(f'{int(y)}-{int(m)}-{int(d)}') 108 | 109 | 110 | def wrangle_basic(p_raw, p_wrangled, filename): 111 | """Basic Wrangle: converts fieldnames, optimizes datatypes and persists data 112 | """ 113 | filename_lower = str(filename).lower() 114 | wrangled_file = p_wrangled.joinpath(filename_lower) 115 | 116 | if wrangled_file.exists(): 117 | logger.info(f'Skipping wrangle of {filename} - already performed') 118 | return 119 | 120 | os.chdir(p_raw) 121 | df = pd.read_csv(filename) 122 | 123 | df.rename(columns=get_fieldname_mapping(), inplace=True) 124 | df.columns = df.columns.str.lower() 125 | 126 | # downcast integers and convert float to Int64, if data permits 127 | dh.optimize_df_dtypes(df) 128 | 129 | msg = dh.df_info(df) 130 | logger.info(f'{filename}\n{msg}') 131 | 132 | # persist with optimized datatypes 133 | os.chdir(p_wrangled) 134 | dh.to_csv_with_types(df, wrangled_file) 135 | 136 | 137 | def wrangle_people(p_raw, p_wrangled): 138 | """Custom parsing of dates, converts fieldnames, optimizes datatypes and persists data 139 | """ 140 | if p_wrangled.joinpath('people.csv').exists(): 141 | logger.info('Skipping wrangle of People.csv - already performed') 142 | return 143 | 144 | os.chdir(p_raw) 145 | people = pd.read_csv('People.csv', parse_dates=['debut', 'finalGame']) 146 | 147 | people.rename(columns=get_fieldname_mapping(), inplace=True) 148 | people.columns = people.columns.str.lower() 149 | 150 | people['birth_date'] = people.apply(lambda x: to_date(x, 'birth'), axis=1) 151 | people['death_date'] = people.apply(lambda x: to_date(x, 'death'), axis=1) 152 | people = people.drop( 153 | ['birth_year', 'birth_month', 'birth_day', 154 | 'death_year', 'death_month', 'death_day'], axis=1) 155 | 156 | msg = dh.df_info(people) 157 | logger.info('people\n{}'.format(msg)) 158 | 159 | # persist as a csv file with data types 160 | os.chdir(p_wrangled) 161 | dh.to_csv_with_types(people, 'people.csv') 162 | 163 | 164 | def wrangle_fielding(p_raw, p_wrangled): 165 | """Drops cols > 90% null, converts fieldnames, optimizes datatypes and persists data 166 | """ 167 | if p_wrangled.joinpath('fielding.csv').exists(): 168 | logger.info('Skipping wrangle of Fielding.csv - already performed') 169 | return 170 | 171 | os.chdir(p_raw) 172 | fielding = pd.read_csv('Fielding.csv') 173 | 174 | fielding.rename(columns=get_fieldname_mapping(), inplace=True) 175 | fielding.columns = fielding.columns.str.lower() 176 | 177 | # drop any column that is more than 90% null 178 | filt = fielding.isna().mean() > 0.90 179 | if filt.any(): 180 | drop_cols = fielding.columns[filt] 181 | logger.warning(f'Cols > 90% missing being dropped: {" ".join(drop_cols)}') 182 | fielding.drop(drop_cols, axis=1, inplace=True) 183 | 184 | dh.optimize_df_dtypes(fielding) 185 | 186 | msg = dh.df_info(fielding) 187 | logger.info('fielding\n{}'.format(msg)) 188 | 189 | # persist 190 | os.chdir(p_wrangled) 191 | dh.to_csv_with_types(fielding, 'fielding.csv') 192 | 193 | 194 | def main(): 195 | """Wrangle the data""" 196 | parser = get_parser() 197 | args = parser.parse_args() 198 | 199 | if args.log_level: 200 | fh = logging.FileHandler('download.log') 201 | formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s') 202 | fh.setFormatter(formatter) 203 | fh.setLevel(args.log_level) 204 | logger.addHandler(fh) 205 | 206 | if args.verbose: 207 | # send INFO level logging to stdout 208 | sh = logging.StreamHandler(sys.stdout) 209 | formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s') 210 | sh.setFormatter(formatter) 211 | sh.setLevel(logging.INFO) 212 | logger.addHandler(sh) 213 | 214 | p_lahman_raw = Path(args.data_dir).joinpath('lahman/raw').resolve() 215 | p_lahman_wrangled = Path(args.data_dir).joinpath('lahman/wrangled').resolve() 216 | 217 | wrangle_people(p_lahman_raw, p_lahman_wrangled) 218 | wrangle_fielding(p_lahman_raw, p_lahman_wrangled) 219 | 220 | # TODO add fieldname mappings to support other Lahman csv files 221 | wrangle_basic(p_lahman_raw, p_lahman_wrangled, 'Batting.csv') 222 | wrangle_basic(p_lahman_raw, p_lahman_wrangled, 'BattingPost.csv') 223 | wrangle_basic(p_lahman_raw, p_lahman_wrangled, 'FieldingPost.csv') 224 | wrangle_basic(p_lahman_raw, p_lahman_wrangled, 'Pitching.csv') 225 | wrangle_basic(p_lahman_raw, p_lahman_wrangled, 'PitchingPost.csv') 226 | wrangle_basic(p_lahman_raw, p_lahman_wrangled, 'Teams.csv') 227 | wrangle_basic(p_lahman_raw, p_lahman_wrangled, 'Salaries.csv') 228 | wrangle_basic(p_lahman_raw, p_lahman_wrangled, 'Parks.csv') 229 | 230 | 231 | if __name__ == '__main__': 232 | main() 233 | -------------------------------------------------------------------------------- /download_scripts/retrosheet_collect.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Collect parsed event files""" 4 | 5 | __author__ = 'Stephen Diehl' 6 | 7 | import argparse 8 | import sys 9 | from pathlib import Path 10 | import os 11 | import glob 12 | import pandas as pd 13 | import data_helper as dh 14 | import logging 15 | 16 | logger = logging.getLogger(__name__) 17 | logger.setLevel(logging.DEBUG) 18 | 19 | 20 | def get_parser(): 21 | """Args Description""" 22 | 23 | parser = argparse.ArgumentParser( 24 | description=__doc__, 25 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 26 | 27 | parser.add_argument("--data-dir", type=str, help="baseball data directory", default='../data') 28 | parser.add_argument("-v", "--verbose", help="verbose output", action="store_true") 29 | parser.add_argument("--use-datatypes", help="use precomputed datatypes", action="store_true") 30 | parser.add_argument("--log", dest="log_level", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 31 | help="Set the logging level") 32 | 33 | return parser 34 | 35 | 36 | def collect_parsed_files(parse_dir, collect_dir, parser, use_datatypes): 37 | """Collect all parsed files and optimize datatypes. 38 | """ 39 | 40 | os.chdir(parse_dir) 41 | # read the augmented files, not the ones created by cwevent 42 | if parser == 'cwevent': 43 | dailyfiles = glob.glob(f'{parser}*_plus.csv') 44 | else: 45 | dailyfiles = glob.glob(f'{parser}*.csv') 46 | dailyfiles.sort() 47 | 48 | logger.info(f'Collecting {len(dailyfiles)} {parser} parsed csv files into single dataframe ...') 49 | 50 | if use_datatypes: 51 | # this can save gigabytes of RAM by using precomputed datatypes 52 | logger.info('Using precomputed data types') 53 | if parser == 'cwdaily': 54 | filename = '../player_game_types.csv' 55 | elif parser == 'cwgame': 56 | filename = '../game_types.csv' 57 | elif parser == 'cwevent': 58 | filename = '../event_types.csv' 59 | else: 60 | raise ValueError(f'Unrecognized parser: {parser}') 61 | 62 | dates, dtypes = dh.read_types(filename) 63 | dtypes = {key.upper(): value for key, value in dtypes.items()} 64 | 65 | df = pd.concat((pd.read_csv(f, parse_dates=dates, dtype=dtypes) for f in dailyfiles), 66 | ignore_index=True, copy=False) 67 | logger.info(f'Optimized Memory Usage: {dh.mem_usage(df)}') 68 | else: 69 | # This could use twice the RAM required to hold the unoptimized DataFrame! 70 | # cwgame parser will output the line score (line_tx) like: 001001001 71 | # but without double quotes around it, so it gets interpreted as a number. 72 | # Specify dtype for line score fields to get around this. 73 | df = pd.concat((pd.read_csv(f, dtype={'AWAY_LINE_TX': str, 'HOME_LINE_TX': str}) 74 | for f in dailyfiles), ignore_index=True, copy=False) 75 | 76 | logger.info(f'Unoptimized Memory Usage: {dh.mem_usage(df)}') 77 | logger.info('Optimizing Data Types to reduce memory ...') 78 | 79 | # for cwdaily, optimize_df_dtypes reduces the size of the dataframe by a factor of 3 80 | dh.optimize_df_dtypes(df) 81 | logger.info(f'Optimized Memory Usage: {dh.mem_usage(df)}') 82 | 83 | # convert to lower case 84 | df.columns = df.columns.str.lower() 85 | 86 | # drop any column that is more than 99% null 87 | filt = df.isna().mean() > 0.99 88 | if filt.any(): 89 | drop_cols = df.columns[filt] 90 | logger.warning(f'Cols > 99% missing being dropped: {" ".join(drop_cols)}') 91 | df.drop(drop_cols, axis=1, inplace=True) 92 | 93 | # persist optimized dataframe 94 | # gzip chosen over xy because this runs on client computer and gzip is faster 95 | logger.info('persisting dataframe using compression - this could take several minutes ...') 96 | os.chdir(collect_dir) 97 | if parser == 'cwdaily': 98 | filename = 'player_game.csv.gz' 99 | elif parser == 'cwgame': 100 | filename = 'game.csv.gz' 101 | elif parser == 'cwevent': # was wrangled in parser to save RAM, write to wrangled dir 102 | filename = 'event.csv.gz' 103 | else: 104 | raise ValueError(f'Unrecognized parser: {parser}') 105 | 106 | dh.to_csv_with_types(df, filename) 107 | logger.info(f'{parser} data persisted') 108 | 109 | 110 | def augment_event_files(p_data_parsed): 111 | """Add New Play-by-Play Fields 112 | 113 | cwevent does not produce a boolean or int for the following values: 114 | 'so', 'sb', 'cs', 'bk', 'bb', 'ibb', 'hbp', 'xi', 'single', 'double', 'triple', 'hr' 115 | Extract these from event_tx and h_cd 116 | 117 | The advantage of creating these fields is: 118 | 1) some play-by-play analysis is easier 119 | 2) the new fields can be aggregated to the game level and compared with cwgame to 120 | verify data consistency 121 | 122 | This method is in retrosheet_collect.py rather than retrosheet_wrangle.py, because 123 | many Gigs of RAM can be saved by collecting csv files that replace the value 'T' 124 | with the value True (and likewise 'F' with False). 125 | """ 126 | os.chdir(p_data_parsed) 127 | files = p_data_parsed.glob('cwevent????.csv') 128 | for file in sorted(files): 129 | df = pd.read_csv(file) 130 | logger.info(f'Creating Augmented Event File: {file.name.split(".")[0]}_plus.csv') 131 | 132 | # change column names to lowercase 133 | cols = [col.lower() for col in df.columns] 134 | df.columns = cols 135 | 136 | # prepare to remove _fl from flag fields 137 | flag_fields = [col for col in df.columns if col.endswith('_fl')] 138 | new_names = [col[:-3] for col in flag_fields] 139 | 140 | # convert 'T' to True/False 141 | # a bool takes 8 times less memory than the object 'T' 142 | df[new_names] = df[flag_fields].applymap(lambda s: s == 'T') 143 | df.drop(columns=flag_fields, inplace=True) 144 | 145 | # use "better" names 146 | names = {'event_outs_ct': 'outs', 'err_ct': 'e', 'event_runs_ct': 'r', 147 | 'bat_home_id': 'home_half', 'pa_new': 'pa', 'bat_team_id': 'team_id', 148 | 'fld_team_id': 'opponent_team_id'} 149 | df = df.rename(columns=names) 150 | 151 | df['so'] = df['event_tx'].str.contains(r'^K') 152 | df['sb'] = df['event_tx'].str.count('SB') # counts multiple stolen bases on one play 153 | df['cs'] = df['event_tx'].str.count('CS') # counts multiple cs on one play 154 | df['bk'] = df['event_tx'].str.contains('BK') 155 | 156 | # 'I' not preceded by 'D' or 'B' or '/' and not followed by 'N' 157 | df['ibb'] = df['event_tx'].str.contains(r'(? 0 175 | 176 | df.to_csv(f'{file.name.split(".")[0]}_plus.csv', index=False) 177 | 178 | 179 | def main(): 180 | """Collect the CSV files.""" 181 | parser = get_parser() 182 | args = parser.parse_args() 183 | 184 | if args.log_level: 185 | fh = logging.FileHandler('download.log') 186 | formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s') 187 | fh.setFormatter(formatter) 188 | fh.setLevel(args.log_level) 189 | logger.addHandler(fh) 190 | 191 | if args.verbose: 192 | # send INFO level logging to stdout 193 | sh = logging.StreamHandler(sys.stdout) 194 | formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s') 195 | sh.setFormatter(formatter) 196 | sh.setLevel(logging.INFO) 197 | logger.addHandler(sh) 198 | 199 | p_data = Path(args.data_dir).resolve() 200 | p_data_parsed = p_data.joinpath('retrosheet/parsed') 201 | p_data_collected = p_data.joinpath('retrosheet/collected') 202 | 203 | # create directories, if they do not exist 204 | p_data_parsed.mkdir(parents=True, exist_ok=True) 205 | p_data_collected.mkdir(parents=True, exist_ok=True) 206 | 207 | event_files = list(p_data_parsed.glob('cwevent*.csv')) 208 | if event_files: 209 | if p_data.joinpath('retrosheet', 'collected', 'event.csv.gz').exists(): 210 | logger.info('Skipping cwevent collection -- already performed') 211 | else: 212 | augment_event_files(p_data_parsed) 213 | collect_parsed_files(p_data_parsed, p_data_collected, 'cwevent', args.use_datatypes) 214 | 215 | if p_data.joinpath('retrosheet', 'collected', 'player_game.csv.gz').exists(): 216 | logger.info('Skipping cwdaily collection -- already performed') 217 | else: 218 | collect_parsed_files(p_data_parsed, p_data_collected, 'cwdaily', args.use_datatypes) 219 | 220 | if p_data.joinpath('retrosheet', 'collected', 'game.csv.gz').exists(): 221 | logger.info('Skipping cwgame collection -- already performed') 222 | else: 223 | collect_parsed_files(p_data_parsed, p_data_collected, 'cwgame', args.use_datatypes) 224 | 225 | 226 | if __name__ == '__main__': 227 | main() 228 | -------------------------------------------------------------------------------- /data/retrosheet/nb_data/fangraphs.csv: -------------------------------------------------------------------------------- 1 | Season,Team,Basic (5yr),3yr,1yr,1B,2B,3B,HR,SO,BB,GB,FB,LD,IFFB,FIP 2 | 2015,Angels,97,95,93,100,96,88,98,102,97,101,100,98,100,98 3 | 2015,Orioles,101,101,108,101,96,87,106,98,100,101,102,100,100,103 4 | 2015,Red Sox,104,107,109,103,112,103,95,99,99,102,97,103,101,98 5 | 2015,White Sox,99,98,95,98,95,93,105,102,103,98,101,98,105,102 6 | 2015,Indians,102,106,112,101,106,83,102,100,100,101,97,101,92,100 7 | 2015,Tigers,102,99,95,101,99,123,101,96,100,100,104,101,105,102 8 | 2015,Royals,102,103,101,101,105,114,92,97,99,100,101,101,95,98 9 | 2015,Twins,102,102,100,103,103,104,101,98,99,102,101,102,100,101 10 | 2015,Yankees,101,100,101,99,95,83,112,101,100,98,102,98,102,104 11 | 2015,Athletics,98,97,97,99,102,113,94,98,99,100,102,100,105,98 12 | 2015,Mariners,96,95,94,98,96,85,99,103,99,97,102,97,107,98 13 | 2015,Rays,97,98,99,99,94,100,96,103,100,98,100,100,104,98 14 | 2015,Rangers,105,105,107,103,100,109,101,98,104,100,100,103,98,102 15 | 2015,Blue Jays,101,101,96,97,108,97,103,100,99,99,99,100,98,101 16 | 2015,Diamondbacks,105,107,103,100,106,127,104,100,100,101,100,101,93,101 17 | 2015,Braves,99,99,97,100,98,96,94,103,102,98,100,101,99,97 18 | 2015,Cubs,100,96,98,100,100,103,101,101,102,100,99,101,97,100 19 | 2015,Reds,101,101,105,98,100,91,108,104,102,98,100,98,102,102 20 | 2015,Rockies,116,120,120,109,109,132,110,95,102,107,99,107,90,107 21 | 2015,Marlins,97,97,98,99,98,108,90,99,101,101,99,97,95,97 22 | 2015,Astros,97,96,97,98,96,109,102,103,99,100,98,98,100,99 23 | 2015,Dodgers,96,94,96,97,99,77,100,100,94,99,100,96,102,98 24 | 2015,Brewers,102,101,105,99,101,96,108,101,101,99,100,101,97,103 25 | 2015,Nationals,101,100,100,104,102,81,97,99,100,100,102,101,98,99 26 | 2015,Mets,95,95,94,95,96,86,99,102,101,97,101,98,110,100 27 | 2015,Phillies,100,97,102,98,97,95,111,104,101,99,101,97,108,103 28 | 2015,Pirates,98,99,97,102,99,96,92,97,99,102,98,104,99,98 29 | 2015,Cardinals,97,99,97,100,99,96,95,97,98,100,101,101,103,99 30 | 2015,Padres,95,97,97,98,96,96,96,101,100,100,98,99,97,98 31 | 2015,Giants,96,96,93,100,99,122,86,100,101,101,97,98,98,95 32 | 2016,Angels,97,96,96,99,95,84,100,102,98,100,100,98,100,99 33 | 2016,Orioles,100,102,98,102,96,87,105,98,100,101,103,99,101,103 34 | 2016,Red Sox,105,106,109,103,114,104,96,99,99,102,98,103,101,99 35 | 2016,White Sox,99,97,97,99,95,96,103,103,101,99,102,98,102,100 36 | 2016,Indians,104,106,110,102,106,85,102,100,101,102,97,102,92,101 37 | 2016,Tigers,100,101,101,100,98,120,100,96,99,100,104,101,105,101 38 | 2016,Royals,101,101,108,101,106,113,92,97,99,101,100,102,95,98 39 | 2016,Twins,102,102,102,102,103,103,102,98,100,102,100,101,101,102 40 | 2016,Yankees,101,101,102,100,95,83,112,101,101,98,102,100,105,104 41 | 2016,Athletics,98,98,92,99,102,108,93,98,99,100,100,100,101,98 42 | 2016,Mariners,95,96,97,97,94,87,100,103,99,97,102,96,104,98 43 | 2016,Rays,97,96,95,99,94,97,96,103,100,97,99,99,105,97 44 | 2016,Rangers,108,108,107,104,102,120,104,98,103,101,101,104,100,104 45 | 2016,Blue Jays,100,100,107,98,105,102,102,100,99,100,99,98,97,100 46 | 2016,Diamondbacks,105,108,110,100,106,127,104,100,100,101,100,101,93,101 47 | 2016,Braves,99,99,103,100,98,96,94,103,102,98,100,101,99,97 48 | 2016,Cubs,100,99,94,99,99,105,100,100,102,100,99,100,97,100 49 | 2016,Reds,102,102,100,98,100,95,108,104,103,98,100,99,102,103 50 | 2016,Rockies,116,117,117,109,112,135,111,96,102,106,99,105,89,107 51 | 2016,Marlins,95,94,92,99,96,103,90,99,100,101,99,98,95,97 52 | 2016,Astros,96,93,91,97,96,105,101,102,98,100,97,97,100,99 53 | 2016,Dodgers,96,95,91,96,100,75,101,100,94,99,100,96,104,98 54 | 2016,Brewers,101,102,99,98,100,94,107,102,100,99,100,100,97,102 55 | 2016,Nationals,102,100,98,103,103,83,100,100,101,99,103,100,97,100 56 | 2016,Mets,94,95,99,95,94,86,97,101,100,97,100,97,109,98 57 | 2016,Phillies,99,99,92,98,96,94,109,105,100,99,101,97,109,102 58 | 2016,Pirates,98,98,100,102,101,95,93,97,100,102,98,103,99,99 59 | 2016,Cardinals,98,96,96,101,100,96,95,97,98,101,101,102,102,99 60 | 2016,Padres,97,96,101,99,99,95,96,100,102,101,98,102,98,99 61 | 2016,Giants,97,95,101,101,99,120,86,99,100,101,96,99,98,95 62 | 2017,Angels,98,97,98,99,95,84,100,102,98,100,100,98,100,100 63 | 2017,Orioles,102,99,101,102,96,87,105,98,100,101,103,99,101,104 64 | 2017,Red Sox,105,105,101,103,114,104,96,99,99,102,98,103,101,99 65 | 2017,White Sox,98,98,100,99,95,96,103,103,101,99,102,98,102,101 66 | 2017,Indians,104,105,99,102,106,85,102,100,101,102,97,102,92,100 67 | 2017,Tigers,101,102,108,100,98,120,100,96,99,100,104,101,105,101 68 | 2017,Royals,102,102,97,101,106,113,92,97,99,101,100,102,95,98 69 | 2017,Twins,101,102,104,102,103,103,102,98,100,102,100,101,101,101 70 | 2017,Yankees,100,103,101,100,95,83,112,101,101,98,102,100,105,102 71 | 2017,Athletics,97,96,105,99,102,108,93,98,99,100,100,100,101,98 72 | 2017,Mariners,96,95,96,97,94,87,100,103,99,97,102,96,104,98 73 | 2017,Rays,96,96,95,99,94,97,96,103,100,97,99,99,105,97 74 | 2017,Rangers,109,111,110,104,102,120,104,98,103,101,101,104,100,104 75 | 2017,Blue Jays,100,101,97,98,105,102,102,100,99,100,99,98,97,101 76 | 2017,Diamondbacks,105,108,109,100,106,127,104,100,100,101,100,101,93,101 77 | 2017,Braves,101,101,99,102,101,92,96,99,99,101,99,102,101,99 78 | 2017,Cubs,100,101,106,99,99,105,100,100,102,100,99,100,97,100 79 | 2017,Reds,102,102,101,98,100,95,108,104,103,98,100,99,102,102 80 | 2017,Rockies,115,115,115,109,112,135,111,96,102,106,99,105,89,106 81 | 2017,Marlins,95,91,94,99,96,103,90,99,100,101,99,98,95,97 82 | 2017,Astros,97,94,92,97,96,105,101,102,98,100,97,97,100,99 83 | 2017,Dodgers,96,94,99,96,100,75,101,100,94,99,100,96,104,98 84 | 2017,Brewers,101,101,103,98,100,94,107,102,100,99,100,100,97,101 85 | 2017,Nationals,102,102,103,103,103,83,100,100,101,99,103,100,97,102 86 | 2017,Mets,94,93,93,95,94,86,97,101,100,97,100,97,109,98 87 | 2017,Phillies,100,99,103,98,96,94,109,105,100,99,101,97,109,102 88 | 2017,Pirates,98,97,98,102,101,95,93,97,100,102,98,103,99,100 89 | 2017,Cardinals,96,96,95,101,100,96,95,97,98,101,101,102,102,98 90 | 2017,Padres,97,98,92,99,99,95,96,100,102,101,98,102,98,99 91 | 2017,Giants,96,98,93,101,99,120,86,99,100,101,96,99,98,94 92 | 2018,Angels,98,99,99,99,95,84,100,102,98,100,100,98,100,100 93 | 2018,Orioles,102,102,99,102,96,87,105,98,100,101,103,99,101,104 94 | 2018,Red Sox,105,103,104,103,114,104,96,99,99,102,98,103,101,99 95 | 2018,White Sox,98,99,97,99,95,96,103,103,101,99,102,98,102,101 96 | 2018,Indians,104,101,106,102,106,85,102,100,101,102,97,102,92,100 97 | 2018,Tigers,101,103,97,100,98,120,100,96,99,100,104,101,105,101 98 | 2018,Royals,102,101,103,101,106,113,92,97,99,101,100,102,95,98 99 | 2018,Twins,101,101,100,102,103,103,102,98,100,102,100,101,101,101 100 | 2018,Yankees,100,99,106,100,95,83,112,101,101,98,102,100,105,102 101 | 2018,Athletics,97,97,92,99,102,108,93,98,99,100,100,100,101,98 102 | 2018,Mariners,96,96,93,97,94,87,100,103,99,97,102,96,104,98 103 | 2018,Rays,96,96,97,99,94,97,96,103,100,97,99,99,105,97 104 | 2018,Rangers,109,112,116,104,102,120,104,98,103,101,101,104,100,104 105 | 2018,Blue Jays,100,99,98,98,105,102,102,100,99,100,99,98,97,101 106 | 2018,Diamondbacks,100,101,103,101,99,130,99,100,102,100,98,104,101,99 107 | 2018,Braves,101,101,106,102,101,92,96,99,99,101,99,102,101,99 108 | 2018,Cubs,100,102,104,99,99,105,100,100,102,100,99,100,97,100 109 | 2018,Reds,102,103,106,98,100,95,108,104,103,98,100,99,102,102 110 | 2018,Rockies,115,115,112,109,112,135,111,96,102,106,99,105,89,106 111 | 2018,Marlins,95,95,88,99,96,103,90,99,100,101,99,98,95,97 112 | 2018,Astros,97,98,99,97,96,105,101,102,98,100,97,97,100,99 113 | 2018,Dodgers,96,96,94,96,100,75,101,100,94,99,100,96,104,98 114 | 2018,Brewers,101,101,101,98,100,94,107,102,100,99,100,100,97,101 115 | 2018,Nationals,102,104,106,103,103,83,100,100,101,99,103,100,97,102 116 | 2018,Mets,94,92,87,95,94,86,97,101,100,97,100,97,109,98 117 | 2018,Phillies,100,103,102,98,96,94,109,105,100,99,101,97,109,102 118 | 2018,Pirates,98,98,94,102,101,95,93,97,100,102,98,103,99,100 119 | 2018,Cardinals,96,96,97,101,100,96,95,97,98,101,101,102,102,98 120 | 2018,Padres,97,95,102,99,99,95,96,100,102,101,98,102,98,99 121 | 2018,Giants,96,94,101,101,99,120,86,99,100,101,96,99,98,94 122 | 2019,Angels,98,99,101,100,96,88,98,102,97,101,100,98,100,100 123 | 2019,Orioles,102,102,104,101,96,87,106,98,100,101,102,100,100,104 124 | 2019,Red Sox,105,103,103,103,112,103,95,99,99,102,97,103,101,99 125 | 2019,White Sox,98,99,98,98,95,93,105,102,103,98,101,98,105,101 126 | 2019,Indians,104,101,99,101,106,83,102,100,100,101,97,101,92,100 127 | 2019,Tigers,101,103,105,101,99,123,101,96,100,100,104,101,105,101 128 | 2019,Royals,102,101,103,101,105,114,92,97,99,100,101,101,95,98 129 | 2019,Twins,101,101,99,103,103,104,101,98,99,102,101,102,100,101 130 | 2019,Yankees,100,99,91,99,95,83,112,101,100,98,102,98,102,102 131 | 2019,Athletics,97,97,95,99,102,113,94,98,99,100,102,100,105,98 132 | 2019,Mariners,96,96,98,98,96,85,99,103,99,97,102,97,107,98 133 | 2019,Rays,96,96,95,99,94,100,96,103,100,98,100,100,104,97 134 | 2019,Rangers,109,112,111,103,100,109,101,98,104,100,100,103,98,104 135 | 2019,Blue Jays,100,99,101,97,108,97,103,100,99,99,99,100,98,101 136 | 2019,Diamondbacks,100,101,99,100,106,127,104,100,100,101,100,101,93,99 137 | 2019,Braves,101,101,100,101,102,92,99,100,97,100,100,103,101,99 138 | 2019,Cubs,100,102,97,100,100,103,101,101,102,100,99,101,97,100 139 | 2019,Reds,102,103,102,98,100,91,108,104,102,98,100,98,102,102 140 | 2019,Rockies,115,115,118,109,109,132,110,95,102,107,99,107,90,106 141 | 2019,Marlins,95,95,104,99,98,108,90,99,101,101,99,97,95,97 142 | 2019,Astros,97,98,104,98,96,109,102,103,99,100,98,98,100,99 143 | 2019,Dodgers,96,96,96,97,99,77,100,100,94,99,100,96,102,98 144 | 2019,Brewers,101,101,99,99,101,96,108,101,101,99,100,101,97,101 145 | 2019,Nationals,102,104,105,104,102,81,97,99,100,100,102,101,98,102 146 | 2019,Mets,94,92,95,95,96,86,99,102,101,97,101,98,110,98 147 | 2019,Phillies,100,103,102,98,97,95,111,104,101,99,101,97,108,102 148 | 2019,Pirates,98,98,100,102,99,96,92,97,99,102,98,104,99,100 149 | 2019,Cardinals,96,96,96,100,99,96,95,97,98,100,101,101,103,98 150 | 2019,Padres,97,95,93,98,96,96,96,101,100,100,98,99,97,99 151 | 2019,Giants,96,94,90,100,99,122,86,100,101,101,97,98,98,94 152 | -------------------------------------------------------------------------------- /download_scripts/data_helper.py: -------------------------------------------------------------------------------- 1 | """Baseball Data Helper Functions""" 2 | 3 | __author__ = 'Stephen Diehl' 4 | 5 | import pandas as pd 6 | import numpy as np 7 | import re 8 | import io 9 | from pathlib import Path 10 | import statsmodels.api as sm 11 | from IPython.display import HTML, display 12 | from sqlalchemy.types import SmallInteger, Integer, BigInteger, Float 13 | 14 | 15 | def to_csv_with_types(df, filename): 16 | """ 17 | Save df to csv file and save df.dtypes to csv file. 18 | 19 | If filename ends in .gz, Pandas will use gzip compression. 20 | 21 | This is intended to be used after optimizing df column types. 22 | Read back with: from_csv_with_types() 23 | 24 | Persistence with data types cannot currently be done with hdf5 because 25 | the new Int64 and similar data types are not supported. 26 | """ 27 | 28 | p = Path(filename) 29 | types_name = p.name.split('.')[0] + '_types.csv' 30 | p_types = p.parent / types_name 31 | 32 | dtypes = df.dtypes.to_frame('dtypes').reset_index() 33 | 34 | dtypes.to_csv(p_types, index=False) 35 | df.to_csv(p, index=False) 36 | 37 | 38 | def from_csv_with_types(filename, usecols=None, nrows=None): 39 | """ 40 | Read df.dtypes from csv file and read df from csv file. 41 | 42 | If filename ends in .gz, Pandas will use gzip decompression. 43 | This is the complement of to_csv_with_types(). 44 | """ 45 | 46 | p = Path(filename) 47 | types_name = p.name.split('.')[0] + '_types.csv' 48 | p_types = p.parent / types_name 49 | dates, dtypes = read_types(p_types) 50 | 51 | # only parse dates that are in usecols 52 | if dates and usecols: 53 | dates = list(set(dates) & set(usecols)) 54 | 55 | return pd.read_csv(p, parse_dates=dates, dtype=dtypes, usecols=usecols, nrows=nrows) 56 | 57 | 58 | def read_types(filename): 59 | """Read data types file to get list of date fields and a dictionary mapping of types 60 | 61 | """ 62 | types = pd.read_csv(filename).set_index('index').to_dict() 63 | dtypes = types['dtypes'] 64 | 65 | dates = [key for key, value in dtypes.items() if value.startswith('datetime')] 66 | for field in dates: 67 | dtypes.pop(field) 68 | 69 | return dates, dtypes 70 | 71 | 72 | def get_optimal_data_type(s): 73 | # if the integer is outside the range of values that be converted to a nullable integer type 74 | # use float64 75 | convert_type = 'float64' 76 | 77 | dtype_range = get_dtype_range() 78 | if s.min() >= 0: 79 | for dtype in ['UInt8', 'UInt16', 'UInt32', 'UInt64']: 80 | if s.max() <= dtype_range[dtype][2]: 81 | convert_type = dtype 82 | break 83 | else: 84 | for dtype in ['Int8', 'Int16', 'Int32', 'Int64']: 85 | if s.max() <= dtype_range[dtype][2] and s.min() >= dtype_range[dtype][1]: 86 | convert_type = dtype 87 | break 88 | 89 | return convert_type 90 | 91 | 92 | def optimize_df_dtypes(df, ignore=None): 93 | """ 94 | Downcasts DataFrame Column Types based on values. 95 | 96 | Modification is inplace. 97 | 98 | Parameters: 99 | df (pd.DataFrame): reduce size of datatypes as appropriate for its values. 100 | 101 | ignore (list): column names to exclude from downcasting. 102 | """ 103 | 104 | # columns to consider for downcasting 105 | process_cols = df.columns 106 | if ignore: 107 | process_cols = df.columns.difference(ignore) 108 | 109 | if len(process_cols) == 0: 110 | return df 111 | 112 | # get the integer columns, if any 113 | df_int = df[process_cols].select_dtypes(include=[np.int]) 114 | 115 | # downcast integer columns to smallest unsigned int that will hold the values 116 | if len(df_int.columns) > 0: 117 | df[df_int.columns] = df_int.apply(pd.to_numeric, downcast='unsigned') 118 | 119 | # if there were any negative values, the above creates int64, downcast int64 as well 120 | df_int64 = df[process_cols].select_dtypes(include=[np.int64]) 121 | if len(df_int64.columns) > 0: 122 | df[df_int64.columns] = df_int64.apply(pd.to_numeric, downcast='signed') 123 | 124 | # convert float columns that are integers with nans to best nullable integer type 125 | df_float = df.select_dtypes(include=['float']) 126 | if len(df_float.columns) > 0: 127 | filt = df_float.apply(is_int) 128 | int_col_names = df_float.columns[filt] 129 | if filt.any(): 130 | for col in int_col_names: 131 | convert_type = get_optimal_data_type(df[col]) 132 | df[col] = df[col].astype(convert_type) 133 | 134 | 135 | def get_dtype_range(): 136 | """Create a Dictionary having min/max values per Data Type 137 | 138 | Key: string representation of data type 139 | Value: list of length 3 140 | value[0] is 0 or np.nan 141 | value[1] is min for that data type 142 | value[2] is max for that data type 143 | 144 | This dictionary can be used to create a 3 row DataFrame which demonstrates 145 | that the specified data type can hold the specified values. 146 | 147 | Pandas data type limits: 148 | Int8 nullable with same limits as np.int8 149 | UInt8 nullable with same limits as np.uint8 150 | Int16 nullable with same limits as np.int16 151 | UInt16 nullable with same limits as np.uint16 152 | Int32 nullable with same limits as np.int32 153 | UInt32 nullable with same limits as np.uint32 154 | Int64 nullable with min/max limits about 1/2 of np.int64 155 | UInt64 nullable with max limit about 1/4 of np.uint64 156 | """ 157 | data = [] 158 | for dtype in ['uint8', 'int8', 'uint16', 'int16', 'uint32', 'int32', 'uint64', 'int64']: 159 | data.append([0, np.iinfo(dtype).min, np.iinfo(dtype).max]) 160 | data.append([np.nan, np.iinfo(dtype).min, np.iinfo(dtype).max]) 161 | 162 | keys = ['uint8', 'UInt8', 'int8', 'Int8', 'uint16', 'UInt16', 'int16', 'Int16', 163 | 'uint32', 'UInt32', 'int32', 'Int32', 'uint64', 'UInt64', 'int64', 'Int64'] 164 | 165 | dtype_range = dict(zip(keys, data)) 166 | 167 | # Pandas has different limits than numpy for the following 168 | dtype_range['UInt64'][2] = 2**61 169 | dtype_range['Int64'][1] = -2**61 170 | dtype_range['Int64'][2] = 2**61 171 | 172 | return dict(zip(keys, data)) 173 | 174 | 175 | def optimize_db_dtypes(df): 176 | """ 177 | Choose smallest ANSI SQL Column Type for integer that fits the optimized DataFrame. 178 | 179 | Relies on: 180 | from sqlalchemy.types import SmallInteger, Integer, BigInteger 181 | 182 | SQL Column Types are signed, so uint16 might not fit in smallinteger 183 | TODO: below is safe but inefficient for uint16, UInt16, uint32 and UInt32 184 | """ 185 | small_int = {col: SmallInteger for col in df.select_dtypes( 186 | include=[pd.Int8Dtype, np.int8, pd.UInt8Dtype, np.uint8, 187 | pd.Int16Dtype, np.int16]).columns} 188 | 189 | integer = {col: Integer for col in df.select_dtypes( 190 | include=[pd.UInt16Dtype, np.uint16, pd.Int32Dtype, np.int32]).columns} 191 | 192 | big_int = {col: BigInteger for col in df.select_dtypes( 193 | include=[pd.UInt32Dtype, np.uint32, pd.Int64Dtype, np.int64]).columns} 194 | 195 | # use double precision for unsigned 64 bit integers 196 | # Float(precision=53) is the SQL data type for double precision 197 | double = {col: Float(precision=53) for col in df.select_dtypes( 198 | include=[np.uint64, pd.UInt64Dtype]).columns} 199 | 200 | dtypes = {**small_int, **integer, **big_int, **double} 201 | 202 | return dtypes 203 | 204 | 205 | def mem_usage(df): 206 | """Returns a string representing df memory usage in MB.""" 207 | mem = df.memory_usage(deep=True).sum() 208 | mem = mem / 2 ** 20 # covert to megabytes 209 | return f'{mem:03.2f} MB' 210 | 211 | 212 | def is_int(s): 213 | """Returns True if all non-null values are integers. 214 | 215 | Useful for determining if the df column (pd.Series) is 216 | float just to hold missing values. 217 | """ 218 | notnull = s.notnull() 219 | is_integer = s.apply(lambda x: (x % 1 == 0.0)) 220 | return (notnull == is_integer).all() 221 | 222 | 223 | def convert_camel_case(name): 224 | """ 225 | CamelCase to snake_case. 226 | 227 | This is from: 228 | https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case#answer-1176023 229 | """ 230 | s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name) 231 | return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower() 232 | 233 | 234 | def is_unique(df, cols, ignore_null=False): 235 | """Fast determination of multi-column uniqueness.""" 236 | if ignore_null: 237 | df.dropna(subset=cols, inplace=True) 238 | return not (df.duplicated(subset=cols)).any() 239 | 240 | 241 | def df_info(df): 242 | """Use buffer to capture output from df.info()""" 243 | buffer = io.StringIO() 244 | df.info(buf=buffer) 245 | return buffer.getvalue() 246 | 247 | 248 | def order_cols(df, cols): 249 | """Put columns in cols first, followed by rest of columns""" 250 | rest = [col for col in df.columns if col not in cols] 251 | df = df[cols + rest] 252 | return df 253 | 254 | 255 | def sum_stats_for_dups(df, pkey, stat_cols): 256 | """Sum stat columns for rows having the same primary key. 257 | 258 | This is a "best guess" fix to rows with duplicate primary keys. 259 | 260 | The first value for a non-pkey non-stat column will be kept. 261 | """ 262 | # dups is true for all rows that are duplicates 263 | dups = df.duplicated(subset=pkey, keep=False) 264 | if len(dups) == 0: 265 | return 266 | 267 | # get the duplicated rows 268 | df_dups = df.loc[dups] 269 | 270 | # for the duplicate rows, sum the stat columns only 271 | df_summed = df_dups.groupby(pkey)[stat_cols].sum() 272 | 273 | # often, setting the index to the primary key makes data processing easier 274 | df.set_index(pkey, inplace=True) 275 | 276 | # remove all but one of each group of duplicated rows 277 | df = df.loc[~df.index.duplicated(keep='first')].copy() 278 | 279 | # set the kept row (per group) equal to the summed row computed above 280 | df.loc[df_summed.index, stat_cols] = df_summed 281 | 282 | df.reset_index(inplace=True) 283 | 284 | return df 285 | 286 | 287 | def move_column_after(df, after_col, col): 288 | idx = df.columns.get_loc(after_col) 289 | cols = list(df.columns) 290 | cols.remove(col) 291 | cols.insert(idx + 1, col) 292 | return df.reindex(cols, axis=1) 293 | 294 | 295 | def game_id_to_url(game_id): 296 | """Game ID to URL for Jupyter Notebooks""" 297 | dir = game_id[:3] 298 | url = 'https://www.baseball-reference.com/boxes/' + dir + '/' + game_id + '.shtml' 299 | display(HTML(f'{game_id}')) 300 | 301 | 302 | def player_id_to_url(player_id): 303 | """Baseball Reference Player ID to URL for Jupyter Notebooks""" 304 | dir = player_id[0] 305 | url = 'https://www.baseball-reference.com/players/' + dir + '/' + player_id + '.shtml' 306 | display(HTML(f'{player_id}')) 307 | 308 | 309 | def simple_loess(x, y, df, frac=1 / 6, it=0): 310 | """Smoothes noisy data. 311 | 312 | Increase frac to get more smoothing. 313 | Decrease frac to get less smoothing. 314 | 315 | sns.lmplot has a loess option, but it uses poor and unchangeable defaults.""" 316 | z = sm.nonparametric.lowess(df[y], df[x], frac=frac, it=it) 317 | return pd.DataFrame(data=z, columns=[x, y]) 318 | -------------------------------------------------------------------------------- /download_scripts/retrosheet_wrangle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """Wrangle Retrosheet Data from {data_dir}/retrosheet/raw to {data_dir}/retrosheet/wrangled 4 | 5 | Wrangles: player per game and team per game data 6 | """ 7 | 8 | __author__ = 'Stephen Diehl' 9 | 10 | import argparse 11 | import re 12 | import shutil 13 | from pathlib import Path 14 | import logging 15 | import sys 16 | import collections 17 | 18 | import pandas as pd 19 | import numpy as np 20 | 21 | import data_helper as dh 22 | 23 | logger = logging.getLogger(__name__) 24 | logger.setLevel(logging.DEBUG) 25 | 26 | 27 | def get_parser(): 28 | """Args Description""" 29 | 30 | parser = argparse.ArgumentParser( 31 | description=__doc__, 32 | formatter_class=argparse.ArgumentDefaultsHelpFormatter) 33 | 34 | parser.add_argument("--data-dir", type=str, help="baseball data directory", default='../data') 35 | parser.add_argument("-v", "--verbose", help="verbose output", action="store_true") 36 | parser.add_argument("--log", dest="log_level", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 37 | help="Set the logging level") 38 | 39 | return parser 40 | 41 | 42 | def get_game(p_retrosheet_collected): 43 | """Read in collected results of the cwgame parser.""" 44 | logger.info('Reading game.csv.gz ...') 45 | filename = p_retrosheet_collected / 'game.csv.gz' 46 | game = dh.from_csv_with_types(filename) 47 | n_rows, n_cols = game.shape 48 | logger.info(f'game loaded {n_rows:,d} rows with {n_cols:,d} columns') 49 | return game 50 | 51 | 52 | def get_player_game(p_retrosheet_collected): 53 | """Read in collected results of the cwdaily parser.""" 54 | logger.info('Reading player_game.csv.gz ...') 55 | filename = p_retrosheet_collected / 'player_game.csv.gz' 56 | player_game = dh.from_csv_with_types(filename) 57 | n_rows, n_cols = player_game.shape 58 | logger.info(f'player_game loaded {n_rows:,d} rows with {n_cols:,d} columns') 59 | return player_game 60 | 61 | 62 | def clean_player_game(player_game): 63 | """Ensure Primary Key is Unique.""" 64 | 65 | # Fix Duplicate Primary Key 66 | pkey = ['game_id', 'player_id'] 67 | if not dh.is_unique(player_game, pkey): 68 | # if pkey is dup, sum the stat rows for the dups 69 | dups = player_game.duplicated(subset=pkey) 70 | df_dups = player_game.loc[dups, pkey] 71 | logger.warning(f'Dup PKey Found - summing stats for:\n{df_dups.to_string()}') 72 | 73 | # TODO flag fields should be ORed not summed 74 | # this is not currently a problem with the single dup found 75 | # data integrity tests verify that all flag fields are either 0 or 1 76 | """Flag Fields (value is 0 or 1): 77 | b_g b_g_dh b_g_ph b_g_pr p_g p_gs p_cg p_sho p_gf p_w p_l p_sv f_p_g f_p_gs f_c_g 78 | f_c_gs f_1b_g f_1b_gs f_2b_g f_2b_gs f_3b_g f_3b_gs f_ss_g f_ss_gs f_lf_g f_lf_gs 79 | f_cf_g f_cf_gs f_rf_g f_rf_gs 80 | """ 81 | 82 | # player stat columns b_ for batter, p_ for pitcher, f_ for fielder 83 | stat_columns = [col for col in player_game.columns if re.search(r'^[bpf]_', col)] 84 | stat_columns.remove('b_g') # don't sum this column 85 | 86 | player_game = dh.sum_stats_for_dups(player_game, pkey, stat_columns) 87 | 88 | return player_game 89 | 90 | 91 | def create_batting(player_game, game_start, p_retrosheet_wrangled): 92 | """Create batting.csv for batting attributes per player per game.""" 93 | # column names of the batting attributes 94 | b_cols = [col for col in player_game.columns if col.startswith('b_')] 95 | 96 | # Note: any player who is in a game in any role, will have b_g = 1 97 | # even if b_pa == 0 (no plate appearances) 98 | 99 | # fields which uniquely identify a record 100 | pkey = ['game_id', 'player_id'] 101 | 102 | # fields to join to other "tables" 103 | fkey = ['team_id'] 104 | 105 | batting = player_game.loc[:, pkey + fkey + b_cols].copy() 106 | 107 | # remove b_ from the column names, except for b_2b and b_3b 108 | b_cols_new = {col: col[2:] for col in b_cols} 109 | b_cols_new['b_2b'] = 'double' 110 | b_cols_new['b_3b'] = 'triple' 111 | b_cols_new['b_gdp'] = 'gidp' # to match Lahman 112 | b_cols_new['b_hp'] = 'hbp' # to match Lahman 113 | batting.rename(columns=b_cols_new, inplace=True) 114 | 115 | # add game_start.dt.year as many queries use year 116 | batting = pd.merge(batting, game_start[['game_id', 'game_start']]) 117 | batting['year'] = batting['game_start'].dt.year.astype('int16') 118 | 119 | dh.optimize_df_dtypes(batting, ignore=['year']) 120 | logger.info('Writing and compressing batting. This could take several minutes ...') 121 | dh.to_csv_with_types(batting, p_retrosheet_wrangled / 'batting.csv.gz') 122 | 123 | 124 | def create_pitching(player_game, game_start, p_retrosheet_wrangled): 125 | """Create pitching.csv for pitching attributes per player per game.""" 126 | # column names of the pitching attributes 127 | p_cols = [col for col in player_game.columns if col.startswith('p_')] 128 | 129 | # if all pitching attributes are 0 then the player did not pitch 130 | # note: all attributes are unsigned integers, so if their sum is zero, all are zero 131 | p_filt = player_game[p_cols].sum(axis=1) == 0 132 | 133 | # fields which uniquely identify a record 134 | pkey = ['game_id', 'player_id'] 135 | 136 | # fields to join to other "tables" 137 | fkey = ['team_id'] 138 | 139 | # data with some non-zero attributes 140 | pitching = player_game.loc[~p_filt, pkey + fkey + p_cols].copy() 141 | 142 | # remove p_ from the column names, except for p_2b and p_3b 143 | p_cols_new = {col: col[2:] for col in p_cols} 144 | p_cols_new['p_2b'] = 'double' 145 | p_cols_new['p_3b'] = 'triple' 146 | p_cols_new['p_gdp'] = 'gidp' # to match Lahman 147 | p_cols_new['p_hp'] = 'hbp' # to match Lahman 148 | pitching.rename(columns=p_cols_new, inplace=True) 149 | 150 | # add game_start.dt.year as many queries use year 151 | pitching = pd.merge(pitching, game_start[['game_id', 'game_start']]) 152 | pitching['year'] = pitching['game_start'].dt.year.astype('int16') 153 | 154 | dh.optimize_df_dtypes(pitching, ignore=['year']) 155 | logger.info('Writing and compressing pitching. This could take several minutes ...') 156 | dh.to_csv_with_types(pitching, p_retrosheet_wrangled / 'pitching.csv.gz') 157 | 158 | 159 | def create_fielding(player_game, game_start, p_retrosheet_wrangled): 160 | """Create fielding.csv for fielding attributes per player per game.""" 161 | # column names for fielding attributes 162 | f_cols = [col for col in player_game.columns if col.startswith('f_')] 163 | 164 | # create orig_cols dictionary which maps fielder's pos to original fielding columns names 165 | # create new_cols dictionary which maps fielder's pos to new fielding column names 166 | # pos: P, C, 1B, 2B, 3B, SS, LF, CF, RF 167 | # column name pattern: f_{pos}_{stat} 168 | orig_cols = collections.defaultdict(list) 169 | new_cols = collections.defaultdict(list) 170 | for col in f_cols: 171 | match = re.search(r'f_(\w{1,2})_(\w*)', col) 172 | pos = match.group(1) 173 | stat = match.group(2) 174 | orig_cols[pos].append(col) 175 | stat = stat.replace('out', 'inn_outs') # to match Lahman 176 | new_cols[pos].append(stat) 177 | 178 | # full pkey will be: ['game_id', 'player_id', 'pos'] 179 | pkey = ['game_id', 'player_id'] 180 | 181 | # fields to join to other "tables" 182 | fkey = ['team_id'] 183 | 184 | """For each record created by cwdaily, create up to 9 new records, one per position. 185 | Each record will temporarily go in its own dataframe and then be concatenated. 186 | 187 | Each dataframe has the same columns.""" 188 | dfs = [] 189 | for pos in orig_cols.keys(): 190 | # if all fielding attributes for this pos are 0 then the player did not play that pos 191 | # note: all attributes are unsigned integers 192 | f_filt = player_game[orig_cols[pos]].sum(axis=1) == 0 193 | 194 | df = pd.DataFrame() 195 | df[pkey + fkey + new_cols[pos]] = \ 196 | player_game.loc[~f_filt, pkey + fkey + orig_cols[pos]].copy() 197 | 198 | # add the position column to the df 199 | # use upper case to match Lahman position values 200 | df.insert(2, 'pos', pos.upper()) 201 | 202 | # orig_cols['c'] has pb and xi columns 203 | # all other positions do not have pb and xi 204 | if pos != 'c': 205 | df[f'pb'] = 0 206 | df[f'xi'] = 0 207 | 208 | dfs.append(df) 209 | 210 | fielding = pd.concat(dfs, ignore_index=True) 211 | 212 | # add game_start.dt.year as many queries use year 213 | fielding = pd.merge(fielding, game_start[['game_id', 'game_start']]) 214 | fielding['year'] = fielding['game_start'].dt.year.astype('int16') 215 | 216 | dh.optimize_df_dtypes(fielding, ignore=['year']) 217 | logger.info('Writing and compressing fielding. This could take several minutes ...') 218 | dh.to_csv_with_types(fielding, p_retrosheet_wrangled / 'fielding.csv.gz') 219 | 220 | 221 | def wrangle_game(game, p_retrosheet_wrangled): 222 | """Tidy the Game Data 223 | 224 | There are 3 types of data: 225 | 226 | data specific to a game -- the 'game' columns below 227 | data specific to the home team for that game -- the 'home' columns below 228 | data specific to the away team for that game -- the 'away' columns below 229 | The attributes for the home team are identical to the attributes for the away team. 230 | 231 | This suggests breaking this out into 2 csv files. 232 | 233 | 1. team_game.csv with key (game_id, team_id) -- stats per team per game (e.g. runs scored) 234 | 2. game.csv with key (game_id) -- stats per game (e.g. attendance) 235 | """ 236 | 237 | home_cols = [col for col in game.columns if col.startswith('home')] 238 | away_cols = [col for col in game.columns if col.startswith('away')] 239 | game_cols = [col for col in game.columns 240 | if not col.startswith('home') and not col.startswith('away')] 241 | 242 | game_tidy = game[game_cols].copy() 243 | home_team_game = game[['game_id'] + home_cols].copy() 244 | away_team_game = game[['game_id'] + away_cols].copy() 245 | 246 | home_team_game['bat_last'] = True 247 | away_team_game['bat_last'] = False 248 | home_team_game = dh.move_column_after(home_team_game, 'game_id', 'bat_last') 249 | away_team_game = dh.move_column_after(away_team_game, 'game_id', 'bat_last') 250 | 251 | # remove leading 'home_' and 'away_' suffix from fields 252 | home_team_game.rename(columns=lambda col: col[5:] if col.startswith('home_') else col, inplace=True) 253 | away_team_game.rename(columns=lambda col: col[5:] if col.startswith('away_') else col, inplace=True) 254 | 255 | # include opponent team_id in each row 256 | home_team_game.insert(4, 'opponent_team_id', away_team_game['team_id']) 257 | away_team_game.insert(4, 'opponent_team_id', home_team_game['team_id']) 258 | team_game = pd.concat([home_team_game, away_team_game]) 259 | 260 | # improve column names 261 | names = {col: col.replace('_ct', '') for col in team_game.columns if col.endswith('_ct')} 262 | 263 | # handle invalid identifiers 264 | names['2b_ct'] = 'double' 265 | names['3b_ct'] = 'triple' 266 | 267 | # pitcher_ct (number of pitchers) is a good name though, keep it 268 | names.pop('pitcher_ct') 269 | 270 | # additional fields to rename for consistency 271 | names['bi_ct'] = 'rbi' 272 | names['gdp_ct'] = 'gidp' 273 | names['hits_ct'] = 'h' 274 | names['hp_ct'] = 'hbp' 275 | names['err_ct'] = 'e' 276 | names['score_ct'] = 'r' 277 | 278 | team_game = team_game.rename(columns=names) 279 | 280 | # create new datetime column 281 | game_tidy['game_start'] = game_tidy.apply(parse_datetime, axis=1) 282 | game_tidy = dh.move_column_after(game_tidy, 'game_id', 'game_start') 283 | 284 | # these fields are no longer necessary 285 | game_tidy = game_tidy.drop(['start_game_tm', 'game_dt', 'game_dy'], axis=1) 286 | 287 | # add the game_start column to team_game to simplify queries 288 | team_game = pd.merge(team_game, game_tidy[['game_id', 'game_start']]) 289 | team_game['year'] = team_game['game_start'].dt.year.astype('int16') 290 | 291 | logger.info('Writing and compressing team_game. This could take several minutes ...') 292 | dh.optimize_df_dtypes(team_game, ignore=['year']) 293 | dh.to_csv_with_types(team_game, p_retrosheet_wrangled / 'team_game.csv.gz') 294 | 295 | # convert designated hitter to True/False and rename 296 | game_tidy['dh'] = False 297 | filt = game_tidy['dh_fl'] == 'T' 298 | game_tidy.loc[filt, 'dh'] = True 299 | game_tidy.drop('dh_fl', axis=1, inplace=True) 300 | 301 | # convert impossible attendance values to null and rename 302 | filt = game_tidy['attend_park_ct'] <= 0 303 | impossible_values = game_tidy.loc[filt, 'attend_park_ct'].unique() 304 | game_tidy['attendance'] = game_tidy['attend_park_ct'].replace(impossible_values, np.nan) 305 | game_tidy.drop('attend_park_ct', axis=1, inplace=True) 306 | 307 | # convert impossible temperature values to null and rename 308 | filt = game_tidy['temp_park_ct'] <= 0 309 | impossible_values = game_tidy.loc[filt, 'temp_park_ct'].unique() 310 | game_tidy['temperature'] = game_tidy['temp_park_ct'].replace(impossible_values, np.nan) 311 | game_tidy.drop('temp_park_ct', axis=1, inplace=True) 312 | 313 | # replace code values with strings 314 | # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-winddirection 315 | direction = { 316 | 0: 'unknown', 317 | 1: 'to_lf', 318 | 2: 'to_cf', 319 | 3: 'to_rf', 320 | 4: 'l_to_r', 321 | 5: 'from_lf', 322 | 6: 'from_cf', 323 | 7: 'from_rf', 324 | 8: 'r_to_l'} 325 | game_tidy['wind_direction'] = \ 326 | game_tidy['wind_direction_park_cd'].map(direction).replace('unknown', np.nan) 327 | game_tidy.drop('wind_direction_park_cd', axis=1, inplace=True) 328 | 329 | # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-windspeed 330 | # convert impossible wind speed values to null and rename 331 | filt = game_tidy['wind_speed_park_ct'] < 0 332 | impossible_values = game_tidy.loc[filt, 'wind_speed_park_ct'].unique() 333 | game_tidy['wind_speed'] = game_tidy['wind_speed_park_ct'].replace(impossible_values, np.nan) 334 | game_tidy.drop('wind_speed_park_ct', axis=1, inplace=True) 335 | 336 | # replace code values with strings 337 | # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-fieldcondition 338 | condition = { 339 | 0: 'unknown', 340 | 1: 'soaked', 341 | 2: 'wet', 342 | 3: 'damp', 343 | 4: 'dry'} 344 | game_tidy['field_condition'] = \ 345 | game_tidy['field_park_cd'].map(condition).replace('unknown', np.nan) 346 | game_tidy.drop('field_park_cd', axis=1, inplace=True) 347 | 348 | # replace code values with strings 349 | # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-precipitation 350 | precip = { 351 | 0: 'unknown', 352 | 1: 'none', 353 | 2: 'drizzle', 354 | 3: 'showers', 355 | 4: 'rain', 356 | 5: 'snow'} 357 | game_tidy['precip_type'] = \ 358 | game_tidy['precip_park_cd'].map(precip).replace('unknown', np.nan) 359 | game_tidy.drop('precip_park_cd', axis=1, inplace=True) 360 | 361 | # replace code values with strings 362 | # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-sky 363 | sky = { 364 | 0: 'unknown', 365 | 1: 'sunny', 366 | 2: 'cloudy', 367 | 3: 'overcast', 368 | 4: 'night', 369 | 5: 'dome'} 370 | game_tidy['sky_condition'] = \ 371 | game_tidy['sky_park_cd'].map(sky).replace('unknown', np.nan) 372 | game_tidy.drop('sky_park_cd', axis=1, inplace=True) 373 | 374 | logger.info('Writing and compressing game. This could take several minutes ...') 375 | dh.optimize_df_dtypes(game_tidy) 376 | dh.to_csv_with_types(game_tidy, p_retrosheet_wrangled / 'game.csv.gz') 377 | 378 | # to add game date to other tables 379 | return game_tidy[['game_id', 'game_start']] 380 | 381 | 382 | def parse_datetime(row): 383 | """Determine AM/PM from MLB domain knowledge and Day/Night Flag 384 | 385 | Here is the relevant information. 386 | 387 | * am/pm is not specified 388 | * start_game_tm is an integer 389 | * example: 130 represents 1:30 (am or pm) 390 | * start_game_tm == 0 means the game start time is unknown 391 | * there are no start_game_tm < 100 that are not exactly zero 392 | * daynight_park_cd is never missing 393 | * based on the data, almost always a game that starts between 5 and 9 is classified as a night game 394 | This is likely because "night" actually means that the stadium lights must be turned on before a 395 | game of typical length ends. 396 | * MLB domain knowledge: A game may start "early" to allow for travel, but games never start 397 | before 9 am so: 100 <= start_game_tm < 900 => pm 398 | * example: 830 => 8:30 pm 399 | * MLB domain knowledge: A game may start "late" due to rain delay, but games never start 400 | after midnight so: 900 < start_game_tm < 1200 => am or pm depending on the day/night flag 401 | * example: 1030 Day => 10:30 am 402 | * example: 1030 Night => 10:30 pm 403 | """ 404 | date = row['game_dt'] 405 | time = row['start_game_tm'] 406 | day_night = row['daynight_park_cd'] 407 | 408 | if 0 < time < 900: 409 | time += 1200 410 | elif (900 <= time < 1200) and day_night == 'N': 411 | time += 1200 412 | 413 | time_str = f'{time // 100:02d}:{time % 100:02d}' 414 | datetime_str = str(date) + ' ' + time_str 415 | return pd.to_datetime(datetime_str, format='%Y%m%d %H:%M') 416 | 417 | 418 | def wrangle_event(p_retrosheet_collected, p_retrosheet_wrangled): 419 | """Wrangle event 420 | 421 | At this time, there is nothing to do, just copy the collected data.""" 422 | source = p_retrosheet_collected / 'event.csv.gz' 423 | destination = p_retrosheet_wrangled / 'event.csv.gz' 424 | shutil.copyfile(source, destination) 425 | 426 | source = p_retrosheet_collected / 'event_types.csv' 427 | destination = p_retrosheet_wrangled / 'event_types.csv' 428 | shutil.copyfile(source, destination) 429 | 430 | 431 | def wrangle_parks(data_dir, retrosheet_wrangle): 432 | parks_filename = data_dir / 'retrosheet/raw/misc/parkcode.txt' 433 | parks = pd.read_csv(parks_filename, parse_dates=['START', 'END']) 434 | cols = [col.lower() for col in parks.columns] 435 | parks.columns = cols 436 | parks = parks.rename(columns={'parkid': 'park_id'}) 437 | dh.to_csv_with_types(parks, retrosheet_wrangle / 'parks.csv') 438 | 439 | 440 | def wrangle_teams(data_dir, retrosheet_wrangle): 441 | team_dir = data_dir / 'retrosheet/raw/event/regular' 442 | 443 | dfs = [] 444 | team_files = team_dir.glob('TEAM*') 445 | for team in sorted(team_files): 446 | year = int(team.name[-4:]) 447 | df = pd.read_csv(team, header=None, names=['team_id', 'lg_id', 'city', 'name']) 448 | df.insert(1, 'year', year) 449 | dfs.append(df) 450 | retro_teams = pd.concat(dfs, ignore_index=True) 451 | dh.to_csv_with_types(retro_teams, retrosheet_wrangle / 'teams.csv') 452 | 453 | 454 | def main(): 455 | """Wrangle the data. 456 | """ 457 | parser = get_parser() 458 | args = parser.parse_args() 459 | 460 | if args.log_level: 461 | fh = logging.FileHandler('download.log') 462 | formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s') 463 | fh.setFormatter(formatter) 464 | fh.setLevel(args.log_level) 465 | logger.addHandler(fh) 466 | 467 | if args.verbose: 468 | # send INFO level logging to stdout 469 | sh = logging.StreamHandler(sys.stdout) 470 | formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s') 471 | sh.setFormatter(formatter) 472 | sh.setLevel(logging.INFO) 473 | logger.addHandler(sh) 474 | 475 | data_dir = Path(args.data_dir) 476 | p_retrosheet_collected = (data_dir / 'retrosheet/collected').resolve() 477 | p_retrosheet_wrangled = (data_dir / 'retrosheet/wrangled').resolve() 478 | 479 | # get collected data from parsers 480 | game = get_game(p_retrosheet_collected) # cwgame 481 | game_start = wrangle_game(game, p_retrosheet_wrangled) 482 | 483 | player_game = get_player_game(p_retrosheet_collected) # cwdaily 484 | player_game = clean_player_game(player_game) 485 | 486 | create_batting(player_game, game_start, p_retrosheet_wrangled) 487 | create_pitching(player_game, game_start, p_retrosheet_wrangled) 488 | create_fielding(player_game, game_start, p_retrosheet_wrangled) 489 | 490 | wrangle_event(p_retrosheet_collected, p_retrosheet_wrangled) # cwevent 491 | 492 | # parks.txt is included with the retrosheet data. It is a csv file. 493 | wrangle_parks(data_dir, p_retrosheet_wrangled) 494 | 495 | # TEAM is included in the retrosheet data. They are csv files. 496 | wrangle_teams(data_dir, p_retrosheet_wrangled) 497 | 498 | logger.info('Finished') 499 | 500 | 501 | if __name__ == '__main__': 502 | main() 503 | -------------------------------------------------------------------------------- /download_scripts/tests/test_data.py: -------------------------------------------------------------------------------- 1 | __author__ = 'Stephen Diehl' 2 | 3 | import zipfile 4 | import re 5 | import pandas as pd 6 | import numpy as np 7 | from .. import data_helper as dh 8 | 9 | 10 | def test_lahman_download(data_dir): 11 | """Verify the Lahman Data was downloaded, unzipped and reogranized.""" 12 | lahman_dir = data_dir / 'lahman' 13 | raw_dir = lahman_dir / 'raw' 14 | wrangled_dir = lahman_dir / 'wrangled' 15 | 16 | assert lahman_dir.is_dir() 17 | assert wrangled_dir.is_dir() 18 | assert raw_dir.is_dir() 19 | 20 | # 2 directories and 1 file 21 | assert len(list(lahman_dir.iterdir())) == 3 22 | 23 | # zip from master branch of https://github.com/chadwickbureau/baseballdatabank 24 | zipfilename = raw_dir.joinpath('baseballdatabank-master.zip') 25 | assert zipfilename.is_file() 26 | 27 | zipped = zipfile.ZipFile(zipfilename) 28 | zip_core_files = [file for file in zipped.namelist() 29 | if file.startswith('baseballdatabank-master/core/') and 30 | file.endswith('.csv')] 31 | 32 | # each csv file in the zipfile should be in raw_dir 33 | assert len(list(raw_dir.glob('*.csv'))) == len(zip_core_files) 34 | 35 | 36 | def test_retrosheet_download(data_dir): 37 | """Verify the Retrosheet data was downloaded and and unzipped.""" 38 | retrosheet_dir = data_dir / 'retrosheet' 39 | raw_dir = retrosheet_dir / 'raw' 40 | wrangled_dir = retrosheet_dir / 'wrangled' 41 | 42 | assert retrosheet_dir.is_dir() 43 | assert wrangled_dir.is_dir() 44 | assert raw_dir.is_dir() 45 | 46 | teams = raw_dir.glob('TEAM*') 47 | years = sorted([team.name[4:] for team in teams]) 48 | 49 | for year in years: 50 | zipdata = raw_dir.joinpath(f'{year}eve.zip') 51 | assert zipdata.exists() 52 | 53 | # should be same number of files in raw_dir as in zipfile 54 | files = [file for file in raw_dir.glob(f'*{year}*') if not file.name.endswith('.zip')] 55 | zipped = zipfile.ZipFile(zipdata) 56 | assert len(files) == len(zipped.namelist()) 57 | 58 | 59 | def test_download_years(batting): 60 | """Verify the Retrosheet years 1974 through 2019 inclusive were downloaded. 61 | 62 | The data consistency tests have accuracy bounds tested on these years only!""" 63 | assert (batting['year'].agg(['min', 'max']) == (1974, 2019)).all() 64 | assert batting['year'].nunique() == (2019 - 1974) + 1 65 | 66 | 67 | def test_lahman_people_pkey(lahman_people): 68 | """Verify the Lahman People primary and foreign keys.""" 69 | assert dh.is_unique(lahman_people, ['player_id']) # lahman player id 70 | assert dh.is_unique(lahman_people, ['retro_id'], ignore_null=True) # retrosheet player id 71 | 72 | 73 | def test_lahman_fielding_pkey(lahman_fielding): 74 | """Verfiy the Lahman Fielding primary keys.""" 75 | assert dh.is_unique(lahman_fielding, ['player_id', 'year', 'stint', 'pos']) 76 | 77 | 78 | def test_lahman_batting_pkey(lahman_batting): 79 | """Verify the Lahman Batting primary key.""" 80 | assert dh.is_unique(lahman_batting, ['player_id', 'year', 'stint']) 81 | 82 | 83 | def test_lahman_pitching_pkey(lahman_pitching): 84 | """Verify the Lahman Pitching primary key.""" 85 | assert dh.is_unique(lahman_pitching, ['player_id', 'year', 'stint']) 86 | 87 | 88 | def test_lahman_salaries_pkey(data_dir): 89 | """Verify the Lahman Salaries primary key.""" 90 | filename = data_dir / 'lahman' / 'wrangled' / 'salaries.csv' 91 | 92 | # check for duplicate IDs 93 | salaries = dh.from_csv_with_types(filename) 94 | assert dh.is_unique(salaries, ['player_id', 'year', 'team_id']) 95 | 96 | 97 | def test_lahman_teams_pkey(lahman_teams): 98 | """Verify the Lahman Teams primary key.""" 99 | assert dh.is_unique(lahman_teams, ['team_id', 'year']) # lahman team_id 100 | assert dh.is_unique(lahman_teams, ['team_id_retro', 'year']) # retrosheet team_id 101 | 102 | 103 | def test_lahman_parks_pkey(data_dir): 104 | """Verify the Lahman Parks primary key.""" 105 | filename = data_dir / 'lahman' / 'wrangled' / 'parks.csv' 106 | 107 | # check for duplicate IDs 108 | parks = dh.from_csv_with_types(filename) 109 | assert dh.is_unique(parks, ['park_key']) 110 | 111 | # park_name is not unique 112 | # assert dh.is_unique(parks, ['park_name'] 113 | 114 | 115 | def test_game_id(team_game): 116 | """Verify 1st 3 characters of game_id are the team batting last.""" 117 | filt = team_game['bat_last'] == False 118 | team_game['home_team_id'] = team_game['team_id'] 119 | team_game.loc[filt, 'home_team_id'] = team_game.loc[filt, 'opponent_team_id'] 120 | 121 | assert (team_game['game_id'].str[:3] == team_game['home_team_id']).all() 122 | 123 | 124 | def test_batting_flags(batting): 125 | """Verify the batting flags are 0 or 1. 126 | 127 | g means in the game in the specified role. 128 | For example, g_pr means in the game as a pinch runner.""" 129 | flag_cols = [ 130 | 'g', 131 | 'g_dh', 132 | 'g_ph', 133 | 'g_pr' 134 | ] 135 | 136 | assert batting[flag_cols].min().min() == 0 137 | assert batting[flag_cols].max().max() == 1 138 | 139 | 140 | def test_pitching_flags(pitching): 141 | """Verify the pitching flags are 0 or 1. 142 | 143 | For example: 144 | gs means the pitcher started the game 145 | gf means the pitcher finished the game""" 146 | flag_cols = [ 147 | 'g', 148 | 'gs', 149 | 'cg', 150 | 'sho', 151 | 'gf', 152 | 'w', 153 | 'l', 154 | 'sv' 155 | ] 156 | 157 | assert pitching[flag_cols].min().min() == 0 158 | assert pitching[flag_cols].max().max() == 1 159 | 160 | 161 | def test_fielding_flags(fielding): 162 | """Verify the fielding flags are either 0 or 1.""" 163 | flag_cols = [ 164 | 'g', 165 | 'gs' 166 | ] 167 | 168 | assert fielding[flag_cols].min().min() == 0 169 | assert fielding[flag_cols].max().max() == 1 170 | 171 | 172 | def test_batting_pkey(batting): 173 | """Verify the Retrosheet batting primary key.""" 174 | assert dh.is_unique(batting, ['player_id', 'game_id']) 175 | 176 | 177 | def test_pitching_pkey(pitching): 178 | """Verify the Retrosheet pitching primary key.""" 179 | assert dh.is_unique(pitching, ['player_id', 'game_id']) 180 | 181 | 182 | def test_fielding_pkey(fielding): 183 | """Verify the Retrosheet fielding primary key.""" 184 | assert dh.is_unique(fielding, ['player_id', 'game_id', 'pos']) 185 | 186 | 187 | def test_team_game_pkey(team_game): 188 | """Verify the Retrosheet team_game primary key.""" 189 | assert dh.is_unique(team_game, ['team_id', 'game_id']) 190 | 191 | 192 | def test_game_pkey(game): 193 | """Verify the Retrosheet game primary key.""" 194 | assert dh.is_unique(game, ['game_id']) 195 | 196 | 197 | def test_lahman_retro_batting_data(batting, lahman_batting): 198 | """Compare Aggregated Lahman batting data to Aggregated Retrosheet batting data""" 199 | # columns in common -- these are the columns to compare 200 | b_cols = set(batting.columns) & set(lahman_batting.columns) 201 | b_cols -= {'player_id', 'team_id', 'year'} 202 | 203 | # there are 17 columns in common 204 | assert len(b_cols) == 17 205 | 206 | l_batting = lahman_batting[b_cols] 207 | r_batting = batting[b_cols] 208 | 209 | l_sums = l_batting.agg('sum').astype(int) 210 | l_sums.sort_index(inplace=True) 211 | 212 | r_sums = r_batting.agg('sum').astype(int) 213 | r_sums.sort_index(inplace=True) 214 | 215 | # verify all 17 batting attributes 216 | # are within plus/minus 0.01% of each other when summed 217 | assert (np.abs(1.0 - (l_sums / r_sums)) < .0001).all() 218 | 219 | 220 | def test_lahman_retro_pitching_data(pitching, lahman_pitching): 221 | """Compare Aggregated Lahman pitching data to Aggregated Retrosheet pitching data""" 222 | # columns in common -- these are the columns to compare 223 | p_cols = set(lahman_pitching.columns) & set(pitching.columns) 224 | p_cols -= {'player_id', 'team_id', 'year'} 225 | 226 | # there are 21 columns in common 227 | assert len(p_cols) == 21 228 | 229 | l_pitching = lahman_pitching[p_cols] 230 | r_pitching = pitching[p_cols] 231 | 232 | l_sums = l_pitching.agg('sum').astype(int) 233 | l_sums.sort_index(inplace=True) 234 | 235 | r_sums = r_pitching.agg('sum').astype(int) 236 | r_sums.sort_index(inplace=True) 237 | 238 | # verify all values are within plus/minus 0.06% of each other 239 | assert (np.abs(1.0 - (l_sums / r_sums)) < .0006).all() 240 | 241 | 242 | def test_lahman_retro_fielding_data(fielding, lahman_fielding): 243 | """Compare Aggregated Lahman fielding per position data to 244 | Aggregated Retrosheet fielding per position data.""" 245 | # find the common columns 246 | f_cols = set(lahman_fielding.columns) & set(fielding.columns) 247 | f_cols -= {'player_id', 'pos', 'team_id', 'year'} 248 | f_cols = list(f_cols) 249 | 250 | # work-around for Pandas 1.0.1 bugs 251 | # sum does not up-cast for nullable integer types 252 | # select_dtypes does not distinguish between nullable and non-nullable int types 253 | idx = lahman_fielding[f_cols].dtypes.isin([pd.UInt8Dtype(), pd.UInt16Dtype()]) 254 | for col in lahman_fielding[f_cols].columns[idx]: 255 | lahman_fielding[col] = lahman_fielding[col].astype('Int32') 256 | 257 | l_sums = lahman_fielding.groupby('pos')[f_cols].agg('sum') 258 | l_sums.sort_index(inplace=True) 259 | 260 | # there are 7 fielding attributes and 7 fielding positions in Lahman 261 | assert l_sums.shape == (7, 7) 262 | 263 | r_sums = fielding.groupby('pos')[f_cols].agg('sum').astype('int') 264 | 265 | # Lahman uses OF for sum of LF, CF, RF 266 | r_sums.loc['OF'] = r_sums.loc['LF'] + r_sums.loc['CF'] + r_sums.loc['RF'] 267 | r_sums = r_sums.drop(['LF', 'CF', 'RF']) 268 | r_sums.sort_index(inplace=True) 269 | 270 | # there are now 7 fielding attributes and 7 fielding positions in Retrosheet sums 271 | assert r_sums.shape == (7, 7) 272 | 273 | # the indexes and columns should now be the same 274 | assert l_sums.index.equals(r_sums.index) 275 | assert l_sums.columns.equals(r_sums.columns) 276 | 277 | filt = fielding['pos'].isin(['LF', 'CF', 'RF']) 278 | r_of = fielding[filt] 279 | 280 | # account for outfielders who played more than 1 outfield position in the same game 281 | total_dups = r_of.duplicated(subset=['player_id', 'game_id'], keep=False).sum() 282 | counted_dups = r_of.duplicated(subset=['player_id', 'game_id'], keep='first').sum() 283 | r_sums.loc['OF', 'g'] -= (total_dups - counted_dups) 284 | 285 | rel_accuarcy = l_sums / r_sums 286 | 287 | # relative accuracy is within 0.8% for all 49 aggregated values 288 | assert (np.abs(1.0 - rel_accuarcy) < 0.008).all().all() 289 | 290 | 291 | def test_batting_team_game_data(batting, team_game): 292 | """Verify Retrosheet batting aggregated by (game_id, team_id) 293 | is the same as team_game batting stats.""" 294 | exclude = ['game_id', 'team_id', 'player_id', 'game_start', 'year'] 295 | cols = set(batting.columns) & set(team_game.columns) - set(exclude) 296 | cols = list(cols) 297 | 298 | assert len(cols) == 17 299 | 300 | b = batting[['game_id', 'team_id'] + cols].groupby(['game_id', 'team_id']).agg('sum') 301 | b = b.reset_index().sort_index() 302 | 303 | tg = team_game[['game_id', 'team_id'] + cols].sort_values( 304 | ['game_id', 'team_id']).reset_index(drop=True) 305 | 306 | assert b.equals(tg) 307 | 308 | 309 | def test_pitching_team_game_data(pitching, team_game): 310 | """Verify Retrosheet batting aggregated by (game_id, team_id) 311 | is the same as team_game pitching stats 312 | 313 | This shows that the two Retrosheet parsers are consistent with one another.""" 314 | cols = ['wp', 'bk', 'er'] 315 | 316 | p = pitching[['game_id', 'team_id'] + cols].groupby(['game_id', 'team_id']).agg('sum') 317 | p = p.reset_index().sort_index() 318 | 319 | tg = team_game[['game_id', 'team_id'] + cols].sort_values( 320 | ['game_id', 'team_id']).reset_index(drop=True) 321 | 322 | assert p.equals(tg) 323 | 324 | 325 | def test_fielding_team_game_data(fielding, team_game): 326 | """Verify Retrosheet fielding aggregated by (game_id, team_id) 327 | is the same a team_game fielding stats 328 | 329 | This shows that the two Retrosheet parsers are consistent with one another.""" 330 | cols = ['a', 'e', 'po', 'pb'] 331 | 332 | f = fielding[['game_id', 'team_id'] + cols].groupby(['game_id', 'team_id']).agg('sum') 333 | f = f.reset_index().sort_index() 334 | 335 | tg = team_game[['game_id', 'team_id'] + cols].sort_values( 336 | ['game_id', 'team_id']).reset_index(drop=True) 337 | 338 | assert f.equals(tg) 339 | 340 | 341 | def test_batting_lahman_game_data(batting, lahman_teams): 342 | """Verify Retrosheet batting aggregated by (year, team_id_lahman) 343 | is the same as Lahman_teams. 344 | 345 | This shows that Retrosheet batting and Lahman Teams are consistent with each other.""" 346 | # Add team_id_lahman 347 | retro_batting = pd.merge(batting, lahman_teams[['team_id', 'year', 'team_id_retro']], 348 | left_on=['year', 'team_id'], 349 | right_on=['year', 'team_id_retro'], 350 | how='inner', suffixes=['_retrosheet', '_lahman']) 351 | 352 | # team_id_retro is now the same as team_id_retrosheet 353 | retro_batting.drop('team_id_retro', axis=1, inplace=True) 354 | 355 | pkey = ['year', 'team_id'] 356 | compare_cols = set(lahman_teams.columns) & set(retro_batting.columns) - set(pkey) 357 | compare_cols -= {'g'} # cannot sum g by player per team to get g per team 358 | compare_cols -= {'sb', 'cs'} # these stats are close, but don't tie out as well as others 359 | compare_cols = list(compare_cols) 360 | 361 | assert len(compare_cols) == 10 362 | 363 | retro_batting_sums = retro_batting.groupby(['year', 'team_id_lahman'])[compare_cols].sum().astype('int') 364 | retro_batting_sums.sort_index(inplace=True) 365 | 366 | year_min, year_max = retro_batting['year'].aggregate(['min', 'max']) 367 | year_filt = (lahman_teams['year'] >= year_min) & (lahman_teams['year'] <= year_max) 368 | l_teams = lahman_teams.loc[year_filt, pkey + compare_cols] 369 | l_teams = l_teams.set_index(pkey).sort_index() 370 | 371 | # verify all 12880 values are within 0.5% of each other 372 | assert np.abs(1.0 - (l_teams / retro_batting_sums)).max().max() < 0.005 373 | 374 | 375 | def test_attendance_values(game): 376 | """Verify attendance has plausible values.""" 377 | # There was one baseball game in which the public was not allowed to attend. 378 | # This is considered null rather than 0, as people wanted to attend, but were not allowed. 379 | # https://www.baseball-reference.com/boxes/BAL/BAL201504290.shtml 380 | assert game['attendance'].min() > 0 381 | 382 | 383 | def test_temperature_values(game): 384 | """Verify temperature has plausible values.""" 385 | # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-temperature 386 | assert game['temperature'].min() > 0 387 | 388 | 389 | def test_wind_speed_values(game): 390 | """Verify wind speed has plausible values.""" 391 | assert game['wind_speed'].min() >= 0 392 | 393 | 394 | def test_wind_direction_values(game): 395 | """Verfiy wind direction is in known category.""" 396 | # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-winddirection 397 | valid_values = ['to_lf', 'to_cf', 'to_rf', 'l_to_r', 'from_lf', 'from_cf', 398 | 'from_rf', 'r_to_l'] 399 | assert game['wind_direction'].dropna().isin(valid_values).all() 400 | 401 | 402 | def test_field_condition_values(game): 403 | """Verify field condition is in known category.""" 404 | # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-fieldcondition 405 | valid_values = ['soaked', 'wet', 'damp', 'dry'] 406 | assert game['field_condition'].dropna().isin(valid_values).all() 407 | 408 | 409 | def test_precip_type_values(game): 410 | """Verify precipition type is in known category.""" 411 | # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-precipitation 412 | valid_values = ['none', 'drizzle', 'showers', 'rain', 'snow'] 413 | assert game['precip_type'].dropna().isin(valid_values).all() 414 | 415 | 416 | def test_sky_condition_values(game): 417 | """Verify sky condition is in known category.""" 418 | # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-sky 419 | valid_values = ['sunny', 'cloudy', 'overcast', 'night', 'dome'] 420 | assert game['sky_condition'].dropna().isin(valid_values).all() 421 | 422 | 423 | def test_game_length_values(game): 424 | """Verify number of outs is consistent with number of innings.""" 425 | outs = game['outs_ct'] 426 | inns = game['inn_ct'] 427 | 428 | # this is defined by the rules of baseball 429 | assert ((5 * inns <= outs) & (outs <= 6 * inns)).all() 430 | 431 | 432 | def test_game_length_minute_values(game): 433 | """Verify game length per out is plausible.""" 434 | outs = game['outs_ct'] 435 | mins = game['minutes_game_ct'] 436 | mins_per_out = mins / outs 437 | 438 | # these bounds should be wide enough to encompass any future game 439 | assert mins_per_out.min() > 1 and mins_per_out.max() < 6 440 | 441 | 442 | def test_retro_lahman_batting_players(batting, lahman_people, lahman_batting): 443 | """Verify all Retrosheet batters are in Lahman batting""" 444 | lahman_batters = pd.merge(lahman_batting['player_id'], lahman_people[['player_id', 'retro_id']]) 445 | r_batters = set(batting['player_id'].unique()) 446 | l_batters = set(lahman_batters['retro_id'].unique()) 447 | assert r_batters == l_batters 448 | 449 | 450 | def test_retro_lahman_fielding_players(fielding, lahman_people, lahman_fielding): 451 | """Verify all Retrosheet fielders are in Lahman fielding""" 452 | lahman_fielders = pd.merge(lahman_fielding['player_id'], lahman_people[['player_id', 'retro_id']]) 453 | r_fielders = set(fielding['player_id'].unique()) 454 | l_fielders = set(lahman_fielders['retro_id'].unique()) 455 | 456 | # There is one Retrosheet fielder not in Lahman fielding 457 | assert len(r_fielders - l_fielders) == 1 458 | assert len(l_fielders - r_fielders) == 0 459 | 460 | missing_fielder = f'{(r_fielders - l_fielders).pop()}' 461 | missing = fielding.query(f'player_id == "{missing_fielder}"') 462 | 463 | # The missing fielder had zero fielding total chances. 464 | assert missing['tc'].sum() == 0 465 | 466 | # The missing fielder was on the field for no outs. 467 | assert missing['inn_outs'].sum() == 0 468 | 469 | 470 | def test_retro_lahman_pitching_players(pitching, lahman_pitching, lahman_people): 471 | """Verify all Retrosheet pitchers are in Lahman pitchers""" 472 | lahman_pitchers = pd.merge(lahman_pitching['player_id'], lahman_people[['player_id', 'retro_id']]) 473 | r_pitchers = set(pitching['player_id'].unique()) 474 | l_pitchers = set(lahman_pitchers['retro_id'].unique()) 475 | assert r_pitchers == l_pitchers 476 | 477 | 478 | def test_retro_lahman_player_ids(batting, lahman_people): 479 | """Verify the inverse of Lahman player_id to Retrosheet player_id mapping is valid. 480 | 481 | In other words, each Retrosheet player_id is mapped to exactly one Lahman player_id. 482 | 483 | Other tests verify that Retrosheet player_ids and Lahman player_ids are unique. 484 | 485 | Note: every player who was in a game, has a Retrosheet batting record even if 486 | they had no plate appearances.""" 487 | retro_players = pd.Series(batting['player_id'].unique(), name='player_id') 488 | 489 | # use an inner join to verify that the mapping is one-to-one and onto 490 | mapping = lahman_people[['player_id', 'retro_id']].merge( 491 | retro_players, how='inner', 492 | left_on=['retro_id'], 493 | right_on=['player_id'], 494 | suffixes=('_lahman', '_retro')) 495 | 496 | assert len(retro_players) == len(mapping) 497 | 498 | 499 | def test_retro_lahman_team_ids(team_game, lahman_teams): 500 | """Verify the inverse of the Lahman to Retroshett mapping is valid. 501 | A is (team_id, year). 502 | 503 | The logic is analogous test_retro_lahman_player_ids() above.""" 504 | 505 | # create a Retrosheet dataframe having just the unique values 506 | retro_team_ids = team_game[['team_id', 'year']].copy() 507 | retro_team_ids = retro_team_ids.drop_duplicates(subset=['team_id', 'year']) 508 | 509 | # use an inner join to verify that the mapping is one-to-one and onto 510 | mapping = lahman_teams.merge(retro_team_ids, how='inner', 511 | left_on=['team_id_retro', 'year'], 512 | right_on=['team_id', 'year']) 513 | 514 | assert len(retro_team_ids) == len(mapping) 515 | 516 | 517 | def test_retro_pitching_batting(pitching, batting): 518 | """Verify Retrosheet batting stats == pitching stats (allowed)""" 519 | exclude = ['game_id', 'team_id', 'player_id', 'g', 'game_start', 'year'] 520 | cols = set(pitching.columns) & set(batting.columns) - set(exclude) 521 | cols = list(cols) 522 | assert len(cols) == 16 523 | 524 | # sum over all pitchers over all years 525 | p = pitching[cols].agg('sum') 526 | 527 | # sum over all batters over all years 528 | b = batting[cols].agg('sum') 529 | 530 | # Retrosheet is completely consistent 531 | p.equals(b) 532 | 533 | 534 | def test_lahman_pitching_batting(lahman_pitching, lahman_batting): 535 | """Verify Lahman batting stats == pitching stats (allowed)""" 536 | exclude = ['lg_id', 'player_id', 'stint', 'team_id', 'year', 'g'] 537 | cols = set(lahman_pitching.columns) & set(lahman_batting.columns) 538 | cols -= set(exclude) 539 | assert len(cols) == 10 540 | 541 | # sum over all pitchers over all years 542 | p = lahman_pitching[cols].agg('sum') 543 | 544 | # sum over all batters over all years 545 | b = lahman_batting[cols].agg('sum') 546 | 547 | # the biggest difference is less than 0.01% 548 | assert np.abs(1.0 - p / b).max() < 0.0001 549 | 550 | 551 | def test_lahman_batting_teams(lahman_batting, lahman_teams): 552 | """Verify Lahman batting aggregated to the team level matches Lahman teams.""" 553 | exclude = ['lg_id', 'team_id', 'year', 'g'] 554 | key = ['team_id', 'year'] 555 | cols = set(lahman_batting.columns) & set(lahman_teams.columns) - set(exclude) 556 | cols = list(cols) 557 | assert len(cols) == 12 558 | 559 | # work-around for Pandas 1.0.1 bugs 560 | # sum does not up-cast for nullable integer types 561 | # select_dtypes does not distinguish between nullable and non-nullable int types 562 | idx = lahman_batting[cols].dtypes.isin([pd.UInt8Dtype(), pd.UInt16Dtype()]) 563 | for col in lahman_batting[cols].columns[idx]: 564 | lahman_batting[col] = lahman_batting[col].astype('Int32') 565 | 566 | idx = lahman_teams[cols].dtypes.isin([pd.UInt8Dtype(), pd.UInt16Dtype()]) 567 | for col in lahman_teams[cols].columns[idx]: 568 | lahman_teams[col] = lahman_teams[col].astype('Int32') 569 | 570 | b = lahman_batting[key + cols].groupby(key).agg('sum').reset_index() 571 | 572 | t = lahman_teams[key + cols].sort_values(key).reset_index(drop=True) 573 | 574 | # ensure the dtypes are the same 575 | for col in t.columns: 576 | if not col == 'team_id' and not col == 'year': 577 | b[col] = b[col].astype('int') 578 | t[col] = t[col].astype('int') 579 | 580 | assert b[cols].equals(t[cols]) 581 | 582 | 583 | def test_lahman_pitching_teams(lahman_pitching, lahman_teams): 584 | """Verify Lahman pitching aggregated to the team level matches Lahman teams.""" 585 | # most of the common columns are for batting, not pitching 586 | # era cannot be summed 587 | # sho for team is counted differently than for pitcher 588 | # er for team is counted differently than for pitcher 589 | exclude = ['lg_id', 'team_id', 'year', 'g', 'era', 590 | 'bb', 'h', 'hbp', 'hr', 'r', 'sf', 'so', 'sho', 'er'] 591 | key = ['team_id', 'year'] 592 | cols = set(lahman_pitching.columns) & set(lahman_teams.columns) - set(exclude) 593 | cols = list(cols) 594 | assert len(cols) == 5 595 | 596 | p = lahman_pitching[key + cols].groupby(key).agg('sum').reset_index() 597 | 598 | t = lahman_teams[key + cols].sort_values(key).reset_index(drop=True) 599 | 600 | # dtypes need to be the same 601 | for col in p.columns: 602 | if not col == 'year' and not col == 'team_id': 603 | p[col] = p[col].astype('int') 604 | t[col] = t[col].astype('int') 605 | 606 | assert np.abs(p[cols] - t[cols]).max().max() == 1 607 | 608 | 609 | def test_lahman_fielding_teams(lahman_fielding, lahman_teams): 610 | """Verify Lahman fielding aggregated to the team level matches Lahman teams.""" 611 | # dp is excluded because in fielding, each fielder involved gets a dp 612 | # whereas in team only one dp is counted 613 | exclude = ['lg_id', 'team_id', 'year', 'g', 'dp', 'player_id'] 614 | key = ['team_id', 'year'] 615 | cols = set(lahman_fielding.columns) & set(lahman_teams.columns) - set(exclude) 616 | cols = list(cols) 617 | assert len(cols) == 1 618 | 619 | f = lahman_fielding[key + cols].groupby(key).agg('sum').reset_index() 620 | 621 | t = lahman_teams[key + cols].sort_values(key).reset_index(drop=True) 622 | 623 | # ensure the dtypes are the same 624 | col = 'e' 625 | f[cols] = f[cols].astype('int') 626 | t[cols] = t[cols].astype('int') 627 | 628 | # When comparing large values, it is best to use their relative differences. 629 | # When comparing small values, it is best to use their absolute differences. 630 | assert ((f[cols] - t[cols]).max() <= 2).all() 631 | 632 | 633 | def test_event(event, team_game): 634 | """Verify play-by-play data aggregated per team per game matches team_game data 635 | 636 | About 10 fields were added to cwevent output by custom parsing of event_tx. 637 | These 10 fields are included in this test.""" 638 | 639 | key = ['game_id', 'team_id', 'opponent_team_id'] 640 | compare_cols = set(team_game.columns) & set(event.columns) - set(key) 641 | compare_cols = list(compare_cols) 642 | assert len(compare_cols) == 21 643 | 644 | event_team_game = event[key + compare_cols].groupby(key).agg('sum') 645 | 646 | # e, dp, tp, pb, wp, and bk should be charged to the opponent when 647 | # aggregating values to compare with team_game 648 | opp_cols = ['e', 'dp', 'tp', 'pb', 'wp', 'bk'] 649 | tmp = event_team_game.reset_index() 650 | opp = event_team_game.sort_values(['game_id', 'opponent_team_id']).reset_index() 651 | 652 | # swap column values 653 | tmp[opp_cols] = opp[opp_cols] 654 | event_team_game = tmp 655 | 656 | tg = team_game.set_index(['game_id', 'team_id']).sort_index() 657 | etg = event_team_game.set_index(['game_id', 'team_id']).sort_index() 658 | 659 | diff = tg[compare_cols] - etg[compare_cols] 660 | 661 | assert diff.max().max() == 0 662 | assert diff.min().min() == 0 663 | 664 | 665 | def test_event_pkey(event): 666 | """Verify the Retrosheet event primary key.""" 667 | assert dh.is_unique(event, ['game_id', 'event_id']) 668 | 669 | 670 | def test_line_score(team_game): 671 | """Verify line score total is run total.""" 672 | 673 | def line_score_to_runs(row): 674 | line_score = row['line_tx'] 675 | 676 | # example: 0102(11)0500 677 | # capture all digits having 678 | # positive lookbehind = ( 679 | # positive lookahead = ) 680 | # OR capture one digit at a time 681 | runs = 0 682 | for value in re.findall(r'(?<=\()\d+(?=\))|\d', line_score): 683 | runs += int(value) 684 | 685 | return runs 686 | 687 | runs = team_game.apply(line_score_to_runs, axis=1) 688 | assert (runs == team_game['r']).all() 689 | -------------------------------------------------------------------------------- /data/lahman/readme2017.txt: -------------------------------------------------------------------------------- 1 | The Lahman Baseball Database 2 | 3 | 2017 Version 4 | Release Date: March 31, 2018 5 | 6 | ---------------------------------------------------------------------- 7 | 8 | README CONTENTS 9 | 0.1 Copyright Notice 10 | 0.2 Contact Information 11 | 12 | 1.0 Release Contents 13 | 1.1 Introduction 14 | 1.2 What's New 15 | 1.3 Acknowledgements 16 | 1.4 Using this Database 17 | 1.5 Revision History 18 | 19 | 2.0 Data Tables 20 | 21 | ---------------------------------------------------------------------- 22 | 23 | 0.1 Copyright Notice & Limited Use License 24 | 25 | This database is copyright 1996-2018 by Sean Lahman. 26 | 27 | This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License. For details see: http://creativecommons.org/licenses/by-sa/3.0/ 28 | 29 | 30 | For licensing information or further information, contact Sean Lahman 31 | at: seanlahman@gmail.com 32 | 33 | ---------------------------------------------------------------------- 34 | 35 | 0.2 Contact Information 36 | 37 | Web site: http://www.baseball1.com 38 | E-Mail : seanlahman@gmail.com 39 | 40 | If you're interested in contributing to the maintenance of this 41 | database or making suggestions for improvement, please consider 42 | joining our mailinglist at: 43 | 44 | http://groups.yahoo.com/group/baseball-databank/ 45 | 46 | If you are interested in similar databases for other sports, please 47 | vist the Open Source Sports website at http://OpenSourceSports.com 48 | 49 | ---------------------------------------------------------------------- 50 | 1.0 Release Contents 51 | 52 | This release of the database can be downloaded in several formats. The 53 | contents of each version are listed below. 54 | 55 | MS Access Versions: 56 | lahman2017.mdb 57 | 2017readme.txt 58 | 59 | SQL version 60 | lahman2017.sql 61 | 2017readme.txt 62 | 63 | Comma Delimited Version: 64 | 2017readme.txt 65 | AllStarFull.csv 66 | Appearances.csv 67 | AwardsManagers.csv 68 | AwardsPlayers.csv 69 | AwardsShareManagers.csv 70 | AwardsSharePlayers.csv 71 | Batting.csv 72 | BattingPost.csv 73 | CollegePlaying.csv 74 | Fielding.csv 75 | FieldingOF.csv 76 | FieldingPost.csv 77 | FieldingOFsplit 78 | HallOfFame.csv 79 | HomeGames.csv 80 | Managers.csv 81 | ManagersHalf.csv 82 | Parks.csv 83 | People.csv 84 | Pitching.csv 85 | PitchingPost.csv 86 | README.txt 87 | Salaries.csv 88 | Schools.csv 89 | SeriesPost.csv 90 | Teams.csv 91 | TeamsFranchises.csv 92 | TeamsHalf.csv 93 | 94 | ---------------------------------------------------------------------- 95 | 1.1 Introduction 96 | 97 | This database contains pitching, hitting, and fielding statistics for 98 | Major League Baseball from 1871 through 2017. It includes data from 99 | the two current leagues (American and National), the four other "major" 100 | leagues (American Association, Union Association, Players League, and 101 | Federal League), and the National Association of 1871-1875. 102 | 103 | This database was created by Sean Lahman, who pioneered the effort to 104 | make baseball statistics freely available to the general public. What 105 | started as a one man effort in 1994 has grown tremendously, and now a 106 | team of researchers have collected their efforts to make this the 107 | largest and most accurate source for baseball statistics available 108 | anywhere. (See Acknowledgements below for a list of the key 109 | contributors to this project.) 110 | 111 | None of what we have done would have been possible without the 112 | pioneering work of Hy Turkin, S.C. Thompson, David Neft, and Pete 113 | Palmer (among others). All baseball fans owe a debt of gratitude 114 | to the people who have worked so hard to build the tremendous set 115 | of data that we have today. Our thanks also to the many members of 116 | the Society for American Baseball Research who have helped us over 117 | the years. We strongly urge you to support and join their efforts. 118 | Please vist their website (www.sabr.org). 119 | 120 | If you have any problems or find any errors, please let us know. Any 121 | feedback is appreciated 122 | 123 | ---------------------------------------------------------------------- 124 | 1.2 What's New in 2017 125 | 126 | Player stats have been updated through 2017 season, and many of the other tables 127 | have been updated based on new research into the historical record. 128 | 129 | One notable change: The name of the table that contains biographical information 130 | for players has been changed from "Master" to "People" top better reflect its 131 | contents. 132 | 133 | ---------------------------------------------------------------------- 134 | 1.3 Acknowledgements 135 | 136 | Much of the raw data contained in this database comes from the work of 137 | Pete Palmer, the legendary statistician, who has had a hand in most 138 | of the baseball encylopedias published since 1974. He is largely 139 | responsible for bringing the batting, pitching, and fielding data out 140 | of the dark ages and into the computer era. Without him, none of this 141 | would be possible. For more on Pete's work, please read his own 142 | account at: http://sabr.org/cmsfiles/PalmerDatabaseHistory.pdf 143 | 144 | Three people have been key contributors to the work that followed, first 145 | by taking the raw data and creating a relational database, and later 146 | by extending the database to make it more accesible to researchers. 147 | 148 | Sean Lahman launched the Baseball Archive's website back before 149 | most people had heard of the world wide web. Frustrated by the 150 | lack of sports data available, he led the effort to build a 151 | baseball database that everyone could use. He created the first version 152 | of the database and began to make it available for free download from 153 | his website in 1995. 154 | 155 | The work of Sean Forman to create and maintain an online encyclopedia 156 | at Baseball-Reference.com was a quantum leap for both fans and researchers. 157 | The website launched in 2000, provding a user-friendly interface to the Lahman 158 | Baseball Database. Forman and Lahman launched the Baseball Databank in 2001, 159 | a group of researchers whose goal was to update and maintain the database 160 | as an open source collection available to all. 161 | 162 | Ted Turocy has done the lion's share of the work to updating the main 163 | data tables since 2012, automating the work of annual updates and linking 164 | historical data to play-by-play accounts compiled by Retrosheet. 165 | 166 | A handful of researchers have made substantial contributions to 167 | maintain this database over years. Listed alphabetically, they 168 | are: Derek Adair, Mike Crain, Kevin Johnson, Rod Nelson, Tom Tango, 169 | and Paul Wendt. These folks did much of the heavy lifting, and are 170 | largely responsible for the improvements made since 2000. 171 | 172 | Others who made important contributions include: Dvd Avins, 173 | Clifford Blau, Bill Burgess, Clem Comly, Jeff Burk, Randy Cox, 174 | Mitch Dickerman, Paul DuBois, Mike Emeigh, F.X. Flinn, Bill Hickman, 175 | Jerry Hoffman, Dan Holmes, Micke Hovmoller, Peter Kreutzer, 176 | Danile Levine, Bruce Macleod, Ken Matinale, Michael Mavrogiannis, 177 | Cliff Otto, Alberto Perdomo, Dave Quinn, John Rickert, Tom Ruane, 178 | Theron Skyles, Hans Van Slooten, Michael Westbay, and Rob Wood. 179 | 180 | Many other people have made significant contributions to the database 181 | over the years. The contribution of Tom Ruane's effort to the overall 182 | quality of the underlying data has been tremendous. His work at 183 | retrosheet.org integrates the yearly data with the day-by-day data, 184 | creating a reference source of startling depth. 185 | 186 | Sean Holtz helped with a major overhaul and redesign before the 187 | 2000 season. Keith Woolner was instrumental in helping turn 188 | a huge collection of stats into a relational database in the mid-1990s. 189 | Clifford Otto & Ted Nye also helped provide guidance to the early 190 | versions. Lee Sinnis, John Northey & Erik Greenwood helped supply key 191 | pieces of data. Many others have written in with corrections and 192 | suggestions that made each subsequent version even better than what 193 | preceded it. 194 | 195 | The work of the SABR Baseball Records Committee, led by Lyle Spatz 196 | has been invaluable. So has the work of Bill Carle and the SABR 197 | Biographical Committee. David Vincent, keeper of the Home Run Log and 198 | other bits of hard to find info, has always been helpful. The recent 199 | addition of colleges to player bios is the result of much research by 200 | members of SABR's Collegiate Baseball committee. 201 | 202 | Salary data was first supplied by Doug Pappas, who passed away during 203 | the summer of 2004. He was the leading authority on many subjects, 204 | most significantly the financial history of Major League Baseball. 205 | We are grateful that he allowed us to include some of the data he 206 | compiled. His work has been continued by the SABR Business of 207 | Baseball committee. 208 | 209 | Thanks is also due to the staff at the National Baseball Library 210 | in Cooperstown who have been so helpful over the years, including 211 | Tim Wiles, Jim Gates, Bruce Markusen, and the rest of the staff. 212 | 213 | A special debt of gratitude is owed to Dave Smith and the folks at 214 | Retrosheet. There is no other group working so hard to compile and 215 | share baseball data. Their website (www.retrosheet.org) will give 216 | you a taste of the wealth of information Dave and the gang have collected. 217 | 218 | Thanks to all contributors great and small. What you have created is 219 | a wonderful thing. 220 | 221 | ---------------------------------------------------------------------- 222 | 1.4 Using this Database 223 | 224 | This version of the database is available in Microsoft Access 225 | format, SQL files or in a generic, comma delimited format. Because this is a 226 | relational database, you will not be able to use the data in a 227 | flat-database application. 228 | 229 | Please note that this is not a stand alone application. It requires 230 | a database application or some other application designed specifically 231 | to interact with the database. 232 | 233 | If you are unable to import the data directly, you should download the 234 | database in the delimted text format. Then use the documentation 235 | in section 2.0 of this document to import the data into 236 | your database application. 237 | 238 | ---------------------------------------------------------------------- 239 | 1.5 Revision History 240 | 241 | Version Date Comments 242 | 1.0 December 1992 Database ported from dBase 243 | 1.1 May 1993 Becomes fully relational 244 | 1.2 July 1993 Corrections made to full database 245 | 1.21 December 1993 1993 statistics added 246 | 1.3 July 1994 Pre-1900 data added 247 | 1.31 February 1995 1994 Statistics added 248 | 1.32 August 1995 Statistics added for other leagues 249 | 1.4 September 1995 Fielding Data added 250 | 1.41 November 1995 1995 statistics added 251 | 1.42 March 1996 HOF/All-Star tables added 252 | 1.5-MS October 1996 1st public release - MS Access format 253 | 1.5-GV October 1996 Released generic comma-delimted files 254 | 1.6-MS December 1996 Updated with 1996 stats, some corrections 255 | 1.61-MS December 1996 Corrected error in MASTER table 256 | 1.62 February 1997 Corrected 1914-1915 batters data and updated 257 | 2.0 February 1998 Major Revisions-added teams & managers 258 | 2.1 October 1998 Interim release w/1998 stats 259 | 2.2 January 1999 New release w/post-season stats & awards added 260 | 3.0 November 1999 Major release - fixed errors and 1999 statistics added 261 | 4.0 May 2001 Major release - proofed & redesigned tables 262 | 4.5 March 2002 Updated with 2001 stats and added new biographical data 263 | 5.0 December 2002 Major revision - new tables and data 264 | 5.1 January 2004 Updated with 2003 data, and new pitching categories 265 | 5.2 November 2004 Updated with 2004 season statistics. 266 | 5.3 December 2005 Updated with 2005 season statistics. 267 | 5.4 December 2006 Updated with 2006 season statistics. 268 | 5.5 December 2007 Updated with 2007 season statistics. 269 | 5.6 December 2008 Updated with 2008 season statistics. 270 | 5.7 December 2009 Updated for 2009 and added several tables. 271 | 5.8 December 2010 Updated with 2010 season statistics. 272 | 5.9 December 2011 Updated for 2011 and removed obsolete tables. 273 | 2012 December 2012 Updated with 2012 season statistics 274 | 2013 December 2013 Updated with 2013 season statistics 275 | 2014 December 2014 Updated with 2014 season statistics 276 | 2015 December 2015 Updated with 2015 season statistics 277 | 2016 February 2017 Updated for 2016 and added several tables 278 | 2017 March 2018 Updated for 2017 279 | 280 | ------------------------------------------------------------------------------ 281 | 2.0 Data Tables 282 | 283 | The design follows these general principles. Each player is assigned a 284 | unique number (playerID). All of the information relating to that player 285 | is tagged with his playerID. The playerIDs are linked to names and 286 | birthdates in the MASTER table. 287 | 288 | The database is comprised of the following main tables: 289 | 290 | People - Player names, DOB, and biographical info 291 | Batting - batting statistics 292 | Pitching - pitching statistics 293 | Fielding - fielding statistics 294 | 295 | It is supplemented by these tables: 296 | 297 | AllStarFull - All-Star appearances 298 | HallofFame - Hall of Fame voting data 299 | Managers - managerial statistics 300 | Teams - yearly stats and standings 301 | BattingPost - post-season batting statistics 302 | PitchingPost - post-season pitching statistics 303 | TeamFranchises - franchise information 304 | FieldingOF - outfield position data 305 | FieldingPost- post-season fielding data 306 | FieldingOFsplit - LF/CF/RF splits 307 | ManagersHalf - split season data for managers 308 | TeamsHalf - split season data for teams 309 | Salaries - player salary data 310 | SeriesPost - post-season series information 311 | AwardsManagers - awards won by managers 312 | AwardsPlayers - awards won by players 313 | AwardsShareManagers - award voting for manager awards 314 | AwardsSharePlayers - award voting for player awards 315 | Appearances - details on the positions a player appeared at 316 | Schools - list of colleges that players attended 317 | CollegePlaying - list of players and the colleges they attended 318 | Parks - list of major league ballparls 319 | HomeGames - Number of homegames played by each team in each ballpark 320 | 321 | 322 | 323 | -------------------------------------------------------------------------- 324 | 2.1 People table 325 | 326 | 327 | playerID A unique code asssigned to each player. The playerID links 328 | the data in this file with records in the other files. 329 | birthYear Year player was born 330 | birthMonth Month player was born 331 | birthDay Day player was born 332 | birthCountry Country where player was born 333 | birthState State where player was born 334 | birthCity City where player was born 335 | deathYear Year player died 336 | deathMonth Month player died 337 | deathDay Day player died 338 | deathCountry Country where player died 339 | deathState State where player died 340 | deathCity City where player died 341 | nameFirst Player's first name 342 | nameLast Player's last name 343 | nameGiven Player's given name (typically first and middle) 344 | weight Player's weight in pounds 345 | height Player's height in inches 346 | bats Player's batting hand (left, right, or both) 347 | throws Player's throwing hand (left or right) 348 | debut Date that player made first major league appearance 349 | finalGame Date that player made first major league appearance (blank if still active) 350 | retroID ID used by retrosheet 351 | bbrefID ID used by Baseball Reference website 352 | 353 | 354 | ------------------------------------------------------------------------------ 355 | 2.2 Batting Table 356 | playerID Player ID code 357 | yearID Year 358 | stint player's stint (order of appearances within a season) 359 | teamID Team 360 | lgID League 361 | G Games 362 | AB At Bats 363 | R Runs 364 | H Hits 365 | 2B Doubles 366 | 3B Triples 367 | HR Homeruns 368 | RBI Runs Batted In 369 | SB Stolen Bases 370 | CS Caught Stealing 371 | BB Base on Balls 372 | SO Strikeouts 373 | IBB Intentional walks 374 | HBP Hit by pitch 375 | SH Sacrifice hits 376 | SF Sacrifice flies 377 | GIDP Grounded into double plays 378 | 379 | ------------------------------------------------------------------------------ 380 | 2.3 Pitching table 381 | 382 | playerID Player ID code 383 | yearID Year 384 | stint player's stint (order of appearances within a season) 385 | teamID Team 386 | lgID League 387 | W Wins 388 | L Losses 389 | G Games 390 | GS Games Started 391 | CG Complete Games 392 | SHO Shutouts 393 | SV Saves 394 | IPOuts Outs Pitched (innings pitched x 3) 395 | H Hits 396 | ER Earned Runs 397 | HR Homeruns 398 | BB Walks 399 | SO Strikeouts 400 | BAOpp Opponent's Batting Average 401 | ERA Earned Run Average 402 | IBB Intentional Walks 403 | WP Wild Pitches 404 | HBP Batters Hit By Pitch 405 | BK Balks 406 | BFP Batters faced by Pitcher 407 | GF Games Finished 408 | R Runs Allowed 409 | SH Sacrifices by opposing batters 410 | SF Sacrifice flies by opposing batters 411 | GIDP Grounded into double plays by opposing batter 412 | ------------------------------------------------------------------------------ 413 | 2.4 Fielding Table 414 | 415 | playerID Player ID code 416 | yearID Year 417 | stint player's stint (order of appearances within a season) 418 | teamID Team 419 | lgID League 420 | Pos Position 421 | G Games 422 | GS Games Started 423 | InnOuts Time played in the field expressed as outs 424 | PO Putouts 425 | A Assists 426 | E Errors 427 | DP Double Plays 428 | PB Passed Balls (by catchers) 429 | WP Wild Pitches (by catchers) 430 | SB Opponent Stolen Bases (by catchers) 431 | CS Opponents Caught Stealing (by catchers) 432 | ZR Zone Rating 433 | 434 | ------------------------------------------------------------------------------ 435 | 2.5 AllstarFull table 436 | 437 | playerID Player ID code 438 | YearID Year 439 | gameNum Game number (zero if only one All-Star game played that season) 440 | gameID Retrosheet ID for the game idea 441 | teamID Team 442 | lgID League 443 | GP 1 if Played in the game 444 | startingPos If player was game starter, the position played 445 | ------------------------------------------------------------------------------ 446 | 2.6 HallOfFame table 447 | 448 | playerID Player ID code 449 | yearID Year of ballot 450 | votedBy Method by which player was voted upon 451 | ballots Total ballots cast in that year 452 | needed Number of votes needed for selection in that year 453 | votes Total votes received 454 | inducted Whether player was inducted by that vote or not (Y or N) 455 | category Category in which candidate was honored 456 | needed_note Explanation of qualifiers for special elections 457 | ------------------------------------------------------------------------------ 458 | 2.7 Managers table 459 | 460 | playerID Player ID Number 461 | yearID Year 462 | teamID Team 463 | lgID League 464 | inseason Managerial order. Zero if the individual managed the team 465 | the entire year. Otherwise denotes where the manager appeared 466 | in the managerial order (1 for first manager, 2 for second, etc.) 467 | G Games managed 468 | W Wins 469 | L Losses 470 | rank Team's final position in standings that year 471 | plyrMgr Player Manager (denoted by 'Y') 472 | 473 | ------------------------------------------------------------------------------ 474 | 2.8 Teams table 475 | 476 | yearID Year 477 | lgID League 478 | teamID Team 479 | franchID Franchise (links to TeamsFranchise table) 480 | divID Team's division 481 | Rank Position in final standings 482 | G Games played 483 | GHome Games played at home 484 | W Wins 485 | L Losses 486 | DivWin Division Winner (Y or N) 487 | WCWin Wild Card Winner (Y or N) 488 | LgWin League Champion(Y or N) 489 | WSWin World Series Winner (Y or N) 490 | R Runs scored 491 | AB At bats 492 | H Hits by batters 493 | 2B Doubles 494 | 3B Triples 495 | HR Homeruns by batters 496 | BB Walks by batters 497 | SO Strikeouts by batters 498 | SB Stolen bases 499 | CS Caught stealing 500 | HBP Batters hit by pitch 501 | SF Sacrifice flies 502 | RA Opponents runs scored 503 | ER Earned runs allowed 504 | ERA Earned run average 505 | CG Complete games 506 | SHO Shutouts 507 | SV Saves 508 | IPOuts Outs Pitched (innings pitched x 3) 509 | HA Hits allowed 510 | HRA Homeruns allowed 511 | BBA Walks allowed 512 | SOA Strikeouts by pitchers 513 | E Errors 514 | DP Double Plays 515 | FP Fielding percentage 516 | name Team's full name 517 | park Name of team's home ballpark 518 | attendance Home attendance total 519 | BPF Three-year park factor for batters 520 | PPF Three-year park factor for pitchers 521 | teamIDBR Team ID used by Baseball Reference website 522 | teamIDlahman45 Team ID used in Lahman database version 4.5 523 | teamIDretro Team ID used by Retrosheet 524 | 525 | ------------------------------------------------------------------------------ 526 | 2.9 BattingPost table 527 | 528 | yearID Year 529 | round Level of playoffs 530 | playerID Player ID code 531 | teamID Team 532 | lgID League 533 | G Games 534 | AB At Bats 535 | R Runs 536 | H Hits 537 | 2B Doubles 538 | 3B Triples 539 | HR Homeruns 540 | RBI Runs Batted In 541 | SB Stolen Bases 542 | CS Caught stealing 543 | BB Base on Balls 544 | SO Strikeouts 545 | IBB Intentional walks 546 | HBP Hit by pitch 547 | SH Sacrifices 548 | SF Sacrifice flies 549 | GIDP Grounded into double plays 550 | 551 | ------------------------------------------------------------------------------ 552 | 2.10 PitchingPost table 553 | 554 | playerID Player ID code 555 | yearID Year 556 | round Level of playoffs 557 | teamID Team 558 | lgID League 559 | W Wins 560 | L Losses 561 | G Games 562 | GS Games Started 563 | CG Complete Games 564 | SHO Shutouts 565 | SV Saves 566 | IPOuts Outs Pitched (innings pitched x 3) 567 | H Hits 568 | ER Earned Runs 569 | HR Homeruns 570 | BB Walks 571 | SO Strikeouts 572 | BAOpp Opponents' batting average 573 | ERA Earned Run Average 574 | IBB Intentional Walks 575 | WP Wild Pitches 576 | HBP Batters Hit By Pitch 577 | BK Balks 578 | BFP Batters faced by Pitcher 579 | GF Games Finished 580 | R Runs Allowed 581 | SH Sacrifice Hits allowed 582 | SF Sacrifice Flies allowed 583 | GIDP Grounded into Double Plays 584 | 585 | ------------------------------------------------------------------------------ 586 | 2.11 TeamFranchises table 587 | 588 | franchID Franchise ID 589 | franchName Franchise name 590 | active Whetehr team is currently active (Y or N) 591 | NAassoc ID of National Association team franchise played as 592 | 593 | ------------------------------------------------------------------------------ 594 | 2.12 FieldingOF table 595 | 596 | playerID Player ID code 597 | yearID Year 598 | stint player's stint (order of appearances within a season) 599 | Glf Games played in left field 600 | Gcf Games played in center field 601 | Grf Games played in right field 602 | 603 | ------------------------------------------------------------------------------ 604 | 2.13 ManagersHalf table 605 | 606 | playerID Manager ID code 607 | yearID Year 608 | teamID Team 609 | lgID League 610 | inseason Managerial order. One if the individual managed the team 611 | the entire year. Otherwise denotes where the manager appeared 612 | in the managerial order (1 for first manager, 2 for second, etc.) 613 | half First or second half of season 614 | G Games managed 615 | W Wins 616 | L Losses 617 | rank Team's position in standings for the half 618 | 619 | ------------------------------------------------------------------------------ 620 | 2.14 TeamsHalf table 621 | 622 | yearID Year 623 | lgID League 624 | teamID Team 625 | half First or second half of season 626 | divID Division 627 | DivWin Won Division (Y or N) 628 | rank Team's position in standings for the half 629 | G Games played 630 | W Wins 631 | L Losses 632 | 633 | ------------------------------------------------------------------------------ 634 | 2.15 Salaries table 635 | 636 | yearID Year 637 | teamID Team 638 | lgID League 639 | playerID Player ID code 640 | salary Salary 641 | 642 | ------------------------------------------------------------------------------ 643 | 2.16 SeriesPost table 644 | 645 | yearID Year 646 | round Level of playoffs 647 | teamIDwinner Team ID of the team that won the series 648 | lgIDwinner League ID of the team that won the series 649 | teamIDloser Team ID of the team that lost the series 650 | lgIDloser League ID of the team that lost the series 651 | wins Wins by team that won the series 652 | losses Losses by team that won the series 653 | ties Tie games 654 | ------------------------------------------------------------------------------ 655 | 2.17 AwardsManagers table 656 | 657 | playerID Manager ID code 658 | awardID Name of award won 659 | yearID Year 660 | lgID League 661 | tie Award was a tie (Y or N) 662 | notes Notes about the award 663 | 664 | ------------------------------------------------------------------------------ 665 | 2.18 AwardsPlayers table 666 | 667 | playerID Player ID code 668 | awardID Name of award won 669 | yearID Year 670 | lgID League 671 | tie Award was a tie (Y or N) 672 | notes Notes about the award 673 | 674 | ------------------------------------------------------------------------------ 675 | 2.19 AwardsShareManagers table 676 | 677 | awardID name of award votes were received for 678 | yearID Year 679 | lgID League 680 | playerID Manager ID code 681 | pointsWon Number of points received 682 | pointsMax Maximum numner of points possible 683 | votesFirst Number of first place votes 684 | 685 | ------------------------------------------------------------------------------ 686 | 2.20 AwardsSharePlayers table 687 | 688 | awardID name of award votes were received for 689 | yearID Year 690 | lgID League 691 | playerID Player ID code 692 | pointsWon Number of points received 693 | pointsMax Maximum numner of points possible 694 | votesFirst Number of first place votes 695 | 696 | ------------------------------------------------------------------------------ 697 | 2.21 FieldingPost table 698 | 699 | playerID Player ID code 700 | yearID Year 701 | teamID Team 702 | lgID League 703 | round Level of playoffs 704 | Pos Position 705 | G Games 706 | GS Games Started 707 | InnOuts Time played in the field expressed as outs 708 | PO Putouts 709 | A Assists 710 | E Errors 711 | DP Double Plays 712 | TP Triple Plays 713 | PB Passed Balls 714 | SB Stolen Bases allowed (by catcher) 715 | CS Caught Stealing (by catcher) 716 | 717 | ------------------------------------------------------------------------------ 718 | 2.22 Appearances table 719 | 720 | yearID Year 721 | teamID Team 722 | lgID League 723 | playerID Player ID code 724 | G_all Total games played 725 | GS Games started 726 | G_batting Games in which player batted 727 | G_defense Games in which player appeared on defense 728 | G_p Games as pitcher 729 | G_c Games as catcher 730 | G_1b Games as firstbaseman 731 | G_2b Games as secondbaseman 732 | G_3b Games as thirdbaseman 733 | G_ss Games as shortstop 734 | G_lf Games as leftfielder 735 | G_cf Games as centerfielder 736 | G_rf Games as right fielder 737 | G_of Games as outfielder 738 | G_dh Games as designated hitter 739 | G_ph Games as pinch hitter 740 | G_pr Games as pinch runner 741 | 742 | 743 | ------------------------------------------------------------------------------ 744 | 2.23 Schools table 745 | schoolID school ID code 746 | schoolName school name 747 | schoolCity city where school is located 748 | schoolState state where school's city is located 749 | schoolNick nickname for school's baseball team 750 | 751 | 752 | ------------------------------------------------------------------------------ 753 | 2.24 CollegePlaying table 754 | playerid Player ID code 755 | schoolID school ID code 756 | year year 757 | 758 | 759 | 760 | ------------------------------------------------------------------------------ 761 | 2.25 FieldingOFsplit table 762 | playerID Player ID code 763 | yearID Year 764 | stint player's stint (order of appearances within a season) 765 | teamID Team 766 | lgID League 767 | Pos Position 768 | G Games 769 | GS Games Started 770 | InnOuts Time played in the field expressed as outs 771 | PO Putouts 772 | A Assists 773 | E Errors 774 | DP Double Plays 775 | 776 | 777 | ------------------------------------------------------------------------------ 778 | 2.26 Parks table 779 | park.key ballpark ID code 780 | park.name name of ballpark 781 | park.alias alternate names of ballpark 782 | city city 783 | state state 784 | country country 785 | 786 | ------------------------------------------------------------------------------ 787 | 2.27 HomeGames table 788 | year.key year 789 | league.key league 790 | team.key team ID 791 | park.key ballpark ID 792 | span.first date of first game played 793 | span.last date of last game played 794 | games total number of games 795 | openings total number of dates played 796 | attendance total attendaance 797 | 798 | 799 | 800 | 801 | --------------------------------------------------------------------------------