├── download_scripts
    ├── __init__.py
    ├── tests
    │   ├── __init__.py
    │   ├── test_func.py
    │   └── test_data.py
    ├── pytest.ini
    ├── run_all_scripts.py
    ├── retrosheet_download.py
    ├── conftest.py
    ├── lahman_download.py
    ├── retrosheet_datadictionary.py
    ├── README.md
    ├── retrosheet_parse.py
    ├── postgres_load_data.py
    ├── lahman_wrangle.py
    ├── retrosheet_collect.py
    ├── data_helper.py
    └── retrosheet_wrangle.py
├── data
    ├── retrosheet
    │   ├── nb_data
    │   │   ├── pf_types.csv
    │   │   ├── fangraphs_types.csv
    │   │   ├── pf.csv
    │   │   └── fangraphs.csv
    │   ├── event_types.csv
    │   ├── player_game_types.csv
    │   ├── game_types.csv
    │   ├── cwdaily_datadictionary.txt
    │   └── cwgame_datadictionary.txt
    └── lahman
    │   └── readme2017.txt
├── .gitignore
├── LICENSE
├── MLB_Data_Details.md
├── baseball_jupyter_nb
    └── README.md
├── RetrosheetParsers.md
├── MLB_Data_Overview.md
└── README.md


/download_scripts/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/download_scripts/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/download_scripts/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | markers =
3 |     slow: marks tests as slow
4 | 


--------------------------------------------------------------------------------
/data/retrosheet/nb_data/pf_types.csv:
--------------------------------------------------------------------------------
1 | index,dtypes
2 | team_id,object
3 | year,int64
4 | pf,float64
5 | pf_half,float64
6 | name,object
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .idea
 2 | data/lahman/raw
 3 | data/lahman/wrangled
 4 | data/retrosheet/raw
 5 | data/retrosheet/parsed
 6 | data/retrosheet/collected
 7 | data/retrosheet/wrangled
 8 | *.log
 9 | test_data
10 | tmp*
11 | __pycache__
12 | *checkpoint.ipynb
13 | 


--------------------------------------------------------------------------------
/data/retrosheet/nb_data/fangraphs_types.csv:
--------------------------------------------------------------------------------
 1 | index,dtypes
 2 | Season,int64
 3 | Team,object
 4 | Basic (5yr),int64
 5 | 3yr,int64
 6 | 1yr,int64
 7 | 1B,int64
 8 | 2B,int64
 9 | 3B,int64
10 | HR,int64
11 | SO,int64
12 | BB,int64
13 | GB,int64
14 | FB,int64
15 | LD,int64
16 | IFFB,int64
17 | FIP,int64
18 | 


--------------------------------------------------------------------------------
/data/retrosheet/event_types.csv:
--------------------------------------------------------------------------------
 1 | index,dtypes
 2 | game_id,object
 3 | inn_ct,uint8
 4 | home_half,uint8
 5 | away_score_ct,uint8
 6 | home_score_ct,uint8
 7 | bat_id,object
 8 | pit_id,object
 9 | event_tx,object
10 | h_cd,uint8
11 | outs,uint8
12 | e,uint8
13 | event_id,uint8
14 | team_id,object
15 | opponent_team_id,object
16 | inn_runs_ct,uint8
17 | start_bases_cd,uint8
18 | end_bases_cd,uint8
19 | r,uint8
20 | fate_runs_ct,uint8
21 | ab,bool
22 | sh,bool
23 | sf,bool
24 | dp,bool
25 | tp,bool
26 | wp,bool
27 | pb,bool
28 | inn_end,bool
29 | pa,bool
30 | bat_safe_err,bool
31 | so,bool
32 | sb,uint8
33 | cs,uint8
34 | bk,bool
35 | ibb,bool
36 | bb,bool
37 | hbp,bool
38 | xi,bool
39 | single,bool
40 | double,bool
41 | triple,bool
42 | hr,bool
43 | h,bool
44 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019,2020 by Stephen Diehl
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MLB_Data_Details.md:
--------------------------------------------------------------------------------
 1 | ## MLB Data Details
 2 | 
 3 | ### Lahman Data Source
 4 | 
 5 | The most recent data will be downloaded from:  https://github.com/chadwickbureau/baseballdatabank/archive/master.zip
 6 | 
 7 | **Lahman Data License** from https://github.com/chadwickbureau/baseballdatabank readme.txt
 8 | 
 9 | ```
10 | This work is licensed under a Creative Commons Attribution-ShareAlike
11 | 3.0 Unported License.  For details see:
12 | http://creativecommons.org/licenses/by-sa/3.0/
13 | ```
14 | 
15 | #### Lahman Data Dictionary
16 | 
17 | The most recent data dictionary is:  http://www.seanlahman.com/files/database/readme2017.txt  It is applicable to the 2019 Lahman data.
18 | 
19 | This file is also copied to this repo at: `data/lahman/readme2017.txt`
20 | 
21 | ### Retrosheet Data Source
22 | 
23 | The play-by-play data will be downloaded from: https://github.com/chadwickbureau/retrosheet/archive/master.zip 
24 | 
25 | The retrosheet_download script will put these in: `../data/retrosheet/raw`
26 | 
27 | **Retrosheet Data License** from https://www.retrosheet.org/notice.txt 
28 | 
29 | ```
30 |      The information used here was obtained free of
31 |      charge from and is copyrighted by Retrosheet.  Interested
32 |      parties may contact Retrosheet at "www.retrosheet.org".
33 | ```
34 | 


--------------------------------------------------------------------------------
/download_scripts/run_all_scripts.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | """Run all scripts"""
 4 | 
 5 | __author__ = 'Stephen Diehl'
 6 | 
 7 | import argparse
 8 | import sys
 9 | import subprocess
10 | 
11 | 
12 | def get_parser():
13 |     """Args Description"""
14 | 
15 |     # current_year = datetime.datetime.today().year
16 |     parser = argparse.ArgumentParser(
17 |         description=__doc__,
18 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
19 | 
20 |     parser.add_argument("--data-dir", type=str, help="baseball data directory", default='../data')
21 |     parser.add_argument("--start-year", type=int, help="start year", default='1955')
22 |     parser.add_argument("--end-year", type=int, help="end year", default='2019')
23 | 
24 |     return parser
25 | 
26 | 
27 | def run_cmd(cmd):
28 |     proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
29 |     for line in proc.stdout:
30 |         sys.stdout.buffer.write(line)
31 |         sys.stdout.buffer.flush()
32 | 
33 | 
34 | def main():
35 |     parser = get_parser()
36 |     args = parser.parse_args()
37 |     data_dir = f'--data-dir={args.data_dir}'
38 |     start_year = f'--start-year={args.start_year}'
39 |     end_year = f'--end-year={args.end_year}'
40 | 
41 |     print('Running lahman_download:')
42 |     cmd = ['./lahman_download.py', '-v', '--log=INFO', data_dir]
43 |     run_cmd(cmd)
44 | 
45 |     print('Running lahman_wrangle:')
46 |     cmd = ['./lahman_wrangle.py', '-v', '--log=INFO', data_dir]
47 |     run_cmd(cmd)
48 | 
49 |     print('Running retrosheet_download:')
50 |     cmd = ['./retrosheet_download.py', '-v', '--log=INFO', data_dir]
51 |     run_cmd(cmd)
52 | 
53 |     print('Running retrosheet_parse:')
54 |     cmd = ['./retrosheet_parse.py', '-v', '--log=INFO', '--run-cwevent', data_dir, start_year, end_year]
55 |     run_cmd(cmd)
56 | 
57 |     print('Running retrosheet_collect:')
58 |     cmd = ['./retrosheet_collect.py', '-v', '--log=INFO', '--use-datatypes', data_dir]
59 |     run_cmd(cmd)
60 | 
61 |     print('Running retrosheet_wrangle:')
62 |     cmd = ['./retrosheet_wrangle.py', '-v', '--log=INFO', data_dir]
63 |     run_cmd(cmd)
64 | 
65 |     print('Running pytest:')
66 |     cmd = ['pytest', '-v', data_dir]
67 |     run_cmd(cmd)
68 |     print('All scripts have run.')
69 | 
70 | 
71 | if __name__ == '__main__':
72 |     main()
73 | 


--------------------------------------------------------------------------------
/download_scripts/tests/test_func.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import pandas as pd
 4 | 
 5 | __author__ = 'Stephen Diehl'
 6 | 
 7 | from .. import data_helper as dh
 8 | 
 9 | 
10 | def test_python_version():
11 |     assert sys.version_info.major == 3
12 |     assert sys.version_info.minor >= 7
13 | 
14 | 
15 | def test_data_dir(data_dir):
16 |     # if this does not pass either data_dir was passed incorrectly or
17 |     # pytest was not run from the download_scripts directory
18 |     assert data_dir.is_dir()
19 | 
20 | 
21 | def test_optimize_df():
22 |     # 16 columns but only 3 data types chosen by Pandas 0.25.x
23 |     df = pd.DataFrame(dh.get_dtype_range())
24 |     assert len(df.columns) == 16
25 |     assert df.dtypes.nunique() == 3
26 | 
27 |     # optimize the data types (modifies df inplace)
28 |     dh.optimize_df_dtypes(df)
29 |     assert len(df.columns) == 16
30 |     assert df.dtypes.nunique() == 16
31 | 
32 |     # Pandas will silently convert to float if int value is too large
33 |     # Verify that no implicit conversion took place
34 |     assert (df.dtypes.values == df.columns.values).all()
35 | 
36 | 
37 | def test_rw_with_types(data_dir):
38 |     dtype_range = dh.get_dtype_range()
39 |     df = pd.DataFrame(dtype_range)
40 |     dtypes_orig = df.dtypes
41 | 
42 |     dh.optimize_df_dtypes(df)
43 |     dh.to_csv_with_types(df, data_dir / 'tmp.csv.gz')
44 |     df = dh.from_csv_with_types(data_dir / 'tmp.csv.gz')
45 | 
46 |     assert (df.dtypes == list(dtype_range.keys())).all()
47 |     assert not (df.dtypes == dtypes_orig).all()
48 | 
49 |     assert (data_dir / 'tmp.csv.gz').is_file()
50 |     assert (data_dir / 'tmp_types.csv').is_file()
51 |     os.remove(data_dir / 'tmp.csv.gz')
52 |     os.remove(data_dir / 'tmp_types.csv')
53 | 
54 | 
55 | def test_sum_stats_for_dups():
56 |     data = {'pkey1': [1, 2, 3, 3, 4, 5, 5, 5],
57 |             'pkey2': [2, 3, 4, 4, 5, 6, 6, 6],
58 |             'stat1': [1, 2, 3, 4, 5, 6, 7, 8],
59 |             'stat2': [1, 1, 1, 1, 1, 1, 1, 1],
60 |             'misc': ['a', 'b', 'c1', 'c2', 'd', 'e1', 'e2', 'e3']}
61 |     df = pd.DataFrame(data)
62 | 
63 |     df = dh.sum_stats_for_dups(df, ['pkey1', 'pkey2'], ['stat1', 'stat2'])
64 | 
65 |     chk = {'pkey1': [1, 2, 3, 4, 5],
66 |            'pkey2': [2, 3, 4, 5, 6],
67 |            'stat1': [1, 2, 7, 5, 21],
68 |            'stat2': [1, 1, 2, 1, 3],
69 |            'misc': ['a', 'b', 'c1', 'd', 'e1']}
70 |     df_chk = pd.DataFrame(chk)
71 | 
72 |     assert df.equals(df_chk)
73 | 


--------------------------------------------------------------------------------
/baseball_jupyter_nb/README.md:
--------------------------------------------------------------------------------
 1 | ## Jupyter Notebooks
 2 | 
 3 | The Jupyter Notebooks:
 4 | 
 5 | * with a CSV suffix uses CSV files as the data source
 6 | * with a SQL suffix uses SQL to to perform the same analysis
 7 | 
 8 | The links below will display the notebook using nbviewer:  
 9 | 
10 | - [01_Intro_CSV](https://nbviewer.jupyter.org/github/sdiehl28/baseball-analytics/blob/master/baseball_jupyter_nb/01_Intro_CSV.ipynb)  
11 |   - how has game length increased over the years?
12 |   - how has pitcher count increased over the years?
13 |   - what it the relationship between pitcher count and game length?
14 |   - how many more runs are scored in games for which the DH is used?
15 | - [01_Intro_SQL](https://nbviewer.jupyter.org/github/sdiehl28/baseball-analytics/blob/master/baseball_jupyter_nb/01_Intro_SQL.ipynb)
16 |   - same as above, but using SQL as much as possible
17 | - [02_Data_Consistency_CSV](https://nbviewer.jupyter.org/github/sdiehl28/baseball-analytics/blob/master/baseball_jupyter_nb/02_Data_Consistency_CSV.ipynb)
18 |   - Compare Retrosheet stats aggregated to season level, to Lahman stats.
19 |   - Compare individual stats aggregated to team level, to team stats, for both Retrosheet and Lahman.
20 |   - Compare batting stats to pitching-allowed stats, for both Retrosheet and Lahman.
21 | - [03a_ParkFactor_CSV](https://nbviewer.jupyter.org/github/sdiehl28/baseball-analytics/blob/master/baseball_jupyter_nb/03a_ParkFactor_CSV.ipynb)
22 |   - Compute the Park Factor, for all teams for several years, accounting for home games not played in home park.
23 |   - Web scrape FanGraphs and compare.
24 |   - It is shown that ESPN and FanGraphs included Boston's runs scored in London, as part of the Fenway Park runs, thereby mistakenly increasing the Park Factor for Fenway Park.
25 | - [03b_ParkFactor_CSV](https://nbviewer.jupyter.org/github/sdiehl28/baseball-analytics/blob/master/baseball_jupyter_nb/03b_ParkFactor_CSV.ipynb)
26 |   - Compute Park Factor, for all teams for several years, accounting for each team's road schedule.
27 |   - It is shown that the road schedule can significantly impact the home park factor for a couple of teams each year.
28 | - [04_LinearWeights_CSV](https://nbviewer.jupyter.org/github/sdiehl28/baseball-analytics/blob/master/baseball_jupyter_nb/04_LinearWeights_CSV.ipynb)
29 |   - Model runs per half-inning using Linear Regression.
30 |   - The coefficient for single, double, triple, home run and other plays is determined.
31 |   - The model accounts for 78% of the variance of runs scored per half-inning.


--------------------------------------------------------------------------------
/data/retrosheet/player_game_types.csv:
--------------------------------------------------------------------------------
  1 | index,dtypes
  2 | game_id,object
  3 | game_dt,uint32
  4 | game_ct,uint8
  5 | appear_dt,uint32
  6 | team_id,object
  7 | player_id,object
  8 | slot_ct,uint8
  9 | seq_ct,uint8
 10 | home_fl,uint8
 11 | opponent_id,object
 12 | park_id,object
 13 | b_g,uint8
 14 | b_pa,uint8
 15 | b_ab,uint8
 16 | b_r,uint8
 17 | b_h,uint8
 18 | b_tb,uint8
 19 | b_2b,uint8
 20 | b_3b,uint8
 21 | b_hr,uint8
 22 | b_hr4,uint8
 23 | b_rbi,uint8
 24 | b_bb,uint8
 25 | b_ibb,uint8
 26 | b_so,uint8
 27 | b_gdp,uint8
 28 | b_hp,uint8
 29 | b_sh,uint8
 30 | b_sf,uint8
 31 | b_sb,uint8
 32 | b_cs,uint8
 33 | b_xi,uint8
 34 | b_g_dh,uint8
 35 | b_g_ph,uint8
 36 | b_g_pr,uint8
 37 | p_g,uint8
 38 | p_gs,uint8
 39 | p_cg,uint8
 40 | p_sho,uint8
 41 | p_gf,uint8
 42 | p_w,uint8
 43 | p_l,uint8
 44 | p_sv,uint8
 45 | p_out,uint8
 46 | p_tbf,uint8
 47 | p_ab,uint8
 48 | p_r,uint8
 49 | p_er,uint8
 50 | p_h,uint8
 51 | p_tb,uint8
 52 | p_2b,uint8
 53 | p_3b,uint8
 54 | p_hr,uint8
 55 | p_hr4,uint8
 56 | p_bb,uint8
 57 | p_ibb,uint8
 58 | p_so,uint8
 59 | p_gdp,uint8
 60 | p_hp,uint8
 61 | p_sh,uint8
 62 | p_sf,uint8
 63 | p_xi,uint8
 64 | p_wp,uint8
 65 | p_bk,uint8
 66 | p_ir,uint8
 67 | p_irs,uint8
 68 | p_go,uint8
 69 | p_ao,uint8
 70 | p_pitch,UInt8
 71 | p_strike,UInt8
 72 | f_p_g,uint8
 73 | f_p_gs,uint8
 74 | f_p_out,uint8
 75 | f_p_tc,uint8
 76 | f_p_po,uint8
 77 | f_p_a,uint8
 78 | f_p_e,uint8
 79 | f_p_dp,uint8
 80 | f_p_tp,uint8
 81 | f_c_g,uint8
 82 | f_c_gs,uint8
 83 | f_c_out,uint8
 84 | f_c_tc,uint8
 85 | f_c_po,uint8
 86 | f_c_a,uint8
 87 | f_c_e,uint8
 88 | f_c_dp,uint8
 89 | f_c_tp,uint8
 90 | f_c_pb,uint8
 91 | f_c_xi,uint8
 92 | f_1b_g,uint8
 93 | f_1b_gs,uint8
 94 | f_1b_out,uint8
 95 | f_1b_tc,uint8
 96 | f_1b_po,uint8
 97 | f_1b_a,uint8
 98 | f_1b_e,uint8
 99 | f_1b_dp,uint8
100 | f_1b_tp,uint8
101 | f_2b_g,uint8
102 | f_2b_gs,uint8
103 | f_2b_out,uint8
104 | f_2b_tc,uint8
105 | f_2b_po,uint8
106 | f_2b_a,uint8
107 | f_2b_e,uint8
108 | f_2b_dp,uint8
109 | f_2b_tp,uint8
110 | f_3b_g,uint8
111 | f_3b_gs,uint8
112 | f_3b_out,uint8
113 | f_3b_tc,uint8
114 | f_3b_po,uint8
115 | f_3b_a,uint8
116 | f_3b_e,uint8
117 | f_3b_dp,uint8
118 | f_3b_tp,uint8
119 | f_ss_g,uint8
120 | f_ss_gs,uint8
121 | f_ss_out,uint8
122 | f_ss_tc,uint8
123 | f_ss_po,uint8
124 | f_ss_a,uint8
125 | f_ss_e,uint8
126 | f_ss_dp,uint8
127 | f_ss_tp,uint8
128 | f_lf_g,uint8
129 | f_lf_gs,uint8
130 | f_lf_out,uint8
131 | f_lf_tc,uint8
132 | f_lf_po,uint8
133 | f_lf_a,uint8
134 | f_lf_e,uint8
135 | f_lf_dp,uint8
136 | f_lf_tp,uint8
137 | f_cf_g,uint8
138 | f_cf_gs,uint8
139 | f_cf_out,uint8
140 | f_cf_tc,uint8
141 | f_cf_po,uint8
142 | f_cf_a,uint8
143 | f_cf_e,uint8
144 | f_cf_dp,uint8
145 | f_cf_tp,uint8
146 | f_rf_g,uint8
147 | f_rf_gs,uint8
148 | f_rf_out,uint8
149 | f_rf_tc,uint8
150 | f_rf_po,uint8
151 | f_rf_a,uint8
152 | f_rf_e,uint8
153 | f_rf_dp,uint8
154 | f_rf_tp,uint8
155 | 


--------------------------------------------------------------------------------
/RetrosheetParsers.md:
--------------------------------------------------------------------------------
 1 | ## Retrosheet Parsers: Source and Exectuables
 2 | 
 3 | The open source parsers created by Dr. T. L. Turocy to parse the Retrosheet play-by-play data are excellent. 
 4 | 
 5 | These parsers must be installed and on the path for the Python scripts to make use of them.
 6 | 
 7 | Parser Description: http://chadwick.sourceforge.net/doc/cwtools.html  
 8 | Parser Executables and Source: https://sourceforge.net/projects/chadwick/   
 9 | 
10 | At the time of this writing, version 0.7.2 is the latest version.  Executable versions of the parsers are available for Windows.  Source code is available for Linux and MacOS.  See [How to Build Retrosheet Parsers on Linux](#how-to-build-retrosheet-parsers-on-linux)
11 | 
12 | ## Retrosheet Parsers
13 | 
14 | Three parsers are used:
15 | 
16 | * cwevent - creates one record for each single, double, error, stolen base, hit by pitch, balk, etc.
17 | * cwdaily - similar to a box score in which each players stats for a game are created
18 | * cwgame - similar to a line score in which each teams stats for a game are created
19 | 
20 | In more detail:
21 | 
22 | * cwevent
23 |   * was missing about 10 fields that cwgame creates and are useful for analysis.  These 10 fields were added by using regular expressions to parse the event_tx field.  These 10 fields were then aggregated to the game level and verified to be 100% consistent with the output of cwgame.
24 | * cwdaily
25 |   * the output of cwdaily is split into 3 csv files: batting, pitching and fielding.  This is how Lahman structures their data as well.
26 | * cwgame
27 |   * the output of cwgame is split into 2 csv files: team_game and game.  The data in game is data specific to a game such as which park it was played in.  The data in team_game is specific to a team for that game.
28 | 
29 | All possible fields are extracted from the cwdaily and cwgame parsers.  Both parsers are run automatically by the retrosheet_parse.py script. 
30 | 
31 | The cwevent parsers creates a great many rows.  As such a default subset of the fields is selected.  This parser is optionally run by the retrosheet_parse.py script.
32 | 
33 | ## Retrosheet Parsers License
34 | 
35 | From https://github.com/chadwickbureau/chadwick README
36 | 
37 | ```
38 | This is Chadwick, a library and toolset for baseball play-by-play
39 | and statistics.
40 | 
41 | Chadwick is Open Source software, distributed under the terms of the 
42 | GNU General Public License (GPL).
43 | ```
44 | 
45 | Parser Description: http://chadwick.sourceforge.net/doc/cwtools.html  
46 | Parser Executables and Source: https://sourceforge.net/projects/chadwick/  
47 | 
48 | ## How to Build Retrosheet Parsers on Linux
49 | 
50 | If you do not already have a build environment:
51 | 
52 | 1. sudo apt install gcc
53 | 2. sudo apt install build-essential
54 | 
55 | cd to the source directory:
56 | 
57 | 1. ./configure
58 | 2. make
59 | 3. sudo make install
60 | 
61 | Result
62 | 
63 | 1. The cw command line tools will be installed in /usr/local/bin.
64 | 2. The cw library will be installed in /usr/local/lib.
65 | 
66 | To allow the command line tools to find the shared libraries, add the following to your .bashrc and then: source .bashrc
67 | `export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib`


--------------------------------------------------------------------------------
/download_scripts/retrosheet_download.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """Download and Unzip Retrosheet Data to {data_dir}/retrosheet/raw
  4 | 
  5 | Will not download data if it has already been downloaded.
  6 | """
  7 | 
  8 | __author__ = 'Stephen Diehl'
  9 | 
 10 | import os
 11 | import shutil
 12 | import argparse
 13 | import requests
 14 | from pathlib import Path
 15 | import zipfile
 16 | import logging
 17 | import sys
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | logger.setLevel(logging.DEBUG)
 21 | 
 22 | 
 23 | def get_parser():
 24 |     """Args Description"""
 25 | 
 26 |     parser = argparse.ArgumentParser(
 27 |         description=__doc__,
 28 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 29 | 
 30 |     parser.add_argument("--data-dir", type=str, help="baseball data directory", default='../data')
 31 |     parser.add_argument("-v", "--verbose", help="verbose output", action="store_true")
 32 |     parser.add_argument("--log", dest="log_level", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
 33 |                         help="Set the logging level")
 34 | 
 35 |     return parser
 36 | 
 37 | 
 38 | def mk_dirs(data_dir):
 39 |     """Make data directories"""
 40 |     p_retrosheet_raw = data_dir / 'retrosheet/raw'
 41 |     p_retrosheet_wrangled = data_dir / 'retrosheet/wrangled'
 42 | 
 43 |     # create directories from these path objects
 44 |     p_retrosheet_raw.mkdir(parents=True, exist_ok=True)
 45 |     p_retrosheet_wrangled.mkdir(parents=True, exist_ok=True)
 46 | 
 47 | 
 48 | def download_data(raw_dir):
 49 |     """download and unzip retrosheet event files"""
 50 | 
 51 |     os.chdir(raw_dir)
 52 | 
 53 |     # download most recent Retrosheet data
 54 |     # most recent data is from chadwickbureau on github.
 55 |     zip_filename = 'retrosheet-master.zip'
 56 | 
 57 |     if not Path(zip_filename).is_file():
 58 |         logger.info('Downloading >200 MB of Data ...')
 59 | 
 60 |         url = 'https://github.com/chadwickbureau/retrosheet/archive/master.zip'
 61 |         r = requests.get(url)
 62 |         r.raise_for_status()
 63 |         with open(zip_filename, 'wb') as f:
 64 |             f.write(r.content)
 65 | 
 66 |         # unzip it
 67 |         with zipfile.ZipFile(zip_filename, "r") as zip_ref:
 68 |             zip_ref.extractall('.')
 69 | 
 70 | 
 71 | def reorg_files(raw_dir):
 72 |     """move the unzipped files to the raw directory and remove the extract directory"""
 73 |     os.chdir(raw_dir)
 74 | 
 75 |     unzip_dir = raw_dir / 'retrosheet-master'
 76 | 
 77 |     if unzip_dir.exists():
 78 |         # move the subdirectories up one directory
 79 |         for dir in os.listdir(unzip_dir):
 80 |             shutil.move(unzip_dir.joinpath(dir).as_posix(), '.')
 81 | 
 82 |         # rm the extract directory
 83 |         shutil.rmtree('retrosheet-master')
 84 | 
 85 | 
 86 | def main():
 87 |     """Download Retrosheet Event Files
 88 |     """
 89 |     parser = get_parser()
 90 |     args = parser.parse_args()
 91 | 
 92 |     if args.log_level:
 93 |         fh = logging.FileHandler('download.log')
 94 |         formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
 95 |         fh.setFormatter(formatter)
 96 |         fh.setLevel(args.log_level)
 97 |         logger.addHandler(fh)
 98 | 
 99 |     if args.verbose:
100 |         # send INFO level logging to stdout
101 |         sh = logging.StreamHandler(sys.stdout)
102 |         formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
103 |         sh.setFormatter(formatter)
104 |         sh.setLevel(logging.INFO)
105 |         logger.addHandler(sh)
106 | 
107 |     data_dir = Path(args.data_dir)
108 |     mk_dirs(data_dir)
109 | 
110 |     raw_dir = (data_dir / 'retrosheet/raw').resolve()
111 |     download_data(raw_dir)
112 |     reorg_files(raw_dir)
113 | 
114 | 
115 | if __name__ == '__main__':
116 |     main()
117 | 


--------------------------------------------------------------------------------
/data/retrosheet/game_types.csv:
--------------------------------------------------------------------------------
  1 | index,dtypes
  2 | game_id,object
  3 | game_dt,uint32
  4 | game_ct,uint8
  5 | game_dy,object
  6 | start_game_tm,uint16
  7 | dh_fl,object
  8 | daynight_park_cd,object
  9 | away_team_id,object
 10 | home_team_id,object
 11 | park_id,object
 12 | away_start_pit_id,object
 13 | home_start_pit_id,object
 14 | base4_ump_id,object
 15 | base1_ump_id,object
 16 | base2_ump_id,object
 17 | base3_ump_id,object
 18 | attend_park_ct,int32
 19 | scorer_record_id,object
 20 | translator_record_id,object
 21 | inputter_record_id,object
 22 | input_record_ts,object
 23 | method_record_cd,uint8
 24 | pitches_record_cd,uint8
 25 | temp_park_ct,int8
 26 | wind_direction_park_cd,uint8
 27 | wind_speed_park_ct,int8
 28 | field_park_cd,uint8
 29 | precip_park_cd,uint8
 30 | sky_park_cd,uint8
 31 | minutes_game_ct,uint16
 32 | inn_ct,uint8
 33 | away_score_ct,uint8
 34 | home_score_ct,uint8
 35 | away_hits_ct,uint8
 36 | home_hits_ct,uint8
 37 | away_err_ct,uint8
 38 | home_err_ct,uint8
 39 | away_lob_ct,uint8
 40 | home_lob_ct,uint8
 41 | win_pit_id,object
 42 | lose_pit_id,object
 43 | save_pit_id,object
 44 | gwrbi_bat_id,object
 45 | away_lineup1_bat_id,object
 46 | away_lineup1_fld_cd,uint8
 47 | away_lineup2_bat_id,object
 48 | away_lineup2_fld_cd,uint8
 49 | away_lineup3_bat_id,object
 50 | away_lineup3_fld_cd,uint8
 51 | away_lineup4_bat_id,object
 52 | away_lineup4_fld_cd,uint8
 53 | away_lineup5_bat_id,object
 54 | away_lineup5_fld_cd,uint8
 55 | away_lineup6_bat_id,object
 56 | away_lineup6_fld_cd,uint8
 57 | away_lineup7_bat_id,object
 58 | away_lineup7_fld_cd,uint8
 59 | away_lineup8_bat_id,object
 60 | away_lineup8_fld_cd,uint8
 61 | away_lineup9_bat_id,object
 62 | away_lineup9_fld_cd,uint8
 63 | home_lineup1_bat_id,object
 64 | home_lineup1_fld_cd,uint8
 65 | home_lineup2_bat_id,object
 66 | home_lineup2_fld_cd,uint8
 67 | home_lineup3_bat_id,object
 68 | home_lineup3_fld_cd,uint8
 69 | home_lineup4_bat_id,object
 70 | home_lineup4_fld_cd,uint8
 71 | home_lineup5_bat_id,object
 72 | home_lineup5_fld_cd,uint8
 73 | home_lineup6_bat_id,object
 74 | home_lineup6_fld_cd,uint8
 75 | home_lineup7_bat_id,object
 76 | home_lineup7_fld_cd,uint8
 77 | home_lineup8_bat_id,object
 78 | home_lineup8_fld_cd,uint8
 79 | home_lineup9_bat_id,object
 80 | home_lineup9_fld_cd,uint8
 81 | away_finish_pit_id,object
 82 | home_finish_pit_id,object
 83 | away_team_league_id,object
 84 | home_team_league_id,object
 85 | outs_ct,uint8
 86 | away_line_tx,object
 87 | home_line_tx,object
 88 | away_ab_ct,uint8
 89 | away_2b_ct,uint8
 90 | away_3b_ct,uint8
 91 | away_hr_ct,uint8
 92 | away_bi_ct,uint8
 93 | away_sh_ct,uint8
 94 | away_sf_ct,uint8
 95 | away_hp_ct,uint8
 96 | away_bb_ct,uint8
 97 | away_ibb_ct,uint8
 98 | away_so_ct,uint8
 99 | away_sb_ct,uint8
100 | away_cs_ct,uint8
101 | away_gdp_ct,uint8
102 | away_xi_ct,uint8
103 | away_pitcher_ct,uint8
104 | away_er_ct,uint8
105 | away_ter_ct,uint8
106 | away_wp_ct,uint8
107 | away_bk_ct,uint8
108 | away_po_ct,uint8
109 | away_a_ct,uint8
110 | away_pb_ct,uint8
111 | away_dp_ct,uint8
112 | away_tp_ct,uint8
113 | home_ab_ct,uint8
114 | home_2b_ct,uint8
115 | home_3b_ct,uint8
116 | home_hr_ct,uint8
117 | home_bi_ct,uint8
118 | home_sh_ct,uint8
119 | home_sf_ct,uint8
120 | home_hp_ct,uint8
121 | home_bb_ct,uint8
122 | home_ibb_ct,uint8
123 | home_so_ct,uint8
124 | home_sb_ct,uint8
125 | home_cs_ct,uint8
126 | home_gdp_ct,uint8
127 | home_xi_ct,uint8
128 | home_pitcher_ct,uint8
129 | home_er_ct,uint8
130 | home_ter_ct,uint8
131 | home_wp_ct,uint8
132 | home_bk_ct,uint8
133 | home_po_ct,uint8
134 | home_a_ct,uint8
135 | home_pb_ct,uint8
136 | home_dp_ct,uint8
137 | home_tp_ct,uint8
138 | win_pit_name_tx,object
139 | lose_pit_name_tx,object
140 | save_pit_name_tx,object
141 | goahead_rbi_id,object
142 | goahead_rbi_name_tx,object
143 | away_lineup1_bat_name_tx,object
144 | away_lineup2_bat_name_tx,object
145 | away_lineup3_bat_name_tx,object
146 | away_lineup4_bat_name_tx,object
147 | away_lineup5_bat_name_tx,object
148 | away_lineup6_bat_name_tx,object
149 | away_lineup7_bat_name_tx,object
150 | away_lineup8_bat_name_tx,object
151 | away_lineup9_bat_name_tx,object
152 | home_lineup1_bat_name_tx,object
153 | home_lineup2_bat_name_tx,object
154 | home_lineup3_bat_name_tx,object
155 | home_lineup4_bat_name_tx,object
156 | home_lineup5_bat_name_tx,object
157 | home_lineup6_bat_name_tx,object
158 | home_lineup7_bat_name_tx,object
159 | home_lineup8_bat_name_tx,object
160 | home_lineup9_bat_name_tx,object
161 | 


--------------------------------------------------------------------------------
/download_scripts/conftest.py:
--------------------------------------------------------------------------------
  1 | """Fixtures for Data Consistency Testing
  2 | 
  3 |    Data Consistency Testing is for the year 1974 through 2019 inclusive.
  4 | """
  5 | import pytest
  6 | from pathlib import Path
  7 | from . import data_helper as dh
  8 | 
  9 | 
 10 | def pytest_addoption(parser):
 11 |     parser.addoption(
 12 |         "--data-dir", action='store', default="../data", type=str, help="baseball data directory"
 13 |     )
 14 |     parser.addoption(
 15 |         "--runslow", action="store_true", default=False, help="run slow tests"
 16 |     )
 17 | 
 18 | 
 19 | def pytest_collection_modifyitems(config, items):
 20 |     if config.getoption("--runslow"):
 21 |         # --runslow given in cli: do not skip slow tests
 22 |         return
 23 | 
 24 |     skip_slow = pytest.mark.skip(reason="need --runslow option to run")
 25 |     for item in items:
 26 |         if "slow" in item.keywords:
 27 |             item.add_marker(skip_slow)
 28 | 
 29 | 
 30 | @pytest.fixture(scope='session')
 31 | def data_dir(request):
 32 |     return Path(request.config.getoption("--data-dir"))
 33 | 
 34 | 
 35 | # @pytest.fixture(scope='session')
 36 | # def player_game(data_dir):
 37 | #     # depending upon the amount of data, it could take 30 seconds to decompress player_game.csv.gz
 38 | #     filename = data_dir / 'retrosheet' / 'collected' / 'player_game.csv.gz'
 39 | #     player_game = dh.from_csv_with_types(filename)
 40 | #     return player_game
 41 | 
 42 | 
 43 | @pytest.fixture(scope='session')
 44 | def team_game(data_dir):
 45 |     filename = data_dir / 'retrosheet' / 'wrangled' / 'team_game.csv.gz'
 46 |     team_game = dh.from_csv_with_types(filename)
 47 |     team_game = team_game.query('1974 <= game_start.dt.year <= 2019')
 48 |     return team_game
 49 | 
 50 | 
 51 | @pytest.fixture(scope='session')
 52 | def game(data_dir):
 53 |     filename = data_dir / 'retrosheet' / 'wrangled' / 'game.csv.gz'
 54 |     game = dh.from_csv_with_types(filename)
 55 |     game = game.query('1974 <= game_start.dt.year <= 2019')
 56 |     return game
 57 | 
 58 | 
 59 | @pytest.fixture(scope='session')
 60 | def batting(data_dir):
 61 |     filename = data_dir / 'retrosheet' / 'wrangled' / 'batting.csv.gz'
 62 |     batting = dh.from_csv_with_types(filename)
 63 |     batting = batting.query('1974 <= game_start.dt.year <= 2019')
 64 |     return batting
 65 | 
 66 | 
 67 | @pytest.fixture(scope='session')
 68 | def pitching(data_dir):
 69 |     filename = data_dir / 'retrosheet' / 'wrangled' / 'pitching.csv.gz'
 70 |     pitching = dh.from_csv_with_types(filename)
 71 |     pitching = pitching.query('1974 <= game_start.dt.year <= 2019')
 72 |     return pitching
 73 | 
 74 | 
 75 | @pytest.fixture(scope='session')
 76 | def fielding(data_dir):
 77 |     filename = data_dir / 'retrosheet' / 'wrangled' / 'fielding.csv.gz'
 78 |     fielding = dh.from_csv_with_types(filename)
 79 |     fielding = fielding.query('1974 <= game_start.dt.year <= 2019')
 80 |     return fielding
 81 | 
 82 | 
 83 | @pytest.fixture(scope='session')
 84 | def lahman_batting(data_dir):
 85 |     filename = data_dir / 'lahman' / 'wrangled' / 'batting.csv'
 86 |     batting = dh.from_csv_with_types(filename)
 87 |     batting = batting.query('1974 <= year <= 2019')
 88 |     return batting
 89 | 
 90 | 
 91 | @pytest.fixture(scope='session')
 92 | def lahman_pitching(data_dir):
 93 |     filename = data_dir / 'lahman' / 'wrangled' / 'pitching.csv'
 94 |     pitching = dh.from_csv_with_types(filename)
 95 |     pitching = pitching.query('1974 <= year <= 2019')
 96 |     return pitching
 97 | 
 98 | 
 99 | @pytest.fixture(scope='session')
100 | def lahman_fielding(data_dir):
101 |     filename = data_dir / 'lahman' / 'wrangled' / 'fielding.csv'
102 |     fielding = dh.from_csv_with_types(filename)
103 |     fielding = fielding.query('1974 <= year <= 2019')
104 |     return fielding
105 | 
106 | 
107 | @pytest.fixture(scope='session')
108 | def lahman_teams(data_dir):
109 |     filename = data_dir / 'lahman' / 'wrangled' / 'teams.csv'
110 |     teams = dh.from_csv_with_types(filename)
111 |     teams = teams.query('1974 <= year <= 2019')
112 |     return teams
113 | 
114 | 
115 | @pytest.fixture(scope='session')
116 | def lahman_people(data_dir):
117 |     filename = data_dir / 'lahman' / 'wrangled' / 'people.csv'
118 |     return dh.from_csv_with_types(filename)
119 | 
120 | 
121 | @pytest.fixture(scope='session')
122 | def event(data_dir):
123 |     filename = data_dir / 'retrosheet' / 'wrangled' / 'event.csv.gz'
124 |     return dh.from_csv_with_types(filename)
125 | 


--------------------------------------------------------------------------------
/download_scripts/lahman_download.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """Download and Unzip Lahman Data to {data_dir}/lahman/raw
  4 | 
  5 | Will not download data if it has already been downloaded.
  6 | """
  7 | 
  8 | __author__ = 'Stephen Diehl'
  9 | 
 10 | import os
 11 | import shutil
 12 | import argparse
 13 | import requests
 14 | from pathlib import Path
 15 | import zipfile
 16 | import logging
 17 | import sys
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | logger.setLevel(logging.DEBUG)
 21 | 
 22 | 
 23 | def get_parser():
 24 |     """Args Description"""
 25 | 
 26 |     parser = argparse.ArgumentParser(
 27 |         description=__doc__,
 28 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 29 | 
 30 |     parser.add_argument("--data-dir", type=str, help="baseball data directory", default='../data')
 31 |     parser.add_argument("-v", "--verbose", help="verbose output", action="store_true")
 32 |     parser.add_argument("--log", dest="log_level", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
 33 |                         help="Set the logging level")
 34 | 
 35 |     return parser
 36 | 
 37 | 
 38 | def mk_dirs(data_dir):
 39 |     """Make data directories"""
 40 |     p_lahman = Path(data_dir) / 'lahman'
 41 |     p_lahman_raw = p_lahman / 'raw'
 42 |     p_lahman_wrangled = p_lahman / 'wrangled'
 43 | 
 44 |     # create directories from these path objects
 45 |     p_lahman_raw.mkdir(parents=True, exist_ok=True)
 46 |     p_lahman_wrangled.mkdir(parents=True, exist_ok=True)
 47 | 
 48 |     msg = " ".join(os.listdir(p_lahman))
 49 |     logger.info(f'{p_lahman} contents: {msg}')
 50 | 
 51 | 
 52 | def download_data(raw_dir):
 53 |     """download and unzip Lahman zip file"""
 54 |     os.chdir(raw_dir)
 55 | 
 56 |     # download most recent data dictionary (accurate for 2019)
 57 |     url = 'http://www.seanlahman.com/files/database/readme2017.txt'
 58 |     dd_filename = '../readme2017.txt'
 59 |     if not Path(dd_filename).is_file():
 60 |         r = requests.get(url)
 61 |         r.raise_for_status()
 62 |         with open(dd_filename, 'wb') as f:
 63 |             f.write(r.content)
 64 | 
 65 |     # download most recent Lahman data
 66 |     # most recent data is not from www.seanlahman.com.  It is from chadwickbureau on github.
 67 |     zip_filename = 'baseballdatabank-master.zip'
 68 | 
 69 |     if not Path(zip_filename).is_file():
 70 |         logger.info('Downloading Data ...')
 71 | 
 72 |         url = 'https://github.com/chadwickbureau/baseballdatabank/archive/master.zip'
 73 |         r = requests.get(url)
 74 |         r.raise_for_status()
 75 |         with open(zip_filename, 'wb') as f:
 76 |             f.write(r.content)
 77 | 
 78 |         # unzip it
 79 |         with zipfile.ZipFile(zip_filename, "r") as zip_ref:
 80 |             zip_ref.extractall('.')
 81 | 
 82 | 
 83 | def reorg_files(raw_dir):
 84 |     """move the unzipped files to the raw directory and remove the extract directory"""
 85 |     os.chdir(raw_dir)
 86 | 
 87 |     if not Path('People.csv').is_file():
 88 |         unzip_dir = raw_dir / 'baseballdatabank-master' / 'core'
 89 | 
 90 |         # move the unzipped csv files to the current working directory
 91 |         for root, dirs, files in os.walk(unzip_dir):
 92 |             for file in files:
 93 |                 shutil.move(root + '/' + file, '.')
 94 | 
 95 |         # rm the extract directory
 96 |         shutil.rmtree('baseballdatabank-master')
 97 | 
 98 |     msg = '\n'.join(os.listdir('.'))
 99 |     logger.info(f'{raw_dir} contents:\n {msg}')
100 | 
101 | 
102 | def main():
103 |     """Download and Unzip Lahman Data to {data_dir}/lahman/raw"""
104 |     parser = get_parser()
105 |     args = parser.parse_args()
106 | 
107 |     if args.log_level:
108 |         fh = logging.FileHandler('download.log')
109 |         formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
110 |         fh.setFormatter(formatter)
111 |         fh.setLevel(args.log_level)
112 |         logger.addHandler(fh)
113 | 
114 |     if args.verbose:
115 |         # send INFO level logging to stdout
116 |         sh = logging.StreamHandler(sys.stdout)
117 |         formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
118 |         sh.setFormatter(formatter)
119 |         sh.setLevel(logging.INFO)
120 |         logger.addHandler(sh)
121 | 
122 |     data_dir = Path(args.data_dir)
123 |     mk_dirs(data_dir)
124 | 
125 |     raw_dir = data_dir / 'lahman/raw'
126 |     raw_dir = raw_dir.resolve()
127 |     download_data(raw_dir)
128 |     reorg_files(raw_dir)
129 | 
130 |     logger.info('Finished')
131 | 
132 | 
133 | if __name__ == '__main__':
134 |     main()
135 | 


--------------------------------------------------------------------------------
/download_scripts/retrosheet_datadictionary.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """Use the Retrosheet parsers to generate their Data Dictionaries."""
  4 | 
  5 | __author__ = 'Stephen Diehl'
  6 | 
  7 | import csv
  8 | import subprocess
  9 | import os
 10 | from pathlib import Path
 11 | import io
 12 | import re
 13 | import argparse
 14 | 
 15 | 
 16 | def get_parser():
 17 |     """Args Description"""
 18 | 
 19 |     # current_year = datetime.datetime.today().year
 20 |     parser = argparse.ArgumentParser(
 21 |         description=__doc__,
 22 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 23 | 
 24 |     parser.add_argument("--data-dir", type=str, help="baseball data directory", default='../data')
 25 |     parser.add_argument("-v", "--verbose", help="verbose output", action="store_true")
 26 | 
 27 |     return parser
 28 | 
 29 | 
 30 | def check_for_retrosheet_parsers():
 31 |     """Check that parsers can be executed."""
 32 |     p1 = subprocess.run(['cwdaily', '-h'], shell=False, capture_output=True)
 33 |     if p1.returncode != 0:
 34 |         raise FileNotFoundError('could not execute cwdaily')
 35 | 
 36 |     p1 = subprocess.run(['cwgame', '-h'], shell=False, capture_output=True)
 37 |     if p1.returncode != 0:
 38 |         raise FileNotFoundError('could not execute cwgame')
 39 | 
 40 | 
 41 | def get_cwdaily_values(description):
 42 |     """Get cwdaily field descriptions"""
 43 |     cwdaily_values = []
 44 |     for line in io.StringIO(description):
 45 | 
 46 |         # if the line starts with a number
 47 |         if re.match(r'^\d+', line):
 48 |             tmp = line.rstrip()[8:]
 49 | 
 50 |             if ':' in tmp:
 51 |                 key, value = tmp.split(':')
 52 |                 value = value.strip()
 53 |             else:
 54 |                 value = tmp.strip()
 55 |             cwdaily_values.append(value)
 56 | 
 57 |     return cwdaily_values
 58 | 
 59 | 
 60 | def get_cwgame_values(description):
 61 |     """get cwgame field descriptions"""
 62 |     cwgame_values = []
 63 |     for line in io.StringIO(description):
 64 | 
 65 |         # if the line starts with a number
 66 |         if re.match(r'^\d+', line):
 67 |             tmp = line.rstrip()[8:]
 68 | 
 69 |             if ':' in tmp:
 70 |                 key, value = tmp.split(':')
 71 |                 value = value.strip()
 72 |             else:
 73 |                 value = tmp.strip()
 74 |             cwgame_values.append(value)
 75 | 
 76 |     return cwgame_values
 77 | 
 78 | 
 79 | def main():
 80 |     """Generate Data Dictionary from cwdaily and cwgame parsers"""
 81 |     check_for_retrosheet_parsers()
 82 | 
 83 |     parser = get_parser()
 84 |     args = parser.parse_args()
 85 | 
 86 |     p_data = Path(args.data_dir).resolve()
 87 |     p_data_raw = p_data.joinpath('retrosheet/raw')
 88 |     os.chdir(p_data_raw)
 89 | 
 90 |     # TODO allow this to work with any event file found in raw directory
 91 |     if not Path('2019LAN.EVN').is_file():
 92 |         raise FileNotFoundError('retrosheet data must be downloaded first')
 93 | 
 94 |     args = ['cwdaily', '-f', '0-153', '-n', '-y', '2019', '2019LAN.EVN']
 95 |     result = subprocess.run(args, shell=False, text=True, capture_output=True)
 96 | 
 97 |     # get header row
 98 |     cwdaily_keys = next(csv.reader(io.StringIO(result.stdout)))
 99 | 
100 |     args = ['cwgame', '-f', '0-83', '-x', '0-94', '-n', '-y', '2019', '2019LAN.EVN']
101 |     result = subprocess.run(args, shell=False, text=True, capture_output=True)
102 | 
103 |     # get header row
104 |     cwgame_keys = next(csv.reader(io.StringIO(result.stdout)))
105 | 
106 |     args = ['cwdaily', '-f', '0-153', '-d']
107 |     result = subprocess.run(args, shell=False, text=True, capture_output=True)
108 | 
109 |     # stderr not stdout
110 |     cwdaily_values = get_cwdaily_values(result.stderr)
111 | 
112 |     args = ['cwgame', '-f', '0-83', '-x', '0-94', '-d']
113 |     result = subprocess.run(args, shell=False, text=True, capture_output=True)
114 | 
115 |     # stderr not stdout
116 |     cwgame_values = get_cwgame_values(result.stderr)
117 | 
118 |     assert len(cwdaily_keys) == len(cwdaily_values)
119 |     assert len(cwgame_keys) == len(cwgame_values)
120 |     cwdaily_dict = dict(zip(cwdaily_keys, cwdaily_values))
121 |     cwgame_dict = dict(zip(cwgame_keys, cwgame_values))
122 | 
123 |     p_retrosheet = p_data.joinpath('retrosheet')
124 |     os.chdir(p_retrosheet)
125 |     with open('cwdaily_datadictionary.txt', 'w') as fh:
126 |         for key, value in cwdaily_dict.items():
127 |             fh.write(f'{key} = {value}\n')
128 | 
129 |     with open('cwgame_datadictionary.txt', 'w') as fh:
130 |         for key, value in cwgame_dict.items():
131 |             fh.write(f'{key} = {value}\n')
132 | 
133 | 
134 | if __name__ == '__main__':
135 |     main()
136 | 


--------------------------------------------------------------------------------
/data/retrosheet/cwdaily_datadictionary.txt:
--------------------------------------------------------------------------------
  1 | GAME_ID = game id
  2 | GAME_DT = date
  3 | GAME_CT = game number (0 = no double header)
  4 | APPEAR_DT = apperance date
  5 | TEAM_ID = team id
  6 | PLAYER_ID = player id
  7 | SLOT_CT = player slot in batting order
  8 | SEQ_CT = sequence in batting order slot
  9 | HOME_FL = home flag
 10 | OPPONENT_ID = opponent id
 11 | PARK_ID = park id
 12 | B_G = games played
 13 | B_PA = plate appearances
 14 | B_AB = at bats
 15 | B_R = runs
 16 | B_H = hits
 17 | B_TB = total bases
 18 | B_2B = doubles
 19 | B_3B = triples
 20 | B_HR = home runs
 21 | B_HR4 = grand slams
 22 | B_RBI = runs batted in
 23 | B_GW = game winning RBI
 24 | B_BB = walks
 25 | B_IBB = intentional walks
 26 | B_SO = strikeouts
 27 | B_GDP = grounded into DP
 28 | B_HP = hit by pitch
 29 | B_SH = sacrifice hits
 30 | B_SF = sacrifice flies
 31 | B_SB = stolen bases
 32 | B_CS = caught stealing
 33 | B_XI = reached on interference
 34 | B_G_DH = games as DH
 35 | B_G_PH = games as PH
 36 | B_G_PR = games as PR
 37 | P_G = games pitched
 38 | P_GS = games started
 39 | P_CG = complete games
 40 | P_SHO = shutouts
 41 | P_GF = games finished
 42 | P_W = wins
 43 | P_L = losses
 44 | P_SV = saves
 45 | P_OUT = outs recorded (innings pitched times 3)
 46 | P_TBF = batters faced
 47 | P_AB = at bats
 48 | P_R = runs allowed
 49 | P_ER = earned runs allowed
 50 | P_H = hits allowed
 51 | P_TB = total bases allowed
 52 | P_2B = doubles allowed
 53 | P_3B = triples allowed
 54 | P_HR = home runs allowed
 55 | P_HR4 = grand slams allowed
 56 | P_BB = walks allowed
 57 | P_IBB = intentional walks allowed
 58 | P_SO = strikeouts
 59 | P_GDP = grounded into double play
 60 | P_HP = hit batsmen
 61 | P_SH = sacrifice hits against
 62 | P_SF = sacrifice flies against
 63 | P_XI = reached on interference
 64 | P_WP = wild pitches
 65 | P_BK = balks
 66 | P_IR = inherited runners
 67 | P_IRS = inherited runners scored
 68 | P_GO = ground outs
 69 | P_AO = air outs
 70 | P_PITCH = pitches
 71 | P_STRIKE = strikes
 72 | F_P_G = games at P
 73 | F_P_GS = games started at P
 74 | F_P_OUT = outs recorded at P (innings fielded times 3)
 75 | F_P_TC = total chances at P
 76 | F_P_PO = putouts at P
 77 | F_P_A = assists at P
 78 | F_P_E = errors at P
 79 | F_P_DP = double plays at P
 80 | F_P_TP = triple plays at P
 81 | F_C_G = games at C
 82 | F_C_GS = games started at C
 83 | F_C_OUT = outs recorded at C (innings fielded times 3)
 84 | F_C_TC = total chances at C
 85 | F_C_PO = putouts at C
 86 | F_C_A = assists at C
 87 | F_C_E = errors at C
 88 | F_C_DP = double plays at C
 89 | F_C_TP = triple plays at C
 90 | F_C_PB = passed balls at C
 91 | F_C_XI = catcher's interference at C
 92 | F_1B_G = games at 1B
 93 | F_1B_GS = games started at 1B
 94 | F_1B_OUT = outs recorded at 1B (innings fielded times 3)
 95 | F_1B_TC = total chances at 1B
 96 | F_1B_PO = putouts at 1B
 97 | F_1B_A = assists at 1B
 98 | F_1B_E = errors at 1B
 99 | F_1B_DP = double plays at 1B
100 | F_1B_TP = triple plays at 1B
101 | F_2B_G = games at 2B
102 | F_2B_GS = games started at 2B
103 | F_2B_OUT = outs recorded at 2B (innings fielded times 3)
104 | F_2B_TC = total chances at 2B
105 | F_2B_PO = putouts at 2B
106 | F_2B_A = assists at 2B
107 | F_2B_E = errors at 2B
108 | F_2B_DP = double plays at 2B
109 | F_2B_TP = triple plays at 2B
110 | F_3B_G = games at 3B
111 | F_3B_GS = games started at 3B
112 | F_3B_OUT = outs recorded at 3B (innings fielded times 3)
113 | F_3B_TC = total chances at 3B
114 | F_3B_PO = putouts at 3B
115 | F_3B_A = assists at 3B
116 | F_3B_E = errors at 3B
117 | F_3B_DP = double plays at 3B
118 | F_3B_TP = triple plays at 3B
119 | F_SS_G = games at SS
120 | F_SS_GS = games started at SS
121 | F_SS_OUT = outs recorded at SS (innings fielded times 3)
122 | F_SS_TC = total chances at SS
123 | F_SS_PO = putouts at SS
124 | F_SS_A = assists at SS
125 | F_SS_E = errors at SS
126 | F_SS_DP = double plays at SS
127 | F_SS_TP = triple plays at SS
128 | F_LF_G = games at LF
129 | F_LF_GS = games started at LF
130 | F_LF_OUT = outs recorded at LF (innings fielded times 3)
131 | F_LF_TC = total chances at LF
132 | F_LF_PO = putouts at LF
133 | F_LF_A = assists at LF
134 | F_LF_E = errors at LF
135 | F_LF_DP = double plays at LF
136 | F_LF_TP = triple plays at LF
137 | F_CF_G = games at CF
138 | F_CF_GS = games started at CF
139 | F_CF_OUT = outs recorded at CF (innings fielded times 3)
140 | F_CF_TC = total chances at CF
141 | F_CF_PO = putouts at CF
142 | F_CF_A = assists at CF
143 | F_CF_E = errors at CF
144 | F_CF_DP = double plays at CF
145 | F_CF_TP = triple plays at CF
146 | F_RF_G = games at RF
147 | F_RF_GS = games started at RF
148 | F_RF_OUT = outs recorded at RF (innings fielded times 3)
149 | F_RF_TC = total chances at RF
150 | F_RF_PO = putouts at RF
151 | F_RF_A = assists at RF
152 | F_RF_E = errors at RF
153 | F_RF_DP = double plays at RF
154 | F_RF_TP = triple plays at RF
155 | 


--------------------------------------------------------------------------------
/data/retrosheet/nb_data/pf.csv:
--------------------------------------------------------------------------------
  1 | team_id,year,pf,pf_half,name
  2 | ANA,2015,0.86,0.93,Angels
  3 | ANA,2016,0.91,0.96,Angels
  4 | ANA,2017,0.95,0.97,Angels
  5 | ANA,2018,0.97,0.98,Angels
  6 | ANA,2019,1.01,1.0,Angels
  7 | ARI,2015,1.06,1.03,Diamondbacks
  8 | ARI,2016,1.22,1.11,Diamondbacks
  9 | ARI,2017,1.2,1.1,Diamondbacks
 10 | ARI,2018,1.06,1.03,Diamondbacks
 11 | ARI,2019,0.98,0.99,Diamondbacks
 12 | ATL,2015,0.94,0.97,Braves
 13 | ATL,2016,1.06,1.03,Braves
 14 | ATL,2017,0.98,0.99,Braves
 15 | ATL,2018,1.12,1.06,Braves
 16 | ATL,2019,1.0,1.0,Braves
 17 | BAL,2015,1.21,1.1,Orioles
 18 | BAL,2016,0.95,0.98,Orioles
 19 | BAL,2017,1.03,1.01,Orioles
 20 | BAL,2018,0.98,0.99,Orioles
 21 | BAL,2019,1.09,1.04,Orioles
 22 | BOS,2015,1.19,1.1,Red Sox
 23 | BOS,2016,1.2,1.1,Red Sox
 24 | BOS,2017,1.03,1.01,Red Sox
 25 | BOS,2018,1.08,1.04,Red Sox
 26 | BOS,2019,1.03,1.01,Red Sox
 27 | CHA,2015,0.9,0.95,White Sox
 28 | CHA,2016,0.93,0.96,White Sox
 29 | CHA,2017,1.0,1.0,White Sox
 30 | CHA,2018,0.94,0.97,White Sox
 31 | CHA,2019,0.97,0.98,White Sox
 32 | CHN,2015,0.95,0.98,Cubs
 33 | CHN,2016,0.87,0.94,Cubs
 34 | CHN,2017,1.13,1.07,Cubs
 35 | CHN,2018,1.08,1.04,Cubs
 36 | CHN,2019,0.93,0.97,Cubs
 37 | CIN,2015,1.12,1.06,Reds
 38 | CIN,2016,0.99,0.99,Reds
 39 | CIN,2017,1.02,1.01,Reds
 40 | CIN,2018,1.13,1.06,Reds
 41 | CIN,2019,1.03,1.02,Reds
 42 | CLE,2015,1.26,1.13,Indians
 43 | CLE,2016,1.21,1.1,Indians
 44 | CLE,2017,0.97,0.99,Indians
 45 | CLE,2018,1.12,1.06,Indians
 46 | CLE,2019,0.97,0.99,Indians
 47 | COL,2015,1.44,1.22,Rockies
 48 | COL,2016,1.37,1.18,Rockies
 49 | COL,2017,1.33,1.17,Rockies
 50 | COL,2018,1.27,1.14,Rockies
 51 | COL,2019,1.39,1.2,Rockies
 52 | DET,2015,0.9,0.95,Tigers
 53 | DET,2016,1.02,1.01,Tigers
 54 | DET,2017,1.17,1.08,Tigers
 55 | DET,2018,0.95,0.97,Tigers
 56 | DET,2019,1.11,1.05,Tigers
 57 | HOU,2015,0.93,0.96,Astros
 58 | HOU,2016,0.81,0.9,Astros
 59 | HOU,2017,0.82,0.91,Astros
 60 | HOU,2018,0.99,0.99,Astros
 61 | HOU,2019,1.1,1.05,Astros
 62 | KCA,2015,1.02,1.01,Royals
 63 | KCA,2016,1.17,1.09,Royals
 64 | KCA,2017,0.93,0.96,Royals
 65 | KCA,2018,1.06,1.03,Royals
 66 | KCA,2019,1.07,1.04,Royals
 67 | LAN,2015,0.92,0.96,Dodgers
 68 | LAN,2016,0.81,0.91,Dodgers
 69 | LAN,2017,0.97,0.99,Dodgers
 70 | LAN,2018,0.86,0.93,Dodgers
 71 | LAN,2019,0.9,0.95,Dodgers
 72 | MIA,2015,0.95,0.98,Marlins
 73 | MIA,2016,0.83,0.92,Marlins
 74 | MIA,2017,0.85,0.93,Marlins
 75 | MIA,2018,0.75,0.87,Marlins
 76 | MIA,2019,1.09,1.04,Marlins
 77 | MIL,2015,1.1,1.05,Brewers
 78 | MIL,2016,0.97,0.99,Brewers
 79 | MIL,2017,1.08,1.04,Brewers
 80 | MIL,2018,1.01,1.01,Brewers
 81 | MIL,2019,0.98,0.99,Brewers
 82 | MIN,2015,0.99,1.0,Twins
 83 | MIN,2016,1.04,1.02,Twins
 84 | MIN,2017,1.1,1.05,Twins
 85 | MIN,2018,1.02,1.01,Twins
 86 | MIN,2019,0.98,0.99,Twins
 87 | NYA,2015,1.02,1.01,Yankees
 88 | NYA,2016,1.04,1.02,Yankees
 89 | NYA,2017,1.0,1.0,Yankees
 90 | NYA,2018,1.13,1.06,Yankees
 91 | NYA,2019,0.84,0.92,Yankees
 92 | NYN,2015,0.87,0.94,Mets
 93 | NYN,2016,0.99,0.99,Mets
 94 | NYN,2017,0.86,0.93,Mets
 95 | NYN,2018,0.73,0.87,Mets
 96 | NYN,2019,0.89,0.95,Mets
 97 | OAK,2015,0.94,0.97,Athletics
 98 | OAK,2016,0.83,0.91,Athletics
 99 | OAK,2017,1.1,1.05,Athletics
100 | OAK,2018,0.84,0.92,Athletics
101 | OAK,2019,0.89,0.94,Athletics
102 | PHI,2015,1.04,1.02,Phillies
103 | PHI,2016,0.84,0.92,Phillies
104 | PHI,2017,1.07,1.04,Phillies
105 | PHI,2018,1.04,1.02,Phillies
106 | PHI,2019,1.05,1.02,Phillies
107 | PIT,2015,0.93,0.97,Pirates
108 | PIT,2016,1.01,1.0,Pirates
109 | PIT,2017,0.95,0.97,Pirates
110 | PIT,2018,0.88,0.94,Pirates
111 | PIT,2019,1.0,1.0,Pirates
112 | SDN,2015,0.93,0.97,Padres
113 | SDN,2016,1.01,1.01,Padres
114 | SDN,2017,0.83,0.91,Padres
115 | SDN,2018,1.04,1.02,Padres
116 | SDN,2019,0.86,0.93,Padres
117 | SEA,2015,0.88,0.94,Mariners
118 | SEA,2016,0.94,0.97,Mariners
119 | SEA,2017,0.92,0.96,Mariners
120 | SEA,2018,0.85,0.92,Mariners
121 | SEA,2019,0.95,0.98,Mariners
122 | SFN,2015,0.85,0.92,Giants
123 | SFN,2016,1.01,1.01,Giants
124 | SFN,2017,0.85,0.92,Giants
125 | SFN,2018,1.01,1.01,Giants
126 | SFN,2019,0.8,0.9,Giants
127 | SLN,2015,0.93,0.97,Cardinals
128 | SLN,2016,0.92,0.96,Cardinals
129 | SLN,2017,0.89,0.94,Cardinals
130 | SLN,2018,0.93,0.96,Cardinals
131 | SLN,2019,0.92,0.96,Cardinals
132 | TBA,2015,0.96,0.98,Rays
133 | TBA,2016,0.89,0.94,Rays
134 | TBA,2017,0.92,0.96,Rays
135 | TBA,2018,0.93,0.96,Rays
136 | TBA,2019,0.89,0.95,Rays
137 | TEX,2015,1.14,1.07,Rangers
138 | TEX,2016,1.16,1.08,Rangers
139 | TEX,2017,1.22,1.11,Rangers
140 | TEX,2018,1.35,1.18,Rangers
141 | TEX,2019,1.24,1.12,Rangers
142 | TOR,2015,0.91,0.95,Blue Jays
143 | TOR,2016,1.16,1.08,Blue Jays
144 | TOR,2017,0.95,0.97,Blue Jays
145 | TOR,2018,0.96,0.98,Blue Jays
146 | TOR,2019,1.03,1.02,Blue Jays
147 | WAS,2015,1.0,1.0,Nationals
148 | WAS,2016,0.96,0.98,Nationals
149 | WAS,2017,1.06,1.03,Nationals
150 | WAS,2018,1.13,1.07,Nationals
151 | WAS,2019,1.1,1.05,Nationals
152 | 


--------------------------------------------------------------------------------
/download_scripts/README.md:
--------------------------------------------------------------------------------
 1 | # Data Preparation Scripts for Baseball Analytics
 2 | 
 3 | These scripts download, parse and wrangle the Lahman and Retrosheet data.
 4 | 
 5 | An optional script creates Postgres tables with appropriate primary key constraints and loads the csv files into these tables.
 6 | 
 7 | All scripts should be run from the download_scripts directory. 
 8 | 
 9 | For all scripts:
10 | 
11 | * --help for help
12 | * -v for verbose: logs to stdout
13 | * --log INFO:  appends to download.log file (at the INFO level)
14 | * --data-dir ../data:   specifies the data directory (default is ../data)
15 | 
16 | Scripts with example command line arguments:
17 | 
18 | * **./run_all_scripts.py** --start-year=1974 --end-year=2019
19 |   * convenience script to run all scripts with -v --log=INFO
20 |   * default data directory is ../data
21 |   * all data is downloaded but only the years specified are parsed and wrangled
22 | * **./lahman_download.py** -v --log=INFO
23 |   * downloads all the lahman data and unzips it to `../data/lahman/raw`
24 | 
25 | * **./lahman_wrangle.py** -v -log=INFO
26 |   * converts field names to snake_case
27 |   * performs custom parsing of dates
28 |   * drops fielding columns that have more than 90% missing values
29 |   * optimizes data types
30 |   * persists with optimized data types to `../data/lahman/wrangle`
31 | * **./retrosheet_download.py** -v -log=INFO
32 |   * downloads the retrosheet data and unzips it to `../data/retrosheet/raw`
33 | * **./retrosheet_parse.py** -v --log=INFO --start-year=1974 --end-year=2019
34 |   * parses data in `data/retrosheet/raw` for the specified years
35 |     * cwdaily and cwgame are always run
36 |     * use '--run-cwevent' to run the cwevent parser as well
37 |     * use '--cwevent-fields' to specify your own set of fields using the cwevent syntax
38 |       * for example, to specify all fields use: --cwevent-fields='-f 0-96 -x 0-62'
39 | * **./retrosheet_collect.py** -v --log=INFO --use-datatypes
40 |   * with --use-datatypes option
41 |     * uses the precomputed optimized data types: `data/retrosheet/*_types.csv`
42 |     * this can save several Gigs of RAM, if data goes back to the 1950s or earlier
43 |   * without --use-datatypes option
44 |     * will compute and save the optimized data types
45 |     * may require more than 16 Gig of RAM, if data goes back to the 1950s or earlier
46 |   * collects the results into one DataFrame for cwdaily and one DataFrame for cwgame
47 |     * if there are cwevent files, it will collect these into a single DataFrame as well
48 |     * if there are cwevent files, it will add the following new fields to make play-by-play analysis easier: so, sb, cs, bk, bb, ibb, hbp, xi, single, double, triple, hr
49 |   * converts the field names to lower case
50 |   * drops columns that have more than 99% missing values
51 |   * persists the results to `../data/retrosheet/collected`
52 |   * the csv files are compressed using gzip
53 | * **./retrosheet_datadictionary.py**
54 |   * this is an optional script which produces the data dictionary for the cwdaily and cwgame parsers
55 |   * the results of running this script are published in this github repo at `data/retrosheet` as cwdaily_datadictionary.txt and cwgame_datadictionary.txt
56 | * **./retrosheet_wrangle.py** -v --log=INFO
57 |   *  data cleanup for non-unique primary key (player_id, game_id)
58 |      *  between 1948 and 2019 there is only one duplicate primary key
59 |   *  custom parsing of game start time
60 |   *  restructure cwdaily output to create batting/pitching/fielding csv files that have a row only if the player has a non-zero batting/pitching/fielding statistic for that game
61 |   *  restructure cwgame output to create stats per team per game (team_game.csv) and stats per game (game.csv)
62 |   *  the csv files are compressed using gzip
63 | * **./postgres_load_data.py** -v --log=INFO
64 |   *  optional script to:
65 |      *  create tables with optimized data types
66 |      *  create primary and foreign key constraints
67 |      *  load data into tables
68 |   *  the baseball database must have already been created
69 |      *  connect string:  f'postgresql://{db_user}:{db_pass}@localhost:5432/baseball' 
70 | 
71 | ### Performing Data Validation
72 | 
73 | pytest is used to automate the running of more than 50 data integrity and data consistency tests.
74 | 
75 | Running pytest:
76 | 
77 | * recommend: 'pytest -v'
78 | * must be run from the `download_scripts` directory
79 | * must be run after the scripts which download and parse the data have been run
80 | * accepts custom option: --data-dir=<data_directory>
81 | 
82 | If you like, you may spot check the data using [Baseball Reference](https://www.baseball-reference.com/).  Baseball Reference uses the Retrosheet data.  The box score for a game can be constructed from the game_id using:  
83 |  `'https://www.baseball-reference.com/boxes/' + game_id.str[:3] + '/' + game_id + '.shtml'`  
84 |  For example, to verify that there are two entries for Chris Young for game_id = BOS201708250, the url is:  
85 | https://www.baseball-reference.com/boxes/BOS/BOS201708250.shtml
86 | 
87 | ### Rerunning the Scripts
88 | 
89 | It is rarely necessary to re-download the data.  Minor tweaks are continually being made to Lahman and Retrosheet for very old data, but recent data is usually accurate and complete the first time it is made available.
90 | 
91 | The data is not updated during the season.  It is added to both Lahman and Retrosheet around late December.  For example, all of the 2019 regular and post-season data for both Lahman and Retrosheet became available in late December 2019.
92 | 
93 | To rerun the scripts, it is only necessary to remove the data from data directories other than the raw data directories.


--------------------------------------------------------------------------------
/MLB_Data_Overview.md:
--------------------------------------------------------------------------------
  1 | ## MLB Data Overview
  2 | 
  3 | ### Tidy Data Definition
  4 | 
  5 | Data is [tidy](https://en.wikipedia.org/wiki/Tidy_data) if:
  6 | 
  7 | 1. Each variable forms a column.
  8 | 2. Each observation forms a row.
  9 | 3. Each type of observational unit forms a table or csv file.
 10 | 
 11 | The above is nearly identical to the database term "3rd normal form".  Arguably the last rule above is not required for data analysis, but it saves space and helps to ensure data consistency.
 12 | 
 13 | The benefit of making the data tidy is that data analysis is much easier.
 14 | 
 15 | ### Lahman Overview
 16 | 
 17 | The Lahman data is tidy.  The description of these csv files is in the `data/lahman` directory and is called readme2017.txt.  It was copied from the Lahman website and it is accurate for 2018 and 2019 as well.
 18 | 
 19 | A description of data might be called a "data dictionary" or a "code book" or simply just a "readme.txt".
 20 | 
 21 | As of December 2019, Lahman has data through the end of the 2019 season.  
 22 | 
 23 | ### Retrosheet Overview
 24 | 
 25 | The Retrosheet data is not tidy nor is it in csv format, rather it is in a custom text format.  Reading this format is most easily done using the open-source parsers by Dr. T. L.  Turocy which convert the Retrosheet text files into csv files with a header row.
 26 | 
 27 | As of December 2019, Retrosheet has data through the 2019 season.
 28 | 
 29 | ### Field Names
 30 | 
 31 | The field names in both datasets are based on standard baseball abbreviations.  See for example https://en.wikipedia.org/wiki/Baseball_statistics.
 32 | 
 33 | The field names have been changed as little as possible to remain familiar.  Field name changes include:
 34 | 
 35 | * columns in different csv files with the same meaning, now have the same column name
 36 | * CamelCase is converted to snake_case
 37 | * '2B' and '3B' are changed to 'double' and 'triple' to make them valid identifiers
 38 | * Retrosheet's 'gdp' is changed to 'gidp' to match Lahman
 39 | * Retrosheet's 'hp' is changed to 'hbp' to match Lahman 
 40 | 
 41 | ### CSV Files Created
 42 | 
 43 | After data wrangling, the following csv files exist:
 44 | 
 45 | **Lahman**
 46 | 
 47 | * Stats per Player per Year:
 48 |   * batting.csv
 49 |   * pitching.csv
 50 |   * fielding.csv
 51 | * Postseason Stats per Round per Player per Year
 52 |   * battingpost.csv
 53 |   * pitchingpost.csv
 54 |   * fieldingpost.csv
 55 | * Stats per Team per Year:
 56 |   * teams.csv -- contains team_id for both Lahman and Retrosheet
 57 | * Other
 58 |   * people.csv -- contains player_id for Lahman, Retrosheet and Baseball-Reference
 59 |   * salaries.csv
 60 |   * parks.csv
 61 |   * more to be added soon ...
 62 |   
 63 | 
 64 | **Retrosheet**  
 65 | 
 66 | * Stats per Event
 67 |   * event.csv.gz
 68 | * Stats per Player per Game:
 69 |   * batting.csv.gz
 70 |   * pitching.csv.gz
 71 |   * fielding.csv.gz
 72 | * Stats per Team per Game:
 73 |   * team_game.csv.gz
 74 | * Stats per Game:
 75 |   * game.csv.gz
 76 | * Postseason stats: to be added soon ...
 77 | 
 78 | A script to create Postgres tables with appropriate primary key constraints and load each of the above csv files into these tables is provided.
 79 | 
 80 | ### Unique Identifiers (Primary Keys)
 81 | 
 82 | When performing data analysis, it is essential to know what field(s) uniquely identify a row in a csv file (or table).  It turns out that cwgame generates the equivalent of two entries for the "box score" one time since 1948.  These two entries were summed appropriately so that the expected unique identifiers work properly.
 83 | 
 84 | Not having unique identifiers greatly complicates data analysis.
 85 | 
 86 | ### Data Types
 87 | 
 88 | There are several reasons to pay close attention to the data types used by Pandas and/or Postgres:
 89 | 
 90 | * the data type provides information about the field
 91 | * the data type helps to ensure correct code
 92 | * use the smallest appropriate data type saves memory and database storage
 93 | 
 94 | For example, the default value for an integer in Pandas is 'int64', and yet the maximum number of hits in a game can be saved in just 8 bits with a 'uint8'.  Pandas nullable integer data types are also made use of.
 95 | 
 96 | Data type optimization per column per csv file are persisted to disk by writing a corresponding csv files with the suffice _types.csv.  I have written python function which then read the csv back into a dataframe using the optimized persisted data types.
 97 | 
 98 | ## Data Wrangling
 99 | 
100 | The scripts which wrangle the Lahman and Retrosheet data will:
101 | 
102 | * ensure that the same field name has the same meaning across all csv files for both Lahman and Retrosheet
103 | * ensure that the field names conform to official baseball abbreviations as much as possible
104 |   * with the caveat that all field names must be valid Python identifiers and valid SQL column names
105 | * determine the most efficient data type, for both Pandas and Postgres tables, and persist that data type for each corresponding csv file
106 | * automate the running of 3 Retrosheet parsers and tidy the output
107 | * translate numeric codes into text so they can be understood
108 | * identify the different ways in which missing data is represented and create the appropriate value in Pandas
109 | * translate unusual date and time representations to appropriate data and time Pandas data types
110 | * normalize the data
111 |   * for example, every player does not play every fielding position in every game, and yet that is how the output of the cwgame parsers presents the data.  As such, that output is almost all zeros.  A better representation is to create a row for each player for each fielding position they actually played  in a game.
112 | * and more ...
113 | 
114 | ### Baseball Player Roles
115 | 
116 | A baseball player may have several roles during the course of a game, such as batter, pitcher and any of the 9 fielding positions.
117 | 
118 | Attribute names for batters and pitchers are the same where it makes sense to do so.  For example, if a batter hits a "hr" then then opposing team's pitcher must have given up a "hr".
119 | 
120 | All attribute names for the 9 fielding positions are identical, even though passed-ball only applies to the catcher and interference is mostly relevant to the catcher, pitcher and first baseman.  This allows for a single csv file for fielding with no null values.


--------------------------------------------------------------------------------
/download_scripts/retrosheet_parse.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """Parse all event files in {data_dir}/retrosheet/raw and put result in {data_dir}/retrosheet/parsed"""
  4 | 
  5 | __author__ = 'Stephen Diehl'
  6 | 
  7 | import argparse
  8 | import subprocess
  9 | import sys
 10 | from pathlib import Path
 11 | import os
 12 | import glob
 13 | import logging
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | logger.setLevel(logging.DEBUG)
 17 | 
 18 | 
 19 | def get_parser():
 20 |     """Args Description"""
 21 | 
 22 |     parser = argparse.ArgumentParser(
 23 |         description=__doc__,
 24 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 25 | 
 26 |     parser.add_argument("--data-dir", type=str, help="baseball data directory", default='../data')
 27 | 
 28 |     # Some Key MLB Data Dates
 29 |     # 1955: sacrifice files, sacrifice bunts and intentional walks are recorded for the first time
 30 |     # 1969: divisional play begins
 31 |     # 1974: Retrosheet is missing no games from 1974 to present
 32 |     parser.add_argument("--start-year", type=int, help="start year", default='1955')
 33 | 
 34 |     # Retrosheet Data for 2019 became available in December 2019
 35 |     parser.add_argument("--end-year", type=int, help="end year", default='2019')
 36 | 
 37 |     parser.add_argument("-v", "--verbose", help="verbose output", action="store_true")
 38 |     parser.add_argument("--log", dest="log_level", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
 39 |                         help="Set the logging level")
 40 | 
 41 |     parser.add_argument("--run-cwevent", help="verbose output", action="store_true")
 42 |     parser.add_argument("--cwevent-fields", type=str, help="cwevent field specification",
 43 |                         default='-f 0,2,3,8,9,10,14,29,36-42,44,45,51,96 -x 1,2,5,8,11,13,14,45,50,55')
 44 | 
 45 |     return parser
 46 | 
 47 | 
 48 | def check_for_retrosheet_parsers():
 49 |     """Check that parsers can be executed."""
 50 |     p1 = subprocess.run(['cwevent', '-h'], shell=False, capture_output=True)
 51 |     if p1.returncode != 0:
 52 |         raise FileNotFoundError('could not execute cwevent')
 53 | 
 54 |     p1 = subprocess.run(['cwdaily', '-h'], shell=False, capture_output=True)
 55 |     if p1.returncode != 0:
 56 |         raise FileNotFoundError('could not execute cwdaily')
 57 | 
 58 |     p1 = subprocess.run(['cwgame', '-h'], shell=False, capture_output=True)
 59 |     if p1.returncode != 0:
 60 |         raise FileNotFoundError('could not execute cwgame')
 61 | 
 62 | 
 63 | def parse_event_files(raw_dir, parse_dir, parser, fields, start_year, end_year):
 64 |     """Parse raw Retrosheet data"""
 65 |     os.chdir(raw_dir)
 66 | 
 67 |     for year in range(start_year, end_year + 1):
 68 |         files = sorted(glob.glob(f'{year}*.EV*'))
 69 |         first = True
 70 | 
 71 |         cmd = [parser]
 72 |         cmd.extend(fields.split(' '))
 73 | 
 74 |         logger.info(f'{parser} parsing {len(files)} teams for {year} ...')
 75 | 
 76 |         for file in files:
 77 |             out = f'{parse_dir.as_posix()}/{parser}{year}.csv'
 78 |             if first:
 79 |                 # print csv header using -n
 80 |                 cmd.append('-n')
 81 |                 cmd.extend(['-y', str(year)])
 82 | 
 83 |                 cmd_full = cmd + [file]
 84 |                 logger.debug(f'{" ".join(cmd_full)}')
 85 | 
 86 |                 # overwrite existing file if it exists
 87 |                 with open(out, "w+") as outfile:
 88 |                     result = subprocess.run(cmd_full, shell=False, stdout=outfile, stderr=subprocess.DEVNULL)
 89 |                 first = False
 90 | 
 91 |                 # don't print csv header for subsequent teams in the same year
 92 |                 cmd.remove('-n')
 93 |             else:
 94 |                 cmd_full = cmd + [file]
 95 |                 logger.debug(f'{" ".join(cmd_full)}')
 96 | 
 97 |                 # append to existing file
 98 |                 with open(out, "a+") as outfile:
 99 |                     result = subprocess.run(cmd_full, shell=False, stdout=outfile, stderr=subprocess.DEVNULL)
100 | 
101 | 
102 | def main():
103 |     """Parse the data and organize the results.
104 |     """
105 |     parser = get_parser()
106 |     args = parser.parse_args()
107 | 
108 |     if args.log_level:
109 |         fh = logging.FileHandler('download.log')
110 |         formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
111 |         fh.setFormatter(formatter)
112 |         fh.setLevel(args.log_level)
113 |         logger.addHandler(fh)
114 | 
115 |     if args.verbose:
116 |         # send INFO level logging to stdout
117 |         sh = logging.StreamHandler(sys.stdout)
118 |         formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
119 |         sh.setFormatter(formatter)
120 |         sh.setLevel(logging.INFO)
121 |         logger.addHandler(sh)
122 | 
123 |     if args.start_year > 1974:
124 |         logger.warning('data consistency tests require start_year <= 1974')
125 |         args.start_year = 1974
126 | 
127 |     if args.end_year < 2019:
128 |         logger.warning('data consistency tests require end-year >= 2019')
129 |         args.end_year = 2019
130 | 
131 |     check_for_retrosheet_parsers()
132 | 
133 |     p_data = Path(args.data_dir).resolve()
134 | 
135 |     p_data_raw = p_data.joinpath('retrosheet/raw/event/regular')
136 |     p_data_parsed = p_data.joinpath('retrosheet/parsed')
137 |     p_data_collected = p_data.joinpath('retrosheet/collected')
138 | 
139 |     # create directories, if they do not exist
140 |     p_data_parsed.mkdir(parents=True, exist_ok=True)
141 |     p_data_collected.mkdir(parents=True, exist_ok=True)
142 | 
143 |     # this selection of fields appears to support most play-by-play analysis
144 |     if args.run_cwevent:
145 |         if (p_data_parsed / 'cwevent2019.csv').exists():
146 |             logger.info('Skipping cwevent parsing -- already performed')
147 |         else:
148 |             parse_event_files(p_data_raw, p_data_parsed, 'cwevent',
149 |                               args.cwevent_fields, args.start_year, args.end_year)
150 | 
151 |     # request all available fields for cwdaily and cwgame
152 |     if (p_data_parsed / 'cwdaily2019.csv').exists():
153 |         logger.info('Skipping cwdaily parser -- already performed')
154 |     else:
155 |         parse_event_files(p_data_raw, p_data_parsed, 'cwdaily', '-f 0-153', args.start_year, args.end_year)
156 | 
157 |     if (p_data_parsed / 'cwgame2019.csv').exists():
158 |         logger.info('Skipping cwgame parser -- already performed')
159 |     else:
160 |         parse_event_files(p_data_raw, p_data_parsed, 'cwgame', '-f 0-83 -x 0-94', args.start_year, args.end_year)
161 | 
162 | 
163 | if __name__ == '__main__':
164 |     main()
165 | 


--------------------------------------------------------------------------------
/data/retrosheet/cwgame_datadictionary.txt:
--------------------------------------------------------------------------------
  1 | GAME_ID = game id
  2 | GAME_DT = date
  3 | GAME_CT = game number (0 = no double header)
  4 | GAME_DY = day of week
  5 | START_GAME_TM = start time
  6 | DH_FL = DH used flag
  7 | DAYNIGHT_PARK_CD = day/night flag
  8 | AWAY_TEAM_ID = visiting team
  9 | HOME_TEAM_ID = home team
 10 | PARK_ID = game site
 11 | AWAY_START_PIT_ID = vis. starting pitcher
 12 | HOME_START_PIT_ID = home starting pitcher
 13 | BASE4_UMP_ID = home plate umpire
 14 | BASE1_UMP_ID = first base umpire
 15 | BASE2_UMP_ID = second base umpire
 16 | BASE3_UMP_ID = third base umpire
 17 | LF_UMP_ID = left field umpire
 18 | RF_UMP_ID = right field umpire
 19 | ATTEND_PARK_CT = attendance
 20 | SCORER_RECORD_ID = PS scorer
 21 | TRANSLATOR_RECORD_ID = translator
 22 | INPUTTER_RECORD_ID = inputter
 23 | INPUT_RECORD_TS = input time
 24 | EDIT_RECORD_TS = edit time
 25 | METHOD_RECORD_CD = how scored
 26 | PITCHES_RECORD_CD = pitches entered?
 27 | TEMP_PARK_CT = temperature
 28 | WIND_DIRECTION_PARK_CD = wind direction
 29 | WIND_SPEED_PARK_CT = wind speed
 30 | FIELD_PARK_CD = field condition
 31 | PRECIP_PARK_CD = precipitation
 32 | SKY_PARK_CD = sky
 33 | MINUTES_GAME_CT = time of game
 34 | INN_CT = number of innings
 35 | AWAY_SCORE_CT = visitor final score
 36 | HOME_SCORE_CT = home final score
 37 | AWAY_HITS_CT = visitor hits
 38 | HOME_HITS_CT = home hits
 39 | AWAY_ERR_CT = visitor errors
 40 | HOME_ERR_CT = home errors
 41 | AWAY_LOB_CT = visitor left on base
 42 | HOME_LOB_CT = home left on base
 43 | WIN_PIT_ID = winning pitcher
 44 | LOSE_PIT_ID = losing pitcher
 45 | SAVE_PIT_ID = save for
 46 | GWRBI_BAT_ID = GW RBI
 47 | AWAY_LINEUP1_BAT_ID = visitor batter 1
 48 | AWAY_LINEUP1_FLD_CD = visitor position 1
 49 | AWAY_LINEUP2_BAT_ID = visitor batter 2
 50 | AWAY_LINEUP2_FLD_CD = visitor position 2
 51 | AWAY_LINEUP3_BAT_ID = visitor batter 3
 52 | AWAY_LINEUP3_FLD_CD = visitor position 3
 53 | AWAY_LINEUP4_BAT_ID = visitor batter 4
 54 | AWAY_LINEUP4_FLD_CD = visitor position 4
 55 | AWAY_LINEUP5_BAT_ID = visitor batter 5
 56 | AWAY_LINEUP5_FLD_CD = visitor position 5
 57 | AWAY_LINEUP6_BAT_ID = visitor batter 6
 58 | AWAY_LINEUP6_FLD_CD = visitor position 6
 59 | AWAY_LINEUP7_BAT_ID = visitor batter 7
 60 | AWAY_LINEUP7_FLD_CD = visitor position 7
 61 | AWAY_LINEUP8_BAT_ID = visitor batter 8
 62 | AWAY_LINEUP8_FLD_CD = visitor position 8
 63 | AWAY_LINEUP9_BAT_ID = visitor batter 9
 64 | AWAY_LINEUP9_FLD_CD = visitor position 9
 65 | HOME_LINEUP1_BAT_ID = home batter 1
 66 | HOME_LINEUP1_FLD_CD = home position 1
 67 | HOME_LINEUP2_BAT_ID = home batter 2
 68 | HOME_LINEUP2_FLD_CD = home position 2
 69 | HOME_LINEUP3_BAT_ID = home batter 3
 70 | HOME_LINEUP3_FLD_CD = home position 3
 71 | HOME_LINEUP4_BAT_ID = home batter 4
 72 | HOME_LINEUP4_FLD_CD = home position 4
 73 | HOME_LINEUP5_BAT_ID = home batter 5
 74 | HOME_LINEUP5_FLD_CD = home position 5
 75 | HOME_LINEUP6_BAT_ID = home batter 6
 76 | HOME_LINEUP6_FLD_CD = home position 6
 77 | HOME_LINEUP7_BAT_ID = home batter 7
 78 | HOME_LINEUP7_FLD_CD = home position 7
 79 | HOME_LINEUP8_BAT_ID = home batter 8
 80 | HOME_LINEUP8_FLD_CD = home position 8
 81 | HOME_LINEUP9_BAT_ID = home batter 9
 82 | HOME_LINEUP9_FLD_CD = home position 9
 83 | AWAY_FINISH_PIT_ID = visiting finisher (NULL if complete game)
 84 | HOME_FINISH_PIT_ID = home finisher (NULL if complete game)
 85 | AWAY_TEAM_LEAGUE_ID = visiting team league
 86 | HOME_TEAM_LEAGUE_ID = home team league
 87 | AWAY_TEAM_GAME_CT = visiting team game number
 88 | HOME_TEAM_GAME_CT = home team game number
 89 | OUTS_CT = length of game in outs
 90 | COMPLETION_TX = information on completion of game
 91 | FORFEIT_TX = information on forfeit of game
 92 | PROTEST_TX = information on protest of game
 93 | AWAY_LINE_TX = visiting team linescore
 94 | HOME_LINE_TX = home team linescore
 95 | AWAY_AB_CT = visiting team AB
 96 | AWAY_2B_CT = visiting team 2B
 97 | AWAY_3B_CT = visiting team 3B
 98 | AWAY_HR_CT = visiting team HR
 99 | AWAY_BI_CT = visiting team RBI
100 | AWAY_SH_CT = visiting team SH
101 | AWAY_SF_CT = visiting team SF
102 | AWAY_HP_CT = visiting team HP
103 | AWAY_BB_CT = visiting team BB
104 | AWAY_IBB_CT = visiting team IBB
105 | AWAY_SO_CT = visiting team SO
106 | AWAY_SB_CT = visiting team SB
107 | AWAY_CS_CT = visiting team CS
108 | AWAY_GDP_CT = visiting team GDP
109 | AWAY_XI_CT = visiting team reach on interference
110 | AWAY_PITCHER_CT = number of pitchers used by visiting team
111 | AWAY_ER_CT = visiting team individual ER allowed
112 | AWAY_TER_CT = visiting team team ER allowed
113 | AWAY_WP_CT = visiting team WP
114 | AWAY_BK_CT = visiting team BK
115 | AWAY_PO_CT = visiting team PO
116 | AWAY_A_CT = visiting team A
117 | AWAY_PB_CT = visiting team PB
118 | AWAY_DP_CT = visiting team DP
119 | AWAY_TP_CT = visiting team TP
120 | HOME_AB_CT = home team AB
121 | HOME_2B_CT = home team 2B
122 | HOME_3B_CT = home team 3B
123 | HOME_HR_CT = home team HR
124 | HOME_BI_CT = home team RBI
125 | HOME_SH_CT = home team SH
126 | HOME_SF_CT = home team SF
127 | HOME_HP_CT = home team HP
128 | HOME_BB_CT = home team BB
129 | HOME_IBB_CT = home team IBB
130 | HOME_SO_CT = home team SO
131 | HOME_SB_CT = home team SB
132 | HOME_CS_CT = home team CS
133 | HOME_GDP_CT = home team GDP
134 | HOME_XI_CT = home team reach on interference
135 | HOME_PITCHER_CT = number of pitchers used by home team
136 | HOME_ER_CT = home team individual ER allowed
137 | HOME_TER_CT = home team team ER allowed
138 | HOME_WP_CT = home team WP
139 | HOME_BK_CT = home team BK
140 | HOME_PO_CT = home team PO
141 | HOME_A_CT = home team A
142 | HOME_PB_CT = home team PB
143 | HOME_DP_CT = home team DP
144 | HOME_TP_CT = home team TP
145 | UMP_HOME_NAME_TX = home plate umpire name
146 | UMP_1B_NAME_TX = first base umpire name
147 | UMP_2B_NAME_TX = second base umpire name
148 | UMP_3B_NAME_TX = third base umpire name
149 | UMP_LF_NAME_TX = left field umpire name
150 | UMP_RF_NAME_TX = right field umpire name
151 | AWAY_MANAGER_ID = visitors manager ID
152 | AWAY_MANAGER_NAME_TX = visitors manager name
153 | HOME_MANAGER_ID = home manager ID
154 | HOME_MANAGER_NAME_TX = home manager name
155 | WIN_PIT_NAME_TX = winning pitcher name
156 | LOSE_PIT_NAME_TX = losing pitcher name
157 | SAVE_PIT_NAME_TX = save pitcher name
158 | GOAHEAD_RBI_ID = batter with goahead RBI ID
159 | GOAHEAD_RBI_NAME_TX = batter with goahead RBI
160 | AWAY_LINEUP1_BAT_NAME_TX = visitor batter 1 name
161 | AWAY_LINEUP2_BAT_NAME_TX = visitor batter 2 name
162 | AWAY_LINEUP3_BAT_NAME_TX = visitor batter 3 name
163 | AWAY_LINEUP4_BAT_NAME_TX = visitor batter 4 name
164 | AWAY_LINEUP5_BAT_NAME_TX = visitor batter 5 name
165 | AWAY_LINEUP6_BAT_NAME_TX = visitor batter 6 name
166 | AWAY_LINEUP7_BAT_NAME_TX = visitor batter 7 name
167 | AWAY_LINEUP8_BAT_NAME_TX = visitor batter 8 name
168 | AWAY_LINEUP9_BAT_NAME_TX = visitor batter 9 name
169 | HOME_LINEUP1_BAT_NAME_TX = home batter 1 name
170 | HOME_LINEUP2_BAT_NAME_TX = home batter 2 name
171 | HOME_LINEUP3_BAT_NAME_TX = home batter 3 name
172 | HOME_LINEUP4_BAT_NAME_TX = home batter 4 name
173 | HOME_LINEUP5_BAT_NAME_TX = home batter 5 name
174 | HOME_LINEUP6_BAT_NAME_TX = home batter 6 name
175 | HOME_LINEUP7_BAT_NAME_TX = home batter 7 name
176 | HOME_LINEUP8_BAT_NAME_TX = home batter 8 name
177 | HOME_LINEUP9_BAT_NAME_TX = home batter 9 name
178 | ADD_INFO_TX = additional information
179 | ACQ_INFO_TX = acquisition information
180 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Baseball Analytics
 2 | 
 3 | ## Overview
 4 | 
 5 | Scripts are provided which download, parse, and wrangle the Lahman and Retrosheet data to produce a set of tidy csv files that can be analyzed in Python and Pandas, or R.  There is also an optional script to load the data into Postgres.
 6 | 
 7 | Examples of data analysis are provided using Python and Pandas in Jupyter Notebooks.
 8 | 
 9 | The value of publishing the scripts and the analysis is that the results are repeatable.  The precise data used and the precise data processing, are made available for anyone to use, modify and evaluate.
10 | 
11 | The value of wrangling the data is that the analysis is much easier and the RAM and storage requirements are much less.
12 | 
13 | ## Data Science and Sabermetrics
14 | 
15 | [Sabermetrics](https://en.wikipedia.org/wiki/Sabermetrics) was created before the advent of modern software tools for data analysis and fast personal computers. One aim is to create metrics that make it easy for people to quickly grasp how much a baseball player contributes to his team's wins. In data science terminology, this is an example of explanatory modeling.
16 | 
17 | Another aim of Sabermetrics is to identify metrics that are likely to be useful in a predictive model.  In data science terminology, a baseball domain expert uses feature engineering to create inputs (Sabermetrics) to improve predictive accuracy.
18 | 
19 | Data Science, and science in general, must produce results that can be repeated by others. See [Reproducible Research](https://en.wikipedia.org/wiki/Reproducibility#Reproducible_research). A problem with many Sabermetric blog posts is that the results cannot be repeated because the code use to perform the analysis, and the data itself, are not made public.
20 | 
21 | The emphasis here is on repeatable data analysis. The scripts to download the data are provided. The data is wrangled to simplify the analysis, and the data wrangling scripts are provided. Over 50 tests are also provided to verify the data wrangling, verify the Retrosheet parsers, and determine how consistent the Retrosheet data is with the Lahman data. These tests can be run with the single command, 'pytest'. The data analysis is published in unambiguous code in the form of Jupyter notebooks.
22 | 
23 | ### Data Preparation Scripts
24 | 
25 | The Python scripts prepare the data for analysis, including running the open-source [Retrosheet Parsers](https://github.com/sdiehl28/baseball-analytics/blob/master/RetrosheetParsers.md). These scripts are at [download_scripts](https://github.com/sdiehl28/baseball-analytics/tree/master/download_scripts).
26 | 
27 | ### Data Analysis
28 | 
29 | Examples of baseball analysis are presented using Jupyter Notebooks with Python, Pandas and matplotlib/seaborn plots.
30 | 
31 | Some initial analysis includes:
32 | 
33 | - How many more runs per game are there when the DH is used? Could this difference be due to chance?
34 | - How has game length and pitcher count increased over the years?
35 | - - How is game length related to pitcher count? Could this relationship be due to chance?
36 | - Computing the Park Factor
37 | - - What did ESPN, Fangraphs, and others get wrong about the park factor for Fenway Park in 2019?
38 |   - Demonstrate that accounting for each team's road schedule will strongly affect the home park factor, for a few teams each year.
39 |   - Compute the game-weighted average Park Factor on the road, for each team, for several years.
40 | - Linear Modeling of Runs per Half Inning
41 |   - How much does a singe, double, triple and home run contribute to run scoring per half-inning?
42 | 
43 | These Jupyter Notebooks are in this repo at: [Baseball Analysis](https://github.com/sdiehl28/baseball-analytics/tree/master/baseball_jupyter_nb).
44 | 
45 | ### Data Validation and Wrangling Validation
46 | 
47 | There is no way to know the accuracy of the Retrosheet play-by-play data, but it is assumed to be quite accurate given the large number of volunteers who have worked on it for decades.
48 | 
49 | The Lahman data was originally gathered at the season level independently of the Retrosheet data and is therefore inconsistent with Retrosheet in some cases.  For the last few years it appears the Lahman seasonal data is derived from the Retrosheet data so there are no new discrepancies.  Lahman also includes data not in Retrosheet, such as player's salaries.
50 | 
51 | The following data checks can be made:
52 | 
53 | * how close is the Retrosheet data to the Lahman data
54 | * how consistent is the data produced by three Retrosheet parsers with each other
55 | * how consistent is the data in the Lahman tables
56 | 
57 | Performing these checks on the wrangled data also verifies the wrangling (data restructuring) code did not change the data.
58 | 
59 | pytest is used to automate more than 50 tests which check more than 100 attributes. The data is checked for all years between 1974 and 2019, as this is the period for which there is no missing Retrosheet data.
60 | 
61 | The data consistency tests show that the [Retrosheet parsers](https://github.com/sdiehl28/baseball-analytics/blob/master/RetrosheetParsers.md), are 100% self-consistent. In other words, when the data from one Retrosheet parser is aggregated to the same level as another Retrosheet parser and compared, the results are identical.  This shows that there are no errors in the parsers, and no errors in my restructuring of the parser output.
62 | 
63 | The data consistency tests show that the Lahman data is almost 100% self-consistent. For example, when the data in batting is aggregated to the team level and compared with the batting data in teams, the results are almost identical.
64 | 
65 | The data consistency tests show that the Retrosheet data when aggregated and compared with the Lahman data over the period 1974 through 2019 is:
66 | 
67 | - for batting stats: within 0.01%
68 | - for pitching stats: within 0.06%
69 | - for fielding stats: within 0.8%
70 | 
71 | For a detailed description of many of the data consistency tests, see my Jupyter notebook [Data Consistency](https://nbviewer.jupyter.org/github/sdiehl28/baseball-analytics/blob/master/baseball_jupyter_nb/02_Data_Consistency_CSV.ipynb)
72 | 
73 | ### Ongoing
74 | 
75 | Additional examples of baseball data analysis are continually being added.
76 | 
77 | Retrosheet postseason data will soon be parsed and wrangled. All Retrosheet regular season data has been parsed and wrangled.
78 | 
79 | ## Additional Information
80 | 
81 | For more information about the Lahman and Retrosheet data sets and how they were wrangled, see: [MLB Data Overview](https://github.com/sdiehl28/baseball-analytics/blob/master/MLB_Data_Overview.md)
82 | 
83 | For the data sources and their licenses see: [MLB Data Details](https://github.com/sdiehl28/baseball-analytics/blob/master/MLB_Data_Details.md)
84 | 
85 | ## Development Environment
86 | 
87 | Clone the repo: `git clone https://github.com/sdiehl28/baseball-analytics.git`
88 | 
89 | Active your conda environment.  If creating a new conda environment, run `conda install anaconda`.  If using Postgres, also run `conda install psycopg2`
90 | 
91 | The scripts and Jupyter Notebooks were testing using Python 3.7 and Pandas 1.0.1 in a full [Anaconda](https://www.anaconda.com/distribution/) 2019.10 environment.
92 | 
93 | The [open-source parsers](https://sourceforge.net/projects/chadwick/) for Retrosheet must be installed to run the scripts. See: [Retrosheet Parsers](https://github.com/sdiehl28/baseball-analytics/blob/master/RetrosheetParsers.md).
94 | 
95 | 


--------------------------------------------------------------------------------
/download_scripts/postgres_load_data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """Load Wrangled data into Postgres"""
  4 | 
  5 | __author__ = 'Stephen Diehl'
  6 | 
  7 | import os
  8 | import sys
  9 | from pathlib import Path
 10 | import argparse
 11 | import logging
 12 | import csv
 13 | from io import StringIO
 14 | 
 15 | from sqlalchemy import create_engine
 16 | 
 17 | import data_helper as dh
 18 | 
 19 | logger = logging.getLogger(__name__)
 20 | logger.setLevel(logging.DEBUG)
 21 | 
 22 | 
 23 | def get_parser():
 24 |     """Args Description"""
 25 | 
 26 |     parser = argparse.ArgumentParser(
 27 |         description=__doc__,
 28 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 29 | 
 30 |     parser.add_argument("--data-dir", type=str, help="baseball data directory", default='../data')
 31 |     parser.add_argument("-v", "--verbose", help="verbose output", action="store_true")
 32 |     parser.add_argument("--log", dest="log_level", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
 33 |                         help="Set the logging level")
 34 | 
 35 |     return parser
 36 | 
 37 | 
 38 | # This improves df.to_sql() write speed by a couple orders of magnitude!
 39 | # This method was copied verbatim from:
 40 | # https://pandas.pydata.org/pandas-docs/stable/user_guide/io.html#io-sql-method
 41 | # Alternative to_sql() *method* for DBs that support COPY FROM
 42 | def psql_insert_copy(table, conn, keys, data_iter):
 43 |     # gets a DBAPI connection that can provide a cursor
 44 |     dbapi_conn = conn.connection
 45 |     with dbapi_conn.cursor() as cur:
 46 |         s_buf = StringIO()
 47 |         writer = csv.writer(s_buf)
 48 |         writer.writerows(data_iter)
 49 |         s_buf.seek(0)
 50 | 
 51 |         columns = ', '.join('"{}"'.format(k) for k in keys)
 52 |         if table.schema:
 53 |             table_name = '{}.{}'.format(table.schema, table.name)
 54 |         else:
 55 |             table_name = table.name
 56 | 
 57 |         sql = 'COPY {} ({}) FROM STDIN WITH CSV'.format(
 58 |             table_name, columns)
 59 |         cur.copy_expert(sql=sql, file=s_buf)
 60 | 
 61 | 
 62 | def create_and_load_table(engine, prefix, filename, pkey=None):
 63 |     table = prefix + filename.name.split('.')[0]
 64 |     logger.info(f'{table} loading ...')
 65 | 
 66 |     # read with optimized Pandas data types
 67 |     df = dh.from_csv_with_types(filename)
 68 | 
 69 |     # compute optimized database data types
 70 |     db_dtypes = dh.optimize_db_dtypes(df)
 71 | 
 72 |     # drop table and its dependencies (e.g. primary key constraint)
 73 |     engine.execute(f'DROP TABLE IF EXISTS {table} CASCADE')
 74 |     df.to_sql(table, engine, index=False, dtype=db_dtypes, method=psql_insert_copy)
 75 | 
 76 |     # add primary key constraint
 77 |     if pkey:
 78 |         pkeys_str = ', '.join(pkey)
 79 |         sql = f'ALTER TABLE {table} ADD PRIMARY KEY ({pkeys_str})'
 80 |         engine.execute(sql)
 81 | 
 82 |     # rows added
 83 |     rs = engine.execute(f'SELECT COUNT(*) from {table}')
 84 |     result = rs.fetchall()
 85 |     rows = result[0][0]
 86 | 
 87 |     logger.info(f'{table} added with {rows} rows')
 88 | 
 89 | 
 90 | def load_lahman_tables(engine, data_dir):
 91 |     lahman_data = data_dir.joinpath('lahman/wrangled')
 92 | 
 93 |     create_and_load_table(engine, 'lahman_', lahman_data / 'people.csv', ['player_id'])
 94 |     sql = 'ALTER TABLE lahman_people ADD CONSTRAINT retro_player_unique UNIQUE (retro_id)'
 95 |     engine.execute(sql)
 96 | 
 97 |     create_and_load_table(engine, 'lahman_', lahman_data / 'batting.csv',
 98 |                           ['player_id', 'year', 'stint'])
 99 |     create_and_load_table(engine, 'lahman_', lahman_data / 'battingpost.csv',
100 |                           ['player_id', 'year', 'round'])
101 |     create_and_load_table(engine, 'lahman_', lahman_data / 'pitching.csv',
102 |                           ['player_id', 'year', 'stint'])
103 |     create_and_load_table(engine, 'lahman_', lahman_data / 'pitchingpost.csv',
104 |                           ['player_id', 'year', 'round'])
105 |     create_and_load_table(engine, 'lahman_', lahman_data / 'fielding.csv',
106 |                           ['player_id', 'year', 'stint', 'pos'])
107 |     create_and_load_table(engine, 'lahman_', lahman_data / 'fieldingpost.csv',
108 |                           ['player_id', 'year', 'round', 'pos'])
109 |     create_and_load_table(engine, 'lahman_', lahman_data / 'parks.csv',
110 |                           ['park_key'])
111 |     create_and_load_table(engine, 'lahman_', lahman_data / 'salaries.csv',
112 |                           ['player_id', 'year', 'team_id'])
113 |     create_and_load_table(engine, 'lahman_', lahman_data / 'teams.csv',
114 |                           ['team_id', 'year'])
115 |     sql = 'ALTER TABLE lahman_teams ADD CONSTRAINT retro_team_unique UNIQUE (team_id_retro, year)'
116 |     engine.execute(sql)
117 | 
118 | 
119 | def load_retrosheet_tables(engine, data_dir):
120 |     retro_data = data_dir.joinpath('retrosheet/wrangled')
121 | 
122 |     create_and_load_table(engine, 'retro_', retro_data / 'batting.csv.gz',
123 |                           ['player_id', 'game_id'])
124 |     sql = """ALTER TABLE retro_batting
125 |     ADD CONSTRAINT batting_player_id
126 |     FOREIGN KEY(player_id)
127 |     REFERENCES lahman_people(retro_id)
128 |     """
129 |     engine.execute(sql)
130 | 
131 |     create_and_load_table(engine, 'retro_', retro_data / 'pitching.csv.gz',
132 |                           ['player_id', 'game_id'])
133 |     sql = """ALTER TABLE retro_pitching
134 |     ADD CONSTRAINT pitching_player_id
135 |     FOREIGN KEY(player_id)
136 |     REFERENCES lahman_people(retro_id)
137 |     """
138 |     engine.execute(sql)
139 | 
140 |     create_and_load_table(engine, 'retro_', retro_data / 'fielding.csv.gz',
141 |                           ['player_id', 'game_id', 'pos'])
142 |     sql = """ALTER TABLE retro_fielding
143 |     ADD CONSTRAINT fielding_player_id
144 |     FOREIGN KEY(player_id)
145 |     REFERENCES lahman_people(retro_id)
146 |     """
147 |     engine.execute(sql)
148 | 
149 |     create_and_load_table(engine, 'retro_', retro_data / 'game.csv.gz',
150 |                           ['game_id'])
151 | 
152 |     create_and_load_table(engine, 'retro_', retro_data / 'team_game.csv.gz',
153 |                           ['team_id', 'game_id'])
154 | 
155 |     sql = """ALTER TABLE retro_team_game
156 |     ADD CONSTRAINT retro_team_id FOREIGN KEY (team_id, year) 
157 |     REFERENCES lahman_teams (team_id_retro, year)
158 |     """
159 |     engine.execute(sql)
160 | 
161 |     create_and_load_table(engine, 'retro_', retro_data / 'event.csv.gz',
162 |                           ['game_id', 'event_id'])
163 | 
164 | 
165 | def main():
166 |     """Load the data in Postgres.
167 |     """
168 |     parser = get_parser()
169 |     args = parser.parse_args()
170 | 
171 |     if args.log_level:
172 |         fh = logging.FileHandler('download.log')
173 |         formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
174 |         fh.setFormatter(formatter)
175 |         fh.setLevel(args.log_level)
176 |         logger.addHandler(fh)
177 | 
178 |     if args.verbose:
179 |         # send INFO level logging to stdout
180 |         sh = logging.StreamHandler(sys.stdout)
181 |         formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
182 |         sh.setFormatter(formatter)
183 |         sh.setLevel(logging.INFO)
184 |         logger.addHandler(sh)
185 | 
186 |     # Get the user and password from the environment (rather than hardcoding it)
187 |     db_user = os.environ.get('DB_USER')
188 |     db_pass = os.environ.get('DB_PASS')
189 | 
190 |     # avoid putting passwords directly in code
191 |     connect_str = f'postgresql://{db_user}:{db_pass}@localhost:5432/baseball'
192 | 
193 |     # for distinction between engine.execute() and engine.connect().execute() see:
194 |     # https://stackoverflow.com/questions/34322471/sqlalchemy-engine-connection-and-session-difference#answer-42772654
195 |     engine = create_engine(connect_str)
196 | 
197 |     data_dir = Path('../data')
198 |     load_lahman_tables(engine, data_dir)
199 |     load_retrosheet_tables(engine, data_dir)
200 | 
201 |     logger.info('Finished')
202 | 
203 | 
204 | if __name__ == '__main__':
205 |     main()
206 | 


--------------------------------------------------------------------------------
/download_scripts/lahman_wrangle.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """Wrangle Lahman Data from {data_dir}/lahman/raw to {data_dir}/lahman/wrangled
  4 | 
  5 | Wrangles: batting, pitching, fielding, people, teams, salaries, parks
  6 | """
  7 | 
  8 | __author__ = 'Stephen Diehl'
  9 | 
 10 | import pandas as pd
 11 | 
 12 | import os
 13 | import argparse
 14 | from pathlib import Path
 15 | import logging
 16 | import sys
 17 | 
 18 | import data_helper as dh
 19 | 
 20 | logger = logging.getLogger(__name__)
 21 | logger.setLevel(logging.DEBUG)
 22 | 
 23 | 
 24 | def get_fieldname_mapping():
 25 |     """Dictionary of fieldnames to modify."""
 26 | 
 27 |     # It is easier to maintain fieldname mappings in a single location
 28 |     new_names = {
 29 |         'playerID': 'player_id',
 30 |         'yearID': 'year',
 31 |         'teamID': 'team_id',
 32 |         'lgID': 'lg_id',
 33 |         '2B': 'double',
 34 |         '3B': 'triple',
 35 |         'BAOpp': 'ba_opp',
 36 |         'IPouts': 'ip_outs',
 37 |         'InnOuts': 'inn_outs',
 38 |         'franchID': 'franch_id',
 39 |         'divID': 'div_id',
 40 |         'Ghome': 'g_home',
 41 |         'DivWin': 'div_win',
 42 |         'WCWin': 'wc_win',
 43 |         'LgWin': 'lg_win',
 44 |         'WSWin': 'ws_win',
 45 |         'teamIDBR': 'team_id_br',
 46 |         'teamIDlahman45': 'team_id_lahman45',
 47 |         'teamIDretro': 'team_id_retro',
 48 |         'birthYear': 'birth_year',
 49 |         'birthMonth': 'birth_month',
 50 |         'birthDay': 'birth_day',
 51 |         'birthCountry': 'birth_country',
 52 |         'birthState': 'birth_state',
 53 |         'birthCity': 'birth_city',
 54 |         'deathYear': 'death_year',
 55 |         'deathMonth': 'death_month',
 56 |         'deathDay': 'death_day',
 57 |         'deathCountry': 'death_country',
 58 |         'deathState': 'death_state',
 59 |         'deathCity': 'death_city',
 60 |         'nameFirst': 'name_first',
 61 |         'nameLast': 'name_last',
 62 |         'nameGiven': 'name_given',
 63 |         'finalGame': 'final_game',
 64 |         'retroID': 'retro_id',
 65 |         'bbrefID': 'bb_ref_id',
 66 |         'park.key': 'park_key',
 67 |         'park.name': 'park_name',
 68 |         'park.alias': 'park_alias'
 69 |     }
 70 | 
 71 |     return new_names
 72 | 
 73 | 
 74 | def get_parser():
 75 |     """Args Description"""
 76 | 
 77 |     parser = argparse.ArgumentParser(
 78 |         description=__doc__,
 79 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 80 | 
 81 |     parser.add_argument("--data-dir", type=str, help="baseball data directory", default='../data')
 82 |     parser.add_argument("-v", "--verbose", help="verbose output", action="store_true")
 83 |     parser.add_argument("--log", dest="log_level", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
 84 |                         help="Set the logging level")
 85 | 
 86 |     return parser
 87 | 
 88 | 
 89 | def to_date(row, prefix):
 90 |     """Custom Parsing of birth and death dates"""
 91 |     y = row[prefix + '_year']
 92 |     m = row[prefix + '_month']
 93 |     d = row[prefix + '_day']
 94 | 
 95 |     # NaT if year is missing
 96 |     if pd.isna(y):
 97 |         return pd.NaT
 98 | 
 99 |     # if year present but month missing
100 |     if pd.isna(m):
101 |         m = 1
102 | 
103 |     # if year present but day missing
104 |     if pd.isna(d):
105 |         d = 1
106 | 
107 |     return pd.to_datetime(f'{int(y)}-{int(m)}-{int(d)}')
108 | 
109 | 
110 | def wrangle_basic(p_raw, p_wrangled, filename):
111 |     """Basic Wrangle:  converts fieldnames, optimizes datatypes and persists data
112 |     """
113 |     filename_lower = str(filename).lower()
114 |     wrangled_file = p_wrangled.joinpath(filename_lower)
115 | 
116 |     if wrangled_file.exists():
117 |         logger.info(f'Skipping wrangle of {filename} - already performed')
118 |         return
119 | 
120 |     os.chdir(p_raw)
121 |     df = pd.read_csv(filename)
122 | 
123 |     df.rename(columns=get_fieldname_mapping(), inplace=True)
124 |     df.columns = df.columns.str.lower()
125 | 
126 |     # downcast integers and convert float to Int64, if data permits
127 |     dh.optimize_df_dtypes(df)
128 | 
129 |     msg = dh.df_info(df)
130 |     logger.info(f'{filename}\n{msg}')
131 | 
132 |     # persist with optimized datatypes
133 |     os.chdir(p_wrangled)
134 |     dh.to_csv_with_types(df, wrangled_file)
135 | 
136 | 
137 | def wrangle_people(p_raw, p_wrangled):
138 |     """Custom parsing of dates, converts fieldnames, optimizes datatypes and persists data
139 |     """
140 |     if p_wrangled.joinpath('people.csv').exists():
141 |         logger.info('Skipping wrangle of People.csv - already performed')
142 |         return
143 | 
144 |     os.chdir(p_raw)
145 |     people = pd.read_csv('People.csv', parse_dates=['debut', 'finalGame'])
146 | 
147 |     people.rename(columns=get_fieldname_mapping(), inplace=True)
148 |     people.columns = people.columns.str.lower()
149 | 
150 |     people['birth_date'] = people.apply(lambda x: to_date(x, 'birth'), axis=1)
151 |     people['death_date'] = people.apply(lambda x: to_date(x, 'death'), axis=1)
152 |     people = people.drop(
153 |         ['birth_year', 'birth_month', 'birth_day',
154 |          'death_year', 'death_month', 'death_day'], axis=1)
155 | 
156 |     msg = dh.df_info(people)
157 |     logger.info('people\n{}'.format(msg))
158 | 
159 |     # persist as a csv file with data types
160 |     os.chdir(p_wrangled)
161 |     dh.to_csv_with_types(people, 'people.csv')
162 | 
163 | 
164 | def wrangle_fielding(p_raw, p_wrangled):
165 |     """Drops cols > 90% null, converts fieldnames, optimizes datatypes and persists data
166 |     """
167 |     if p_wrangled.joinpath('fielding.csv').exists():
168 |         logger.info('Skipping wrangle of Fielding.csv - already performed')
169 |         return
170 | 
171 |     os.chdir(p_raw)
172 |     fielding = pd.read_csv('Fielding.csv')
173 | 
174 |     fielding.rename(columns=get_fieldname_mapping(), inplace=True)
175 |     fielding.columns = fielding.columns.str.lower()
176 | 
177 |     # drop any column that is more than 90% null
178 |     filt = fielding.isna().mean() > 0.90
179 |     if filt.any():
180 |         drop_cols = fielding.columns[filt]
181 |         logger.warning(f'Cols > 90% missing being dropped: {" ".join(drop_cols)}')
182 |         fielding.drop(drop_cols, axis=1, inplace=True)
183 | 
184 |     dh.optimize_df_dtypes(fielding)
185 | 
186 |     msg = dh.df_info(fielding)
187 |     logger.info('fielding\n{}'.format(msg))
188 | 
189 |     # persist
190 |     os.chdir(p_wrangled)
191 |     dh.to_csv_with_types(fielding, 'fielding.csv')
192 | 
193 | 
194 | def main():
195 |     """Wrangle the data"""
196 |     parser = get_parser()
197 |     args = parser.parse_args()
198 | 
199 |     if args.log_level:
200 |         fh = logging.FileHandler('download.log')
201 |         formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
202 |         fh.setFormatter(formatter)
203 |         fh.setLevel(args.log_level)
204 |         logger.addHandler(fh)
205 | 
206 |     if args.verbose:
207 |         # send INFO level logging to stdout
208 |         sh = logging.StreamHandler(sys.stdout)
209 |         formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
210 |         sh.setFormatter(formatter)
211 |         sh.setLevel(logging.INFO)
212 |         logger.addHandler(sh)
213 | 
214 |     p_lahman_raw = Path(args.data_dir).joinpath('lahman/raw').resolve()
215 |     p_lahman_wrangled = Path(args.data_dir).joinpath('lahman/wrangled').resolve()
216 | 
217 |     wrangle_people(p_lahman_raw, p_lahman_wrangled)
218 |     wrangle_fielding(p_lahman_raw, p_lahman_wrangled)
219 | 
220 |     # TODO add fieldname mappings to support other Lahman csv files
221 |     wrangle_basic(p_lahman_raw, p_lahman_wrangled, 'Batting.csv')
222 |     wrangle_basic(p_lahman_raw, p_lahman_wrangled, 'BattingPost.csv')
223 |     wrangle_basic(p_lahman_raw, p_lahman_wrangled, 'FieldingPost.csv')
224 |     wrangle_basic(p_lahman_raw, p_lahman_wrangled, 'Pitching.csv')
225 |     wrangle_basic(p_lahman_raw, p_lahman_wrangled, 'PitchingPost.csv')
226 |     wrangle_basic(p_lahman_raw, p_lahman_wrangled, 'Teams.csv')
227 |     wrangle_basic(p_lahman_raw, p_lahman_wrangled, 'Salaries.csv')
228 |     wrangle_basic(p_lahman_raw, p_lahman_wrangled, 'Parks.csv')
229 | 
230 | 
231 | if __name__ == '__main__':
232 |     main()
233 | 


--------------------------------------------------------------------------------
/download_scripts/retrosheet_collect.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """Collect parsed event files"""
  4 | 
  5 | __author__ = 'Stephen Diehl'
  6 | 
  7 | import argparse
  8 | import sys
  9 | from pathlib import Path
 10 | import os
 11 | import glob
 12 | import pandas as pd
 13 | import data_helper as dh
 14 | import logging
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | logger.setLevel(logging.DEBUG)
 18 | 
 19 | 
 20 | def get_parser():
 21 |     """Args Description"""
 22 | 
 23 |     parser = argparse.ArgumentParser(
 24 |         description=__doc__,
 25 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 26 | 
 27 |     parser.add_argument("--data-dir", type=str, help="baseball data directory", default='../data')
 28 |     parser.add_argument("-v", "--verbose", help="verbose output", action="store_true")
 29 |     parser.add_argument("--use-datatypes", help="use precomputed datatypes", action="store_true")
 30 |     parser.add_argument("--log", dest="log_level", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
 31 |                         help="Set the logging level")
 32 | 
 33 |     return parser
 34 | 
 35 | 
 36 | def collect_parsed_files(parse_dir, collect_dir, parser, use_datatypes):
 37 |     """Collect all parsed files and optimize datatypes.
 38 |     """
 39 | 
 40 |     os.chdir(parse_dir)
 41 |     # read the augmented files, not the ones created by cwevent
 42 |     if parser == 'cwevent':
 43 |         dailyfiles = glob.glob(f'{parser}*_plus.csv')
 44 |     else:
 45 |         dailyfiles = glob.glob(f'{parser}*.csv')
 46 |     dailyfiles.sort()
 47 | 
 48 |     logger.info(f'Collecting {len(dailyfiles)} {parser} parsed csv files into single dataframe ...')
 49 | 
 50 |     if use_datatypes:
 51 |         # this can save gigabytes of RAM by using precomputed datatypes
 52 |         logger.info('Using precomputed data types')
 53 |         if parser == 'cwdaily':
 54 |             filename = '../player_game_types.csv'
 55 |         elif parser == 'cwgame':
 56 |             filename = '../game_types.csv'
 57 |         elif parser == 'cwevent':
 58 |             filename = '../event_types.csv'
 59 |         else:
 60 |             raise ValueError(f'Unrecognized parser: {parser}')
 61 | 
 62 |         dates, dtypes = dh.read_types(filename)
 63 |         dtypes = {key.upper(): value for key, value in dtypes.items()}
 64 | 
 65 |         df = pd.concat((pd.read_csv(f, parse_dates=dates, dtype=dtypes) for f in dailyfiles),
 66 |                        ignore_index=True, copy=False)
 67 |         logger.info(f'Optimized Memory Usage:   {dh.mem_usage(df)}')
 68 |     else:
 69 |         # This could use twice the RAM required to hold the unoptimized DataFrame!
 70 |         # cwgame parser will output the line score (line_tx) like: 001001001
 71 |         # but without double quotes around it, so it gets interpreted as a number.
 72 |         # Specify dtype for line score fields to get around this.
 73 |         df = pd.concat((pd.read_csv(f, dtype={'AWAY_LINE_TX': str, 'HOME_LINE_TX': str})
 74 |                         for f in dailyfiles), ignore_index=True, copy=False)
 75 | 
 76 |         logger.info(f'Unoptimized Memory Usage: {dh.mem_usage(df)}')
 77 |         logger.info('Optimizing Data Types to reduce memory ...')
 78 | 
 79 |         # for cwdaily, optimize_df_dtypes reduces the size of the dataframe by a factor of 3
 80 |         dh.optimize_df_dtypes(df)
 81 |         logger.info(f'Optimized Memory Usage:   {dh.mem_usage(df)}')
 82 | 
 83 |     # convert to lower case
 84 |     df.columns = df.columns.str.lower()
 85 | 
 86 |     # drop any column that is more than 99% null
 87 |     filt = df.isna().mean() > 0.99
 88 |     if filt.any():
 89 |         drop_cols = df.columns[filt]
 90 |         logger.warning(f'Cols > 99% missing being dropped: {" ".join(drop_cols)}')
 91 |         df.drop(drop_cols, axis=1, inplace=True)
 92 | 
 93 |     # persist optimized dataframe
 94 |     # gzip chosen over xy because this runs on client computer and gzip is faster
 95 |     logger.info('persisting dataframe using compression - this could take several minutes ...')
 96 |     os.chdir(collect_dir)
 97 |     if parser == 'cwdaily':
 98 |         filename = 'player_game.csv.gz'
 99 |     elif parser == 'cwgame':
100 |         filename = 'game.csv.gz'
101 |     elif parser == 'cwevent':  # was wrangled in parser to save RAM, write to wrangled dir
102 |         filename = 'event.csv.gz'
103 |     else:
104 |         raise ValueError(f'Unrecognized parser: {parser}')
105 | 
106 |     dh.to_csv_with_types(df, filename)
107 |     logger.info(f'{parser} data persisted')
108 | 
109 | 
110 | def augment_event_files(p_data_parsed):
111 |     """Add New Play-by-Play Fields
112 | 
113 |     cwevent does not produce a boolean or int for the following values:
114 |     'so', 'sb', 'cs', 'bk', 'bb', 'ibb', 'hbp', 'xi', 'single', 'double', 'triple', 'hr'
115 |     Extract these from event_tx and h_cd
116 | 
117 |     The advantage of creating these fields is:
118 |      1) some play-by-play analysis is easier
119 |      2) the new fields can be aggregated to the game level and compared with cwgame to
120 |         verify data consistency
121 | 
122 |     This method is in retrosheet_collect.py rather than retrosheet_wrangle.py, because
123 |     many Gigs of RAM can be saved by collecting csv files that replace the value 'T'
124 |     with the value True (and likewise 'F' with False).
125 |     """
126 |     os.chdir(p_data_parsed)
127 |     files = p_data_parsed.glob('cwevent????.csv')
128 |     for file in sorted(files):
129 |         df = pd.read_csv(file)
130 |         logger.info(f'Creating Augmented Event File: {file.name.split(".")[0]}_plus.csv')
131 | 
132 |         # change column names to lowercase
133 |         cols = [col.lower() for col in df.columns]
134 |         df.columns = cols
135 | 
136 |         # prepare to remove _fl from flag fields
137 |         flag_fields = [col for col in df.columns if col.endswith('_fl')]
138 |         new_names = [col[:-3] for col in flag_fields]
139 | 
140 |         # convert 'T' to True/False
141 |         # a bool takes 8 times less memory than the object 'T'
142 |         df[new_names] = df[flag_fields].applymap(lambda s: s == 'T')
143 |         df.drop(columns=flag_fields, inplace=True)
144 | 
145 |         # use "better" names
146 |         names = {'event_outs_ct': 'outs', 'err_ct': 'e', 'event_runs_ct': 'r',
147 |                  'bat_home_id': 'home_half', 'pa_new': 'pa', 'bat_team_id': 'team_id',
148 |                  'fld_team_id': 'opponent_team_id'}
149 |         df = df.rename(columns=names)
150 | 
151 |         df['so'] = df['event_tx'].str.contains(r'^K')
152 |         df['sb'] = df['event_tx'].str.count('SB')  # counts multiple stolen bases on one play
153 |         df['cs'] = df['event_tx'].str.count('CS')  # counts multiple cs on one play
154 |         df['bk'] = df['event_tx'].str.contains('BK')
155 | 
156 |         # 'I' not preceded by 'D' or 'B' or '/' and not followed by 'N'
157 |         df['ibb'] = df['event_tx'].str.contains(r'(?<![DB\/])I(?!N)')
158 | 
159 |         # 'W' not preceded by 'I' or 'D' and not followed by 'P'
160 |         df['bb'] = df['event_tx'].str.contains(r'(?<![ID])W(?!P)')
161 | 
162 |         # by definition, bb includes ibb
163 |         df['bb'] |= df['ibb']
164 | 
165 |         df['hbp'] = df['event_tx'].str.contains('HP')
166 | 
167 |         # batter my reach base on interference by pitcher or catcher or 1st baseman
168 |         df['xi'] = df['event_tx'].str.contains(r'C/E(?:1|2|3)')
169 | 
170 |         df['single'] = df['h_cd'] == 1
171 |         df['double'] = df['h_cd'] == 2
172 |         df['triple'] = df['h_cd'] == 3
173 |         df['hr'] = df['h_cd'] == 4
174 |         df['h'] = df['h_cd'] > 0
175 | 
176 |         df.to_csv(f'{file.name.split(".")[0]}_plus.csv', index=False)
177 | 
178 | 
179 | def main():
180 |     """Collect the CSV files."""
181 |     parser = get_parser()
182 |     args = parser.parse_args()
183 | 
184 |     if args.log_level:
185 |         fh = logging.FileHandler('download.log')
186 |         formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
187 |         fh.setFormatter(formatter)
188 |         fh.setLevel(args.log_level)
189 |         logger.addHandler(fh)
190 | 
191 |     if args.verbose:
192 |         # send INFO level logging to stdout
193 |         sh = logging.StreamHandler(sys.stdout)
194 |         formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
195 |         sh.setFormatter(formatter)
196 |         sh.setLevel(logging.INFO)
197 |         logger.addHandler(sh)
198 | 
199 |     p_data = Path(args.data_dir).resolve()
200 |     p_data_parsed = p_data.joinpath('retrosheet/parsed')
201 |     p_data_collected = p_data.joinpath('retrosheet/collected')
202 | 
203 |     # create directories, if they do not exist
204 |     p_data_parsed.mkdir(parents=True, exist_ok=True)
205 |     p_data_collected.mkdir(parents=True, exist_ok=True)
206 | 
207 |     event_files = list(p_data_parsed.glob('cwevent*.csv'))
208 |     if event_files:
209 |         if p_data.joinpath('retrosheet', 'collected', 'event.csv.gz').exists():
210 |             logger.info('Skipping cwevent collection -- already performed')
211 |         else:
212 |             augment_event_files(p_data_parsed)
213 |             collect_parsed_files(p_data_parsed, p_data_collected, 'cwevent', args.use_datatypes)
214 | 
215 |     if p_data.joinpath('retrosheet', 'collected', 'player_game.csv.gz').exists():
216 |         logger.info('Skipping cwdaily collection -- already performed')
217 |     else:
218 |         collect_parsed_files(p_data_parsed, p_data_collected, 'cwdaily', args.use_datatypes)
219 | 
220 |     if p_data.joinpath('retrosheet', 'collected', 'game.csv.gz').exists():
221 |         logger.info('Skipping cwgame collection -- already performed')
222 |     else:
223 |         collect_parsed_files(p_data_parsed, p_data_collected, 'cwgame', args.use_datatypes)
224 | 
225 | 
226 | if __name__ == '__main__':
227 |     main()
228 | 


--------------------------------------------------------------------------------
/data/retrosheet/nb_data/fangraphs.csv:
--------------------------------------------------------------------------------
  1 | Season,Team,Basic (5yr),3yr,1yr,1B,2B,3B,HR,SO,BB,GB,FB,LD,IFFB,FIP
  2 | 2015,Angels,97,95,93,100,96,88,98,102,97,101,100,98,100,98
  3 | 2015,Orioles,101,101,108,101,96,87,106,98,100,101,102,100,100,103
  4 | 2015,Red Sox,104,107,109,103,112,103,95,99,99,102,97,103,101,98
  5 | 2015,White Sox,99,98,95,98,95,93,105,102,103,98,101,98,105,102
  6 | 2015,Indians,102,106,112,101,106,83,102,100,100,101,97,101,92,100
  7 | 2015,Tigers,102,99,95,101,99,123,101,96,100,100,104,101,105,102
  8 | 2015,Royals,102,103,101,101,105,114,92,97,99,100,101,101,95,98
  9 | 2015,Twins,102,102,100,103,103,104,101,98,99,102,101,102,100,101
 10 | 2015,Yankees,101,100,101,99,95,83,112,101,100,98,102,98,102,104
 11 | 2015,Athletics,98,97,97,99,102,113,94,98,99,100,102,100,105,98
 12 | 2015,Mariners,96,95,94,98,96,85,99,103,99,97,102,97,107,98
 13 | 2015,Rays,97,98,99,99,94,100,96,103,100,98,100,100,104,98
 14 | 2015,Rangers,105,105,107,103,100,109,101,98,104,100,100,103,98,102
 15 | 2015,Blue Jays,101,101,96,97,108,97,103,100,99,99,99,100,98,101
 16 | 2015,Diamondbacks,105,107,103,100,106,127,104,100,100,101,100,101,93,101
 17 | 2015,Braves,99,99,97,100,98,96,94,103,102,98,100,101,99,97
 18 | 2015,Cubs,100,96,98,100,100,103,101,101,102,100,99,101,97,100
 19 | 2015,Reds,101,101,105,98,100,91,108,104,102,98,100,98,102,102
 20 | 2015,Rockies,116,120,120,109,109,132,110,95,102,107,99,107,90,107
 21 | 2015,Marlins,97,97,98,99,98,108,90,99,101,101,99,97,95,97
 22 | 2015,Astros,97,96,97,98,96,109,102,103,99,100,98,98,100,99
 23 | 2015,Dodgers,96,94,96,97,99,77,100,100,94,99,100,96,102,98
 24 | 2015,Brewers,102,101,105,99,101,96,108,101,101,99,100,101,97,103
 25 | 2015,Nationals,101,100,100,104,102,81,97,99,100,100,102,101,98,99
 26 | 2015,Mets,95,95,94,95,96,86,99,102,101,97,101,98,110,100
 27 | 2015,Phillies,100,97,102,98,97,95,111,104,101,99,101,97,108,103
 28 | 2015,Pirates,98,99,97,102,99,96,92,97,99,102,98,104,99,98
 29 | 2015,Cardinals,97,99,97,100,99,96,95,97,98,100,101,101,103,99
 30 | 2015,Padres,95,97,97,98,96,96,96,101,100,100,98,99,97,98
 31 | 2015,Giants,96,96,93,100,99,122,86,100,101,101,97,98,98,95
 32 | 2016,Angels,97,96,96,99,95,84,100,102,98,100,100,98,100,99
 33 | 2016,Orioles,100,102,98,102,96,87,105,98,100,101,103,99,101,103
 34 | 2016,Red Sox,105,106,109,103,114,104,96,99,99,102,98,103,101,99
 35 | 2016,White Sox,99,97,97,99,95,96,103,103,101,99,102,98,102,100
 36 | 2016,Indians,104,106,110,102,106,85,102,100,101,102,97,102,92,101
 37 | 2016,Tigers,100,101,101,100,98,120,100,96,99,100,104,101,105,101
 38 | 2016,Royals,101,101,108,101,106,113,92,97,99,101,100,102,95,98
 39 | 2016,Twins,102,102,102,102,103,103,102,98,100,102,100,101,101,102
 40 | 2016,Yankees,101,101,102,100,95,83,112,101,101,98,102,100,105,104
 41 | 2016,Athletics,98,98,92,99,102,108,93,98,99,100,100,100,101,98
 42 | 2016,Mariners,95,96,97,97,94,87,100,103,99,97,102,96,104,98
 43 | 2016,Rays,97,96,95,99,94,97,96,103,100,97,99,99,105,97
 44 | 2016,Rangers,108,108,107,104,102,120,104,98,103,101,101,104,100,104
 45 | 2016,Blue Jays,100,100,107,98,105,102,102,100,99,100,99,98,97,100
 46 | 2016,Diamondbacks,105,108,110,100,106,127,104,100,100,101,100,101,93,101
 47 | 2016,Braves,99,99,103,100,98,96,94,103,102,98,100,101,99,97
 48 | 2016,Cubs,100,99,94,99,99,105,100,100,102,100,99,100,97,100
 49 | 2016,Reds,102,102,100,98,100,95,108,104,103,98,100,99,102,103
 50 | 2016,Rockies,116,117,117,109,112,135,111,96,102,106,99,105,89,107
 51 | 2016,Marlins,95,94,92,99,96,103,90,99,100,101,99,98,95,97
 52 | 2016,Astros,96,93,91,97,96,105,101,102,98,100,97,97,100,99
 53 | 2016,Dodgers,96,95,91,96,100,75,101,100,94,99,100,96,104,98
 54 | 2016,Brewers,101,102,99,98,100,94,107,102,100,99,100,100,97,102
 55 | 2016,Nationals,102,100,98,103,103,83,100,100,101,99,103,100,97,100
 56 | 2016,Mets,94,95,99,95,94,86,97,101,100,97,100,97,109,98
 57 | 2016,Phillies,99,99,92,98,96,94,109,105,100,99,101,97,109,102
 58 | 2016,Pirates,98,98,100,102,101,95,93,97,100,102,98,103,99,99
 59 | 2016,Cardinals,98,96,96,101,100,96,95,97,98,101,101,102,102,99
 60 | 2016,Padres,97,96,101,99,99,95,96,100,102,101,98,102,98,99
 61 | 2016,Giants,97,95,101,101,99,120,86,99,100,101,96,99,98,95
 62 | 2017,Angels,98,97,98,99,95,84,100,102,98,100,100,98,100,100
 63 | 2017,Orioles,102,99,101,102,96,87,105,98,100,101,103,99,101,104
 64 | 2017,Red Sox,105,105,101,103,114,104,96,99,99,102,98,103,101,99
 65 | 2017,White Sox,98,98,100,99,95,96,103,103,101,99,102,98,102,101
 66 | 2017,Indians,104,105,99,102,106,85,102,100,101,102,97,102,92,100
 67 | 2017,Tigers,101,102,108,100,98,120,100,96,99,100,104,101,105,101
 68 | 2017,Royals,102,102,97,101,106,113,92,97,99,101,100,102,95,98
 69 | 2017,Twins,101,102,104,102,103,103,102,98,100,102,100,101,101,101
 70 | 2017,Yankees,100,103,101,100,95,83,112,101,101,98,102,100,105,102
 71 | 2017,Athletics,97,96,105,99,102,108,93,98,99,100,100,100,101,98
 72 | 2017,Mariners,96,95,96,97,94,87,100,103,99,97,102,96,104,98
 73 | 2017,Rays,96,96,95,99,94,97,96,103,100,97,99,99,105,97
 74 | 2017,Rangers,109,111,110,104,102,120,104,98,103,101,101,104,100,104
 75 | 2017,Blue Jays,100,101,97,98,105,102,102,100,99,100,99,98,97,101
 76 | 2017,Diamondbacks,105,108,109,100,106,127,104,100,100,101,100,101,93,101
 77 | 2017,Braves,101,101,99,102,101,92,96,99,99,101,99,102,101,99
 78 | 2017,Cubs,100,101,106,99,99,105,100,100,102,100,99,100,97,100
 79 | 2017,Reds,102,102,101,98,100,95,108,104,103,98,100,99,102,102
 80 | 2017,Rockies,115,115,115,109,112,135,111,96,102,106,99,105,89,106
 81 | 2017,Marlins,95,91,94,99,96,103,90,99,100,101,99,98,95,97
 82 | 2017,Astros,97,94,92,97,96,105,101,102,98,100,97,97,100,99
 83 | 2017,Dodgers,96,94,99,96,100,75,101,100,94,99,100,96,104,98
 84 | 2017,Brewers,101,101,103,98,100,94,107,102,100,99,100,100,97,101
 85 | 2017,Nationals,102,102,103,103,103,83,100,100,101,99,103,100,97,102
 86 | 2017,Mets,94,93,93,95,94,86,97,101,100,97,100,97,109,98
 87 | 2017,Phillies,100,99,103,98,96,94,109,105,100,99,101,97,109,102
 88 | 2017,Pirates,98,97,98,102,101,95,93,97,100,102,98,103,99,100
 89 | 2017,Cardinals,96,96,95,101,100,96,95,97,98,101,101,102,102,98
 90 | 2017,Padres,97,98,92,99,99,95,96,100,102,101,98,102,98,99
 91 | 2017,Giants,96,98,93,101,99,120,86,99,100,101,96,99,98,94
 92 | 2018,Angels,98,99,99,99,95,84,100,102,98,100,100,98,100,100
 93 | 2018,Orioles,102,102,99,102,96,87,105,98,100,101,103,99,101,104
 94 | 2018,Red Sox,105,103,104,103,114,104,96,99,99,102,98,103,101,99
 95 | 2018,White Sox,98,99,97,99,95,96,103,103,101,99,102,98,102,101
 96 | 2018,Indians,104,101,106,102,106,85,102,100,101,102,97,102,92,100
 97 | 2018,Tigers,101,103,97,100,98,120,100,96,99,100,104,101,105,101
 98 | 2018,Royals,102,101,103,101,106,113,92,97,99,101,100,102,95,98
 99 | 2018,Twins,101,101,100,102,103,103,102,98,100,102,100,101,101,101
100 | 2018,Yankees,100,99,106,100,95,83,112,101,101,98,102,100,105,102
101 | 2018,Athletics,97,97,92,99,102,108,93,98,99,100,100,100,101,98
102 | 2018,Mariners,96,96,93,97,94,87,100,103,99,97,102,96,104,98
103 | 2018,Rays,96,96,97,99,94,97,96,103,100,97,99,99,105,97
104 | 2018,Rangers,109,112,116,104,102,120,104,98,103,101,101,104,100,104
105 | 2018,Blue Jays,100,99,98,98,105,102,102,100,99,100,99,98,97,101
106 | 2018,Diamondbacks,100,101,103,101,99,130,99,100,102,100,98,104,101,99
107 | 2018,Braves,101,101,106,102,101,92,96,99,99,101,99,102,101,99
108 | 2018,Cubs,100,102,104,99,99,105,100,100,102,100,99,100,97,100
109 | 2018,Reds,102,103,106,98,100,95,108,104,103,98,100,99,102,102
110 | 2018,Rockies,115,115,112,109,112,135,111,96,102,106,99,105,89,106
111 | 2018,Marlins,95,95,88,99,96,103,90,99,100,101,99,98,95,97
112 | 2018,Astros,97,98,99,97,96,105,101,102,98,100,97,97,100,99
113 | 2018,Dodgers,96,96,94,96,100,75,101,100,94,99,100,96,104,98
114 | 2018,Brewers,101,101,101,98,100,94,107,102,100,99,100,100,97,101
115 | 2018,Nationals,102,104,106,103,103,83,100,100,101,99,103,100,97,102
116 | 2018,Mets,94,92,87,95,94,86,97,101,100,97,100,97,109,98
117 | 2018,Phillies,100,103,102,98,96,94,109,105,100,99,101,97,109,102
118 | 2018,Pirates,98,98,94,102,101,95,93,97,100,102,98,103,99,100
119 | 2018,Cardinals,96,96,97,101,100,96,95,97,98,101,101,102,102,98
120 | 2018,Padres,97,95,102,99,99,95,96,100,102,101,98,102,98,99
121 | 2018,Giants,96,94,101,101,99,120,86,99,100,101,96,99,98,94
122 | 2019,Angels,98,99,101,100,96,88,98,102,97,101,100,98,100,100
123 | 2019,Orioles,102,102,104,101,96,87,106,98,100,101,102,100,100,104
124 | 2019,Red Sox,105,103,103,103,112,103,95,99,99,102,97,103,101,99
125 | 2019,White Sox,98,99,98,98,95,93,105,102,103,98,101,98,105,101
126 | 2019,Indians,104,101,99,101,106,83,102,100,100,101,97,101,92,100
127 | 2019,Tigers,101,103,105,101,99,123,101,96,100,100,104,101,105,101
128 | 2019,Royals,102,101,103,101,105,114,92,97,99,100,101,101,95,98
129 | 2019,Twins,101,101,99,103,103,104,101,98,99,102,101,102,100,101
130 | 2019,Yankees,100,99,91,99,95,83,112,101,100,98,102,98,102,102
131 | 2019,Athletics,97,97,95,99,102,113,94,98,99,100,102,100,105,98
132 | 2019,Mariners,96,96,98,98,96,85,99,103,99,97,102,97,107,98
133 | 2019,Rays,96,96,95,99,94,100,96,103,100,98,100,100,104,97
134 | 2019,Rangers,109,112,111,103,100,109,101,98,104,100,100,103,98,104
135 | 2019,Blue Jays,100,99,101,97,108,97,103,100,99,99,99,100,98,101
136 | 2019,Diamondbacks,100,101,99,100,106,127,104,100,100,101,100,101,93,99
137 | 2019,Braves,101,101,100,101,102,92,99,100,97,100,100,103,101,99
138 | 2019,Cubs,100,102,97,100,100,103,101,101,102,100,99,101,97,100
139 | 2019,Reds,102,103,102,98,100,91,108,104,102,98,100,98,102,102
140 | 2019,Rockies,115,115,118,109,109,132,110,95,102,107,99,107,90,106
141 | 2019,Marlins,95,95,104,99,98,108,90,99,101,101,99,97,95,97
142 | 2019,Astros,97,98,104,98,96,109,102,103,99,100,98,98,100,99
143 | 2019,Dodgers,96,96,96,97,99,77,100,100,94,99,100,96,102,98
144 | 2019,Brewers,101,101,99,99,101,96,108,101,101,99,100,101,97,101
145 | 2019,Nationals,102,104,105,104,102,81,97,99,100,100,102,101,98,102
146 | 2019,Mets,94,92,95,95,96,86,99,102,101,97,101,98,110,98
147 | 2019,Phillies,100,103,102,98,97,95,111,104,101,99,101,97,108,102
148 | 2019,Pirates,98,98,100,102,99,96,92,97,99,102,98,104,99,100
149 | 2019,Cardinals,96,96,96,100,99,96,95,97,98,100,101,101,103,98
150 | 2019,Padres,97,95,93,98,96,96,96,101,100,100,98,99,97,99
151 | 2019,Giants,96,94,90,100,99,122,86,100,101,101,97,98,98,94
152 | 


--------------------------------------------------------------------------------
/download_scripts/data_helper.py:
--------------------------------------------------------------------------------
  1 | """Baseball Data Helper Functions"""
  2 | 
  3 | __author__ = 'Stephen Diehl'
  4 | 
  5 | import pandas as pd
  6 | import numpy as np
  7 | import re
  8 | import io
  9 | from pathlib import Path
 10 | import statsmodels.api as sm
 11 | from IPython.display import HTML, display
 12 | from sqlalchemy.types import SmallInteger, Integer, BigInteger, Float
 13 | 
 14 | 
 15 | def to_csv_with_types(df, filename):
 16 |     """
 17 |     Save df to csv file and save df.dtypes to csv file.
 18 | 
 19 |     If filename ends in .gz, Pandas will use gzip compression.
 20 | 
 21 |     This is intended to be used after optimizing df column types.
 22 |     Read back with: from_csv_with_types()
 23 | 
 24 |     Persistence with data types cannot currently be done with hdf5 because
 25 |     the new Int64 and similar data types are not supported.
 26 |     """
 27 | 
 28 |     p = Path(filename)
 29 |     types_name = p.name.split('.')[0] + '_types.csv'
 30 |     p_types = p.parent / types_name
 31 | 
 32 |     dtypes = df.dtypes.to_frame('dtypes').reset_index()
 33 | 
 34 |     dtypes.to_csv(p_types, index=False)
 35 |     df.to_csv(p, index=False)
 36 | 
 37 | 
 38 | def from_csv_with_types(filename, usecols=None, nrows=None):
 39 |     """
 40 |     Read df.dtypes from csv file and read df from csv file.
 41 | 
 42 |     If filename ends in .gz, Pandas will use gzip decompression.
 43 |     This is the complement of to_csv_with_types().
 44 |     """
 45 | 
 46 |     p = Path(filename)
 47 |     types_name = p.name.split('.')[0] + '_types.csv'
 48 |     p_types = p.parent / types_name
 49 |     dates, dtypes = read_types(p_types)
 50 | 
 51 |     # only parse dates that are in usecols
 52 |     if dates and usecols:
 53 |         dates = list(set(dates) & set(usecols))
 54 | 
 55 |     return pd.read_csv(p, parse_dates=dates, dtype=dtypes, usecols=usecols, nrows=nrows)
 56 | 
 57 | 
 58 | def read_types(filename):
 59 |     """Read data types file to get list of date fields and a dictionary mapping of types
 60 | 
 61 |     """
 62 |     types = pd.read_csv(filename).set_index('index').to_dict()
 63 |     dtypes = types['dtypes']
 64 | 
 65 |     dates = [key for key, value in dtypes.items() if value.startswith('datetime')]
 66 |     for field in dates:
 67 |         dtypes.pop(field)
 68 | 
 69 |     return dates, dtypes
 70 | 
 71 | 
 72 | def get_optimal_data_type(s):
 73 |     # if the integer is outside the range of values that be converted to a nullable integer type
 74 |     # use float64
 75 |     convert_type = 'float64'
 76 | 
 77 |     dtype_range = get_dtype_range()
 78 |     if s.min() >= 0:
 79 |         for dtype in ['UInt8', 'UInt16', 'UInt32', 'UInt64']:
 80 |             if s.max() <= dtype_range[dtype][2]:
 81 |                 convert_type = dtype
 82 |                 break
 83 |     else:
 84 |         for dtype in ['Int8', 'Int16', 'Int32', 'Int64']:
 85 |             if s.max() <= dtype_range[dtype][2] and s.min() >= dtype_range[dtype][1]:
 86 |                 convert_type = dtype
 87 |                 break
 88 | 
 89 |     return convert_type
 90 | 
 91 | 
 92 | def optimize_df_dtypes(df, ignore=None):
 93 |     """
 94 |     Downcasts DataFrame Column Types based on values.
 95 | 
 96 |     Modification is inplace.
 97 | 
 98 |     Parameters:
 99 |         df (pd.DataFrame): reduce size of datatypes as appropriate for its values.
100 | 
101 |        ignore (list): column names to exclude from downcasting.
102 |     """
103 | 
104 |     # columns to consider for downcasting
105 |     process_cols = df.columns
106 |     if ignore:
107 |         process_cols = df.columns.difference(ignore)
108 | 
109 |         if len(process_cols) == 0:
110 |             return df
111 | 
112 |     # get the integer columns, if any
113 |     df_int = df[process_cols].select_dtypes(include=[np.int])
114 | 
115 |     # downcast integer columns to smallest unsigned int that will hold the values
116 |     if len(df_int.columns) > 0:
117 |         df[df_int.columns] = df_int.apply(pd.to_numeric, downcast='unsigned')
118 | 
119 |     # if there were any negative values, the above creates int64, downcast int64 as well
120 |     df_int64 = df[process_cols].select_dtypes(include=[np.int64])
121 |     if len(df_int64.columns) > 0:
122 |         df[df_int64.columns] = df_int64.apply(pd.to_numeric, downcast='signed')
123 | 
124 |     # convert float columns that are integers with nans to best nullable integer type
125 |     df_float = df.select_dtypes(include=['float'])
126 |     if len(df_float.columns) > 0:
127 |         filt = df_float.apply(is_int)
128 |         int_col_names = df_float.columns[filt]
129 |         if filt.any():
130 |             for col in int_col_names:
131 |                 convert_type = get_optimal_data_type(df[col])
132 |                 df[col] = df[col].astype(convert_type)
133 | 
134 | 
135 | def get_dtype_range():
136 |     """Create a Dictionary having min/max values per Data Type
137 | 
138 |     Key: string representation of data type
139 |     Value: list of length 3
140 |       value[0] is 0 or np.nan
141 |       value[1] is min for that data type
142 |       value[2] is max for that data type
143 | 
144 |     This dictionary can be used to create a 3 row DataFrame which demonstrates
145 |     that the specified data type can hold the specified values.
146 | 
147 |     Pandas data type limits:
148 |     Int8   nullable with same limits as np.int8
149 |     UInt8  nullable with same limits as np.uint8
150 |     Int16  nullable with same limits as np.int16
151 |     UInt16 nullable with same limits as np.uint16
152 |     Int32  nullable with same limits as np.int32
153 |     UInt32 nullable with same limits as np.uint32
154 |     Int64  nullable with min/max limits about 1/2 of np.int64
155 |     UInt64 nullable with max limit about 1/4 of np.uint64
156 |     """
157 |     data = []
158 |     for dtype in ['uint8', 'int8', 'uint16', 'int16', 'uint32', 'int32', 'uint64', 'int64']:
159 |         data.append([0, np.iinfo(dtype).min, np.iinfo(dtype).max])
160 |         data.append([np.nan, np.iinfo(dtype).min, np.iinfo(dtype).max])
161 | 
162 |     keys = ['uint8', 'UInt8', 'int8', 'Int8', 'uint16', 'UInt16', 'int16', 'Int16',
163 |             'uint32', 'UInt32', 'int32', 'Int32', 'uint64', 'UInt64', 'int64', 'Int64']
164 | 
165 |     dtype_range = dict(zip(keys, data))
166 | 
167 |     # Pandas has different limits than numpy for the following
168 |     dtype_range['UInt64'][2] = 2**61
169 |     dtype_range['Int64'][1] = -2**61
170 |     dtype_range['Int64'][2] = 2**61
171 | 
172 |     return dict(zip(keys, data))
173 | 
174 | 
175 | def optimize_db_dtypes(df):
176 |     """
177 |     Choose smallest ANSI SQL Column Type for integer that fits the optimized DataFrame.
178 | 
179 |     Relies on:
180 |     from sqlalchemy.types import SmallInteger, Integer, BigInteger
181 | 
182 |     SQL Column Types are signed, so uint16 might not fit in smallinteger
183 |     TODO: below is safe but inefficient for uint16, UInt16, uint32 and UInt32
184 |     """
185 |     small_int = {col: SmallInteger for col in df.select_dtypes(
186 |         include=[pd.Int8Dtype, np.int8, pd.UInt8Dtype, np.uint8,
187 |                  pd.Int16Dtype, np.int16]).columns}
188 | 
189 |     integer = {col: Integer for col in df.select_dtypes(
190 |         include=[pd.UInt16Dtype, np.uint16, pd.Int32Dtype, np.int32]).columns}
191 | 
192 |     big_int = {col: BigInteger for col in df.select_dtypes(
193 |         include=[pd.UInt32Dtype, np.uint32, pd.Int64Dtype, np.int64]).columns}
194 | 
195 |     # use double precision for unsigned 64 bit integers
196 |     # Float(precision=53) is the SQL data type for double precision
197 |     double = {col: Float(precision=53) for col in df.select_dtypes(
198 |         include=[np.uint64, pd.UInt64Dtype]).columns}
199 | 
200 |     dtypes = {**small_int, **integer, **big_int, **double}
201 | 
202 |     return dtypes
203 | 
204 | 
205 | def mem_usage(df):
206 |     """Returns a string representing df memory usage in MB."""
207 |     mem = df.memory_usage(deep=True).sum()
208 |     mem = mem / 2 ** 20  # covert to megabytes
209 |     return f'{mem:03.2f} MB'
210 | 
211 | 
212 | def is_int(s):
213 |     """Returns True if all non-null values are integers.
214 | 
215 |     Useful for determining if the df column (pd.Series) is
216 |     float just to hold missing values.
217 |     """
218 |     notnull = s.notnull()
219 |     is_integer = s.apply(lambda x: (x % 1 == 0.0))
220 |     return (notnull == is_integer).all()
221 | 
222 | 
223 | def convert_camel_case(name):
224 |     """
225 |     CamelCase to snake_case.
226 | 
227 |     This is from:
228 |     https://stackoverflow.com/questions/1175208/elegant-python-function-to-convert-camelcase-to-snake-case#answer-1176023
229 |     """
230 |     s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
231 |     return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
232 | 
233 | 
234 | def is_unique(df, cols, ignore_null=False):
235 |     """Fast determination of multi-column uniqueness."""
236 |     if ignore_null:
237 |         df.dropna(subset=cols, inplace=True)
238 |     return not (df.duplicated(subset=cols)).any()
239 | 
240 | 
241 | def df_info(df):
242 |     """Use buffer to capture output from df.info()"""
243 |     buffer = io.StringIO()
244 |     df.info(buf=buffer)
245 |     return buffer.getvalue()
246 | 
247 | 
248 | def order_cols(df, cols):
249 |     """Put columns in cols first, followed by rest of columns"""
250 |     rest = [col for col in df.columns if col not in cols]
251 |     df = df[cols + rest]
252 |     return df
253 | 
254 | 
255 | def sum_stats_for_dups(df, pkey, stat_cols):
256 |     """Sum stat columns for rows having the same primary key.
257 | 
258 |     This is a "best guess" fix to rows with duplicate primary keys.
259 | 
260 |     The first value for a non-pkey non-stat column will be kept.
261 |     """
262 |     # dups is true for all rows that are duplicates
263 |     dups = df.duplicated(subset=pkey, keep=False)
264 |     if len(dups) == 0:
265 |         return
266 | 
267 |     # get the duplicated rows
268 |     df_dups = df.loc[dups]
269 | 
270 |     # for the duplicate rows, sum the stat columns only
271 |     df_summed = df_dups.groupby(pkey)[stat_cols].sum()
272 | 
273 |     # often, setting the index to the primary key makes data processing easier
274 |     df.set_index(pkey, inplace=True)
275 | 
276 |     # remove all but one of each group of duplicated rows
277 |     df = df.loc[~df.index.duplicated(keep='first')].copy()
278 | 
279 |     # set the kept row (per group) equal to the summed row computed above
280 |     df.loc[df_summed.index, stat_cols] = df_summed
281 | 
282 |     df.reset_index(inplace=True)
283 | 
284 |     return df
285 | 
286 | 
287 | def move_column_after(df, after_col, col):
288 |     idx = df.columns.get_loc(after_col)
289 |     cols = list(df.columns)
290 |     cols.remove(col)
291 |     cols.insert(idx + 1, col)
292 |     return df.reindex(cols, axis=1)
293 | 
294 | 
295 | def game_id_to_url(game_id):
296 |     """Game ID to URL for Jupyter Notebooks"""
297 |     dir = game_id[:3]
298 |     url = 'https://www.baseball-reference.com/boxes/' + dir + '/' + game_id + '.shtml'
299 |     display(HTML(f'<a href="{url}">{game_id}</a>'))
300 | 
301 | 
302 | def player_id_to_url(player_id):
303 |     """Baseball Reference Player ID to URL for Jupyter Notebooks"""
304 |     dir = player_id[0]
305 |     url = 'https://www.baseball-reference.com/players/' + dir + '/' + player_id + '.shtml'
306 |     display(HTML(f'<a href="{url}">{player_id}</a>'))
307 | 
308 | 
309 | def simple_loess(x, y, df, frac=1 / 6, it=0):
310 |     """Smoothes noisy data.
311 | 
312 |     Increase frac to get more smoothing.
313 |     Decrease frac to get less smoothing.
314 | 
315 |     sns.lmplot has a loess option, but it uses poor and unchangeable defaults."""
316 |     z = sm.nonparametric.lowess(df[y], df[x], frac=frac, it=it)
317 |     return pd.DataFrame(data=z, columns=[x, y])
318 | 


--------------------------------------------------------------------------------
/download_scripts/retrosheet_wrangle.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | """Wrangle Retrosheet Data from {data_dir}/retrosheet/raw to {data_dir}/retrosheet/wrangled
  4 | 
  5 | Wrangles: player per game and team per game data
  6 | """
  7 | 
  8 | __author__ = 'Stephen Diehl'
  9 | 
 10 | import argparse
 11 | import re
 12 | import shutil
 13 | from pathlib import Path
 14 | import logging
 15 | import sys
 16 | import collections
 17 | 
 18 | import pandas as pd
 19 | import numpy as np
 20 | 
 21 | import data_helper as dh
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | logger.setLevel(logging.DEBUG)
 25 | 
 26 | 
 27 | def get_parser():
 28 |     """Args Description"""
 29 | 
 30 |     parser = argparse.ArgumentParser(
 31 |         description=__doc__,
 32 |         formatter_class=argparse.ArgumentDefaultsHelpFormatter)
 33 | 
 34 |     parser.add_argument("--data-dir", type=str, help="baseball data directory", default='../data')
 35 |     parser.add_argument("-v", "--verbose", help="verbose output", action="store_true")
 36 |     parser.add_argument("--log", dest="log_level", choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
 37 |                         help="Set the logging level")
 38 | 
 39 |     return parser
 40 | 
 41 | 
 42 | def get_game(p_retrosheet_collected):
 43 |     """Read in collected results of the cwgame parser."""
 44 |     logger.info('Reading game.csv.gz ...')
 45 |     filename = p_retrosheet_collected / 'game.csv.gz'
 46 |     game = dh.from_csv_with_types(filename)
 47 |     n_rows, n_cols = game.shape
 48 |     logger.info(f'game loaded {n_rows:,d} rows with {n_cols:,d} columns')
 49 |     return game
 50 | 
 51 | 
 52 | def get_player_game(p_retrosheet_collected):
 53 |     """Read in collected results of the cwdaily parser."""
 54 |     logger.info('Reading player_game.csv.gz ...')
 55 |     filename = p_retrosheet_collected / 'player_game.csv.gz'
 56 |     player_game = dh.from_csv_with_types(filename)
 57 |     n_rows, n_cols = player_game.shape
 58 |     logger.info(f'player_game loaded {n_rows:,d} rows with {n_cols:,d} columns')
 59 |     return player_game
 60 | 
 61 | 
 62 | def clean_player_game(player_game):
 63 |     """Ensure Primary Key is Unique."""
 64 | 
 65 |     # Fix Duplicate Primary Key
 66 |     pkey = ['game_id', 'player_id']
 67 |     if not dh.is_unique(player_game, pkey):
 68 |         # if pkey is dup, sum the stat rows for the dups
 69 |         dups = player_game.duplicated(subset=pkey)
 70 |         df_dups = player_game.loc[dups, pkey]
 71 |         logger.warning(f'Dup PKey Found - summing stats for:\n{df_dups.to_string()}')
 72 | 
 73 |         # TODO flag fields should be ORed not summed
 74 |         # this is not currently a problem with the single dup found
 75 |         # data integrity tests verify that all flag fields are either 0 or 1
 76 |         """Flag Fields (value is 0 or 1):
 77 |         b_g b_g_dh b_g_ph b_g_pr p_g p_gs p_cg p_sho p_gf p_w p_l p_sv f_p_g f_p_gs f_c_g 
 78 |         f_c_gs f_1b_g f_1b_gs f_2b_g f_2b_gs f_3b_g f_3b_gs f_ss_g f_ss_gs f_lf_g f_lf_gs 
 79 |         f_cf_g f_cf_gs f_rf_g f_rf_gs     
 80 |         """
 81 | 
 82 |         # player stat columns b_ for batter, p_ for pitcher, f_ for fielder
 83 |         stat_columns = [col for col in player_game.columns if re.search(r'^[bpf]_', col)]
 84 |         stat_columns.remove('b_g')  # don't sum this column
 85 | 
 86 |         player_game = dh.sum_stats_for_dups(player_game, pkey, stat_columns)
 87 | 
 88 |     return player_game
 89 | 
 90 | 
 91 | def create_batting(player_game, game_start, p_retrosheet_wrangled):
 92 |     """Create batting.csv for batting attributes per player per game."""
 93 |     # column names of the batting attributes
 94 |     b_cols = [col for col in player_game.columns if col.startswith('b_')]
 95 | 
 96 |     # Note: any player who is in a game in any role, will have b_g = 1
 97 |     # even if b_pa == 0 (no plate appearances)
 98 | 
 99 |     # fields which uniquely identify a record
100 |     pkey = ['game_id', 'player_id']
101 | 
102 |     # fields to join to other "tables"
103 |     fkey = ['team_id']
104 | 
105 |     batting = player_game.loc[:, pkey + fkey + b_cols].copy()
106 | 
107 |     # remove b_ from the column names, except for b_2b and b_3b
108 |     b_cols_new = {col: col[2:] for col in b_cols}
109 |     b_cols_new['b_2b'] = 'double'
110 |     b_cols_new['b_3b'] = 'triple'
111 |     b_cols_new['b_gdp'] = 'gidp'  # to match Lahman
112 |     b_cols_new['b_hp'] = 'hbp'  # to match Lahman
113 |     batting.rename(columns=b_cols_new, inplace=True)
114 | 
115 |     # add game_start.dt.year as many queries use year
116 |     batting = pd.merge(batting, game_start[['game_id', 'game_start']])
117 |     batting['year'] = batting['game_start'].dt.year.astype('int16')
118 | 
119 |     dh.optimize_df_dtypes(batting, ignore=['year'])
120 |     logger.info('Writing and compressing batting.  This could take several minutes ...')
121 |     dh.to_csv_with_types(batting, p_retrosheet_wrangled / 'batting.csv.gz')
122 | 
123 | 
124 | def create_pitching(player_game, game_start, p_retrosheet_wrangled):
125 |     """Create pitching.csv for pitching attributes per player per game."""
126 |     # column names of the pitching attributes
127 |     p_cols = [col for col in player_game.columns if col.startswith('p_')]
128 | 
129 |     # if all pitching attributes are 0 then the player did not pitch
130 |     # note: all attributes are unsigned integers, so if their sum is zero, all are zero
131 |     p_filt = player_game[p_cols].sum(axis=1) == 0
132 | 
133 |     # fields which uniquely identify a record
134 |     pkey = ['game_id', 'player_id']
135 | 
136 |     # fields to join to other "tables"
137 |     fkey = ['team_id']
138 | 
139 |     # data with some non-zero attributes
140 |     pitching = player_game.loc[~p_filt, pkey + fkey + p_cols].copy()
141 | 
142 |     # remove p_ from the column names, except for p_2b and p_3b
143 |     p_cols_new = {col: col[2:] for col in p_cols}
144 |     p_cols_new['p_2b'] = 'double'
145 |     p_cols_new['p_3b'] = 'triple'
146 |     p_cols_new['p_gdp'] = 'gidp'  # to match Lahman
147 |     p_cols_new['p_hp'] = 'hbp'  # to match Lahman
148 |     pitching.rename(columns=p_cols_new, inplace=True)
149 | 
150 |     # add game_start.dt.year as many queries use year
151 |     pitching = pd.merge(pitching, game_start[['game_id', 'game_start']])
152 |     pitching['year'] = pitching['game_start'].dt.year.astype('int16')
153 | 
154 |     dh.optimize_df_dtypes(pitching, ignore=['year'])
155 |     logger.info('Writing and compressing pitching.  This could take several minutes ...')
156 |     dh.to_csv_with_types(pitching, p_retrosheet_wrangled / 'pitching.csv.gz')
157 | 
158 | 
159 | def create_fielding(player_game, game_start, p_retrosheet_wrangled):
160 |     """Create fielding.csv for fielding attributes per player per game."""
161 |     # column names for fielding attributes
162 |     f_cols = [col for col in player_game.columns if col.startswith('f_')]
163 | 
164 |     # create orig_cols dictionary which maps fielder's pos to original fielding columns names
165 |     # create new_cols dictionary which maps fielder's pos to new fielding column names
166 |     # pos: P, C, 1B, 2B, 3B, SS, LF, CF, RF
167 |     # column name pattern: f_{pos}_{stat}
168 |     orig_cols = collections.defaultdict(list)
169 |     new_cols = collections.defaultdict(list)
170 |     for col in f_cols:
171 |         match = re.search(r'f_(\w{1,2})_(\w*)', col)
172 |         pos = match.group(1)
173 |         stat = match.group(2)
174 |         orig_cols[pos].append(col)
175 |         stat = stat.replace('out', 'inn_outs')  # to match Lahman
176 |         new_cols[pos].append(stat)
177 | 
178 |     # full pkey will be: ['game_id', 'player_id', 'pos']
179 |     pkey = ['game_id', 'player_id']
180 | 
181 |     # fields to join to other "tables"
182 |     fkey = ['team_id']
183 | 
184 |     """For each record created by cwdaily, create up to 9 new records, one per position.
185 |     Each record will temporarily go in its own dataframe and then be concatenated.
186 |     
187 |     Each dataframe has the same columns."""
188 |     dfs = []
189 |     for pos in orig_cols.keys():
190 |         # if all fielding attributes for this pos are 0 then the player did not play that pos
191 |         # note: all attributes are unsigned integers
192 |         f_filt = player_game[orig_cols[pos]].sum(axis=1) == 0
193 | 
194 |         df = pd.DataFrame()
195 |         df[pkey + fkey + new_cols[pos]] = \
196 |             player_game.loc[~f_filt, pkey + fkey + orig_cols[pos]].copy()
197 | 
198 |         # add the position column to the df
199 |         # use upper case to match Lahman position values
200 |         df.insert(2, 'pos', pos.upper())
201 | 
202 |         # orig_cols['c'] has pb and xi columns
203 |         # all other positions do not have pb and xi
204 |         if pos != 'c':
205 |             df[f'pb'] = 0
206 |             df[f'xi'] = 0
207 | 
208 |         dfs.append(df)
209 | 
210 |     fielding = pd.concat(dfs, ignore_index=True)
211 | 
212 |     # add game_start.dt.year as many queries use year
213 |     fielding = pd.merge(fielding, game_start[['game_id', 'game_start']])
214 |     fielding['year'] = fielding['game_start'].dt.year.astype('int16')
215 | 
216 |     dh.optimize_df_dtypes(fielding, ignore=['year'])
217 |     logger.info('Writing and compressing fielding.  This could take several minutes ...')
218 |     dh.to_csv_with_types(fielding, p_retrosheet_wrangled / 'fielding.csv.gz')
219 | 
220 | 
221 | def wrangle_game(game, p_retrosheet_wrangled):
222 |     """Tidy the Game Data
223 | 
224 |     There are 3 types of data:
225 | 
226 |     data specific to a game -- the 'game' columns below
227 |     data specific to the home team for that game -- the 'home' columns below
228 |     data specific to the away team for that game -- the 'away' columns below
229 |     The attributes for the home team are identical to the attributes for the away team.
230 | 
231 |     This suggests breaking this out into 2 csv files.
232 | 
233 |     1. team_game.csv with key (game_id, team_id) -- stats per team per game (e.g. runs scored)
234 |     2. game.csv with key (game_id) -- stats per game (e.g. attendance)
235 |     """
236 | 
237 |     home_cols = [col for col in game.columns if col.startswith('home')]
238 |     away_cols = [col for col in game.columns if col.startswith('away')]
239 |     game_cols = [col for col in game.columns
240 |                  if not col.startswith('home') and not col.startswith('away')]
241 | 
242 |     game_tidy = game[game_cols].copy()
243 |     home_team_game = game[['game_id'] + home_cols].copy()
244 |     away_team_game = game[['game_id'] + away_cols].copy()
245 | 
246 |     home_team_game['bat_last'] = True
247 |     away_team_game['bat_last'] = False
248 |     home_team_game = dh.move_column_after(home_team_game, 'game_id', 'bat_last')
249 |     away_team_game = dh.move_column_after(away_team_game, 'game_id', 'bat_last')
250 | 
251 |     # remove leading 'home_' and 'away_' suffix from fields
252 |     home_team_game.rename(columns=lambda col: col[5:] if col.startswith('home_') else col, inplace=True)
253 |     away_team_game.rename(columns=lambda col: col[5:] if col.startswith('away_') else col, inplace=True)
254 | 
255 |     # include opponent team_id in each row
256 |     home_team_game.insert(4, 'opponent_team_id', away_team_game['team_id'])
257 |     away_team_game.insert(4, 'opponent_team_id', home_team_game['team_id'])
258 |     team_game = pd.concat([home_team_game, away_team_game])
259 | 
260 |     # improve column names
261 |     names = {col: col.replace('_ct', '') for col in team_game.columns if col.endswith('_ct')}
262 | 
263 |     # handle invalid identifiers
264 |     names['2b_ct'] = 'double'
265 |     names['3b_ct'] = 'triple'
266 | 
267 |     # pitcher_ct (number of pitchers) is a good name though, keep it
268 |     names.pop('pitcher_ct')
269 | 
270 |     # additional fields to rename for consistency
271 |     names['bi_ct'] = 'rbi'
272 |     names['gdp_ct'] = 'gidp'
273 |     names['hits_ct'] = 'h'
274 |     names['hp_ct'] = 'hbp'
275 |     names['err_ct'] = 'e'
276 |     names['score_ct'] = 'r'
277 | 
278 |     team_game = team_game.rename(columns=names)
279 | 
280 |     # create new datetime column
281 |     game_tidy['game_start'] = game_tidy.apply(parse_datetime, axis=1)
282 |     game_tidy = dh.move_column_after(game_tidy, 'game_id', 'game_start')
283 | 
284 |     # these fields are no longer necessary
285 |     game_tidy = game_tidy.drop(['start_game_tm', 'game_dt', 'game_dy'], axis=1)
286 | 
287 |     # add the game_start column to team_game to simplify queries
288 |     team_game = pd.merge(team_game, game_tidy[['game_id', 'game_start']])
289 |     team_game['year'] = team_game['game_start'].dt.year.astype('int16')
290 | 
291 |     logger.info('Writing and compressing team_game.  This could take several minutes ...')
292 |     dh.optimize_df_dtypes(team_game, ignore=['year'])
293 |     dh.to_csv_with_types(team_game, p_retrosheet_wrangled / 'team_game.csv.gz')
294 | 
295 |     # convert designated hitter to True/False and rename
296 |     game_tidy['dh'] = False
297 |     filt = game_tidy['dh_fl'] == 'T'
298 |     game_tidy.loc[filt, 'dh'] = True
299 |     game_tidy.drop('dh_fl', axis=1, inplace=True)
300 | 
301 |     # convert impossible attendance values to null and rename
302 |     filt = game_tidy['attend_park_ct'] <= 0
303 |     impossible_values = game_tidy.loc[filt, 'attend_park_ct'].unique()
304 |     game_tidy['attendance'] = game_tidy['attend_park_ct'].replace(impossible_values, np.nan)
305 |     game_tidy.drop('attend_park_ct', axis=1, inplace=True)
306 | 
307 |     # convert impossible temperature values to null and rename
308 |     filt = game_tidy['temp_park_ct'] <= 0
309 |     impossible_values = game_tidy.loc[filt, 'temp_park_ct'].unique()
310 |     game_tidy['temperature'] = game_tidy['temp_park_ct'].replace(impossible_values, np.nan)
311 |     game_tidy.drop('temp_park_ct', axis=1, inplace=True)
312 | 
313 |     # replace code values with strings
314 |     # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-winddirection
315 |     direction = {
316 |         0: 'unknown',
317 |         1: 'to_lf',
318 |         2: 'to_cf',
319 |         3: 'to_rf',
320 |         4: 'l_to_r',
321 |         5: 'from_lf',
322 |         6: 'from_cf',
323 |         7: 'from_rf',
324 |         8: 'r_to_l'}
325 |     game_tidy['wind_direction'] = \
326 |         game_tidy['wind_direction_park_cd'].map(direction).replace('unknown', np.nan)
327 |     game_tidy.drop('wind_direction_park_cd', axis=1, inplace=True)
328 | 
329 |     # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-windspeed
330 |     # convert impossible wind speed values to null and rename
331 |     filt = game_tidy['wind_speed_park_ct'] < 0
332 |     impossible_values = game_tidy.loc[filt, 'wind_speed_park_ct'].unique()
333 |     game_tidy['wind_speed'] = game_tidy['wind_speed_park_ct'].replace(impossible_values, np.nan)
334 |     game_tidy.drop('wind_speed_park_ct', axis=1, inplace=True)
335 | 
336 |     # replace code values with strings
337 |     # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-fieldcondition
338 |     condition = {
339 |         0: 'unknown',
340 |         1: 'soaked',
341 |         2: 'wet',
342 |         3: 'damp',
343 |         4: 'dry'}
344 |     game_tidy['field_condition'] = \
345 |         game_tidy['field_park_cd'].map(condition).replace('unknown', np.nan)
346 |     game_tidy.drop('field_park_cd', axis=1, inplace=True)
347 | 
348 |     # replace code values with strings
349 |     # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-precipitation
350 |     precip = {
351 |         0: 'unknown',
352 |         1: 'none',
353 |         2: 'drizzle',
354 |         3: 'showers',
355 |         4: 'rain',
356 |         5: 'snow'}
357 |     game_tidy['precip_type'] = \
358 |         game_tidy['precip_park_cd'].map(precip).replace('unknown', np.nan)
359 |     game_tidy.drop('precip_park_cd', axis=1, inplace=True)
360 | 
361 |     # replace code values with strings
362 |     # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-sky
363 |     sky = {
364 |         0: 'unknown',
365 |         1: 'sunny',
366 |         2: 'cloudy',
367 |         3: 'overcast',
368 |         4: 'night',
369 |         5: 'dome'}
370 |     game_tidy['sky_condition'] = \
371 |         game_tidy['sky_park_cd'].map(sky).replace('unknown', np.nan)
372 |     game_tidy.drop('sky_park_cd', axis=1, inplace=True)
373 | 
374 |     logger.info('Writing and compressing game.  This could take several minutes ...')
375 |     dh.optimize_df_dtypes(game_tidy)
376 |     dh.to_csv_with_types(game_tidy, p_retrosheet_wrangled / 'game.csv.gz')
377 | 
378 |     # to add game date to other tables
379 |     return game_tidy[['game_id', 'game_start']]
380 | 
381 | 
382 | def parse_datetime(row):
383 |     """Determine AM/PM from MLB domain knowledge and Day/Night Flag
384 | 
385 |     Here is the relevant information.
386 | 
387 |     * am/pm is not specified
388 |     * start_game_tm is an integer
389 |       * example: 130 represents 1:30 (am or pm)
390 |     * start_game_tm == 0 means the game start time is unknown
391 |     * there are no start_game_tm < 100 that are not exactly zero
392 |     * daynight_park_cd is never missing
393 |     * based on the data, almost always a game that starts between 5 and 9 is classified as a night game
394 |       This is likely because "night" actually means that the stadium lights must be turned on before a
395 |       game of typical length ends.
396 |     * MLB domain knowledge: A game may start "early" to allow for travel, but games never start
397 |       before 9 am so: 100 <= start_game_tm < 900 => pm
398 |       * example: 830 => 8:30 pm
399 |     * MLB domain knowledge: A game may start "late" due to rain delay, but games never start
400 |       after midnight so: 900 < start_game_tm < 1200 => am or pm depending on the day/night flag
401 |       * example: 1030 Day => 10:30 am
402 |       * example: 1030 Night => 10:30 pm
403 |     """
404 |     date = row['game_dt']
405 |     time = row['start_game_tm']
406 |     day_night = row['daynight_park_cd']
407 | 
408 |     if 0 < time < 900:
409 |         time += 1200
410 |     elif (900 <= time < 1200) and day_night == 'N':
411 |         time += 1200
412 | 
413 |     time_str = f'{time // 100:02d}:{time % 100:02d}'
414 |     datetime_str = str(date) + ' ' + time_str
415 |     return pd.to_datetime(datetime_str, format='%Y%m%d %H:%M')
416 | 
417 | 
418 | def wrangle_event(p_retrosheet_collected, p_retrosheet_wrangled):
419 |     """Wrangle event
420 | 
421 |     At this time, there is nothing to do, just copy the collected data."""
422 |     source = p_retrosheet_collected / 'event.csv.gz'
423 |     destination = p_retrosheet_wrangled / 'event.csv.gz'
424 |     shutil.copyfile(source, destination)
425 | 
426 |     source = p_retrosheet_collected / 'event_types.csv'
427 |     destination = p_retrosheet_wrangled / 'event_types.csv'
428 |     shutil.copyfile(source, destination)
429 | 
430 | 
431 | def wrangle_parks(data_dir, retrosheet_wrangle):
432 |     parks_filename = data_dir / 'retrosheet/raw/misc/parkcode.txt'
433 |     parks = pd.read_csv(parks_filename, parse_dates=['START', 'END'])
434 |     cols = [col.lower() for col in parks.columns]
435 |     parks.columns = cols
436 |     parks = parks.rename(columns={'parkid': 'park_id'})
437 |     dh.to_csv_with_types(parks, retrosheet_wrangle / 'parks.csv')
438 | 
439 | 
440 | def wrangle_teams(data_dir, retrosheet_wrangle):
441 |     team_dir = data_dir / 'retrosheet/raw/event/regular'
442 | 
443 |     dfs = []
444 |     team_files = team_dir.glob('TEAM*')
445 |     for team in sorted(team_files):
446 |         year = int(team.name[-4:])
447 |         df = pd.read_csv(team, header=None, names=['team_id', 'lg_id', 'city', 'name'])
448 |         df.insert(1, 'year', year)
449 |         dfs.append(df)
450 |     retro_teams = pd.concat(dfs, ignore_index=True)
451 |     dh.to_csv_with_types(retro_teams, retrosheet_wrangle / 'teams.csv')
452 | 
453 | 
454 | def main():
455 |     """Wrangle the data.
456 |     """
457 |     parser = get_parser()
458 |     args = parser.parse_args()
459 | 
460 |     if args.log_level:
461 |         fh = logging.FileHandler('download.log')
462 |         formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
463 |         fh.setFormatter(formatter)
464 |         fh.setLevel(args.log_level)
465 |         logger.addHandler(fh)
466 | 
467 |     if args.verbose:
468 |         # send INFO level logging to stdout
469 |         sh = logging.StreamHandler(sys.stdout)
470 |         formatter = logging.Formatter('%(asctime)s:%(name)s:%(levelname)s: %(message)s')
471 |         sh.setFormatter(formatter)
472 |         sh.setLevel(logging.INFO)
473 |         logger.addHandler(sh)
474 | 
475 |     data_dir = Path(args.data_dir)
476 |     p_retrosheet_collected = (data_dir / 'retrosheet/collected').resolve()
477 |     p_retrosheet_wrangled = (data_dir / 'retrosheet/wrangled').resolve()
478 | 
479 |     # get collected data from parsers
480 |     game = get_game(p_retrosheet_collected)  # cwgame
481 |     game_start = wrangle_game(game, p_retrosheet_wrangled)
482 | 
483 |     player_game = get_player_game(p_retrosheet_collected)  # cwdaily
484 |     player_game = clean_player_game(player_game)
485 | 
486 |     create_batting(player_game, game_start, p_retrosheet_wrangled)
487 |     create_pitching(player_game, game_start, p_retrosheet_wrangled)
488 |     create_fielding(player_game, game_start, p_retrosheet_wrangled)
489 | 
490 |     wrangle_event(p_retrosheet_collected, p_retrosheet_wrangled)  # cwevent
491 | 
492 |     # parks.txt is included with the retrosheet data.  It is a csv file.
493 |     wrangle_parks(data_dir, p_retrosheet_wrangled)
494 | 
495 |     # TEAM<YYYY> is included in the retrosheet data.  They are csv files.
496 |     wrangle_teams(data_dir, p_retrosheet_wrangled)
497 | 
498 |     logger.info('Finished')
499 | 
500 | 
501 | if __name__ == '__main__':
502 |     main()
503 | 


--------------------------------------------------------------------------------
/download_scripts/tests/test_data.py:
--------------------------------------------------------------------------------
  1 | __author__ = 'Stephen Diehl'
  2 | 
  3 | import zipfile
  4 | import re
  5 | import pandas as pd
  6 | import numpy as np
  7 | from .. import data_helper as dh
  8 | 
  9 | 
 10 | def test_lahman_download(data_dir):
 11 |     """Verify the Lahman Data was downloaded, unzipped and reogranized."""
 12 |     lahman_dir = data_dir / 'lahman'
 13 |     raw_dir = lahman_dir / 'raw'
 14 |     wrangled_dir = lahman_dir / 'wrangled'
 15 | 
 16 |     assert lahman_dir.is_dir()
 17 |     assert wrangled_dir.is_dir()
 18 |     assert raw_dir.is_dir()
 19 | 
 20 |     # 2 directories and 1 file
 21 |     assert len(list(lahman_dir.iterdir())) == 3
 22 | 
 23 |     # zip from master branch of https://github.com/chadwickbureau/baseballdatabank
 24 |     zipfilename = raw_dir.joinpath('baseballdatabank-master.zip')
 25 |     assert zipfilename.is_file()
 26 | 
 27 |     zipped = zipfile.ZipFile(zipfilename)
 28 |     zip_core_files = [file for file in zipped.namelist()
 29 |                       if file.startswith('baseballdatabank-master/core/') and
 30 |                       file.endswith('.csv')]
 31 | 
 32 |     # each csv file in the zipfile should be in raw_dir
 33 |     assert len(list(raw_dir.glob('*.csv'))) == len(zip_core_files)
 34 | 
 35 | 
 36 | def test_retrosheet_download(data_dir):
 37 |     """Verify the Retrosheet data was downloaded and and unzipped."""
 38 |     retrosheet_dir = data_dir / 'retrosheet'
 39 |     raw_dir = retrosheet_dir / 'raw'
 40 |     wrangled_dir = retrosheet_dir / 'wrangled'
 41 | 
 42 |     assert retrosheet_dir.is_dir()
 43 |     assert wrangled_dir.is_dir()
 44 |     assert raw_dir.is_dir()
 45 | 
 46 |     teams = raw_dir.glob('TEAM*')
 47 |     years = sorted([team.name[4:] for team in teams])
 48 | 
 49 |     for year in years:
 50 |         zipdata = raw_dir.joinpath(f'{year}eve.zip')
 51 |         assert zipdata.exists()
 52 | 
 53 |         # should be same number of files in raw_dir as in zipfile
 54 |         files = [file for file in raw_dir.glob(f'*{year}*') if not file.name.endswith('.zip')]
 55 |         zipped = zipfile.ZipFile(zipdata)
 56 |         assert len(files) == len(zipped.namelist())
 57 | 
 58 | 
 59 | def test_download_years(batting):
 60 |     """Verify the Retrosheet years 1974 through 2019 inclusive were downloaded.
 61 | 
 62 |     The data consistency tests have accuracy bounds tested on these years only!"""
 63 |     assert (batting['year'].agg(['min', 'max']) == (1974, 2019)).all()
 64 |     assert batting['year'].nunique() == (2019 - 1974) + 1
 65 | 
 66 | 
 67 | def test_lahman_people_pkey(lahman_people):
 68 |     """Verify the Lahman People primary and foreign keys."""
 69 |     assert dh.is_unique(lahman_people, ['player_id'])  # lahman player id
 70 |     assert dh.is_unique(lahman_people, ['retro_id'], ignore_null=True)  # retrosheet player id
 71 | 
 72 | 
 73 | def test_lahman_fielding_pkey(lahman_fielding):
 74 |     """Verfiy the Lahman Fielding primary keys."""
 75 |     assert dh.is_unique(lahman_fielding, ['player_id', 'year', 'stint', 'pos'])
 76 | 
 77 | 
 78 | def test_lahman_batting_pkey(lahman_batting):
 79 |     """Verify the Lahman Batting primary key."""
 80 |     assert dh.is_unique(lahman_batting, ['player_id', 'year', 'stint'])
 81 | 
 82 | 
 83 | def test_lahman_pitching_pkey(lahman_pitching):
 84 |     """Verify the Lahman Pitching primary key."""
 85 |     assert dh.is_unique(lahman_pitching, ['player_id', 'year', 'stint'])
 86 | 
 87 | 
 88 | def test_lahman_salaries_pkey(data_dir):
 89 |     """Verify the Lahman Salaries primary key."""
 90 |     filename = data_dir / 'lahman' / 'wrangled' / 'salaries.csv'
 91 | 
 92 |     # check for duplicate IDs
 93 |     salaries = dh.from_csv_with_types(filename)
 94 |     assert dh.is_unique(salaries, ['player_id', 'year', 'team_id'])
 95 | 
 96 | 
 97 | def test_lahman_teams_pkey(lahman_teams):
 98 |     """Verify the Lahman Teams primary key."""
 99 |     assert dh.is_unique(lahman_teams, ['team_id', 'year'])  # lahman team_id
100 |     assert dh.is_unique(lahman_teams, ['team_id_retro', 'year'])  # retrosheet team_id
101 | 
102 | 
103 | def test_lahman_parks_pkey(data_dir):
104 |     """Verify the Lahman Parks primary key."""
105 |     filename = data_dir / 'lahman' / 'wrangled' / 'parks.csv'
106 | 
107 |     # check for duplicate IDs
108 |     parks = dh.from_csv_with_types(filename)
109 |     assert dh.is_unique(parks, ['park_key'])
110 | 
111 |     # park_name is not unique
112 |     # assert dh.is_unique(parks, ['park_name']
113 | 
114 | 
115 | def test_game_id(team_game):
116 |     """Verify 1st 3 characters of game_id are the team batting last."""
117 |     filt = team_game['bat_last'] == False
118 |     team_game['home_team_id'] = team_game['team_id']
119 |     team_game.loc[filt, 'home_team_id'] = team_game.loc[filt, 'opponent_team_id']
120 | 
121 |     assert (team_game['game_id'].str[:3] == team_game['home_team_id']).all()
122 | 
123 | 
124 | def test_batting_flags(batting):
125 |     """Verify the batting flags are 0 or 1.
126 | 
127 |     g means in the game in the specified role.
128 |     For example, g_pr means in the game as a pinch runner."""
129 |     flag_cols = [
130 |         'g',
131 |         'g_dh',
132 |         'g_ph',
133 |         'g_pr'
134 |     ]
135 | 
136 |     assert batting[flag_cols].min().min() == 0
137 |     assert batting[flag_cols].max().max() == 1
138 | 
139 | 
140 | def test_pitching_flags(pitching):
141 |     """Verify the pitching flags are 0 or 1.
142 | 
143 |     For example:
144 |     gs means the pitcher started the game
145 |     gf means the pitcher finished the game"""
146 |     flag_cols = [
147 |         'g',
148 |         'gs',
149 |         'cg',
150 |         'sho',
151 |         'gf',
152 |         'w',
153 |         'l',
154 |         'sv'
155 |     ]
156 | 
157 |     assert pitching[flag_cols].min().min() == 0
158 |     assert pitching[flag_cols].max().max() == 1
159 | 
160 | 
161 | def test_fielding_flags(fielding):
162 |     """Verify the fielding flags are either 0 or 1."""
163 |     flag_cols = [
164 |         'g',
165 |         'gs'
166 |     ]
167 | 
168 |     assert fielding[flag_cols].min().min() == 0
169 |     assert fielding[flag_cols].max().max() == 1
170 | 
171 | 
172 | def test_batting_pkey(batting):
173 |     """Verify the Retrosheet batting primary key."""
174 |     assert dh.is_unique(batting, ['player_id', 'game_id'])
175 | 
176 | 
177 | def test_pitching_pkey(pitching):
178 |     """Verify the Retrosheet pitching primary key."""
179 |     assert dh.is_unique(pitching, ['player_id', 'game_id'])
180 | 
181 | 
182 | def test_fielding_pkey(fielding):
183 |     """Verify the Retrosheet fielding primary key."""
184 |     assert dh.is_unique(fielding, ['player_id', 'game_id', 'pos'])
185 | 
186 | 
187 | def test_team_game_pkey(team_game):
188 |     """Verify the Retrosheet team_game primary key."""
189 |     assert dh.is_unique(team_game, ['team_id', 'game_id'])
190 | 
191 | 
192 | def test_game_pkey(game):
193 |     """Verify the Retrosheet game primary key."""
194 |     assert dh.is_unique(game, ['game_id'])
195 | 
196 | 
197 | def test_lahman_retro_batting_data(batting, lahman_batting):
198 |     """Compare Aggregated Lahman batting data to Aggregated Retrosheet batting data"""
199 |     # columns in common -- these are the columns to compare
200 |     b_cols = set(batting.columns) & set(lahman_batting.columns)
201 |     b_cols -= {'player_id', 'team_id', 'year'}
202 | 
203 |     # there are 17 columns in common
204 |     assert len(b_cols) == 17
205 | 
206 |     l_batting = lahman_batting[b_cols]
207 |     r_batting = batting[b_cols]
208 | 
209 |     l_sums = l_batting.agg('sum').astype(int)
210 |     l_sums.sort_index(inplace=True)
211 | 
212 |     r_sums = r_batting.agg('sum').astype(int)
213 |     r_sums.sort_index(inplace=True)
214 | 
215 |     # verify all 17 batting attributes
216 |     # are within plus/minus 0.01% of each other when summed
217 |     assert (np.abs(1.0 - (l_sums / r_sums)) < .0001).all()
218 | 
219 | 
220 | def test_lahman_retro_pitching_data(pitching, lahman_pitching):
221 |     """Compare Aggregated Lahman pitching data to Aggregated Retrosheet pitching data"""
222 |     # columns in common -- these are the columns to compare
223 |     p_cols = set(lahman_pitching.columns) & set(pitching.columns)
224 |     p_cols -= {'player_id', 'team_id', 'year'}
225 | 
226 |     # there are 21 columns in common
227 |     assert len(p_cols) == 21
228 | 
229 |     l_pitching = lahman_pitching[p_cols]
230 |     r_pitching = pitching[p_cols]
231 | 
232 |     l_sums = l_pitching.agg('sum').astype(int)
233 |     l_sums.sort_index(inplace=True)
234 | 
235 |     r_sums = r_pitching.agg('sum').astype(int)
236 |     r_sums.sort_index(inplace=True)
237 | 
238 |     # verify all values are within plus/minus 0.06% of each other
239 |     assert (np.abs(1.0 - (l_sums / r_sums)) < .0006).all()
240 | 
241 | 
242 | def test_lahman_retro_fielding_data(fielding, lahman_fielding):
243 |     """Compare Aggregated Lahman fielding per position data to
244 |     Aggregated Retrosheet fielding per position data."""
245 |     # find the common columns
246 |     f_cols = set(lahman_fielding.columns) & set(fielding.columns)
247 |     f_cols -= {'player_id', 'pos', 'team_id', 'year'}
248 |     f_cols = list(f_cols)
249 | 
250 |     # work-around for Pandas 1.0.1 bugs
251 |     # sum does not up-cast for nullable integer types
252 |     # select_dtypes does not distinguish between nullable and non-nullable int types
253 |     idx = lahman_fielding[f_cols].dtypes.isin([pd.UInt8Dtype(), pd.UInt16Dtype()])
254 |     for col in lahman_fielding[f_cols].columns[idx]:
255 |         lahman_fielding[col] = lahman_fielding[col].astype('Int32')
256 | 
257 |     l_sums = lahman_fielding.groupby('pos')[f_cols].agg('sum')
258 |     l_sums.sort_index(inplace=True)
259 | 
260 |     # there are 7 fielding attributes and 7 fielding positions in Lahman
261 |     assert l_sums.shape == (7, 7)
262 | 
263 |     r_sums = fielding.groupby('pos')[f_cols].agg('sum').astype('int')
264 | 
265 |     # Lahman uses OF for sum of LF, CF, RF
266 |     r_sums.loc['OF'] = r_sums.loc['LF'] + r_sums.loc['CF'] + r_sums.loc['RF']
267 |     r_sums = r_sums.drop(['LF', 'CF', 'RF'])
268 |     r_sums.sort_index(inplace=True)
269 | 
270 |     # there are now 7 fielding attributes and 7 fielding positions in Retrosheet sums
271 |     assert r_sums.shape == (7, 7)
272 | 
273 |     # the indexes and columns should now be the same
274 |     assert l_sums.index.equals(r_sums.index)
275 |     assert l_sums.columns.equals(r_sums.columns)
276 | 
277 |     filt = fielding['pos'].isin(['LF', 'CF', 'RF'])
278 |     r_of = fielding[filt]
279 | 
280 |     # account for outfielders who played more than 1 outfield position in the same game
281 |     total_dups = r_of.duplicated(subset=['player_id', 'game_id'], keep=False).sum()
282 |     counted_dups = r_of.duplicated(subset=['player_id', 'game_id'], keep='first').sum()
283 |     r_sums.loc['OF', 'g'] -= (total_dups - counted_dups)
284 | 
285 |     rel_accuarcy = l_sums / r_sums
286 | 
287 |     # relative accuracy is within 0.8% for all 49 aggregated values
288 |     assert (np.abs(1.0 - rel_accuarcy) < 0.008).all().all()
289 | 
290 | 
291 | def test_batting_team_game_data(batting, team_game):
292 |     """Verify Retrosheet batting aggregated by (game_id, team_id)
293 |     is the same as team_game batting stats."""
294 |     exclude = ['game_id', 'team_id', 'player_id', 'game_start', 'year']
295 |     cols = set(batting.columns) & set(team_game.columns) - set(exclude)
296 |     cols = list(cols)
297 | 
298 |     assert len(cols) == 17
299 | 
300 |     b = batting[['game_id', 'team_id'] + cols].groupby(['game_id', 'team_id']).agg('sum')
301 |     b = b.reset_index().sort_index()
302 | 
303 |     tg = team_game[['game_id', 'team_id'] + cols].sort_values(
304 |         ['game_id', 'team_id']).reset_index(drop=True)
305 | 
306 |     assert b.equals(tg)
307 | 
308 | 
309 | def test_pitching_team_game_data(pitching, team_game):
310 |     """Verify Retrosheet batting aggregated by (game_id, team_id)
311 |     is the same as team_game pitching stats
312 | 
313 |     This shows that the two Retrosheet parsers are consistent with one another."""
314 |     cols = ['wp', 'bk', 'er']
315 | 
316 |     p = pitching[['game_id', 'team_id'] + cols].groupby(['game_id', 'team_id']).agg('sum')
317 |     p = p.reset_index().sort_index()
318 | 
319 |     tg = team_game[['game_id', 'team_id'] + cols].sort_values(
320 |         ['game_id', 'team_id']).reset_index(drop=True)
321 | 
322 |     assert p.equals(tg)
323 | 
324 | 
325 | def test_fielding_team_game_data(fielding, team_game):
326 |     """Verify Retrosheet fielding aggregated by (game_id, team_id)
327 |     is the same a team_game fielding stats
328 | 
329 |     This shows that the two Retrosheet parsers are consistent with one another."""
330 |     cols = ['a', 'e', 'po', 'pb']
331 | 
332 |     f = fielding[['game_id', 'team_id'] + cols].groupby(['game_id', 'team_id']).agg('sum')
333 |     f = f.reset_index().sort_index()
334 | 
335 |     tg = team_game[['game_id', 'team_id'] + cols].sort_values(
336 |         ['game_id', 'team_id']).reset_index(drop=True)
337 | 
338 |     assert f.equals(tg)
339 | 
340 | 
341 | def test_batting_lahman_game_data(batting, lahman_teams):
342 |     """Verify Retrosheet batting aggregated by (year, team_id_lahman)
343 |     is the same as Lahman_teams.
344 | 
345 |     This shows that Retrosheet batting and Lahman Teams are consistent with each other."""
346 |     # Add team_id_lahman
347 |     retro_batting = pd.merge(batting, lahman_teams[['team_id', 'year', 'team_id_retro']],
348 |                              left_on=['year', 'team_id'],
349 |                              right_on=['year', 'team_id_retro'],
350 |                              how='inner', suffixes=['_retrosheet', '_lahman'])
351 | 
352 |     # team_id_retro is now the same as team_id_retrosheet
353 |     retro_batting.drop('team_id_retro', axis=1, inplace=True)
354 | 
355 |     pkey = ['year', 'team_id']
356 |     compare_cols = set(lahman_teams.columns) & set(retro_batting.columns) - set(pkey)
357 |     compare_cols -= {'g'}  # cannot sum g by player per team to get g per team
358 |     compare_cols -= {'sb', 'cs'}  # these stats are close, but don't tie out as well as others
359 |     compare_cols = list(compare_cols)
360 | 
361 |     assert len(compare_cols) == 10
362 | 
363 |     retro_batting_sums = retro_batting.groupby(['year', 'team_id_lahman'])[compare_cols].sum().astype('int')
364 |     retro_batting_sums.sort_index(inplace=True)
365 | 
366 |     year_min, year_max = retro_batting['year'].aggregate(['min', 'max'])
367 |     year_filt = (lahman_teams['year'] >= year_min) & (lahman_teams['year'] <= year_max)
368 |     l_teams = lahman_teams.loc[year_filt, pkey + compare_cols]
369 |     l_teams = l_teams.set_index(pkey).sort_index()
370 | 
371 |     # verify all 12880 values are within 0.5% of each other
372 |     assert np.abs(1.0 - (l_teams / retro_batting_sums)).max().max() < 0.005
373 | 
374 | 
375 | def test_attendance_values(game):
376 |     """Verify attendance has plausible values."""
377 |     # There was one baseball game in which the public was not allowed to attend.
378 |     # This is considered null rather than 0, as people wanted to attend, but were not allowed.
379 |     # https://www.baseball-reference.com/boxes/BAL/BAL201504290.shtml
380 |     assert game['attendance'].min() > 0
381 | 
382 | 
383 | def test_temperature_values(game):
384 |     """Verify temperature has plausible values."""
385 |     # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-temperature
386 |     assert game['temperature'].min() > 0
387 | 
388 | 
389 | def test_wind_speed_values(game):
390 |     """Verify wind speed has plausible values."""
391 |     assert game['wind_speed'].min() >= 0
392 | 
393 | 
394 | def test_wind_direction_values(game):
395 |     """Verfiy wind direction is in known category."""
396 |     # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-winddirection
397 |     valid_values = ['to_lf', 'to_cf', 'to_rf', 'l_to_r', 'from_lf', 'from_cf',
398 |                     'from_rf', 'r_to_l']
399 |     assert game['wind_direction'].dropna().isin(valid_values).all()
400 | 
401 | 
402 | def test_field_condition_values(game):
403 |     """Verify field condition is in known category."""
404 |     # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-fieldcondition
405 |     valid_values = ['soaked', 'wet', 'damp', 'dry']
406 |     assert game['field_condition'].dropna().isin(valid_values).all()
407 | 
408 | 
409 | def test_precip_type_values(game):
410 |     """Verify precipition type is in known category."""
411 |     # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-precipitation
412 |     valid_values = ['none', 'drizzle', 'showers', 'rain', 'snow']
413 |     assert game['precip_type'].dropna().isin(valid_values).all()
414 | 
415 | 
416 | def test_sky_condition_values(game):
417 |     """Verify sky condition is in known category."""
418 |     # http://chadwick.sourceforge.net/doc/cwgame.html#cwtools-cwgame-sky
419 |     valid_values = ['sunny', 'cloudy', 'overcast', 'night', 'dome']
420 |     assert game['sky_condition'].dropna().isin(valid_values).all()
421 | 
422 | 
423 | def test_game_length_values(game):
424 |     """Verify number of outs is consistent with number of innings."""
425 |     outs = game['outs_ct']
426 |     inns = game['inn_ct']
427 | 
428 |     # this is defined by the rules of baseball
429 |     assert ((5 * inns <= outs) & (outs <= 6 * inns)).all()
430 | 
431 | 
432 | def test_game_length_minute_values(game):
433 |     """Verify game length per out is plausible."""
434 |     outs = game['outs_ct']
435 |     mins = game['minutes_game_ct']
436 |     mins_per_out = mins / outs
437 | 
438 |     # these bounds should be wide enough to encompass any future game
439 |     assert mins_per_out.min() > 1 and mins_per_out.max() < 6
440 | 
441 | 
442 | def test_retro_lahman_batting_players(batting, lahman_people, lahman_batting):
443 |     """Verify all Retrosheet batters are in Lahman batting"""
444 |     lahman_batters = pd.merge(lahman_batting['player_id'], lahman_people[['player_id', 'retro_id']])
445 |     r_batters = set(batting['player_id'].unique())
446 |     l_batters = set(lahman_batters['retro_id'].unique())
447 |     assert r_batters == l_batters
448 | 
449 | 
450 | def test_retro_lahman_fielding_players(fielding, lahman_people, lahman_fielding):
451 |     """Verify all Retrosheet fielders are in Lahman fielding"""
452 |     lahman_fielders = pd.merge(lahman_fielding['player_id'], lahman_people[['player_id', 'retro_id']])
453 |     r_fielders = set(fielding['player_id'].unique())
454 |     l_fielders = set(lahman_fielders['retro_id'].unique())
455 | 
456 |     # There is one Retrosheet fielder not in Lahman fielding
457 |     assert len(r_fielders - l_fielders) == 1
458 |     assert len(l_fielders - r_fielders) == 0
459 | 
460 |     missing_fielder = f'{(r_fielders - l_fielders).pop()}'
461 |     missing = fielding.query(f'player_id == "{missing_fielder}"')
462 | 
463 |     # The missing fielder had zero fielding total chances.
464 |     assert missing['tc'].sum() == 0
465 | 
466 |     # The missing fielder was on the field for no outs.
467 |     assert missing['inn_outs'].sum() == 0
468 | 
469 | 
470 | def test_retro_lahman_pitching_players(pitching, lahman_pitching, lahman_people):
471 |     """Verify all Retrosheet pitchers are in Lahman pitchers"""
472 |     lahman_pitchers = pd.merge(lahman_pitching['player_id'], lahman_people[['player_id', 'retro_id']])
473 |     r_pitchers = set(pitching['player_id'].unique())
474 |     l_pitchers = set(lahman_pitchers['retro_id'].unique())
475 |     assert r_pitchers == l_pitchers
476 | 
477 | 
478 | def test_retro_lahman_player_ids(batting, lahman_people):
479 |     """Verify the inverse of Lahman player_id to Retrosheet player_id mapping is valid.
480 | 
481 |     In other words, each Retrosheet player_id is mapped to exactly one Lahman player_id.
482 | 
483 |     Other tests verify that Retrosheet player_ids and Lahman player_ids are unique.
484 | 
485 |     Note: every player who was in a game, has a Retrosheet batting record even if
486 |     they had no plate appearances."""
487 |     retro_players = pd.Series(batting['player_id'].unique(), name='player_id')
488 | 
489 |     # use an inner join to verify that the mapping is one-to-one and onto
490 |     mapping = lahman_people[['player_id', 'retro_id']].merge(
491 |         retro_players, how='inner',
492 |         left_on=['retro_id'],
493 |         right_on=['player_id'],
494 |         suffixes=('_lahman', '_retro'))
495 | 
496 |     assert len(retro_players) == len(mapping)
497 | 
498 | 
499 | def test_retro_lahman_team_ids(team_game, lahman_teams):
500 |     """Verify the inverse of the Lahman <team_id> to Retroshett <team_id> mapping is valid.
501 |     A <team_id> is (team_id, year).
502 | 
503 |     The logic is analogous test_retro_lahman_player_ids() above."""
504 | 
505 |     # create a Retrosheet dataframe having just the unique <team_id> values
506 |     retro_team_ids = team_game[['team_id', 'year']].copy()
507 |     retro_team_ids = retro_team_ids.drop_duplicates(subset=['team_id', 'year'])
508 | 
509 |     # use an inner join to verify that the mapping is one-to-one and onto
510 |     mapping = lahman_teams.merge(retro_team_ids, how='inner',
511 |                                  left_on=['team_id_retro', 'year'],
512 |                                  right_on=['team_id', 'year'])
513 | 
514 |     assert len(retro_team_ids) == len(mapping)
515 | 
516 | 
517 | def test_retro_pitching_batting(pitching, batting):
518 |     """Verify Retrosheet batting stats == pitching stats (allowed)"""
519 |     exclude = ['game_id', 'team_id', 'player_id', 'g', 'game_start', 'year']
520 |     cols = set(pitching.columns) & set(batting.columns) - set(exclude)
521 |     cols = list(cols)
522 |     assert len(cols) == 16
523 | 
524 |     # sum over all pitchers over all years
525 |     p = pitching[cols].agg('sum')
526 | 
527 |     # sum over all batters over all years
528 |     b = batting[cols].agg('sum')
529 | 
530 |     # Retrosheet is completely consistent
531 |     p.equals(b)
532 | 
533 | 
534 | def test_lahman_pitching_batting(lahman_pitching, lahman_batting):
535 |     """Verify Lahman batting stats == pitching stats (allowed)"""
536 |     exclude = ['lg_id', 'player_id', 'stint', 'team_id', 'year', 'g']
537 |     cols = set(lahman_pitching.columns) & set(lahman_batting.columns)
538 |     cols -= set(exclude)
539 |     assert len(cols) == 10
540 | 
541 |     # sum over all pitchers over all years
542 |     p = lahman_pitching[cols].agg('sum')
543 | 
544 |     # sum over all batters over all years
545 |     b = lahman_batting[cols].agg('sum')
546 | 
547 |     # the biggest difference is less than 0.01%
548 |     assert np.abs(1.0 - p / b).max() < 0.0001
549 | 
550 | 
551 | def test_lahman_batting_teams(lahman_batting, lahman_teams):
552 |     """Verify Lahman batting aggregated to the team level matches Lahman teams."""
553 |     exclude = ['lg_id', 'team_id', 'year', 'g']
554 |     key = ['team_id', 'year']
555 |     cols = set(lahman_batting.columns) & set(lahman_teams.columns) - set(exclude)
556 |     cols = list(cols)
557 |     assert len(cols) == 12
558 | 
559 |     # work-around for Pandas 1.0.1 bugs
560 |     # sum does not up-cast for nullable integer types
561 |     # select_dtypes does not distinguish between nullable and non-nullable int types
562 |     idx = lahman_batting[cols].dtypes.isin([pd.UInt8Dtype(), pd.UInt16Dtype()])
563 |     for col in lahman_batting[cols].columns[idx]:
564 |         lahman_batting[col] = lahman_batting[col].astype('Int32')
565 | 
566 |     idx = lahman_teams[cols].dtypes.isin([pd.UInt8Dtype(), pd.UInt16Dtype()])
567 |     for col in lahman_teams[cols].columns[idx]:
568 |         lahman_teams[col] = lahman_teams[col].astype('Int32')
569 | 
570 |     b = lahman_batting[key + cols].groupby(key).agg('sum').reset_index()
571 | 
572 |     t = lahman_teams[key + cols].sort_values(key).reset_index(drop=True)
573 | 
574 |     # ensure the dtypes are the same
575 |     for col in t.columns:
576 |         if not col == 'team_id' and not col == 'year':
577 |             b[col] = b[col].astype('int')
578 |             t[col] = t[col].astype('int')
579 | 
580 |     assert b[cols].equals(t[cols])
581 | 
582 | 
583 | def test_lahman_pitching_teams(lahman_pitching, lahman_teams):
584 |     """Verify Lahman pitching aggregated to the team level matches Lahman teams."""
585 |     # most of the common columns are for batting, not pitching
586 |     # era cannot be summed
587 |     # sho for team is counted differently than for pitcher
588 |     # er for team is counted differently than for pitcher
589 |     exclude = ['lg_id', 'team_id', 'year', 'g', 'era',
590 |                'bb', 'h', 'hbp', 'hr', 'r', 'sf', 'so', 'sho', 'er']
591 |     key = ['team_id', 'year']
592 |     cols = set(lahman_pitching.columns) & set(lahman_teams.columns) - set(exclude)
593 |     cols = list(cols)
594 |     assert len(cols) == 5
595 | 
596 |     p = lahman_pitching[key + cols].groupby(key).agg('sum').reset_index()
597 | 
598 |     t = lahman_teams[key + cols].sort_values(key).reset_index(drop=True)
599 | 
600 |     # dtypes need to be the same
601 |     for col in p.columns:
602 |         if not col == 'year' and not col == 'team_id':
603 |             p[col] = p[col].astype('int')
604 |             t[col] = t[col].astype('int')
605 | 
606 |     assert np.abs(p[cols] - t[cols]).max().max() == 1
607 | 
608 | 
609 | def test_lahman_fielding_teams(lahman_fielding, lahman_teams):
610 |     """Verify Lahman fielding aggregated to the team level matches Lahman teams."""
611 |     # dp is excluded because in fielding, each fielder involved gets a dp
612 |     # whereas in team only one dp is counted
613 |     exclude = ['lg_id', 'team_id', 'year', 'g', 'dp', 'player_id']
614 |     key = ['team_id', 'year']
615 |     cols = set(lahman_fielding.columns) & set(lahman_teams.columns) - set(exclude)
616 |     cols = list(cols)
617 |     assert len(cols) == 1
618 | 
619 |     f = lahman_fielding[key + cols].groupby(key).agg('sum').reset_index()
620 | 
621 |     t = lahman_teams[key + cols].sort_values(key).reset_index(drop=True)
622 | 
623 |     # ensure the dtypes are the same
624 |     col = 'e'
625 |     f[cols] = f[cols].astype('int')
626 |     t[cols] = t[cols].astype('int')
627 | 
628 |     # When comparing large values, it is best to use their relative differences.
629 |     # When comparing small values, it is best to use their absolute differences.
630 |     assert ((f[cols] - t[cols]).max() <= 2).all()
631 | 
632 | 
633 | def test_event(event, team_game):
634 |     """Verify play-by-play data aggregated per team per game matches team_game data
635 | 
636 |     About 10 fields were added to cwevent output by custom parsing of event_tx.
637 |     These 10 fields are included in this test."""
638 | 
639 |     key = ['game_id', 'team_id', 'opponent_team_id']
640 |     compare_cols = set(team_game.columns) & set(event.columns) - set(key)
641 |     compare_cols = list(compare_cols)
642 |     assert len(compare_cols) == 21
643 | 
644 |     event_team_game = event[key + compare_cols].groupby(key).agg('sum')
645 | 
646 |     # e, dp, tp, pb, wp, and bk should be charged to the opponent when
647 |     # aggregating values to compare with team_game
648 |     opp_cols = ['e', 'dp', 'tp', 'pb', 'wp', 'bk']
649 |     tmp = event_team_game.reset_index()
650 |     opp = event_team_game.sort_values(['game_id', 'opponent_team_id']).reset_index()
651 | 
652 |     # swap column values
653 |     tmp[opp_cols] = opp[opp_cols]
654 |     event_team_game = tmp
655 | 
656 |     tg = team_game.set_index(['game_id', 'team_id']).sort_index()
657 |     etg = event_team_game.set_index(['game_id', 'team_id']).sort_index()
658 | 
659 |     diff = tg[compare_cols] - etg[compare_cols]
660 | 
661 |     assert diff.max().max() == 0
662 |     assert diff.min().min() == 0
663 | 
664 | 
665 | def test_event_pkey(event):
666 |     """Verify the Retrosheet event primary key."""
667 |     assert dh.is_unique(event, ['game_id', 'event_id'])
668 | 
669 | 
670 | def test_line_score(team_game):
671 |     """Verify line score total is run total."""
672 | 
673 |     def line_score_to_runs(row):
674 |         line_score = row['line_tx']
675 | 
676 |         # example: 0102(11)0500
677 |         # capture all digits having
678 |         # positive lookbehind = (
679 |         # positive lookahead = )
680 |         # OR capture one digit at a time
681 |         runs = 0
682 |         for value in re.findall(r'(?<=\()\d+(?=\))|\d', line_score):
683 |             runs += int(value)
684 | 
685 |         return runs
686 | 
687 |     runs = team_game.apply(line_score_to_runs, axis=1)
688 |     assert (runs == team_game['r']).all()
689 | 


--------------------------------------------------------------------------------
/data/lahman/readme2017.txt:
--------------------------------------------------------------------------------
  1 | The Lahman Baseball Database
  2 | 
  3 | 2017 Version
  4 | Release Date: March 31, 2018
  5 | 
  6 | ----------------------------------------------------------------------
  7 | 
  8 | README CONTENTS
  9 | 0.1 Copyright Notice
 10 | 0.2 Contact Information
 11 | 
 12 | 1.0 Release Contents
 13 | 1.1 Introduction
 14 | 1.2 What's New
 15 | 1.3 Acknowledgements
 16 | 1.4 Using this Database
 17 | 1.5 Revision History
 18 | 
 19 | 2.0 Data Tables
 20 | 
 21 | ----------------------------------------------------------------------
 22 | 
 23 | 0.1 Copyright Notice & Limited Use License
 24 | 
 25 | This database is copyright 1996-2018 by Sean Lahman. 
 26 | 
 27 | This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License. For details see: http://creativecommons.org/licenses/by-sa/3.0/
 28 | 
 29 | 
 30 | For licensing information or further information, contact Sean Lahman
 31 | at: seanlahman@gmail.com
 32 | 
 33 | ----------------------------------------------------------------------
 34 | 
 35 | 0.2 Contact Information
 36 | 
 37 | Web site: http://www.baseball1.com
 38 | E-Mail : seanlahman@gmail.com
 39 | 
 40 | If you're interested in contributing to the maintenance of this 
 41 | database or making suggestions for improvement, please consider
 42 | joining our mailinglist at:
 43 | 
 44 |     http://groups.yahoo.com/group/baseball-databank/
 45 | 
 46 | If you are interested in similar databases for other sports, please
 47 | vist the Open Source Sports website at http://OpenSourceSports.com
 48 | 
 49 | ----------------------------------------------------------------------
 50 | 1.0  Release Contents
 51 | 
 52 | This release of the database can be downloaded in several formats. The
 53 | contents of each version are listed below.
 54 | 
 55 | MS Access Versions:
 56 |       lahman2017.mdb 
 57 |       2017readme.txt 
 58 | 
 59 | SQL version
 60 |       lahman2017.sql
 61 |       2017readme.txt 
 62 | 	  
 63 | Comma Delimited Version:
 64 |       2017readme.txt     
 65 |       AllStarFull.csv
 66 |       Appearances.csv
 67 |       AwardsManagers.csv
 68 |       AwardsPlayers.csv
 69 |       AwardsShareManagers.csv
 70 |       AwardsSharePlayers.csv
 71 |       Batting.csv
 72 |       BattingPost.csv
 73 |       CollegePlaying.csv
 74 |       Fielding.csv
 75 |       FieldingOF.csv
 76 |       FieldingPost.csv
 77 | 	  FieldingOFsplit
 78 |       HallOfFame.csv
 79 | 	  HomeGames.csv
 80 |       Managers.csv
 81 |       ManagersHalf.csv
 82 | 	  Parks.csv
 83 |       People.csv
 84 |       Pitching.csv
 85 |       PitchingPost.csv
 86 | 	  README.txt
 87 |       Salaries.csv
 88 |       Schools.csv
 89 |       SeriesPost.csv
 90 |       Teams.csv
 91 |       TeamsFranchises.csv
 92 |       TeamsHalf.csv
 93 | 
 94 | ----------------------------------------------------------------------
 95 | 1.1 Introduction
 96 | 
 97 | This database contains pitching, hitting, and fielding statistics for
 98 | Major League Baseball from 1871 through 2017.  It includes data from
 99 | the two current leagues (American and National), the four other "major" 
100 | leagues (American Association, Union Association, Players League, and
101 | Federal League), and the National Association of 1871-1875. 
102 | 
103 | This database was created by Sean Lahman, who pioneered the effort to
104 | make baseball statistics freely available to the general public. What
105 | started as a one man effort in 1994 has grown tremendously, and now a
106 | team of researchers have collected their efforts to make this the
107 | largest and most accurate source for baseball statistics available
108 | anywhere. (See Acknowledgements below for a list of the key
109 | contributors to this project.)
110 | 
111 | None of what we have done would have been possible without the
112 | pioneering work of Hy Turkin, S.C. Thompson, David Neft, and Pete
113 | Palmer (among others).  All baseball fans owe a debt of gratitude
114 | to the people who have worked so hard to build the tremendous set
115 | of data that we have today.  Our thanks also to the many members of
116 | the Society for American Baseball Research who have helped us over
117 | the years.  We strongly urge you to support and join their efforts.
118 | Please vist their website (www.sabr.org).
119 | 
120 | If you have any problems or find any errors, please let us know.  Any 
121 | feedback is appreciated
122 | 
123 | ----------------------------------------------------------------------
124 | 1.2 What's New in 2017
125 | 
126 | Player stats have been updated through 2017 season, and many of the other tables 
127 | have been updated based on new research into the historical record.
128 | 
129 | One notable change: The name of the table that contains biographical information
130 | for players has been changed from "Master" to "People" top better reflect its 
131 | contents.
132 | 
133 | ----------------------------------------------------------------------
134 | 1.3 Acknowledgements
135 | 
136 | Much of the raw data contained in this database comes from the work of
137 | Pete Palmer, the legendary statistician, who has had a hand in most 
138 | of the baseball encylopedias published since 1974. He is largely 
139 | responsible for bringing the batting, pitching, and fielding data out
140 | of the dark ages and into the computer era.  Without him, none of this
141 | would be possible.  For more on Pete's work, please read his own 
142 | account at: http://sabr.org/cmsfiles/PalmerDatabaseHistory.pdf
143 | 
144 | Three people have been key contributors to the work that followed, first 
145 | by taking the raw data and creating a relational database, and later 
146 | by extending the database to make it more accesible to researchers.
147 | 
148 | Sean Lahman launched the Baseball Archive's website back before 
149 | most people had heard of the world wide web.  Frustrated by the
150 | lack of sports data available, he led the effort to build a 
151 | baseball database that everyone could use. He created the first version
152 | of the database and began to make it available for free download from
153 | his website in 1995.  
154 | 
155 | The work of Sean Forman to create and maintain an online encyclopedia
156 | at Baseball-Reference.com was a quantum leap for both fans and researchers. 
157 | The website launched in 2000, provding a user-friendly interface to the Lahman
158 | Baseball Database.  Forman and Lahman launched the Baseball Databank in 2001,
159 | a group of researchers whose goal was to update and maintain the database
160 | as an open source collection available to all.
161 |   
162 | Ted Turocy has done the lion's share of the work to updating the main
163 | data tables since 2012, automating the work of annual updates and linking
164 | historical data to play-by-play accounts compiled by Retrosheet.
165 | 
166 | A handful of researchers have made substantial contributions to 
167 | maintain this database over years. Listed alphabetically, they 
168 | are: Derek Adair, Mike Crain, Kevin Johnson, Rod Nelson, Tom Tango,
169 | and Paul Wendt. These folks did much of the heavy lifting, and are 
170 | largely responsible for the improvements made since 2000.
171 | 
172 | Others who made important contributions include: Dvd Avins, 
173 | Clifford Blau, Bill Burgess, Clem Comly, Jeff Burk, Randy Cox, 
174 | Mitch Dickerman, Paul DuBois, Mike Emeigh, F.X. Flinn, Bill Hickman,
175 | Jerry Hoffman, Dan Holmes, Micke Hovmoller, Peter Kreutzer, 
176 | Danile Levine, Bruce Macleod, Ken Matinale, Michael Mavrogiannis,
177 | Cliff Otto, Alberto Perdomo, Dave Quinn, John Rickert, Tom Ruane,
178 | Theron Skyles, Hans Van Slooten, Michael Westbay, and Rob Wood.
179 | 
180 | Many other people have made significant contributions to the database
181 | over the years.  The contribution of Tom Ruane's effort to the overall
182 | quality of the underlying data has been tremendous. His work at
183 | retrosheet.org integrates the yearly data with the day-by-day data,
184 | creating a reference source of startling depth. 
185 | 
186 | Sean Holtz helped with a major overhaul and redesign before the
187 | 2000 season. Keith Woolner was instrumental in helping turn
188 | a huge collection of stats into a relational database in the mid-1990s.
189 | Clifford Otto & Ted Nye also helped provide guidance to the early 
190 | versions. Lee Sinnis, John Northey & Erik Greenwood helped supply key
191 | pieces of data. Many others have written in with corrections and 
192 | suggestions that made each subsequent version even better than what
193 | preceded it. 
194 | 
195 | The work of the SABR Baseball Records Committee, led by Lyle Spatz
196 | has been invaluable.  So has the work of Bill Carle and the SABR 
197 | Biographical Committee. David Vincent, keeper of the Home Run Log and
198 | other bits of hard to find info, has always been helpful. The recent
199 | addition of colleges to player bios is the result of much research by
200 | members of SABR's Collegiate Baseball committee.
201 | 
202 | Salary data was first supplied by Doug Pappas, who passed away during
203 | the summer of 2004. He was the leading authority on many subjects, 
204 | most significantly the financial history of Major League Baseball. 
205 | We are grateful that he allowed us to include some of the data he 
206 | compiled.  His work has been continued by the SABR Business of 
207 | Baseball committee.  
208 | 
209 | Thanks is also due to the staff at the National Baseball Library
210 | in Cooperstown who have been so helpful over the years, including
211 | Tim Wiles, Jim Gates, Bruce Markusen, and the rest of the staff.  
212 | 
213 | A special debt of gratitude is owed to Dave Smith and the folks at
214 | Retrosheet. There is no other group working so hard to compile and
215 | share baseball data.  Their website (www.retrosheet.org) will give
216 | you a taste of the wealth of information Dave and the gang have collected.
217 | 
218 | Thanks to all contributors great and small. What you have created is
219 | a wonderful thing.
220 | 
221 | ----------------------------------------------------------------------
222 | 1.4 Using this Database
223 | 
224 | This version of the database is available in Microsoft Access
225 | format, SQL files or in a generic, comma delimited format. Because this is a
226 | relational database, you will not be able to use the data in a
227 | flat-database application. 
228 | 
229 | Please note that this is not a stand alone application.  It requires
230 | a database application or some other application designed specifically
231 | to interact with the database.
232 | 
233 | If you are unable to import the data directly, you should download the
234 | database in the delimted text format.  Then use the documentation
235 | in section 2.0 of this document to import the data into
236 | your database application. 
237 | 
238 | ----------------------------------------------------------------------
239 | 1.5 Revision History
240 | 
241 |      Version      Date            Comments
242 |        1.0      December 1992     Database ported from dBase
243 |        1.1      May 1993          Becomes fully relational
244 |        1.2      July 1993         Corrections made to full database
245 |        1.21     December 1993     1993 statistics added            
246 |        1.3      July 1994         Pre-1900 data added 
247 |        1.31     February 1995     1994 Statistics added
248 |        1.32     August 1995       Statistics added for other leagues
249 |        1.4      September 1995    Fielding Data added 
250 |        1.41     November 1995     1995 statistics added
251 |        1.42     March 1996        HOF/All-Star tables added
252 |        1.5-MS   October 1996      1st public release - MS Access format
253 |        1.5-GV   October 1996      Released generic comma-delimted files
254 |        1.6-MS   December 1996     Updated with 1996 stats, some corrections
255 |        1.61-MS  December 1996     Corrected error in MASTER table
256 |        1.62     February 1997     Corrected 1914-1915 batters data and updated
257 |        2.0      February 1998     Major Revisions-added teams & managers
258 |        2.1      October 1998      Interim release w/1998 stats
259 |        2.2      January 1999      New release w/post-season stats & awards added
260 |        3.0      November 1999	  Major release - fixed errors and 1999 statistics added
261 |        4.0      May 2001	  Major release - proofed & redesigned tables
262 |        4.5      March 2002        Updated with 2001 stats and added new biographical data
263 |        5.0      December 2002     Major revision - new tables and data
264 |        5.1      January 2004      Updated with 2003 data, and new pitching categories
265 |        5.2      November 2004     Updated with 2004 season statistics.
266 |        5.3      December 2005     Updated with 2005 season statistics.
267 |        5.4      December 2006     Updated with 2006 season statistics.
268 |        5.5      December 2007     Updated with 2007 season statistics.
269 |        5.6      December 2008     Updated with 2008 season statistics.
270 |        5.7      December 2009     Updated for 2009 and added several tables.
271 |        5.8      December 2010     Updated with 2010 season statistics.
272 |        5.9      December 2011     Updated for 2011 and removed obsolete tables.
273 |        2012     December 2012     Updated with 2012 season statistics
274 |        2013     December 2013     Updated with 2013 season statistics
275 |        2014     December 2014     Updated with 2014 season statistics
276 |        2015     December 2015     Updated with 2015 season statistics  
277 |        2016     February 2017     Updated for 2016 and added several tables
278 |        2017     March 2018        Updated for 2017
279 | 
280 | ------------------------------------------------------------------------------
281 | 2.0 Data Tables
282 | 
283 | The design follows these general principles.  Each player is assigned a
284 | unique number (playerID).  All of the information relating to that player
285 | is tagged with his playerID.  The playerIDs are linked to names and 
286 | birthdates in the MASTER table.
287 | 
288 | The database is comprised of the following main tables:
289 | 
290 |   People - Player names, DOB, and biographical info
291 |   Batting - batting statistics
292 |   Pitching - pitching statistics
293 |   Fielding - fielding statistics
294 | 
295 | It is supplemented by these tables:
296 | 
297 |   AllStarFull - All-Star appearances
298 |   HallofFame - Hall of Fame voting data
299 |   Managers - managerial statistics
300 |   Teams - yearly stats and standings 
301 |   BattingPost - post-season batting statistics
302 |   PitchingPost - post-season pitching statistics
303 |   TeamFranchises - franchise information
304 |   FieldingOF - outfield position data  
305 |   FieldingPost- post-season fielding data
306 |   FieldingOFsplit - LF/CF/RF splits
307 |   ManagersHalf - split season data for managers
308 |   TeamsHalf - split season data for teams
309 |   Salaries - player salary data
310 |   SeriesPost - post-season series information
311 |   AwardsManagers - awards won by managers 
312 |   AwardsPlayers - awards won by players
313 |   AwardsShareManagers - award voting for manager awards
314 |   AwardsSharePlayers - award voting for player awards
315 |   Appearances - details on the positions a player appeared at
316 |   Schools - list of colleges that players attended
317 |   CollegePlaying - list of players and the colleges they attended
318 |   Parks - list of major league ballparls
319 |   HomeGames - Number of homegames played by each team in each ballpark
320 | 
321 | 
322 | 
323 | --------------------------------------------------------------------------
324 | 2.1 People table
325 | 
326 | 
327 | playerID       A unique code asssigned to each player.  The playerID links
328 |                  the data in this file with records in the other files.
329 | birthYear      Year player was born
330 | birthMonth     Month player was born
331 | birthDay       Day player was born
332 | birthCountry   Country where player was born
333 | birthState     State where player was born
334 | birthCity      City where player was born
335 | deathYear      Year player died
336 | deathMonth     Month player died
337 | deathDay       Day player died
338 | deathCountry   Country where player died
339 | deathState     State where player died
340 | deathCity      City where player died
341 | nameFirst      Player's first name
342 | nameLast       Player's last name
343 | nameGiven      Player's given name (typically first and middle)
344 | weight         Player's weight in pounds
345 | height         Player's height in inches
346 | bats           Player's batting hand (left, right, or both)         
347 | throws         Player's throwing hand (left or right)
348 | debut          Date that player made first major league appearance
349 | finalGame      Date that player made first major league appearance (blank if still active)
350 | retroID        ID used by retrosheet
351 | bbrefID        ID used by Baseball Reference website
352 | 
353 | 
354 | ------------------------------------------------------------------------------
355 | 2.2 Batting Table
356 | playerID       Player ID code
357 | yearID         Year
358 | stint          player's stint (order of appearances within a season)
359 | teamID         Team
360 | lgID           League
361 | G              Games
362 | AB             At Bats
363 | R              Runs
364 | H              Hits
365 | 2B             Doubles
366 | 3B             Triples
367 | HR             Homeruns
368 | RBI            Runs Batted In
369 | SB             Stolen Bases
370 | CS             Caught Stealing
371 | BB             Base on Balls
372 | SO             Strikeouts
373 | IBB            Intentional walks
374 | HBP            Hit by pitch
375 | SH             Sacrifice hits
376 | SF             Sacrifice flies
377 | GIDP           Grounded into double plays
378 | 
379 | ------------------------------------------------------------------------------
380 | 2.3 Pitching table
381 | 
382 | playerID       Player ID code
383 | yearID         Year
384 | stint          player's stint (order of appearances within a season)
385 | teamID         Team
386 | lgID           League
387 | W              Wins
388 | L              Losses
389 | G              Games
390 | GS             Games Started
391 | CG             Complete Games 
392 | SHO            Shutouts
393 | SV             Saves
394 | IPOuts         Outs Pitched (innings pitched x 3)
395 | H              Hits
396 | ER             Earned Runs
397 | HR             Homeruns
398 | BB             Walks
399 | SO             Strikeouts
400 | BAOpp          Opponent's Batting Average
401 | ERA            Earned Run Average
402 | IBB            Intentional Walks
403 | WP             Wild Pitches
404 | HBP            Batters Hit By Pitch
405 | BK             Balks
406 | BFP            Batters faced by Pitcher
407 | GF             Games Finished
408 | R              Runs Allowed
409 | SH             Sacrifices by opposing batters
410 | SF             Sacrifice flies by opposing batters
411 | GIDP           Grounded into double plays by opposing batter
412 | ------------------------------------------------------------------------------
413 | 2.4 Fielding Table
414 | 
415 | playerID       Player ID code
416 | yearID         Year
417 | stint          player's stint (order of appearances within a season)
418 | teamID         Team
419 | lgID           League
420 | Pos            Position
421 | G              Games 
422 | GS             Games Started
423 | InnOuts        Time played in the field expressed as outs 
424 | PO             Putouts
425 | A              Assists
426 | E              Errors
427 | DP             Double Plays
428 | PB             Passed Balls (by catchers)
429 | WP             Wild Pitches (by catchers)
430 | SB             Opponent Stolen Bases (by catchers)
431 | CS             Opponents Caught Stealing (by catchers)
432 | ZR             Zone Rating
433 | 
434 | ------------------------------------------------------------------------------
435 | 2.5  AllstarFull table
436 | 
437 | playerID       Player ID code
438 | YearID         Year
439 | gameNum        Game number (zero if only one All-Star game played that season)
440 | gameID         Retrosheet ID for the game idea
441 | teamID         Team
442 | lgID           League
443 | GP             1 if Played in the game
444 | startingPos    If player was game starter, the position played
445 | ------------------------------------------------------------------------------
446 | 2.6  HallOfFame table
447 | 
448 | playerID       Player ID code
449 | yearID         Year of ballot
450 | votedBy        Method by which player was voted upon
451 | ballots        Total ballots cast in that year
452 | needed         Number of votes needed for selection in that year
453 | votes          Total votes received
454 | inducted       Whether player was inducted by that vote or not (Y or N)
455 | category       Category in which candidate was honored
456 | needed_note    Explanation of qualifiers for special elections
457 | ------------------------------------------------------------------------------
458 | 2.7  Managers table
459 |  
460 | playerID       Player ID Number
461 | yearID         Year
462 | teamID         Team
463 | lgID           League
464 | inseason       Managerial order.  Zero if the individual managed the team
465 |                  the entire year.  Otherwise denotes where the manager appeared
466 |                  in the managerial order (1 for first manager, 2 for second, etc.)
467 | G              Games managed
468 | W              Wins
469 | L              Losses
470 | rank           Team's final position in standings that year
471 | plyrMgr        Player Manager (denoted by 'Y')
472 | 
473 | ------------------------------------------------------------------------------
474 | 2.8  Teams table
475 | 
476 | yearID         Year
477 | lgID           League
478 | teamID         Team
479 | franchID       Franchise (links to TeamsFranchise table)
480 | divID          Team's division
481 | Rank           Position in final standings
482 | G              Games played
483 | GHome          Games played at home
484 | W              Wins
485 | L              Losses
486 | DivWin         Division Winner (Y or N)
487 | WCWin          Wild Card Winner (Y or N)
488 | LgWin          League Champion(Y or N)
489 | WSWin          World Series Winner (Y or N)
490 | R              Runs scored
491 | AB             At bats
492 | H              Hits by batters
493 | 2B             Doubles
494 | 3B             Triples
495 | HR             Homeruns by batters
496 | BB             Walks by batters
497 | SO             Strikeouts by batters
498 | SB             Stolen bases
499 | CS             Caught stealing
500 | HBP            Batters hit by pitch
501 | SF             Sacrifice flies
502 | RA             Opponents runs scored
503 | ER             Earned runs allowed
504 | ERA            Earned run average
505 | CG             Complete games
506 | SHO            Shutouts
507 | SV             Saves
508 | IPOuts         Outs Pitched (innings pitched x 3)
509 | HA             Hits allowed
510 | HRA            Homeruns allowed
511 | BBA            Walks allowed
512 | SOA            Strikeouts by pitchers
513 | E              Errors
514 | DP             Double Plays
515 | FP             Fielding  percentage
516 | name           Team's full name
517 | park           Name of team's home ballpark
518 | attendance     Home attendance total
519 | BPF            Three-year park factor for batters
520 | PPF            Three-year park factor for pitchers
521 | teamIDBR       Team ID used by Baseball Reference website
522 | teamIDlahman45 Team ID used in Lahman database version 4.5
523 | teamIDretro    Team ID used by Retrosheet
524 | 
525 | ------------------------------------------------------------------------------
526 | 2.9  BattingPost table
527 | 
528 | yearID         Year
529 | round          Level of playoffs 
530 | playerID       Player ID code
531 | teamID         Team
532 | lgID           League
533 | G              Games
534 | AB             At Bats
535 | R              Runs
536 | H              Hits
537 | 2B             Doubles
538 | 3B             Triples
539 | HR             Homeruns
540 | RBI            Runs Batted In
541 | SB             Stolen Bases
542 | CS             Caught stealing
543 | BB             Base on Balls
544 | SO             Strikeouts
545 | IBB            Intentional walks
546 | HBP            Hit by pitch
547 | SH             Sacrifices
548 | SF             Sacrifice flies
549 | GIDP           Grounded into double plays
550 | 
551 | ------------------------------------------------------------------------------
552 | 2.10  PitchingPost table
553 | 
554 | playerID       Player ID code
555 | yearID         Year
556 | round          Level of playoffs 
557 | teamID         Team
558 | lgID           League
559 | W              Wins
560 | L              Losses
561 | G              Games
562 | GS             Games Started
563 | CG             Complete Games
564 | SHO             Shutouts 
565 | SV             Saves
566 | IPOuts         Outs Pitched (innings pitched x 3)
567 | H              Hits
568 | ER             Earned Runs
569 | HR             Homeruns
570 | BB             Walks
571 | SO             Strikeouts
572 | BAOpp          Opponents' batting average
573 | ERA            Earned Run Average
574 | IBB            Intentional Walks
575 | WP             Wild Pitches
576 | HBP            Batters Hit By Pitch
577 | BK             Balks
578 | BFP            Batters faced by Pitcher
579 | GF             Games Finished
580 | R              Runs Allowed
581 | SH             Sacrifice Hits allowed
582 | SF             Sacrifice Flies allowed
583 | GIDP           Grounded into Double Plays
584 | 
585 | ------------------------------------------------------------------------------
586 | 2.11 TeamFranchises table
587 | 
588 | franchID       Franchise ID
589 | franchName     Franchise name
590 | active         Whetehr team is currently active (Y or N)
591 | NAassoc        ID of National Association team franchise played as
592 | 
593 | ------------------------------------------------------------------------------
594 | 2.12 FieldingOF table
595 | 
596 | playerID       Player ID code
597 | yearID         Year
598 | stint          player's stint (order of appearances within a season)
599 | Glf            Games played in left field
600 | Gcf            Games played in center field
601 | Grf            Games played in right field
602 | 
603 | ------------------------------------------------------------------------------
604 | 2.13 ManagersHalf table
605 | 
606 | playerID       Manager ID code
607 | yearID         Year
608 | teamID         Team
609 | lgID           League
610 | inseason       Managerial order.  One if the individual managed the team
611 |                  the entire year.  Otherwise denotes where the manager appeared
612 |                  in the managerial order (1 for first manager, 2 for second, etc.)
613 | half           First or second half of season
614 | G              Games managed
615 | W              Wins
616 | L              Losses
617 | rank           Team's position in standings for the half
618 | 
619 | ------------------------------------------------------------------------------
620 | 2.14 TeamsHalf table
621 | 
622 | yearID         Year
623 | lgID           League
624 | teamID         Team
625 | half           First or second half of season
626 | divID          Division
627 | DivWin         Won Division (Y or N)
628 | rank           Team's position in standings for the half
629 | G              Games played
630 | W              Wins
631 | L              Losses
632 | 
633 | ------------------------------------------------------------------------------
634 | 2.15 Salaries table
635 | 
636 | yearID         Year
637 | teamID         Team
638 | lgID           League
639 | playerID       Player ID code
640 | salary         Salary
641 | 
642 | ------------------------------------------------------------------------------
643 | 2.16 SeriesPost table
644 | 
645 | yearID         Year
646 | round          Level of playoffs 
647 | teamIDwinner   Team ID of the team that won the series
648 | lgIDwinner     League ID of the team that won the series
649 | teamIDloser    Team ID of the team that lost the series
650 | lgIDloser      League ID of the team that lost the series 
651 | wins           Wins by team that won the series
652 | losses         Losses by team that won the series
653 | ties           Tie games
654 | ------------------------------------------------------------------------------
655 | 2.17 AwardsManagers table
656 | 
657 | playerID       Manager ID code
658 | awardID        Name of award won
659 | yearID         Year
660 | lgID           League
661 | tie            Award was a tie (Y or N)
662 | notes          Notes about the award
663 | 
664 | ------------------------------------------------------------------------------
665 | 2.18 AwardsPlayers table
666 | 
667 | playerID       Player ID code
668 | awardID        Name of award won
669 | yearID         Year
670 | lgID           League
671 | tie            Award was a tie (Y or N)
672 | notes          Notes about the award
673 | 
674 | ------------------------------------------------------------------------------
675 | 2.19 AwardsShareManagers table
676 | 
677 | awardID        name of award votes were received for
678 | yearID         Year
679 | lgID           League
680 | playerID       Manager ID code
681 | pointsWon      Number of points received
682 | pointsMax      Maximum numner of points possible
683 | votesFirst     Number of first place votes
684 | 
685 | ------------------------------------------------------------------------------
686 | 2.20 AwardsSharePlayers table
687 | 
688 | awardID        name of award votes were received for
689 | yearID         Year
690 | lgID           League
691 | playerID       Player ID code
692 | pointsWon      Number of points received
693 | pointsMax      Maximum numner of points possible
694 | votesFirst     Number of first place votes
695 | 
696 | ------------------------------------------------------------------------------
697 | 2.21 FieldingPost table
698 | 
699 | playerID       Player ID code
700 | yearID         Year
701 | teamID         Team
702 | lgID           League
703 | round          Level of playoffs 
704 | Pos            Position
705 | G              Games 
706 | GS             Games Started
707 | InnOuts        Time played in the field expressed as outs 
708 | PO             Putouts
709 | A              Assists
710 | E              Errors
711 | DP             Double Plays
712 | TP             Triple Plays
713 | PB             Passed Balls
714 | SB             Stolen Bases allowed (by catcher)
715 | CS             Caught Stealing (by catcher)
716 | 
717 | ------------------------------------------------------------------------------
718 | 2.22 Appearances table
719 | 
720 | yearID         Year
721 | teamID         Team
722 | lgID           League
723 | playerID       Player ID code
724 | G_all          Total games played
725 | GS             Games started
726 | G_batting      Games in which player batted
727 | G_defense      Games in which player appeared on defense
728 | G_p            Games as pitcher
729 | G_c            Games as catcher
730 | G_1b           Games as firstbaseman
731 | G_2b           Games as secondbaseman
732 | G_3b           Games as thirdbaseman
733 | G_ss           Games as shortstop
734 | G_lf           Games as leftfielder
735 | G_cf           Games as centerfielder
736 | G_rf           Games as right fielder
737 | G_of           Games as outfielder
738 | G_dh           Games as designated hitter
739 | G_ph           Games as pinch hitter
740 | G_pr           Games as pinch runner
741 | 
742 | 
743 | ------------------------------------------------------------------------------
744 | 2.23 Schools table
745 | schoolID       school ID code
746 | schoolName     school name
747 | schoolCity     city where school is located
748 | schoolState    state where school's city is located
749 | schoolNick     nickname for school's baseball team
750 | 
751 | 
752 | ------------------------------------------------------------------------------
753 | 2.24 CollegePlaying table
754 | playerid       Player ID code
755 | schoolID       school ID code
756 | year           year
757 | 
758 | 
759 | 
760 | ------------------------------------------------------------------------------
761 | 2.25 FieldingOFsplit table
762 | playerID       Player ID code
763 | yearID         Year
764 | stint          player's stint (order of appearances within a season)
765 | teamID         Team
766 | lgID           League
767 | Pos            Position
768 | G              Games 
769 | GS             Games Started
770 | InnOuts        Time played in the field expressed as outs 
771 | PO             Putouts
772 | A              Assists
773 | E              Errors
774 | DP             Double Plays
775 | 
776 | 
777 | ------------------------------------------------------------------------------
778 | 2.26 Parks table
779 | park.key		ballpark ID code
780 | park.name       name of ballpark
781 | park.alias      alternate names of ballpark
782 | city            city
783 | state           state 
784 | country			country
785 | 
786 | ------------------------------------------------------------------------------
787 | 2.27 HomeGames table
788 | year.key		year
789 | league.key		league
790 | team.key		team ID
791 | park.key		ballpark ID
792 | span.first      date of first game played
793 | span.last		date of last game played
794 | games			total number of games
795 | openings		total number of dates played
796 | attendance		total attendaance
797 | 
798 | 
799 | 
800 | <end of file>
801 | 	


--------------------------------------------------------------------------------