├── .gitignore
├── .includepath
├── .project
├── LICENSE.md
├── README.md
└── src
├── main
├── perl
│ ├── .gitignore
│ ├── Kruser
│ │ └── MLB
│ │ │ ├── AtBat.pm
│ │ │ ├── HitAdjuster.pm
│ │ │ └── Storage
│ │ │ └── Mongo.pm
│ ├── atbatETL.pl
│ ├── atbatETL.properties
│ └── log4perl.conf
└── resources
│ ├── stadiumImages
│ ├── 1.svg
│ ├── 10.svg
│ ├── 12.svg
│ ├── 13.svg
│ ├── 14.svg
│ ├── 15.svg
│ ├── 16.svg
│ ├── 17.svg
│ ├── 19.svg
│ ├── 2.svg
│ ├── 20.png
│ ├── 22.svg
│ ├── 2392.svg
│ ├── 2394.svg
│ ├── 2395.svg
│ ├── 2397.svg
│ ├── 25.png
│ ├── 2504.svg
│ ├── 2535.png
│ ├── 2602.svg
│ ├── 2680-2013.svg
│ ├── 2681.svg
│ ├── 2889.svg
│ ├── 3.svg
│ ├── 31.svg
│ ├── 32.svg
│ ├── 3289.svg
│ ├── 3309.svg
│ ├── 3312.svg
│ ├── 3313.svg
│ ├── 4.svg
│ ├── 4169.svg
│ ├── 5.svg
│ ├── 680.svg
│ ├── 7.svg
│ └── 8.png
│ └── stadiums.xml
└── test
└── perl
└── HitAdjuster.t
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 |
--------------------------------------------------------------------------------
/.includepath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | atbat-mongodb
4 |
5 |
6 |
7 |
8 |
9 | org.epic.perleditor.perlbuilder
10 |
11 |
12 |
13 |
14 |
15 | org.epic.perleditor.perlnature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright 2013 Ryan Kruse
2 |
3 | Licensed under the Apache License, Version 2.0 (the "License");
4 | you may not use this file except in compliance with the License.
5 | You may obtain a copy of the License at
6 |
7 | http://www.apache.org/licenses/LICENSE-2.0
8 |
9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | atbat-mongodb
2 | =============
3 |
4 | ## The Gist
5 | This is a Perl project that pulls game, at-bat and pitch data from MLB's AtBat servers and shoves them into a local Mongo Database.
6 |
7 | When you first get setup you can pull an entire year or month of data. From then on, each time you
8 | run the program it will pickup where it left off, keeping your database up-to-date with the baseball season.
9 |
10 | Disclaimer: You'll probably need some software development background to get up and running with this project. At the least a comfort
11 | with databases.
12 |
13 | ---
14 |
15 | ## Prerequisites
16 |
17 | ### Perl
18 | You'll need to install Perl and a few external modules from CPAN. Getting Perl will be different for all of the Operating Systems
19 | so I won't go into it here, but I'll list a few notes..
20 |
21 | * *Windows*: Google ActiveState or StrawberryPerl
22 | * *MacOS*: You already have Perl. Thanks Apple! You'll need the Developer Tools installed to install all of the modules required below. Search
23 | the App Store for the Developer Tools.
24 | * *Linux*: You know what you're doing. Continue.
25 |
26 | #### Perl Modules Required
27 | You'll need to install these modules if you don't have them installed already
28 |
29 | * Config::Properties
30 | * Log::Log4perl
31 | * File::Basename
32 | * Getopt::Long
33 | * LWP
34 | * XML::Simple
35 | * Data::Dumper
36 | * Date::Parse
37 | * DateTime
38 | * Storable
39 | * MongoDB
40 |
41 | Normally you would use cpan to install each module. Something like...
42 |
43 | $ sudo cpan install Config::Properties
44 |
45 | Or if you're on MacOS you may need to run it through Perl like...
46 |
47 | $ perl -MCPAN -e 'install Config::Properties'
48 |
49 |
50 | ### MongoDB
51 | You need a MongoDB installation.
52 |
53 | http://www.mongodb.org
54 |
55 | You don't need to configure anything, just install Mongo and start the mongod process.
56 |
57 | ---
58 |
59 | ## Your First Run
60 | When you're first getting setup and your database is empty, you'll first need to sync a specific day, month or year.
61 | I suggest you sync the current month, which takes about 5-10 minutes depending on your Internet connection.
62 |
63 | ./atbatETL.pl --year=2013 --month=06
64 |
65 | If you're on a fast pipe, you might as well just do a full year. I can grab a year in about 40 minutes. Running
66 | the program like this will grab an entire year.
67 |
68 | ./atbatETL.pl --year=2013
69 |
70 | Note that the program logs quite a bit of interesting output to the log filename listed in the *log4perl.conf* file. By default this is a file
71 | called *mlbatbat.log*. I suggest you tail the log file and watch the days and games roll by. A snippet of the output is...
72 |
73 | 2013/06/29 15:42:53 DEBUG [Kruser.MLB.AtBat] Getting game roster details from http://gd2.mlb.com/components/game/mlb/year_2013/month_06/day_28/gid_2013_06_28_slnmlb_oakmlb_1/players.xml
74 | 2013/06/29 15:42:53 DEBUG [Kruser.MLB.AtBat] Getting at-bat details from http://gd2.mlb.com/components/game/mlb/year_2013/month_06/day_28/gid_2013_06_28_chnmlb_seamlb_1/inning/inning_all.xml
75 | 2013/06/29 15:42:54 DEBUG [Kruser.MLB.Storage.Mongo] Saved 80 at bats to the 'atbats' collection
76 | 2013/06/29 15:42:55 DEBUG [Kruser.MLB.Storage.Mongo] Saved 287 pitches to the 'pitches' collection
77 | 2013/06/29 15:42:55 DEBUG [Kruser.MLB.AtBat] Getting game roster details from http://gd2.mlb.com/components/game/mlb/year_2013/month_06/day_28/gid_2013_06_28_chnmlb_seamlb_1/players.xml
78 | 2013/06/29 15:42:55 DEBUG [Kruser.MLB.AtBat] Getting at-bat details from http://gd2.mlb.com/components/game/mlb/year_2013/month_06/day_28/gid_2013_06_28_phimlb_lanmlb_1/inning/inning_all.xml
79 | 2013/06/29 15:42:58 DEBUG [Kruser.MLB.Storage.Mongo] Saved 88 at bats to the 'atbats' collection
80 | 2013/06/29 15:42:59 DEBUG [Kruser.MLB.Storage.Mongo] Saved 332 pitches to the 'pitches' collection
81 | 2013/06/29 15:42:59 DEBUG [Kruser.MLB.AtBat] Getting game roster details from http://gd2.mlb.com/components/game/mlb/year_2013/month_06/day_28/gid_2013_06_28_phimlb_lanmlb_1/players.xml
82 | 2013/06/29 15:43:00 INFO [Kruser.MLB.AtBat] Finished retrieving data for 2013-06-28.
83 | 2013/06/29 15:43:00 INFO [Kruser.MLB.AtBat] The target date for 2013-06-29 is today, in the future, or late last night. Exiting soon....
84 | 2013/06/29 15:43:02 DEBUG [Kruser.MLB.Storage.Mongo] Saved 62 players to the 'players' collection
85 |
86 | Once your initial run finishes, the next time you run it without args it will pickup where it left off. I suggest running it on a cron or
87 | scheduled task for noon eastern time daily. I won't let it read before 8AM as a precaution against crazy rain-out days.
88 |
89 | ./atbatETL.pl
90 |
91 | ---
92 |
93 | ## Your New Database!!
94 | Startup the *mongo* shell program found in your installs bin directory.
95 |
96 | RYANs-MacBook-Pro:dsire kruser$ /Applications/mongodb-osx-x86_64-2.2.0/bin/mongo
97 | MongoDB shell version: 2.2.0
98 | connecting to: test
99 | >
100 |
101 | ### Collections
102 | Collections in MongoDB are analygous to tables in a relational database. You'll have five of them which you can see from the *show collections*
103 | command below. Note that when you first open the mongo shell you'll need to switch the context to the *mlbatbat* database using the *use mlbatbat*
104 | command as you see below.
105 |
106 | > use mlbatbat
107 | switched to db mlbatbat
108 | > show collections
109 | atbats
110 | games
111 | pitches
112 | players
113 | system.indexes
114 | >
115 |
116 | You should have lots of data in your four collections as you can see below using the *count()* function. If you don't see lots of records then
117 | start over at the beginning as something went wrong with the data collection.
118 |
119 | > db.games.count()
120 | 1222
121 | > db.players.count()
122 | 1166
123 | > db.atbats.count()
124 | 90444
125 | > db.pitches.count()
126 | 346822
127 | >
128 |
129 | ### Indexes
130 | Note that I haven't created indexes on any of your database collections by default. You may wish to place these on your index
131 | depending on the type of research you're doing. Of course, this is all optional, but it would provide performance boosts if you're
132 | doing a lot of queries.
133 |
134 | Read up on MongoDB indexes for more information.
135 | http://docs.mongodb.org/manual/core/indexes/
136 |
137 | For the http://PitchFX.org site I have started with these indexes. You can't go wrong with these if you don't care about the slight storage overhead.
138 |
139 | db.players.ensureIndex({'first':1,'last':1});
140 | db.pitches.ensureIndex({'atbat.pitcher':1,'tfs_zulu':1});
141 | db.pitches.ensureIndex({'atbat.pitcher':1});
142 | db.pitches.ensureIndex({'atbat.batter':1,'tfs_zulu':1});
143 | db.pitches.ensureIndex({'atbat.batter':1});
144 | db.pitches.ensureIndex({'atbat.p_throws':1});
145 | db.pitches.ensureIndex({'atbat.stand':1});
146 | db.pitches.ensureIndex({'atbat.o_start':1});
147 | db.pitches.ensureIndex({'game.game_type':1});
148 | db.pitches.ensureIndex({'inning.number':1});
149 | db.pitches.ensureIndex({'on_1b':1});
150 | db.pitches.ensureIndex({'on_2b':1});
151 | db.pitches.ensureIndex({'on_3b':1});
152 | db.pitches.ensureIndex({'tfs_zulu':1});
153 | db.atbats.ensureIndex({'pitcher':1,'start_tfs_zulu':1});
154 | db.atbats.ensureIndex({'batter':1,'start_tfs_zulu':1});
155 | db.atbats.ensureIndex({'pitcher':1});
156 | db.atbats.ensureIndex({'batter':1});
157 | db.atbats.ensureIndex({'p_throws':1});
158 | db.atbats.ensureIndex({'stand':1});
159 | db.atbats.ensureIndex({'o_start':1});
160 | db.atbats.ensureIndex({'game.game_type':1});
161 | db.atbats.ensureIndex({'start_tfs_zulu':1});
162 | db.atbats.ensureIndex({'inning.number':1});
163 | db.atbats.ensureIndex({'pitch.on_1b':1});
164 | db.atbats.ensureIndex({'pitch.on_2b':1});
165 | db.atbats.ensureIndex({'pitch.on_3b':1});
166 |
167 | ### Some sample functions
168 | I won't have a lot of information here. This part is mostly up to you, but I want to give you some foo to get you excited.
169 |
170 | #### How many 100+ MPH pitches were thrown in May 2013? How many were thrown for balls and how many for strikes?
171 | To find this data we'll query the *pitches* collection. Note that we're specifying the months in an
172 | array of 0-11 instead of 1-12. So 3=April, 4=May, etc.
173 |
174 | > db.pitches.find({"start_speed":{$gte:100}, "tfs_zulu":{$gte:new Date(2013,4,1), $lt:new Date(2013,5,1)}}).count();
175 | 42
176 |
177 | We see that there were *42* total in the month of May 2013. Let's split them up and see how many were thrown for strikes, how many were balls
178 | and how many were hit into play. To do this, we'll use a *group()* function instead of a *find()*.
179 |
180 | > db.pitches.group (
181 | {
182 | key: {"type": true},
183 | cond: {"start_speed":{$gte:100}, "tfs_zulu":{$gte:new Date(2013,4,1), $lt:new Date(2013,5,1)}},
184 | initial: {sum: 0},
185 | reduce: function(doc, prev) { prev.sum += 1}
186 | });
187 |
188 | The results of the query above are...
189 |
190 | [
191 | {
192 | "type" : "B",
193 | "sum" : 15
194 | },
195 | {
196 | "type" : "X",
197 | "sum" : 9
198 | },
199 | {
200 | "type" : "S",
201 | "sum" : 18
202 | }
203 | ]
204 |
205 | By using *group()* we can see the breakdown of the league's 100+MPH pitches
206 | * 15 balls (B)
207 | * 18 strikes (S)
208 | * 9 hit into play (X)
209 |
210 | #### What is Joe Mauer's Batting Average with 2 strikes in all of 2013?
211 | First we'll need to find Joe Mauer's AtBat ID.
212 |
213 | > db.players.find({'last':'Mauer'}).pretty();
214 | {
215 | "_id" : ObjectId("51ceff10d0930a21010016ad"),
216 | "first" : "Joe",
217 | "last" : "Mauer",
218 | "id" : NumberLong(408045)
219 | }
220 | >
221 |
222 | Now that we know his ID is *408045*, we can query the *atbats* collection for the data we need. Notice that I preserved the *id* property
223 | from the MLB data and didn't try to fit that in the MongoDB *_id* field.
224 |
225 | We'll run two queries, one for total at-bats with two strikes and one for total hits.
226 |
227 | > db.atbats.find({"batter":408045,"start_tfs_zulu":{$gte:new Date(2013,0,1), $lt:new Date(2014,0,1)}, "s":{$gte:2}, "event":/Single|Double|Triple|Home Run/}).count();
228 | 54
229 | > db.atbats.find({"batter":408045,"start_tfs_zulu":{$gte:new Date(2013,0,1), $lt:new Date(2014,0,1)}, "s":{$gte:2}, "event":{$not:/Walk|Sacrifice/}}).count();
230 | 183
231 |
232 | The queries tell us that Joe Mauer is 54 for 183, or *.295* in 2013 when he has two strikes. Notice that we used *$gte:2* since the at-bat
233 | will be reported to have three strikes when the batter strikes out, and we certainly want to include that.
234 |
235 | The example above would have been much more performant with a MongoDB aggregate $match and a $group that aggregated
236 | the at-bats and hits together. I kept this as two queries for simplicity. For more information on MongoDB aggregation,
237 | go here http://docs.mongodb.org/manual/reference/aggregation/
238 |
239 |
240 | ---
241 |
242 | ## Why MongoDB?
243 | MongoDB is a document based "nosql" database. Baseball data is particularly relational, but I was interested in seeing if
244 | we could make it a little less so and take advantage of the speed of MongoDB. When I say "speed" I'm speaking of the speed
245 | of both development and usage. You see, I've defined no schema. Instead, I've pretty much taken the XML documents from
246 | the At-Bat servers, sucked them into a POPO (plain old Perl object), and fed them into Mongo. It was simple and FUN!
247 |
248 | Now I did shuffle some data around, making sure a pitch document contained enough information about the at-bat and game to be useful and the same
249 | for at-bats, but for the most part the data stayed with the property names that you find in the MLB At-Bat documents.
250 |
251 | Additionally MongoDB has built-in support for cloud scaling and map-reduce functions. Unlike MySQL, SQLServer, etc., we can run Javascript functions
252 | in the Mongo shell, and even in a map-reduce setup.
253 |
254 | ---
255 | ## Contribute
256 | Fork my repo, please! I accept pull requests so let's chat if you're interested in contributing.
257 |
258 | ---
259 | ## Future
260 | ### Speed
261 | MongoDB is fast on inserts, 99% of the time in running this program is spent waiting for HTTP GET requests to return from the mlb servers.
262 | I would like to put the *_save_game_data* method in AtBat.pm into a thread pool. Originally I had it this way but Perl's LWP is a little
263 | flaky across threads and I didn't want to spend too much time on the issue. If we were able to startup each *_save_game_data* in a thread it would
264 | cut down the runtime of the program to 10% or less. That said, once the initial sync is in a place you like it, you simply run it without args
265 | on a cron/daily schedule and you'll maintain an up-to-date database and you don't really care about runtime speed, only database speed.
266 |
267 | ### Python?
268 | I think Python might have been a wiser choice than Perl for this project, but I can slap Perl together a little faster so I went with that.
269 | I'm thinking a port to Python would be great, provided I'm able to give into the whitespace rules of the language. So maybe I'll do that
270 | soon, maybe not.
271 |
272 | ### ElasticSearch Storage
273 | I would like to have other storage options in addition to MongoDB. I would especially like to see an ElasticSearch.pm module in *Storage*.
274 | ElasticSearch offers some faceting capabilities that would give us extra quick looks without the overhead of the mongo group function. Before starting
275 | an ElasticSearch option though I think it would be wise to look at using a Mongo River that stores to ElasticSearch downstream of Mongo.
276 |
277 | ### MongoDB Options
278 | Right now the program only connects to mongod running on the localhost, default port, without credentials. If this were a commercial product,
279 | this would be quite rediculous. As it stands, I don't need more than that. But yes, eventually I'd like to support running against a remote
280 | MongoDB instance.
281 |
--------------------------------------------------------------------------------
/src/main/perl/.gitignore:
--------------------------------------------------------------------------------
1 | /mlbatbat.log
2 | /nohup.out
3 |
--------------------------------------------------------------------------------
/src/main/perl/Kruser/MLB/AtBat.pm:
--------------------------------------------------------------------------------
1 | package Kruser::MLB::AtBat;
2 |
3 | ##
4 | # A module that provides a way to get Perl data structures
5 | # from the MLB AtBat XML APIs
6 | #
7 | # @author kruser
8 | ##
9 | use strict;
10 | use LWP;
11 | use Log::Log4perl;
12 | use XML::Simple;
13 | use Data::Dumper;
14 | use Date::Parse;
15 | use DateTime;
16 | use Storable 'dclone';
17 | use Kruser::MLB::HitAdjuster;
18 |
19 | my $browser = LWP::UserAgent->new( ssl_opts => { verify_hostname => 0 } );
20 | my $logger = Log::Log4perl->get_logger("Kruser::MLB::AtBat");
21 |
22 | ##
23 | # Construct an instance
24 | ##
25 | sub new {
26 | my ( $proto, %params ) = @_;
27 | my $package = ref($proto) || $proto;
28 |
29 | my $this = {
30 | apibase => undef,
31 | storage => undef,
32 | beforetoday => 1,
33 | year => undef,
34 | month => undef,
35 | day => undef,
36 | players => {},
37 | };
38 |
39 | foreach my $key ( keys %params ) {
40 | $this->{$key} = $params{$key};
41 | }
42 |
43 | bless( $this, $package );
44 | return $this;
45 | }
46 |
47 | ##
48 | # retreives data since the last sync point
49 | ##
50 | sub initiate_sync {
51 | my $this = shift;
52 | if ( $this->{year} && $this->{month} && $this->{day} ) {
53 | $this->_retrieve_day( $this->{year}, $this->{month}, $this->{day} );
54 | }
55 | elsif ( $this->{year} && $this->{month} ) {
56 | $this->_retrieve_month( $this->{year}, $this->{month} );
57 | }
58 | elsif ( $this->{year} ) {
59 | $this->_retrieve_year( $this->{year} );
60 | }
61 | else {
62 | my $lastDate = $this->{storage}->get_last_sync_date();
63 | if ($lastDate) {
64 | $this->_retrieve_since($lastDate);
65 | }
66 | else {
67 | $logger->info(
68 | "Your database doesn't have any data so we're not sure when to sync to. Try seeding it with a year or month."
69 | );
70 | }
71 | }
72 | $this->{storage}->save_players( $this->{players} );
73 | }
74 |
75 | ##
76 | # Retrieves all data since the given date
77 | #
78 | ##
79 | sub _retrieve_since {
80 | my $this = shift;
81 | my $lastDate = shift;
82 |
83 | my $lastDateTime = _convert_to_datetime($lastDate)->epoch() + 86400;
84 | my $today = DateTime->now()->epoch();
85 | while ( $lastDateTime < $today ) {
86 | my $dt = DateTime->from_epoch( epoch => $lastDateTime );
87 | $this->_retrieve_day( $dt->year(), $dt->month(), $dt->day() );
88 | $lastDateTime += 86400;
89 | }
90 | }
91 |
92 | ##
93 | # retrieves a full year
94 | # @param year in YYYY format
95 | ##
96 | sub _retrieve_year {
97 | my $this = shift;
98 | my $year = shift;
99 | $logger->info(
100 | "Retrieving a full year for $year. Sit tight, this could take a few minutes."
101 | );
102 |
103 | for ( my $month = 3 ; $month <= 11 && $this->{'beforetoday'} ; $month++ ) {
104 | $this->_retrieve_month( $year, $month );
105 | }
106 | }
107 |
108 | ##
109 | # retrieves an entire month's worth of data
110 | ##
111 | sub _retrieve_month {
112 | my $this = shift;
113 | my $year = shift;
114 | my $month = shift;
115 | $logger->info("Retrieving data for the month $year-$month.");
116 | if ( $month > 1 && $month < 12 ) {
117 | for ( my $day = 1 ; $day <= 31 && $this->{'beforetoday'} ; $day++ ) {
118 | $this->_retrieve_day( $year, $month, $day );
119 | }
120 | }
121 | else {
122 | $logger->info(
123 | "skipping analyzing $year-$month since there aren't MLB games");
124 | }
125 | }
126 |
127 | ##
128 | # retrieves a full day
129 | # @param year in YYYY format
130 | # @param day in DD format
131 | ##
132 | sub _retrieve_day {
133 | my $this = shift;
134 | my $year = shift;
135 | my $month = shift;
136 | my $day = shift;
137 |
138 | my $targetDay;
139 |
140 | eval {
141 | $targetDay = DateTime->new(
142 | year => $year,
143 | month => $month,
144 | day => $day,
145 | hour => 23,
146 | minute => 59,
147 | second => 59
148 | );
149 | } or do { return; };
150 |
151 | my $fallbackDate = DateTime->new(
152 | year => $year,
153 | month => $month,
154 | day => $day,
155 | hour => 20,
156 | minute => 0,
157 | second => 0
158 | );
159 |
160 | # format the short strings for the URL
161 | $month = '0' . $month if $month < 10;
162 | $day = '0' . $day if $day < 10;
163 | my $dayString = "$year-$month-$day";
164 |
165 | my $now = DateTime->now();
166 | my $millisDifference = $now->epoch() - $targetDay->epoch();
167 | if ( $millisDifference < 60 * 60 * 8 ) {
168 | $logger->info(
169 | "The target date for $dayString is today, in the future, or late last night. Exiting soon...."
170 | );
171 | $this->{beforetoday} = 0;
172 | return;
173 | }
174 | elsif ( $this->{storage}->already_have_day($dayString) ) {
175 | $logger->info(
176 | "We already have some game data for $dayString. Skipping this day."
177 | );
178 | return;
179 | }
180 |
181 | my $dayUrl = $this->{apibase} . "/year_$year/month_$month/day_$day";
182 | $logger->info("Starting retrieving data for $dayString.");
183 |
184 | my @threads;
185 | my @games = $this->_get_games_for_day($dayUrl);
186 | foreach my $game (@games) {
187 | $game->{'source_day'} = $dayString;
188 | $game->{'start'} =
189 | _convert_to_datetime( $game->{'start'}, $fallbackDate );
190 | $this->_save_game_data( $dayUrl, $game, $fallbackDate );
191 | }
192 | $logger->info("Finished retrieving data for $dayString.");
193 | }
194 |
195 | ##
196 | # Gets the inning data for the game passed in and persists all at-bats
197 | # and pitches.
198 | #
199 | # @param {string} dayUrl - the URL for all games that day
200 | # @param {Object} game - the top level game data
201 | # @param {Object} fallbackDate - on MLB gameday servers some games and at-bats don't have a good timestamp. When that's the case this will be used.
202 | ##
203 | sub _save_game_data {
204 | my $this = shift;
205 | my $dayUrl = shift;
206 | my $game = shift;
207 | my $fallbackDate = shift;
208 |
209 | $game->{start} = _convert_to_datetime( $game->{start}, $fallbackDate );
210 |
211 | my $gameId = $game->{gameday};
212 |
213 | my $shallowGameInfo = {
214 | id => $gameId,
215 | time => $game->{time},
216 | away_team => $game->{'away_code'},
217 | home_team => $game->{'home_code'},
218 | venue_id => $game->{'venue_id'},
219 | game_type => $game->{'game_type'},
220 | };
221 |
222 | my $gameRosterUrl = "$dayUrl/gid_$gameId/players.xml";
223 | $logger->debug("Getting game roster details from $gameRosterUrl");
224 |
225 | my $gameRosterXml = $this->_get_xml_page($gameRosterUrl);
226 | if ($gameRosterXml) {
227 | my $gameRosterObj = XMLin(
228 | $gameRosterXml,
229 | KeyAttr => {},
230 | ForceArray => [ 'team', 'player', 'coach' ]
231 | );
232 | if ( $gameRosterObj && $gameRosterObj->{team} ) {
233 | $game->{team} = $gameRosterObj->{team};
234 |
235 | foreach my $team ( @{ $gameRosterObj->{team} } ) {
236 | if ( $team->{'player'} ) {
237 | foreach my $player ( @{ $team->{'player'} } ) {
238 | $this->{players}->{ $player->{id} } = {
239 | id => $player->{id},
240 | first => $player->{first},
241 | last => $player->{last},
242 | lastSeen => $game->{start},
243 | };
244 | }
245 | }
246 | }
247 | }
248 | }
249 |
250 | $this->{storage}->save_game($game);
251 |
252 | my $inningsUrl = "$dayUrl/gid_$gameId/inning/inning_all.xml";
253 | $logger->debug("Getting at-bat details from $inningsUrl");
254 | my $inningsXml = $this->_get_xml_page($inningsUrl);
255 |
256 | my $hitsUrl = "$dayUrl/gid_$gameId/inning/inning_hit.xml";
257 | $logger->debug("Getting hit details from $hitsUrl");
258 | my $hitsXml = $this->_get_xml_page($hitsUrl);
259 |
260 | if ( $inningsXml && $hitsXml ) {
261 |
262 | my $hitsForAtBats =
263 | $this->_add_hit_angles(
264 | XMLin( $hitsXml, KeyAttr => {}, ForceArray => ['hip'] ) );
265 |
266 | $this->_save_at_bats(
267 | XMLin(
268 | $inningsXml,
269 | KeyAttr => {},
270 | ForceArray =>
271 | [ 'inning', 'atbat', 'runner', 'action', 'pitch', 'po' ]
272 | ),
273 | $hitsForAtBats,
274 | $shallowGameInfo,
275 | $fallbackDate
276 | );
277 |
278 | my $hitsForPitches =
279 | $this->_add_hit_angles(
280 | XMLin( $hitsXml, KeyAttr => {}, ForceArray => ['hip'] ) );
281 |
282 | $this->_save_pitches(
283 | XMLin(
284 | $inningsXml,
285 | KeyAttr => {},
286 | ForceArray => [ 'inning', 'atbat', 'runner', 'pitch' ]
287 | ),
288 | $hitsForPitches,
289 | $shallowGameInfo,
290 | $fallbackDate
291 | );
292 | }
293 |
294 | }
295 |
296 | ##
297 | # Cycles through a list of hit balls and use the X/Y coordinates to formulate an angle
298 | # of the hit. 0 degrees will be straight up the middle of the field. -45 degrees is the left
299 | # foul pole and 45 degress is the right foul pole.
300 | ##
301 | sub _add_hit_angles {
302 | my $this = shift;
303 | my $hipList = shift;
304 |
305 | my $hitAdjuster = new Kruser::MLB::HitAdjuster();
306 |
307 | if ( $hipList->{hip} ) {
308 | for my $hip ( @{ $hipList->{hip} } ) {
309 | $hip->{angle} = $hitAdjuster->get_hit_angle($hip);
310 |
311 | # don't insert distance as they aren't reliable just yet
312 | #$hip->{estimatedDistance} = $hitAdjuster->estimate_hit_distance($hip);
313 | }
314 | }
315 | return $hipList;
316 | }
317 |
318 | ##
319 | # Runs through all innings and at-bats of a game and persists each
320 | # pitch as their own object in the database, embedding game and inning info
321 | # along the way
322 | #
323 | # TODO: I'm sure this could be refactored with _save_at_bats to reduce
324 | # a little code redundancy.
325 | #
326 | # @param innings - the object representing all innings
327 | # @param hitBalls - the object representing all hit balls
328 | # @param shallowGame - the shallow game data that we'll embed in each pitch
329 | # @param fallbackDate - the day to use if we don't have one per pitch
330 | # @private
331 | ##
332 | sub _save_pitches {
333 | my $this = shift;
334 | my $inningsObj = shift;
335 | my $hitBalls = shift;
336 | my $shallowGameInfo = shift;
337 | my $fallbackDate = shift;
338 |
339 | my @allPitches = ();
340 |
341 | if ($inningsObj) {
342 | foreach my $inning ( @{ $inningsObj->{inning} } ) {
343 | $this->_save_pitches_from_half_inning( $inning, 'top', $hitBalls,
344 | $shallowGameInfo, $fallbackDate, \@allPitches );
345 | $this->_save_pitches_from_half_inning( $inning, 'bottom', $hitBalls,
346 | $shallowGameInfo, $fallbackDate, \@allPitches );
347 | }
348 | }
349 | $this->{storage}->save_pitches( \@allPitches );
350 | }
351 |
352 | ##
353 | # Saves all pitches from a 1/2 inning's at-bats
354 | #
355 | sub _save_pitches_from_half_inning {
356 | my $this = shift;
357 | my $inning = shift;
358 | my $inningSide = shift;
359 | my $hitBalls = shift;
360 | my $shallowGameInfo = shift;
361 | my $fallbackDate = shift;
362 | my $aggregatePitches = shift;
363 |
364 | if ( $inning->{$inningSide} && $inning->{$inningSide}->{atbat} ) {
365 | my $startingOuts = 0;
366 | my @atbats = @{ $inning->{$inningSide}->{atbat} };
367 | foreach my $atbat (@atbats) {
368 | $atbat->{'batter_team'} =
369 | ( $inningSide eq 'top' )
370 | ? $inning->{'away_team'}
371 | : $inning->{'home_team'};
372 | $atbat->{'pitcher_team'} =
373 | ( $inningSide eq 'top' )
374 | ? $inning->{'home_team'}
375 | : $inning->{'away_team'};
376 | $atbat->{'start_tfs_zulu'} =
377 | _convert_to_datetime( $atbat->{'start_tfs_zulu'}, $fallbackDate );
378 | $atbat->{'o_start'} = $startingOuts;
379 | $startingOuts = $atbat->{'o'};
380 |
381 | my $shallowAtBat = dclone($atbat);
382 | undef $shallowAtBat->{'pitch'};
383 |
384 | if ( $atbat->{pitch} ) {
385 | my @pitches = @{ $atbat->{pitch} };
386 |
387 | my $hip =
388 | $this->_get_hip_for_atbat( $hitBalls, $inning->{num},
389 | $atbat->{batter} );
390 | if ($hip) {
391 | $hip->{'trajectory'} = _get_trajectory( $atbat->{'des'} );
392 |
393 | # inject the hit ball on the last pitch of the at-bat
394 | $pitches[-1]->{'hip'} = $hip;
395 | }
396 |
397 | foreach my $pitch (@pitches) {
398 | $pitch->{'tfs_zulu'} =
399 | _convert_to_datetime( $pitch->{'tfs_zulu'},
400 | $fallbackDate );
401 | $pitch->{'game'} = $shallowGameInfo;
402 | $pitch->{'inning'} = {
403 | type => $inningSide,
404 | number => $inning->{num},
405 | };
406 | $pitch->{'atbat'} = $shallowAtBat;
407 | push( @{$aggregatePitches}, $pitch );
408 | }
409 | }
410 | }
411 | }
412 |
413 | }
414 |
415 | ##
416 | # For a hit ball description, get an trajectory
417 | # from a known list
418 | #
419 | # @param description - the string description of the hit ball
420 | # @returns one of (grounder|flyball|popup|liner)
421 | sub _get_trajectory {
422 | my $hitDescription = shift;
423 |
424 | if ( $hitDescription =~ /pop up|pops out/i ) {
425 | return 'popup';
426 | }
427 | elsif ( $hitDescription =~ /line drive|lines out/i ) {
428 | return 'liner';
429 | }
430 | elsif ( $hitDescription =~ /fly ball|flies out/i ) {
431 | return 'flyball';
432 | }
433 | return 'grounder';
434 | }
435 |
436 | ##
437 | # Run through a list of innings and save the at-bat
438 | # data only. We're purposefully stripping out the pitches
439 | # as those will be saved in another space
440 | #
441 | # @param inningsObj - the object representing all innings
442 | # @param hitsObj - the object representing all hit balls
443 | # @param shallowGame - the shallow game data that we'll embed in each at-bat
444 | # @param fallbackDate - the date to use on the atbats if we don't have one from MLB
445 | # @private
446 | ##
447 | sub _save_at_bats {
448 | my $this = shift;
449 | my $inningsObj = shift;
450 | my $hitsObj = shift;
451 | my $shallowGameInfo = shift;
452 | my $fallbackDate = shift;
453 |
454 | my @allAtBats = ();
455 | if ( $inningsObj && $inningsObj->{'inning'} ) {
456 | foreach my $inning ( @{ $inningsObj->{inning} } ) {
457 | if ( $inning->{top} && $inning->{top}->{atbat} ) {
458 | $this->_save_at_bats_for_inning( $inning, $hitsObj, 'top',
459 | $shallowGameInfo, \@allAtBats, $fallbackDate );
460 |
461 | }
462 | if ( $inning->{bottom} && $inning->{bottom}->{atbat} ) {
463 | $this->_save_at_bats_for_inning( $inning, $hitsObj, 'bottom',
464 | $shallowGameInfo, \@allAtBats, $fallbackDate );
465 | }
466 | }
467 | }
468 | $this->{storage}->save_at_bats( \@allAtBats );
469 | }
470 |
471 | ##
472 | # Handles persisting all at bats in an array that represents
473 | # the top or bottom half of an inning.
474 | #
475 | # The processed results are pushed on the $aggregateAtBats array
476 | # and are assumed to be persisted by the calling method
477 | #
478 | # Note that we're not persisting at-bats and runners like a game log. Instead, we're storing the
479 | # at-bat sa the first class citizen and retrofitting 'runners' to be exactly what the batter
480 | # had on base at the time of their event. This takes out stolen bases that happened during the at-bat.
481 | #
482 | # @param atBats - the array of bats
483 | # @param inning - the inning details
484 | # @param hitBalls - the hit balls for the game so we can pull each hit an inject it as needed
485 | # @param inningSide - (top|bottom), the side of the inning
486 | # @param shallowGameInfo - an arbitrary game object that we'll stick in each at-bat
487 | # @param aggregateAtBats - an array for all of the at-bats that the caller will be aggregating, presumedly for storage
488 | # @param fallbackDate
489 | ##
490 | sub _save_at_bats_for_inning {
491 | my $this = shift;
492 | my $inning = shift;
493 | my $hitBalls = shift;
494 | my $inningSide = shift;
495 | my $shallowGameInfo = shift;
496 | my $aggregateAtBats = shift;
497 | my $fallbackDate = shift;
498 |
499 | my $startingOuts = 0;
500 | my @atbats = @{ $inning->{$inningSide}->{'atbat'} };
501 | foreach my $atbat (@atbats) {
502 | my $atBatEvent = $atbat->{'event'};
503 |
504 | $atbat->{'batter_team'} =
505 | $inningSide eq 'top'
506 | ? $inning->{'away_team'}
507 | : $inning->{'home_team'};
508 | $atbat->{'pitcher_team'} =
509 | $inningSide eq 'top'
510 | ? $inning->{'home_team'}
511 | : $inning->{'away_team'};
512 | $atbat->{'inning'} = {
513 | type => $inningSide,
514 | number => $inning->{num},
515 | };
516 | $atbat->{'o_start'} = $startingOuts;
517 | $atbat->{'game'} = $shallowGameInfo,;
518 | $atbat->{'start_tfs_zulu'} =
519 | _convert_to_datetime( $atbat->{'start_tfs_zulu'}, $fallbackDate );
520 |
521 | my $hip =
522 | $this->_get_hip_for_atbat( $hitBalls, $inning->{num},
523 | $atbat->{batter} );
524 | if ($hip) {
525 | $atbat->{'hip'} = $hip;
526 | $atbat->{'hip'}->{'trajectory'} =
527 | _get_trajectory( $atbat->{'des'} );
528 | }
529 |
530 | my $runnersPotentialBases = 0;
531 | if ( $atbat->{'pitch'} ) {
532 | my @pitches = @{ $atbat->{'pitch'} };
533 | my $lastPitch = $pitches[-1];
534 | if ($lastPitch) {
535 | if ( $lastPitch->{'on_1b'} ) {
536 | $runnersPotentialBases += 3;
537 | }
538 | if ( $lastPitch->{'on_2b'} ) {
539 | $runnersPotentialBases += 2;
540 | }
541 | if ( $lastPitch->{'on_3b'} ) {
542 | $runnersPotentialBases += 1;
543 | }
544 | }
545 | }
546 | $atbat->{'runnersPotentialBases'} = $runnersPotentialBases;
547 |
548 | my $runnersMovedBases = 0;
549 | if ( $atbat->{'runner'} ) {
550 | my @runners = @{ $atbat->{'runner'} };
551 | foreach my $runner (@runners) {
552 | $runnersMovedBases += _get_runners_moved($runner);
553 | }
554 | }
555 | $atbat->{'runnersMovedBases'} = $runnersMovedBases;
556 | push( @{$aggregateAtBats}, $atbat );
557 | $startingOuts = $atbat->{'o'};
558 | }
559 | }
560 |
561 | ##
562 | # Hand me a list of hit balls and we'll pick the one for your batter/inning (the first one for that inning)
563 | #
564 | # Note that the inbound list will be altered, in that we'll remove the match to make this method a little
565 | # faster on the next go-round. The method isn't that performant, but it's good enough.
566 | #
567 | # @param hitBalls - a hash containing an array of hits at $hitBalls->{'hip'}
568 | # @param inning - the inning number
569 | # @param batterId - the ID of the batter
570 | # @returns a hip instance or undefined if it there wasn't a match.
571 | # @private
572 | ##
573 | sub _get_hip_for_atbat {
574 | my $this = shift;
575 | my $hitBalls = shift;
576 | my $inning = shift;
577 | my $batterId = shift;
578 |
579 | my @hips = @{ $hitBalls->{'hip'} };
580 | my $hipCount = @hips;
581 |
582 | my $hipMatch = undef;
583 | my $hipMatchIndex = undef;
584 |
585 | for ( my $i = 0 ; $i < $hipCount ; $i++ ) {
586 | my $hip = @hips[$i];
587 | if ( $hip->{'inning'} == $inning
588 | && $hip->{'batter'} == $batterId
589 | && $hip->{'des'} ne 'Error' )
590 | {
591 | $hipMatch = $hip;
592 | $hipMatchIndex = $i;
593 | last;
594 | }
595 | }
596 | if ( $hipMatch && $hipMatchIndex >= 0 ) {
597 | splice( @{ $hitBalls->{'hip'} }, $hipMatchIndex, 1 );
598 | }
599 | return $hipMatch;
600 | }
601 |
602 | ##
603 | # Get a list of the game folders for a day
604 | # @private
605 | ##
606 | sub _get_games_for_day {
607 | my $this = shift;
608 | my $dayUrl = shift;
609 |
610 | my $url = "$dayUrl/epg.xml";
611 | $logger->debug("Getting gameday lists from $url");
612 | my $gamesXml = $this->_get_xml_page($url);
613 | if ($gamesXml) {
614 | my $gamesObj =
615 | XMLin( $gamesXml, KeyAttr => {}, ForceArray => ['game'] );
616 | if ( $gamesObj && $gamesObj->{game} ) {
617 | $this->_cleanup_games( \@{ $gamesObj->{game} } );
618 | return @{ $gamesObj->{game} };
619 | }
620 | }
621 | return ();
622 | }
623 |
624 | ##
625 | # cleanup the data within the games
626 | #
627 | # @param {Object[]} games - the array of games
628 | # @private
629 | ##
630 | sub _cleanup_games {
631 | my $this = shift;
632 | my $games = shift;
633 |
634 | foreach my $game ( @{$games} ) {
635 | if ( $game->{game_media} ) {
636 | undef( $game->{game_media} );
637 | }
638 | }
639 | }
640 |
641 | ##
642 | # Gets the XML file from the given URL and returns the content
643 | # string or undefined if the retrieval failed
644 | #
645 | # @param {string} url
646 | # @private
647 | ##
648 | sub _get_xml_page {
649 | my $this = shift;
650 | my $url = shift;
651 |
652 | my $response = $browser->get($url);
653 | if ( $response->is_success ) {
654 | my $xml = $response->content();
655 | return $xml;
656 | }
657 | else {
658 | $logger->warn("No content found at $url");
659 | return undef;
660 | }
661 | }
662 |
663 | ##
664 | # Get the number of bases that a runner moved in the at-bat
665 | #
666 | # @param {runner} - the runner as it comes from the atbat schema
667 | # @returns the number of bases moved by a runner that isn't the batter
668 | # @static
669 | # @private
670 | ##
671 | sub _get_runners_moved {
672 | my $runner = shift;
673 |
674 | my $endInt = 0;
675 | my $endBase = $runner->{'end'};
676 |
677 | my $startInt = 0;
678 | my $startBase = $runner->{'start'};
679 |
680 | if ($startBase) {
681 | if ( $startBase eq '1B' ) {
682 | $startInt = 1;
683 | }
684 | elsif ( $startBase eq '2B' ) {
685 | $startInt = 2;
686 | }
687 | elsif ( $startBase eq '3B' ) {
688 | $startInt = 3;
689 | }
690 |
691 | if ( $endBase eq '' && $runner->{'score'} eq 'T' ) {
692 | $endInt = 4;
693 | }
694 | elsif ( $endBase eq '' ) {
695 | $endInt = $startInt;
696 | }
697 | elsif ( $endBase eq '3B' ) {
698 | $endInt = 3;
699 | }
700 | elsif ( $endBase eq '2B' ) {
701 | $endInt = 2;
702 | }
703 | elsif ( $endBase eq '1B' ) {
704 | $endInt = 1;
705 | }
706 | }
707 | return $endInt - $startInt;
708 | }
709 |
710 | ##
711 | # Converts a date string to a DateTime object
712 | #
713 | # @param {string} datetimeString
714 | # @static
715 | # @private
716 | ##
717 | sub _convert_to_datetime {
718 | my $datetimeString = shift;
719 | my $fallbackDate = shift;
720 | eval {
721 | my $conversion =
722 | DateTime->from_epoch( epoch => str2time($datetimeString) );
723 | return $conversion;
724 | }
725 | or do {
726 | $logger->error(
727 | "The string '$datetimeString' can't be converted to a DateTime object. Using $fallbackDate"
728 | );
729 | return $fallbackDate;
730 | };
731 | }
732 |
733 | 1;
734 |
--------------------------------------------------------------------------------
/src/main/perl/Kruser/MLB/HitAdjuster.pm:
--------------------------------------------------------------------------------
1 | package Kruser::MLB::HitAdjuster;
2 |
3 | ##
4 | # A module that provides methods for converting
5 | # the MLB X/Y hit coordinates into angles and distances
6 | #
7 | # TODO: the methods here may really require homeX, homeY and distanceMultiplier settings
8 | # per ballpark.
9 | #
10 | # @author kruser
11 | ##
12 | use strict;
13 |
14 | my $PI = atan2 0, -1;
15 |
16 | ##
17 | # Construct an instance
18 | ##
19 | sub new {
20 | my ( $proto, %params ) = @_;
21 | my $package = ref($proto) || $proto;
22 |
23 | my $this = {
24 | homeX => 125.1,
25 | homeY => 204.5,
26 | distanceMultiplier => 2.3142,
27 | };
28 |
29 | foreach my $key ( keys %params ) {
30 | $this->{$key} = $params{$key};
31 | }
32 |
33 | bless( $this, $package );
34 | return $this;
35 | }
36 |
37 | ##
38 | # Given a hip instance, returns an angle of the hit from home plate, assuming home plate is at the center of a circle
39 | # and zero degrees is due left of home plate.
40 | #
41 | # 45 degrees = left field foul line
42 | # 90 degrees = up the middle
43 | # 135 degrees = right field foul line
44 | #
45 | # @param hit - an instance of the hip - see here for an example: http://gd2.mlb.com/components/game/mlb/year_2013/month_07/day_25/gid_2013_07_25_minmlb_seamlb_1/inning/inning_hit.xml
46 | # @returns angle
47 | ##
48 | sub get_hit_angle {
49 | my $this = shift;
50 | my $hit = shift;
51 |
52 | my $x = $hit->{x};
53 | my $y = $hit->{y};
54 |
55 | my $deltaX = $this->{homeX} - $x;
56 | my $deltaY = $this->{homeY} - $y;
57 |
58 | my $degrees = atan2( $deltaY, $deltaX ) * 180 / $PI;
59 | my $rounded = sprintf( "%.2f", $degrees );
60 | return $rounded;
61 | }
62 |
63 | ##
64 | # Given a hip instance, returns an estimation of the distance between home plate and the x,y coordinates of the hit.
65 | #
66 | # WARNING: THIS METHOD ISN'T RELIABLE. IT NEEDS TO TAKE THE FIELD INTO ACCOUNT AS THE IMAGES AND X/Y COORDINATES
67 | # ARE NOT TO THE SAME SCALE ON FIELD TO FIELD.
68 | #
69 | # @param hit - an instance of the hip - see here for an example: http://gd2.mlb.com/components/game/mlb/year_2013/month_07/day_25/gid_2013_07_25_minmlb_seamlb_1/inning/inning_hit.xml
70 | # @returns angle
71 | ##
72 | sub estimate_hit_distance {
73 | my $this = shift;
74 | my $hit = shift;
75 |
76 | my $x = $hit->{x};
77 | my $y = $hit->{y};
78 |
79 | my $deltaX = abs( $this->{homeX} - $x );
80 | my $deltaY = abs( $this->{homeY} - $y );
81 |
82 | my $sideZ = sqrt( ( $deltaX**2 ) + ( $deltaY**2 ) );
83 | my $distance = $sideZ * $this->{distanceMultiplier};
84 | my $rounded = sprintf( "%.2f", $distance );
85 |
86 | return $rounded;
87 | }
88 |
89 | 1;
90 |
--------------------------------------------------------------------------------
/src/main/perl/Kruser/MLB/Storage/Mongo.pm:
--------------------------------------------------------------------------------
1 | package Kruser::MLB::Storage::Mongo;
2 |
3 | ##
4 | # Provides storage to a MongoDB database where the source files are
5 | # standard Perl data structures
6 | #
7 | # All structures will be convert to BSON/JSON before persisting
8 | #
9 | # @author kruser
10 | ##
11 | use strict;
12 | use Log::Log4perl;
13 | use MongoDB;
14 | use Data::Dumper;
15 |
16 | $MongoDB::BSON::looks_like_number = 1;
17 |
18 | my $logger = Log::Log4perl->get_logger("Kruser::MLB::Storage::Mongo");
19 | my $mongoClient;
20 | my $mongoDB;
21 |
22 | ##
23 | # construct an instance
24 | # TODO: use the dbHost property
25 | ##
26 | sub new {
27 | my ( $proto, %params ) = @_;
28 | my $package = ref($proto) || $proto;
29 | my $this = {
30 | dbName => undef,
31 | dbHost => 'localhost'
32 | };
33 |
34 | foreach my $key ( keys %params ) {
35 | $this->{$key} = $params{$key};
36 | }
37 |
38 | $mongoClient = MongoDB::MongoClient->new;
39 | $mongoDB = $mongoClient->get_database( $this->{dbName} );
40 |
41 | bless( $this, $package );
42 | return $this;
43 | }
44 |
45 | ##
46 | # Save a game and its rosters
47 | #
48 | # @param game - the game object
49 | ##
50 | sub save_game {
51 | my $this = shift;
52 | my $game = shift;
53 | my $collectionName = 'games';
54 |
55 | my $gamesCollection = $mongoDB->get_collection($collectionName);
56 | $gamesCollection->insert($game);
57 | }
58 |
59 | ##
60 | # Saves an array of at-bats
61 | #
62 | # @param {Object[]} atbats
63 | ##
64 | sub save_at_bats {
65 | my $this = shift;
66 | my $atbats = shift;
67 |
68 | my $collectionName = 'atbats';
69 |
70 | my $length = @{$atbats};
71 | if ($length) {
72 | my $collection = $mongoDB->get_collection($collectionName);
73 | my @ids = $collection->batch_insert( \@{$atbats} );
74 |
75 | my $length = @ids;
76 | $logger->debug(
77 | "Saved $length at bats to the '$collectionName' collection");
78 | }
79 | }
80 | ##
81 | # Saves an array of pitches
82 | #
83 | # @param {Object[]} pitches
84 | ##
85 | sub save_pitches {
86 | my $this = shift;
87 | my $pitches = shift;
88 |
89 | my $collectionName = 'pitches';
90 |
91 | my $length = @{$pitches};
92 | if ($length) {
93 | my $collection = $mongoDB->get_collection($collectionName);
94 | my @ids = $collection->batch_insert( \@{$pitches} );
95 |
96 | my $length = @ids;
97 | $logger->debug(
98 | "Saved $length pitches to the '$collectionName' collection");
99 | }
100 | }
101 |
102 | ##
103 | # This method will be called to save or update any players. Each object will have an
104 | # 'id' property. If one entry already exists in the database for this ID, the new record
105 | # should simply overwrite or ignore that entry
106 | #
107 | # @param {Object%} players - key is the MLB ID of the player
108 | ##
109 | sub save_players {
110 | my $this = shift;
111 | my $players = shift;
112 | my $collectionName = 'players';
113 |
114 | my $collection = $mongoDB->get_collection($collectionName);
115 | my @newPlayers = ();
116 | my $updatedPlayers = 0;
117 |
118 | foreach my $playerId ( keys %$players ) {
119 | my $result = $collection->find_one( { id => $playerId } );
120 | if ($result) {
121 | $updatedPlayers++;
122 | my $player = $players->{$playerId};
123 | $collection->update( { id => $playerId },
124 | { '$set' => { 'lastSeen' => $player->{'lastSeen'} } } );
125 | }
126 | else {
127 | push( @newPlayers, $players->{$playerId} );
128 | }
129 | }
130 |
131 | $logger->debug("Updated $updatedPlayers players with new lastSeen dates");
132 |
133 | my $length = @newPlayers;
134 | if ($length) {
135 | my @ids = $collection->batch_insert( \@newPlayers );
136 |
137 | my $length = @ids;
138 | $logger->debug("Saved $length new players to the '$collectionName' collection");
139 | }
140 | }
141 |
142 | ##
143 | # Get the date when the database was last sync'd to
144 | # MLB data
145 | #
146 | # A cli query for this might look like...
147 | # db.games.find().sort({'source_day':-1}).limit(1).pretty();
148 | #
149 | # @returns {DateTime} date of the last sync
150 | ##
151 | sub get_last_sync_date {
152 | my $this = shift;
153 |
154 | my $gamesCollection = $mongoDB->get_collection('games');
155 | my $lastGame =
156 | $gamesCollection->find()->sort( { 'source_day' => -1 } )->limit(1);
157 | if ( $lastGame->count() > 0 ) {
158 | my $latestGame = $lastGame->next();
159 | return $latestGame->{'source_day'};
160 | }
161 | else {
162 | return 0;
163 | }
164 | }
165 |
166 | ##
167 | # Checks if we already have games for that day.
168 | #
169 | # A cli query for this might look like...
170 | # db.games.find({'source_day':'2013-06-01'}).limit(1).pretty();
171 | #
172 | # @param {string} day in YYYY-MM-DD format
173 | # @returns {boolean} true if we already have persisted data for this day
174 | ##
175 | sub already_have_day {
176 | my $this = shift;
177 | my $dayString = shift;
178 |
179 | my $gamesCollection = $mongoDB->get_collection('games');
180 | my $gamesForDay = $gamesCollection->find( { 'source_day' => $dayString } );
181 | my $count = $gamesForDay->count();
182 | return $count;
183 | }
184 | 1;
185 |
--------------------------------------------------------------------------------
/src/main/perl/atbatETL.pl:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 | #
3 | # ETL script for taking MLB AtBat data and moving it into a set of different MongoDB collections
4 | #
5 | # @author: kruser
6 | #
7 | use strict;
8 | use Kruser::MLB::AtBat;
9 | use Kruser::MLB::Storage::Mongo;
10 | use Config::Properties;
11 | use Log::Log4perl;
12 | use Data::Dumper;
13 | use File::Basename;
14 | use Getopt::Long;
15 |
16 | my $properties;
17 | my $year;
18 | my $month;
19 | my $day;
20 | my $path = dirname(__FILE__); # where the script lives
21 | Log::Log4perl->init( $path . '/log4perl.conf' );
22 | my $logger = Log::Log4perl->get_logger("atbatETL");
23 |
24 | ##
25 | # Main
26 | ##
27 | load_options();
28 | load_properties();
29 | my $storage = Kruser::MLB::Storage::Mongo->new(
30 | dbName => $properties->getProperty('db.name'),
31 | dbHost => $properties->getProperty('db.host'),
32 | );
33 | my $atbat = Kruser::MLB::AtBat->new(
34 | storage => $storage,
35 | apibase => $properties->getProperty('apibase'),
36 | year => $year,
37 | month => $month,
38 | day => $day,
39 | );
40 | $atbat->initiate_sync();
41 |
42 | ##
43 | # loads the properties from the script configuration file
44 | ##
45 | sub load_properties() {
46 | my $configFile = $path . '/atbatETL.properties';
47 | if ( !-e $configFile ) {
48 | $logger->error("The config file '$configFile' does not exist");
49 | }
50 |
51 | open PROPS, "< $configFile"
52 | or die "Unable to open configuration file $configFile";
53 | $properties = new Config::Properties();
54 | $properties->load(*PROPS);
55 | }
56 |
57 | ##
58 | # load all of the startup options
59 | ##
60 | sub load_options() {
61 | my $help;
62 | GetOptions(
63 | "h" => \$help,
64 | "help" => \$help,
65 | "year=i" => \$year,
66 | "month=i" => \$month,
67 | "day=i" => \$day,
68 | );
69 |
70 | if ($help) {
71 | usage();
72 | }
73 | }
74 |
75 | ##
76 | # Prints out some help
77 | ##
78 | sub usage {
79 | print
80 | "With no args, this program will sync from the last date the program was run\n";
81 | print
82 | "When you initially run it, you should sync an entire month or year to seed your database.\n\n";
83 | print "Optional args\n";
84 | print " --year=YYYY (the year to sync with)\n";
85 | print " --month=MM (the month to sync with, must be used with --year)\n";
86 | print
87 | " --day=DD (the day to sync with, must be used with --year and --month)\n";
88 | print "\nFor example, this will sync June 2013\n";
89 | print "\tperl atbatETL.pl --year=2013 --month=06\n";
90 | exit;
91 | }
92 |
93 |
--------------------------------------------------------------------------------
/src/main/perl/atbatETL.properties:
--------------------------------------------------------------------------------
1 | # the base URL for the at-bat API site
2 | apibase=http://gd2.mlb.com/components/game/mlb
3 |
4 | # database connection information
5 | db.name=mlbatbat
6 | db.host=localhost
7 | db.username=
8 | db.password=
--------------------------------------------------------------------------------
/src/main/perl/log4perl.conf:
--------------------------------------------------------------------------------
1 | log4perl.rootLogger=DEBUG, LOGFILE
2 |
3 | log4perl.appender.LOGFILE=Log::Log4perl::Appender::File
4 | log4perl.appender.LOGFILE.filename=mlbatbat.log
5 | log4perl.appender.LOGFILE.layout=PatternLayout
6 | log4perl.appender.LOGFILE.layout.ConversionPattern=%d %p [%c] %m%n
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/1.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
70 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/10.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
75 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/12.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
71 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/13.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
69 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/14.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
79 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/15.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
75 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/16.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
76 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/17.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 | ]>
6 |
57 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/19.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
70 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/2.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
78 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kruser/atbat-mongodb/12b8dbbcde6f0caa0e7a8d627c55f31046f45bd7/src/main/resources/stadiumImages/20.png
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/22.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
70 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/2392.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
70 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/2394.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
68 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/2395.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
83 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/2397.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 | ]>
6 |
69 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/25.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kruser/atbat-mongodb/12b8dbbcde6f0caa0e7a8d627c55f31046f45bd7/src/main/resources/stadiumImages/25.png
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/2504.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
69 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/2535.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kruser/atbat-mongodb/12b8dbbcde6f0caa0e7a8d627c55f31046f45bd7/src/main/resources/stadiumImages/2535.png
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/2602.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
72 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/2680-2013.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
48 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/2681.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
71 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/2889.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 | ]>
6 |
62 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/3.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
73 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/31.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
68 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/32.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
72 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/3289.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 | ]>
6 |
65 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/3309.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 | ]>
6 |
64 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/3312.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
68 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/3313.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
70 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/4.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
62 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/4169.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
5 | ]>
6 |
45 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/5.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
70 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/680.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
53 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/7.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
72 |
--------------------------------------------------------------------------------
/src/main/resources/stadiumImages/8.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kruser/atbat-mongodb/12b8dbbcde6f0caa0e7a8d627c55f31046f45bd7/src/main/resources/stadiumImages/8.png
--------------------------------------------------------------------------------
/src/main/resources/stadiums.xml:
--------------------------------------------------------------------------------
1 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 |
44 |
45 |
46 |
47 |
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
94 |
95 |
96 |
97 |
98 |
99 |
100 |
101 |
102 |
103 |
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
--------------------------------------------------------------------------------
/src/test/perl/HitAdjuster.t:
--------------------------------------------------------------------------------
1 | #
2 | # Tests for the HitAdjuster module
3 | use Kruser::MLB::HitAdjuster;
4 |
5 | use Test::More tests => 8;
6 |
7 | my $hitAdjuster = new Kruser::MLB::HitAdjuster();
8 |
9 | diamond_cutter_test();
10 | left_field_line_test();
11 | left_field_gap_test();
12 | right_field_line_test();
13 | distance_test();
14 |
15 | sub diamond_cutter_test
16 | {
17 | my $hit = {
18 | des => 'single',
19 | x => 125.1,
20 | y => 99.40,
21 | };
22 |
23 | my $angle = $hitAdjuster->get_hit_angle($hit);
24 | my $expected = 90;
25 | ok( $angle == $expected, "expecting $expected, got $angle" );
26 | }
27 |
28 | sub left_field_line_test
29 | {
30 | my $leftFieldHit = {
31 | des => 'Double',
32 | x => 43.17,
33 | y => 99.40,
34 | };
35 |
36 | my $angle = $hitAdjuster->get_hit_angle($leftFieldHit);
37 | my $expected = 52.06;
38 | ok( $angle == $expected, "expecting $expected, got $angle" );
39 | }
40 |
41 | sub left_field_gap_test
42 | {
43 | my $hit = {
44 | des => 'Double',
45 | x => 72.29,
46 | y => 66.27,
47 | };
48 |
49 | my $angle = $hitAdjuster->get_hit_angle($hit);
50 | my $expected = 69.09;
51 | ok( $angle == $expected, "expecting $expected, got $angle" );
52 | }
53 |
54 | sub right_field_line_test
55 | {
56 | my $hit = {
57 | des => 'Double',
58 | x => 172,
59 | y => 158,
60 | };
61 |
62 | my $angle = $hitAdjuster->get_hit_angle($hit);
63 | my $expected = 135.25;
64 | ok( $angle == $expected, "expecting $expected, got $angle" );
65 | }
66 |
67 | sub distance_test
68 | {
69 | my $distance = $hitAdjuster->estimate_hit_distance({ x => 136, y => 32, });
70 | my $expected = 400;
71 | ok( $distance == $expected, "expecting $expected, got $distance" );
72 |
73 | $distance = $hitAdjuster->estimate_hit_distance({ x => 225.50, y => 102.50, });
74 | $expected = 331.21;
75 | ok( $distance == $expected, "expecting $expected, got $distance" );
76 |
77 | $distance = $hitAdjuster->estimate_hit_distance({ x => 27.30, y => 104.50, });
78 | $expected = 323.70;
79 | ok( $distance == $expected, "expecting $expected, got $distance" );
80 |
81 | $distance = $hitAdjuster->estimate_hit_distance({ x => 66, y => 62, });
82 | $expected = 357.01;
83 | ok( $distance == $expected, "expecting $expected, got $distance" );
84 | }
85 |
86 |
--------------------------------------------------------------------------------