├── README ├── bin └── highdiv.pl ├── data ├── NYSE_daily ├── NYSE_dividends ├── README ├── baseball └── webcrawl ├── examples ├── README ├── ch2 │ └── average_dividend.pig ├── ch4 │ ├── no_schema.pig │ ├── no_schema_filter.pig │ ├── no_schema_join.pig │ ├── total_trade_estimate.pig │ ├── unintended_walks.pig │ └── unintended_walks_cast.pig ├── ch5 │ ├── batting_production.pig │ ├── count.pig │ ├── countall.pig │ ├── defaultparallel.pig │ ├── define.pig │ ├── define_constructor_args.pig │ ├── distinct.pig │ ├── filter_matches.pig │ ├── filter_not_matches.pig │ ├── group.pig │ ├── invoker.pig │ ├── join.pig │ ├── join2key.pig │ ├── leftjoin.pig │ ├── limit.pig │ ├── order.pig │ ├── order2key.pig │ ├── orderdesc.pig │ ├── parallel.pig │ ├── register.pig │ ├── sample.pig │ ├── selfjoin.pig │ ├── twokey.pig │ └── udf_in_foreach.pig ├── ch6 │ ├── analyze_stock.pig │ ├── cross.pig │ ├── daily.params │ ├── daily.pig │ ├── distinct_symbols.pig │ ├── dividend_analysis.pig │ ├── double_distinct.pig │ ├── flatten.pig │ ├── flatten_noempty.pig │ ├── highest_dividend.pig │ ├── macro.pig │ ├── main.pig │ ├── mergejoin.pig │ ├── multiquery.pig │ ├── repljoin.pig │ ├── semijoin.pig │ ├── sort_for_mergejoin.pig │ ├── streamship.pig │ ├── streamsimple.pig │ └── thetajoin.pig ├── ch7 │ ├── describe.pig │ ├── expected.out │ ├── explain.pig │ ├── illustrate.pig │ ├── java │ │ ├── build.xml │ │ └── example │ │ │ └── PigUnitExample.java │ ├── pigunit.pig │ ├── pigunitwithparams.pig │ └── stats.pig └── ch9 │ ├── pagerank.py │ └── pagerankbindnoarg.py ├── setup ├── README ├── baseball.pig └── tomap.java └── udfs ├── java ├── README ├── build.xml └── com │ └── acme │ ├── financial │ ├── AnalyzeStock.java │ ├── AnalyzeStockV2.java │ └── CurrencyConverter.java │ ├── io │ ├── JsonLoader.java │ └── JsonStorage.java │ ├── marketing │ ├── CloseEnough.java │ ├── MetroResolver.java │ └── MetroResolverV2.java │ └── math │ ├── Pow.java │ └── PowV2.java └── python ├── production.py └── square.py /README: -------------------------------------------------------------------------------- 1 | This repository contains the example data and Pig Latin scripts for the book 2 | _Programming Pig_ by Alan F. Gates, published by O'Reilly. All data used in 3 | the examples is in the public domain. All Pig Latin scripts and associated 4 | user defined functions are released under the Apache 2.0 license. 5 | 6 | In this repository you will find the code used to take the data from it source 7 | and prepare if for the example in the setup directory. The data directory 8 | contains the cleansed data, ready for use in the examples. The examples 9 | directory contains the example Pig Latin scripts, divided by chapters. The 10 | udfs directory contains the UDFs used in the examples. 11 | -------------------------------------------------------------------------------- /bin/highdiv.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | # This code is made available under the Apache License, Version 2.0 (the 4 | # "License"); you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 11 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 12 | # License for the specific language governing permissions and limitations 13 | # under the License. 14 | 15 | use strict; 16 | 17 | while (<>) { 18 | my @fields = split; 19 | if ($fields[3] > 1.0) { 20 | print join("\t", @fields) . "\n" 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /data/NYSE_dividends: -------------------------------------------------------------------------------- 1 | NYSE CPO 2009-12-30 0.14 2 | NYSE CPO 2009-09-28 0.14 3 | NYSE CPO 2009-06-26 0.14 4 | NYSE CPO 2009-03-27 0.14 5 | NYSE CPO 2009-01-06 0.14 6 | NYSE CCS 2009-10-28 0.414 7 | NYSE CCS 2009-07-29 0.414 8 | NYSE CCS 2009-04-29 0.414 9 | NYSE CCS 2009-01-28 0.414 10 | NYSE CIF 2009-12-09 0.029 11 | NYSE CIF 2009-11-10 0.019 12 | NYSE CIF 2009-10-13 0.019 13 | NYSE CIF 2009-09-10 0.019 14 | NYSE CIF 2009-08-10 0.02 15 | NYSE CIF 2009-07-13 0.02 16 | NYSE CIF 2009-06-10 0.02 17 | NYSE CIF 2009-05-11 0.021 18 | NYSE CIF 2009-04-13 0.022 19 | NYSE CIF 2009-03-09 0.022 20 | NYSE CIF 2009-02-09 0.022 21 | NYSE CIF 2009-01-12 0.025 22 | NYSE CYE 2009-12-29 0.055 23 | NYSE CYE 2009-12-11 0.055 24 | NYSE CYE 2009-11-10 0.055 25 | NYSE CYE 2009-10-13 0.055 26 | NYSE CYE 2009-09-11 0.055 27 | NYSE CYE 2009-08-12 0.055 28 | NYSE CYE 2009-07-13 0.055 29 | NYSE CYE 2009-06-11 0.055 30 | NYSE CYE 2009-05-13 0.06 31 | NYSE CYE 2009-04-13 0.06 32 | NYSE CYE 2009-03-12 0.06 33 | NYSE CYE 2009-02-11 0.06 34 | NYSE CWF 2009-12-15 0.034 35 | NYSE CWF 2009-11-17 0.034 36 | NYSE CWF 2009-10-20 0.034 37 | NYSE CWF 2009-09-15 0.034 38 | NYSE CWF 2009-08-18 0.034 39 | NYSE CWF 2009-07-21 0.034 40 | NYSE CWF 2009-06-16 0.034 41 | NYSE CWF 2009-05-19 0.034 42 | NYSE CWF 2009-04-21 0.034 43 | NYSE CWF 2009-03-17 0.034 44 | NYSE CWF 2009-02-17 0.034 45 | NYSE CWF 2009-01-20 0.034 46 | NYSE CME 2009-12-08 1.15 47 | NYSE CME 2009-09-08 1.15 48 | NYSE CME 2009-06-08 1.15 49 | NYSE CME 2009-03-06 1.15 50 | NYSE CM 2009-12-24 0.828 51 | NYSE CM 2009-09-24 0.815 52 | NYSE CM 2009-06-25 0.76 53 | NYSE CM 2009-03-25 0.709 54 | NYSE CYD 2009-10-02 0.1 55 | NYSE CGO 2009-12-29 0.1 56 | NYSE CGO 2009-12-03 0.1 57 | NYSE CGO 2009-11-06 0.1 58 | NYSE CGO 2009-10-07 0.1 59 | NYSE CGO 2009-09-08 0.1 60 | NYSE CGO 2009-08-06 0.1 61 | NYSE CGO 2009-07-08 0.1 62 | NYSE CGO 2009-06-08 0.1 63 | NYSE CGO 2009-05-07 0.1 64 | NYSE CGO 2009-04-08 0.1 65 | NYSE CGO 2009-03-09 0.1 66 | NYSE CGO 2009-02-06 0.1 67 | NYSE CBU 2009-12-11 0.22 68 | NYSE CBU 2009-09-11 0.22 69 | NYSE CBU 2009-06-11 0.22 70 | NYSE CBU 2009-03-12 0.22 71 | NYSE CE 2009-10-13 0.04 72 | NYSE CE 2009-07-13 0.04 73 | NYSE CE 2009-04-13 0.04 74 | NYSE CE 2009-01-13 0.04 75 | NYSE CIB 2009-12-17 0.311 76 | NYSE CIB 2009-09-17 0.311 77 | NYSE CIB 2009-06-12 0.274 78 | NYSE CIB 2009-03-17 0.244 79 | NYSE CCT 2009-10-28 0.438 80 | NYSE CCT 2009-07-29 0.438 81 | NYSE CCT 2009-04-29 0.438 82 | NYSE CCT 2009-01-28 0.438 83 | NYSE CEE 2009-12-29 0.651 84 | NYSE COO 2009-07-16 0.03 85 | NYSE COO 2009-01-14 0.03 86 | NYSE CMP 2009-11-27 0.355 87 | NYSE CMP 2009-08-28 0.355 88 | NYSE CMP 2009-05-28 0.355 89 | NYSE CMP 2009-02-25 0.355 90 | NYSE CNL 2009-11-05 0.225 91 | NYSE CNL 2009-08-06 0.225 92 | NYSE CNL 2009-04-30 0.225 93 | NYSE CNL 2009-02-05 0.225 94 | NYSE CATO 2009-12-17 0.165 95 | NYSE CATO 2009-09-10 0.165 96 | NYSE CATO 2009-06-04 0.165 97 | NYSE CATO 2009-03-05 0.165 98 | NYSE CPL 2009-08-18 1.963 99 | NYSE CPL 2009-03-03 1.617 100 | NYSE CP 2009-12-29 0.236 101 | NYSE CP 2009-09-23 0.232 102 | NYSE CP 2009-06-24 0.215 103 | NYSE CP 2009-03-25 0.202 104 | NYSE CPA 2009-05-27 0.37 105 | NYSE CNW 2009-11-10 0.1 106 | NYSE CNW 2009-08-12 0.1 107 | NYSE CNW 2009-05-13 0.1 108 | NYSE CNW 2009-02-11 0.1 109 | NYSE CVB 2009-08-27 0.388 110 | NYSE CVB 2009-02-25 0.388 111 | NYSE CLC 2009-10-07 0.098 112 | NYSE CLC 2009-07-08 0.09 113 | NYSE CLC 2009-04-07 0.09 114 | NYSE CLC 2009-01-07 0.09 115 | NYSE CLNY 2009-12-29 0.07 116 | NYSE CNP 2009-11-12 0.19 117 | NYSE CNP 2009-08-12 0.19 118 | NYSE CNP 2009-05-13 0.19 119 | NYSE CNP 2009-02-11 0.19 120 | NYSE CSA 2009-12-29 0.1 121 | NYSE CSA 2009-09-23 0.1 122 | NYSE CSA 2009-06-23 0.1 123 | NYSE CSA 2009-03-27 0.225 124 | NYSE CSA 2009-01-12 0.225 125 | NYSE CDI 2009-11-09 0.13 126 | NYSE CDI 2009-08-11 0.13 127 | NYSE CDI 2009-05-12 0.13 128 | NYSE CDI 2009-03-10 0.13 129 | NYSE CSL 2009-11-12 0.16 130 | NYSE CSL 2009-08-12 0.16 131 | NYSE CSL 2009-05-14 0.155 132 | NYSE CSL 2009-02-18 0.155 133 | NYSE CPK 2009-12-10 0.315 134 | NYSE CPK 2009-09-10 0.315 135 | NYSE CPK 2009-06-10 0.315 136 | NYSE CPK 2009-03-11 0.305 137 | NYSE CMU 2009-12-09 0.029 138 | NYSE CMU 2009-11-10 0.028 139 | NYSE CMU 2009-10-13 0.028 140 | NYSE CMU 2009-09-10 0.028 141 | NYSE CMU 2009-08-10 0.028 142 | NYSE CMU 2009-07-13 0.028 143 | NYSE CMU 2009-06-10 0.028 144 | NYSE CMU 2009-05-11 0.028 145 | NYSE CMU 2009-04-13 0.028 146 | NYSE CMU 2009-03-09 0.028 147 | NYSE CMU 2009-02-09 0.027 148 | NYSE CMU 2009-01-12 0.026 149 | NYSE CDR 2009-12-29 0.09 150 | NYSE CDR 2009-02-06 0.113 151 | NYSE CYS 2009-12-29 0.55 152 | NYSE CYS 2009-10-01 0.35 153 | NYSE COY 2009-12-29 0.061 154 | NYSE COY 2009-12-11 0.061 155 | NYSE COY 2009-11-10 0.061 156 | NYSE COY 2009-10-13 0.061 157 | NYSE COY 2009-09-11 0.061 158 | NYSE COY 2009-08-12 0.061 159 | NYSE COY 2009-07-13 0.061 160 | NYSE COY 2009-06-11 0.061 161 | NYSE COY 2009-05-13 0.061 162 | NYSE COY 2009-04-13 0.061 163 | NYSE COY 2009-03-12 0.061 164 | NYSE COY 2009-02-11 0.061 165 | NYSE CHI 2009-12-29 0.095 166 | NYSE CHI 2009-12-03 0.095 167 | NYSE CHI 2009-11-06 0.095 168 | NYSE CHI 2009-10-07 0.095 169 | NYSE CHI 2009-09-08 0.095 170 | NYSE CHI 2009-08-06 0.095 171 | NYSE CHI 2009-07-08 0.095 172 | NYSE CHI 2009-06-08 0.095 173 | NYSE CHI 2009-05-07 0.095 174 | NYSE CHI 2009-04-08 0.095 175 | NYSE CHI 2009-03-09 0.095 176 | NYSE CHI 2009-02-06 0.095 177 | NYSE COF 2009-11-06 0.05 178 | NYSE COF 2009-08-07 0.05 179 | NYSE COF 2009-05-07 0.05 180 | NYSE COF 2009-02-06 0.375 181 | NYSE CRT 2009-12-29 0.178 182 | NYSE CRT 2009-11-25 0.195 183 | NYSE CRT 2009-10-28 0.226 184 | NYSE CRT 2009-09-28 0.194 185 | NYSE CRT 2009-08-27 0.16 186 | NYSE CRT 2009-07-29 0.107 187 | NYSE CRT 2009-06-26 0.139 188 | NYSE CRT 2009-05-27 0.125 189 | NYSE CRT 2009-04-28 0.118 190 | NYSE CRT 2009-03-27 0.118 191 | NYSE CRT 2009-02-25 0.104 192 | NYSE CRT 2009-01-28 0.223 193 | NYSE CHA 2009-04-21 1.097 194 | NYSE CLF 2009-11-18 0.088 195 | NYSE CLF 2009-08-12 0.04 196 | NYSE CLF 2009-05-20 0.04 197 | NYSE CLF 2009-02-12 0.088 198 | NYSE CEG 2009-12-08 0.24 199 | NYSE CEG 2009-09-08 0.24 200 | NYSE CEG 2009-06-08 0.24 201 | NYSE CEG 2009-03-06 0.24 202 | NYSE CSE 2009-12-14 0.01 203 | NYSE CSE 2009-09-14 0.01 204 | NYSE CSE 2009-06-12 0.01 205 | NYSE CSE 2009-03-19 0.01 206 | NYSE COH 2009-12-03 0.075 207 | NYSE COH 2009-09-03 0.075 208 | NYSE COH 2009-06-04 0.075 209 | NYSE CPB 2009-12-28 0.275 210 | NYSE CPB 2009-10-01 0.25 211 | NYSE CPB 2009-07-01 0.25 212 | NYSE CPB 2009-04-02 0.25 213 | NYSE CPT 2009-12-17 0.45 214 | NYSE CPT 2009-09-28 0.45 215 | NYSE CPT 2009-06-26 0.45 216 | NYSE CPT 2009-03-27 0.7 217 | NYSE CVS 2009-10-20 0.076 218 | NYSE CVS 2009-07-21 0.076 219 | NYSE CVS 2009-04-20 0.076 220 | NYSE CVS 2009-01-21 0.076 221 | NYSE CWZ 2009-10-09 1.109 222 | NYSE CWZ 2009-04-09 1.109 223 | NYSE CLI 2009-10-01 0.45 224 | NYSE CLI 2009-07-01 0.45 225 | NYSE CLI 2009-04-01 0.45 226 | NYSE CLI 2009-01-02 0.64 227 | NYSE CAG 2009-10-28 0.2 228 | NYSE CAG 2009-07-29 0.19 229 | NYSE CAG 2009-04-29 0.19 230 | NYSE CAG 2009-01-28 0.19 231 | NYSE CFR 2009-11-27 0.43 232 | NYSE CFR 2009-08-28 0.43 233 | NYSE CFR 2009-05-28 0.43 234 | NYSE CFR 2009-02-25 0.42 235 | NYSE CMK 2009-12-09 0.041 236 | NYSE CMK 2009-11-10 0.038 237 | NYSE CMK 2009-10-13 0.038 238 | NYSE CMK 2009-09-10 0.038 239 | NYSE CMK 2009-08-10 0.041 240 | NYSE CMK 2009-07-13 0.041 241 | NYSE CMK 2009-06-10 0.041 242 | NYSE CMK 2009-05-11 0.041 243 | NYSE CMK 2009-04-13 0.041 244 | NYSE CMK 2009-03-09 0.041 245 | NYSE CMK 2009-02-09 0.041 246 | NYSE CMK 2009-01-12 0.041 247 | NYSE CWT 2009-11-05 0.295 248 | NYSE CWT 2009-08-06 0.295 249 | NYSE CWT 2009-05-06 0.295 250 | NYSE CWT 2009-02-05 0.295 251 | NYSE CPY 2009-11-25 0.16 252 | NYSE CPY 2009-08-13 0.16 253 | NYSE CPY 2009-05-19 0.16 254 | NYSE CPY 2009-02-25 0.16 255 | NYSE CSQ 2009-12-29 0.053 256 | NYSE CSQ 2009-12-03 0.053 257 | NYSE CSQ 2009-11-06 0.053 258 | NYSE CSQ 2009-10-07 0.063 259 | NYSE CSQ 2009-09-08 0.063 260 | NYSE CSQ 2009-08-06 0.063 261 | NYSE CSQ 2009-07-08 0.063 262 | NYSE CSQ 2009-06-08 0.075 263 | NYSE CSQ 2009-05-07 0.075 264 | NYSE CSQ 2009-04-08 0.075 265 | NYSE CSQ 2009-03-09 0.075 266 | NYSE CSQ 2009-02-06 0.075 267 | NYSE CRR 2009-10-29 0.18 268 | NYSE CRR 2009-07-30 0.18 269 | NYSE CRR 2009-04-29 0.17 270 | NYSE CRR 2009-01-29 0.17 271 | NYSE COL 2009-11-12 0.24 272 | NYSE COL 2009-08-13 0.24 273 | NYSE COL 2009-05-14 0.24 274 | NYSE COL 2009-02-11 0.24 275 | NYSE CTL 2009-11-30 0.7 276 | NYSE CTL 2009-09-03 0.7 277 | NYSE CTL 2009-06-12 0.7 278 | NYSE CTL 2009-03-13 0.7 279 | NYSE CHT 2009-07-30 1.16269 280 | NYSE CHT 2009-03-03 0.0 281 | NYSE CBK 2009-10-06 0.06 282 | NYSE CBK 2009-06-16 0.06 283 | NYSE CBK 2009-03-19 0.06 284 | NYSE CRH 2009-09-02 0.265 285 | NYSE CRH 2009-03-11 0.609 286 | NYSE CSS 2009-11-27 0.15 287 | NYSE CSS 2009-08-28 0.15 288 | NYSE CSS 2009-06-09 0.15 289 | NYSE CSS 2009-02-26 0.15 290 | NYSE CIG 2009-04-30 0.5456 291 | NYSE CF 2009-11-12 0.1 292 | NYSE CF 2009-08-12 0.1 293 | NYSE CF 2009-05-12 0.1 294 | NYSE CF 2009-02-12 0.1 295 | NYSE CJB 2009-11-25 0.688 296 | NYSE CJB 2009-10-16 0.595 297 | NYSE CJB 2009-08-27 0.688 298 | NYSE CJB 2009-05-27 0.688 299 | NYSE CJB 2009-04-16 0.621 300 | NYSE CJB 2009-02-25 0.688 301 | NYSE CBL 2009-12-28 0.05 302 | NYSE CBL 2009-09-28 0.05 303 | NYSE CBL 2009-06-26 0.11 304 | NYSE CBL 2009-03-11 0.37 305 | NYSE CMI 2009-11-18 0.175 306 | NYSE CMI 2009-08-19 0.175 307 | NYSE CMI 2009-05-20 0.175 308 | NYSE CMI 2009-02-18 0.175 309 | NYSE CAH 2009-12-29 0.175 310 | NYSE CAH 2009-09-29 0.175 311 | NYSE CAH 2009-09-02 9.75 312 | NYSE CAH 2009-06-29 0.175 313 | NYSE CAH 2009-03-30 0.14 314 | NYSE CAE 2009-12-11 0.029 315 | NYSE CAE 2009-09-11 0.028 316 | NYSE CAE 2009-06-11 0.027 317 | NYSE CAE 2009-03-12 0.023 318 | NYSE CJS 2009-11-25 0.844 319 | NYSE CJS 2009-10-16 0.723 320 | NYSE CJS 2009-08-27 0.581 321 | NYSE CBY 2009-09-16 0.376 322 | NYSE CBY 2009-04-22 0.639 323 | NYSE CHG 2009-10-08 0.54 324 | NYSE CHG 2009-07-08 0.54 325 | NYSE CHG 2009-04-07 0.54 326 | NYSE CHG 2009-01-07 0.54 327 | NYSE CHN 2009-12-22 0.209 328 | NYSE CAS 2009-03-17 0.06 329 | NYSE CCE 2009-11-24 0.08 330 | NYSE CCE 2009-09-09 0.08 331 | NYSE CCE 2009-06-10 0.07 332 | NYSE CCE 2009-03-11 0.07 333 | NYSE CUB 2009-08-19 0.09 334 | NYSE CUB 2009-03-06 0.09 335 | NYSE CJA 2009-11-25 0.688 336 | NYSE CJA 2009-10-16 0.501 337 | NYSE CJA 2009-08-27 0.688 338 | NYSE CJA 2009-05-27 0.688 339 | NYSE CJA 2009-04-16 0.524 340 | NYSE CJA 2009-02-25 0.688 341 | NYSE CIX 2009-12-08 0.125 342 | NYSE CIX 2009-09-10 0.125 343 | NYSE CIX 2009-06-08 0.125 344 | NYSE CIX 2009-03-06 0.125 345 | NYSE CBD 2009-11-19 0.077 346 | NYSE CBD 2009-08-12 0.148 347 | NYSE CBD 2009-05-04 0.247 348 | NYSE CMS 2009-11-04 0.125 349 | NYSE CMS 2009-08-06 0.125 350 | NYSE CMS 2009-05-06 0.125 351 | NYSE CMS 2009-02-04 0.125 352 | NYSE CMO 2009-12-29 0.54 353 | NYSE CMO 2009-09-28 0.56 354 | NYSE CMO 2009-06-26 0.58 355 | NYSE CMO 2009-03-27 0.56 356 | NYSE CTB 2009-11-30 0.105 357 | NYSE CTB 2009-08-31 0.105 358 | NYSE CTB 2009-05-29 0.105 359 | NYSE CTB 2009-03-05 0.105 360 | NYSE CFT 2009-12-29 0.397 361 | NYSE CFT 2009-12-01 0.395 362 | NYSE CFT 2009-11-02 0.421 363 | NYSE CFT 2009-10-01 0.399 364 | NYSE CFT 2009-07-01 0.414 365 | NYSE CFT 2009-06-01 0.437 366 | NYSE CFT 2009-05-01 0.421 367 | NYSE CFT 2009-04-01 0.425 368 | NYSE CFT 2009-03-02 0.41 369 | NYSE CFT 2009-02-02 0.428 370 | NYSE CJT 2009-11-25 0.844 371 | NYSE CJT 2009-08-27 0.581 372 | NYSE CBT 2009-11-24 0.18 373 | NYSE CBT 2009-08-26 0.18 374 | NYSE CBT 2009-05-27 0.18 375 | NYSE CBT 2009-02-25 0.18 376 | NYSE CL 2009-10-22 0.44 377 | NYSE CL 2009-07-22 0.44 378 | NYSE CL 2009-04-22 0.44 379 | NYSE CL 2009-01-22 0.4 380 | NYSE CSP 2009-12-28 0.098 381 | NYSE CSP 2009-12-01 0.098 382 | NYSE CSP 2009-11-02 0.055 383 | NYSE CSP 2009-10-01 0.055 384 | NYSE CSP 2009-09-02 0.055 385 | NYSE CSP 2009-08-03 0.055 386 | NYSE CSP 2009-07-01 0.06 387 | NYSE CSP 2009-06-01 0.06 388 | NYSE CSP 2009-05-01 0.06 389 | NYSE CSP 2009-04-01 0.06 390 | NYSE CSP 2009-03-02 0.06 391 | NYSE CSP 2009-02-02 0.065 392 | NYSE CI 2009-03-09 0.04 393 | NYSE CHK 2009-12-30 0.075 394 | NYSE CHK 2009-09-29 0.075 395 | NYSE CHK 2009-06-29 0.075 396 | NYSE CHK 2009-03-30 0.075 397 | NYSE CASC 2009-12-31 0.01 398 | NYSE CASC 2009-09-30 0.01 399 | NYSE CASC 2009-06-29 0.05 400 | NYSE CASC 2009-04-27 0.05 401 | NYSE CBC 2009-02-11 0.05 402 | NYSE CA 2009-11-13 0.04 403 | NYSE CA 2009-08-06 0.04 404 | NYSE CA 2009-05-27 0.04 405 | NYSE CA 2009-02-12 0.04 406 | NYSE CMA 2009-12-11 0.05 407 | NYSE CMA 2009-09-11 0.05 408 | NYSE CMA 2009-06-11 0.05 409 | NYSE CMA 2009-03-11 0.05 410 | NYSE CHH 2009-12-30 0.185 411 | NYSE CHH 2009-09-30 0.185 412 | NYSE CHH 2009-06-30 0.185 413 | NYSE CHH 2009-04-01 0.185 414 | NYSE CRS 2009-10-23 0.18 415 | NYSE CRS 2009-08-28 0.18 416 | NYSE CRS 2009-05-01 0.18 417 | NYSE CRS 2009-01-30 0.18 418 | NYSE CCJ 2009-12-29 0.057 419 | NYSE CCJ 2009-09-28 0.055 420 | NYSE CCJ 2009-06-26 0.052 421 | NYSE CCJ 2009-02-27 0.047 422 | NYSE CBE 2009-11-25 0.25 423 | NYSE CBE 2009-08-27 0.25 424 | NYSE CBE 2009-05-27 0.25 425 | NYSE CBE 2009-02-25 0.25 426 | NYSE CVE 2009-12-17 0.2 427 | NYSE CVE 2009-12-09 23.549999 428 | NYSE CV 2009-10-30 0.23 429 | NYSE CV 2009-07-31 0.23 430 | NYSE CV 2009-05-01 0.23 431 | NYSE CPV 2009-12-09 0.422 432 | NYSE CPV 2009-09-09 0.422 433 | NYSE CPV 2009-06-10 0.422 434 | NYSE CPV 2009-03-10 0.422 435 | NYSE CIR 2009-11-10 0.038 436 | NYSE CIR 2009-08-12 0.038 437 | NYSE CIR 2009-05-13 0.038 438 | NYSE CIR 2009-03-11 0.038 439 | NYSE CLX 2009-10-26 0.5 440 | NYSE CLX 2009-07-23 0.5 441 | NYSE CLX 2009-04-23 0.46 442 | NYSE CLX 2009-01-26 0.46 443 | NYSE CHE 2009-11-17 0.12 444 | NYSE CHE 2009-08-13 0.12 445 | NYSE CHE 2009-06-04 0.06 446 | NYSE CHE 2009-02-26 0.06 447 | NYSE CSX 2009-11-25 0.22 448 | NYSE CSX 2009-08-27 0.22 449 | NYSE CSX 2009-05-27 0.22 450 | NYSE CSX 2009-02-25 0.22 451 | NYSE COG 2009-11-06 0.03 452 | NYSE COG 2009-08-04 0.03 453 | NYSE COG 2009-05-08 0.03 454 | NYSE COG 2009-02-03 0.03 455 | NYSE CHU 2009-06-03 0.293 456 | NYSE CCH 2009-06-22 0.375 457 | NYSE CCW 2009-11-27 0.438 458 | NYSE CCW 2009-08-28 0.438 459 | NYSE CCW 2009-05-28 0.438 460 | NYSE CCW 2009-02-25 0.438 461 | NYSE COV 2009-10-02 0.18 462 | NYSE COV 2009-07-31 0.16 463 | NYSE COV 2009-03-26 0.16 464 | NYSE COV 2009-02-04 0.16 465 | NYSE CJR 2009-12-11 0.05 466 | NYSE CJR 2009-11-12 0.048 467 | NYSE CJR 2009-10-13 0.048 468 | NYSE CJR 2009-09-11 0.046 469 | NYSE CJR 2009-08-13 0.045 470 | NYSE CJR 2009-07-13 0.043 471 | NYSE CJR 2009-06-11 0.04 472 | NYSE CJR 2009-05-13 0.043 473 | NYSE CJR 2009-04-13 0.041 474 | NYSE CJR 2009-03-11 0.039 475 | NYSE CJR 2009-02-11 0.041 476 | NYSE CJR 2009-01-13 0.042 477 | NYSE CCU 2009-12-29 0.596 478 | NYSE CCU 2009-04-20 0.895 479 | NYSE CAT 2009-10-22 0.42 480 | NYSE CAT 2009-07-16 0.42 481 | NYSE CAT 2009-04-16 0.42 482 | NYSE CAT 2009-01-15 0.42 483 | NYSE CHY 2009-12-29 0.085 484 | NYSE CHY 2009-12-03 0.085 485 | NYSE CHY 2009-11-06 0.085 486 | NYSE CHY 2009-10-07 0.085 487 | NYSE CHY 2009-09-08 0.085 488 | NYSE CHY 2009-08-06 0.085 489 | NYSE CHY 2009-07-08 0.085 490 | NYSE CHY 2009-06-08 0.085 491 | NYSE CHY 2009-05-07 0.085 492 | NYSE CHY 2009-04-08 0.085 493 | NYSE CHY 2009-03-09 0.085 494 | NYSE CHY 2009-02-06 0.085 495 | NYSE CVC 2009-11-10 0.1 496 | NYSE CVC 2009-08-06 0.1 497 | NYSE CVC 2009-05-14 0.1 498 | NYSE CVC 2009-03-05 0.1 499 | NYSE CXE 2009-12-09 0.032 500 | NYSE CXE 2009-11-10 0.031 501 | NYSE CXE 2009-10-13 0.031 502 | NYSE CXE 2009-09-10 0.031 503 | NYSE CXE 2009-08-10 0.031 504 | NYSE CXE 2009-07-13 0.031 505 | NYSE CXE 2009-06-10 0.031 506 | NYSE CXE 2009-05-11 0.031 507 | NYSE CXE 2009-04-13 0.031 508 | NYSE CXE 2009-03-09 0.03 509 | NYSE CXE 2009-02-09 0.028 510 | NYSE CXE 2009-01-12 0.027 511 | NYSE CNQ 2009-12-09 0.1 512 | NYSE CNQ 2009-09-09 0.098 513 | NYSE CNQ 2009-06-10 0.095 514 | NYSE CNQ 2009-03-11 0.081 515 | NYSE CW 2009-11-23 0.08 516 | NYSE CW 2009-10-07 0.08 517 | NYSE CW 2009-06-30 0.08 518 | NYSE CW 2009-04-01 0.08 519 | NYSE CYT 2009-11-06 0.013 520 | NYSE CYT 2009-08-06 0.013 521 | NYSE CYT 2009-05-07 0.013 522 | NYSE CYT 2009-02-06 0.125 523 | NYSE CIM 2009-12-29 0.17 524 | NYSE CIM 2009-09-29 0.12 525 | NYSE CIM 2009-05-28 0.08 526 | NYSE CIM 2009-04-02 0.06 527 | NYSE CLB 2009-10-21 0.1 528 | NYSE CLB 2009-07-22 0.85 529 | NYSE CLB 2009-04-23 0.1 530 | NYSE CLB 2009-01-29 0.1 531 | NYSE CHD 2009-11-05 0.14 532 | NYSE CHD 2009-08-11 0.14 533 | NYSE CHD 2009-05-07 0.09 534 | NYSE CHD 2009-02-05 0.09 535 | NYSE CXH 2009-12-09 0.054 536 | NYSE CXH 2009-11-10 0.054 537 | NYSE CXH 2009-10-13 0.054 538 | NYSE CXH 2009-09-10 0.053 539 | NYSE CXH 2009-08-10 0.053 540 | NYSE CXH 2009-07-13 0.053 541 | NYSE CXH 2009-06-10 0.052 542 | NYSE CXH 2009-05-11 0.052 543 | NYSE CXH 2009-04-13 0.051 544 | NYSE CXH 2009-03-09 0.049 545 | NYSE CXH 2009-02-09 0.045 546 | NYSE CXH 2009-01-12 0.043 547 | NYSE CR 2009-11-25 0.2 548 | NYSE CR 2009-08-27 0.2 549 | NYSE CR 2009-05-27 0.2 550 | NYSE CR 2009-02-25 0.2 551 | NYSE CRE 2009-12-10 0.17 552 | NYSE CRE 2009-09-09 0.17 553 | NYSE CRE 2009-06-19 0.17 554 | NYSE CRE 2009-03-24 0.17 555 | NYSE CNI 2009-12-08 0.238 556 | NYSE CNI 2009-09-04 0.229 557 | NYSE CNI 2009-06-05 0.228 558 | NYSE CNI 2009-03-06 0.196 559 | NYSE CUZ 2009-10-22 0.03 560 | NYSE CUZ 2009-07-30 0.15 561 | NYSE CUZ 2009-04-29 0.25 562 | NYSE CUZ 2009-02-05 0.25 563 | NYSE CLP 2009-11-05 0.15 564 | NYSE CLP 2009-07-30 0.15 565 | NYSE CLP 2009-04-30 0.15 566 | NYSE CLP 2009-02-05 0.25 567 | NYSE CTS 2009-12-22 0.03 568 | NYSE CTS 2009-09-23 0.03 569 | NYSE CTS 2009-06-24 0.03 570 | NYSE CTS 2009-03-25 0.03 571 | NYSE CNX 2009-11-02 0.1 572 | NYSE CNX 2009-08-04 0.1 573 | NYSE CNX 2009-05-01 0.1 574 | NYSE CNX 2009-02-05 0.1 575 | NYSE CSJ 2009-12-29 0.246 576 | NYSE CSJ 2009-12-01 0.276 577 | NYSE CSJ 2009-11-02 0.277 578 | NYSE CSJ 2009-10-01 0.304 579 | NYSE CSJ 2009-07-01 0.331 580 | NYSE CSJ 2009-06-01 0.342 581 | NYSE CSJ 2009-05-01 0.359 582 | NYSE CSJ 2009-04-01 0.374 583 | NYSE CSJ 2009-03-02 0.367 584 | NYSE CSJ 2009-02-02 0.355 585 | NYSE CEL 2009-11-30 0.77 586 | NYSE CEL 2009-08-27 0.8 587 | NYSE CEL 2009-06-04 0.84 588 | NYSE CEL 2009-03-12 0.66 589 | NYSE CIU 2009-12-29 0.374 590 | NYSE CIU 2009-12-01 0.386 591 | NYSE CIU 2009-11-02 0.409 592 | NYSE CIU 2009-10-01 0.395 593 | NYSE CIU 2009-07-01 0.402 594 | NYSE CIU 2009-06-01 0.436 595 | NYSE CIU 2009-05-01 0.394 596 | NYSE CIU 2009-04-01 0.425 597 | NYSE CIU 2009-03-02 0.406 598 | NYSE CIU 2009-02-02 0.418 599 | NYSE CCZ 2009-12-29 0.406 600 | NYSE CCZ 2009-10-13 0.406 601 | NYSE CCZ 2009-07-15 0.392 602 | NYSE CCZ 2009-04-09 0.392 603 | NYSE CCZ 2009-01-08 0.392 604 | NYSE CNS 2009-12-02 0.05 605 | NYSE CNS 2009-09-02 0.05 606 | NYSE CNS 2009-06-02 0.05 607 | NYSE CNS 2009-03-27 0.05 608 | NYSE CPP 2009-12-10 1.006 609 | NYSE CPP 2009-06-10 1.006 610 | NYSE CNK 2009-11-23 0.18 611 | NYSE CNK 2009-08-13 0.18 612 | NYSE CNK 2009-05-29 0.18 613 | NYSE CNK 2009-03-03 0.18 614 | NYSE CVX 2009-11-16 0.68 615 | NYSE CVX 2009-08-17 0.68 616 | NYSE CVX 2009-05-15 0.65 617 | NYSE CVX 2009-02-12 0.65 618 | NYSE CII 2009-12-18 0.485 619 | NYSE CII 2009-09-11 0.485 620 | NYSE CII 2009-06-11 0.485 621 | NYSE CII 2009-03-12 0.485 622 | NYSE CBS 2009-12-09 0.05 623 | NYSE CBS 2009-09-08 0.05 624 | NYSE CBS 2009-06-08 0.05 625 | NYSE CBS 2009-03-09 0.05 626 | NYSE CKR 2009-10-29 0.06 627 | NYSE CKR 2009-08-06 0.06 628 | NYSE CKR 2009-05-14 0.06 629 | NYSE CKR 2009-01-22 0.06 630 | NYSE CB 2009-12-16 0.35 631 | NYSE CB 2009-09-16 0.35 632 | NYSE CB 2009-06-24 0.35 633 | NYSE CB 2009-03-18 0.35 634 | NYSE CSH 2009-11-02 0.035 635 | NYSE CSH 2009-08-03 0.035 636 | NYSE CSH 2009-05-04 0.035 637 | NYSE CSH 2009-02-09 0.035 638 | NYSE CHW 2009-12-29 0.05 639 | NYSE CHW 2009-12-03 0.05 640 | NYSE CHW 2009-11-06 0.05 641 | NYSE CHW 2009-10-07 0.06 642 | NYSE CHW 2009-09-08 0.06 643 | NYSE CHW 2009-08-06 0.06 644 | NYSE CHW 2009-07-08 0.06 645 | NYSE CHW 2009-06-08 0.08 646 | NYSE CHW 2009-05-07 0.08 647 | NYSE CHW 2009-04-08 0.08 648 | NYSE CHW 2009-03-09 0.08 649 | NYSE CHW 2009-02-06 0.08 650 | NYSE CRP 2009-12-08 0.494 651 | NYSE CRP 2009-09-08 0.494 652 | NYSE CRP 2009-06-08 0.494 653 | NYSE CRP 2009-03-09 0.494 654 | NYSE CHL 2009-09-10 0.868 655 | NYSE CHL 2009-05-11 0.906 656 | NYSE CYN 2009-11-02 0.1 657 | NYSE CYN 2009-08-03 0.1 658 | NYSE CYN 2009-05-04 0.1 659 | NYSE CYN 2009-02-02 0.25 660 | NYSE COP 2009-10-28 0.5 661 | NYSE COP 2009-07-29 0.47 662 | NYSE COP 2009-05-21 0.47 663 | NYSE COP 2009-02-19 0.47 664 | NYSE CEO 2009-09-04 2.58 665 | NYSE CEO 2009-05-14 2.581 666 | NYSE CS 2009-04-27 0.096 667 | NYSE CMC 2009-09-30 0.12 668 | NYSE CMC 2009-06-30 0.12 669 | NYSE CMC 2009-04-01 0.12 670 | NYSE CMC 2009-01-06 0.12 671 | -------------------------------------------------------------------------------- /data/README: -------------------------------------------------------------------------------- 1 | This directory contains data sets used in the examples. 2 | 3 | The baseball data has a schema of: 4 | 5 | name:chararray, team:chararray, position:bag{t:(p:chararray)}, bat:map[] 6 | 7 | The possible keys in the bat map are: games, at_bats, hits, runs, 8 | doubles, triples, home_runs, grand_slams, rbis, base_on_balls, ibbs, strikeouts, 9 | sacrifice_hits, sacrifice_flies, hit_by_pitch, gdb, batting_average, 10 | on_base_percentage, and slugging_percentage. 11 | 12 | The baseball data was obtained at 13 | http://download.freebase.com/datadumps/2011-04-14/browse/baseball.tar.bz2 14 | 15 | and was found via http://www.infochimps.com/ 16 | 17 | NYSE_daily data has a schema of: 18 | 19 | exchange:chararray, symbol:chararray, date:chararray, open:float, high:float, 20 | low:float, close:float, volume:int, adj_close:float 21 | 22 | The NYSE_dividends data has a schema of 23 | 24 | exchange:chararray, symbol:chararray, date:chararray, dividends:float 25 | 26 | The NYSE data was obtained at 27 | http://www.infochimps.com/datasets/nyse-daily-1970-2010-open-close-high-low-and-volume 28 | -------------------------------------------------------------------------------- /examples/README: -------------------------------------------------------------------------------- 1 | This directory contains all the examples, divided by chapters. All of these 2 | examples assume that the data being read is in the current working directory. 3 | So if you are running in local mode you will need to start pig in the data 4 | directory. If you load the data into your cluster, you will need to place 5 | the data files in your home directory. 6 | -------------------------------------------------------------------------------- /examples/ch2/average_dividend.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | dividends = load 'NYSE_dividends' as (exchange, symbol, date, dividend); 14 | grouped = group dividends by symbol; 15 | avg = foreach grouped generate group, AVG(dividends.dividend); 16 | store avg into 'average_dividend'; 17 | -------------------------------------------------------------------------------- /examples/ch4/no_schema.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | daily = load 'NYSE_daily'; 14 | calcs = foreach daily generate $7 / 1000000, $3 * 100.0, SUBSTRING($0, 0, 1), $6 - $3; 15 | dump calcs; 16 | -------------------------------------------------------------------------------- /examples/ch4/no_schema_filter.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | daily = load 'NYSE_daily'; 14 | fltrd = filter daily by $6 > $3; 15 | dump fltrd; 16 | -------------------------------------------------------------------------------- /examples/ch4/no_schema_join.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | divs = load 'NYSE_dividends' as (exchange, stock_symbol, date, dividends); 14 | daily = load 'NYSE_daily'; 15 | jnd = join divs by stock_symbol, daily by $1; 16 | dump jnd; 17 | -------------------------------------------------------------------------------- /examples/ch4/total_trade_estimate.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | daily = load 'NYSE_daily' as (exchange:chararray, symbol:chararray, 14 | date:chararray, open:float, high:float, low:float, close:float, 15 | volume:int, adj_close:float); 16 | rough = foreach daily generate volume * close; 17 | dump rough; 18 | -------------------------------------------------------------------------------- /examples/ch4/unintended_walks.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | player = load 'baseball' as (name:chararray, team:chararray, 14 | pos:bag{t:(p:chararray)}, bat:map[]); 15 | unintended = foreach player generate bat#'base_on_balls' - bat#'ibbs'; 16 | dump unintended; 17 | -------------------------------------------------------------------------------- /examples/ch4/unintended_walks_cast.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | player = load 'baseball' as (name:chararray, team:chararray, 14 | pos:bag{t:(p:chararray)}, bat:map[]); 15 | unintended = foreach player generate (int)bat#'base_on_balls' - (int)bat#'ibbs'; 16 | dump unintended; 17 | -------------------------------------------------------------------------------- /examples/ch5/batting_production.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | register 'production.py' using jython as bballudfs; 14 | players = load 'baseball' as (name:chararray, team:chararray, 15 | pos:bag{t:(p:chararray)}, bat:map[]); 16 | nonnull = filter players by bat#'slugging_percentage' is not null and 17 | bat#'on_base_percentage' is not null; 18 | calcprod = foreach nonnull generate name, bballudfs.production( 19 | (float)bat#'slugging_percentage', 20 | (float)bat#'on_base_percentage'); 21 | dump calcprod; 22 | -------------------------------------------------------------------------------- /examples/ch5/count.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | daily = load 'NYSE_daily' as (exchange, stock); 14 | grpd = group daily by stock; 15 | cnt = foreach grpd generate group, COUNT(daily); 16 | dump cnt; 17 | 18 | -------------------------------------------------------------------------------- /examples/ch5/countall.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | daily = load 'NYSE_daily' as (exchange, stock); 14 | grpd = group daily all; 15 | cnt = foreach grpd generate COUNT(daily); 16 | dump cnt; 17 | 18 | -------------------------------------------------------------------------------- /examples/ch5/defaultparallel.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | set default_parallel 10; 14 | daily = load 'NYSE_daily' as (exchange, symbol, date, open, high, low, close, 15 | volume, adj_close); 16 | bysymbl = group daily by symbol; 17 | average = foreach bysymbl generate group, AVG(daily.close) as avg; 18 | sorted = order average by avg desc; 19 | dump sorted; 20 | 21 | -------------------------------------------------------------------------------- /examples/ch5/define.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | register '/piggybank.jar'; 14 | define reverse org.apache.pig.piggybank.evaluation.string.Reverse(); 15 | divs = load 'NYSE_dividends' as (exchange:chararray, symbol:chararray, 16 | date:chararray, dividends:float); 17 | backwards = foreach divs generate reverse(symbol); 18 | dump backwards; 19 | -------------------------------------------------------------------------------- /examples/ch5/define_constructor_args.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | register 'acme.jar'; 14 | define convert com.acme.financial.CurrencyConverter('dollar', 'euro'); 15 | divs = load 'NYSE_dividends' as (exchange:chararray, symbol:chararray, 16 | date:chararray, dividends:float); 17 | backwards = foreach divs generate convert(dividends); 18 | dump backwards; 19 | -------------------------------------------------------------------------------- /examples/ch5/distinct.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | -- find a distinct list of ticker symbols for each exchange 14 | -- This load will truncate the records, picking up just the first two fields. 15 | daily = load 'NYSE_daily' as (exchange:chararray, symbol:chararray); 16 | uniq = distinct daily; 17 | dump uniq; 18 | -------------------------------------------------------------------------------- /examples/ch5/filter_matches.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | -- filter_matches.pig 14 | divs = load 'NYSE_dividends' as (exchange:chararray, symbol:chararray, 15 | date:chararray, dividends:float); 16 | startswithcm = filter divs by symbol matches 'CM.*'; 17 | dump startswithcm; 18 | -------------------------------------------------------------------------------- /examples/ch5/filter_not_matches.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | -- filter_not_matches.pig 14 | divs = load 'NYSE_dividends' as (exchange:chararray, symbol:chararray, 15 | date:chararray, dividends:float); 16 | notstartswithcm = filter divs by not symbol matches 'CM.*'; 17 | dump notstartswithcm; 18 | -------------------------------------------------------------------------------- /examples/ch5/group.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | daily = load 'NYSE_daily' as (exchange, stock); 14 | grpd = group daily by stock; 15 | store grpd into 'by_group'; 16 | -------------------------------------------------------------------------------- /examples/ch5/invoker.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | define hex InvokeForString('java.lang.Integer.toHexString', 'int'); 14 | divs = load 'NYSE_daily' as (exchange, symbol, date, open, high, low, 15 | close, volume, adj_close); 16 | nonnull = filter divs by volume is not null; 17 | inhex = foreach nonnull generate symbol, hex((int)volume); 18 | dump inhex; 19 | 20 | -------------------------------------------------------------------------------- /examples/ch5/join.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | daily = load 'NYSE_daily' as (exchange, symbol, date, open, high, low, close, 14 | volume, adj_close); 15 | divs = load 'NYSE_dividends' as (exchange, symbol, date, dividends); 16 | jnd = join daily by symbol, divs by symbol; 17 | dump jnd; 18 | -------------------------------------------------------------------------------- /examples/ch5/join2key.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | daily = load 'NYSE_daily' as (exchange, symbol, date, open, high, low, close, 14 | volume, adj_close); 15 | divs = load 'NYSE_dividends' as (exchange, symbol, date, dividends); 16 | jnd = join daily by (symbol, date), divs by (symbol, date); 17 | dump jnd; 18 | -------------------------------------------------------------------------------- /examples/ch5/leftjoin.pig: -------------------------------------------------------------------------------- 1 | 2 | -- This code is made available under the Apache License, Version 2.0 (the 3 | -- "License"); you may not use this file except in compliance with the License. 4 | -- You may obtain a copy of the License at 5 | -- 6 | -- http://www.apache.org/licenses/LICENSE-2.0 7 | -- 8 | -- Unless required by applicable law or agreed to in writing, software 9 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | -- License for the specific language governing permissions and limitations 12 | -- under the License. 13 | 14 | daily = load 'NYSE_daily' as (exchange, symbol, date, open, high, low, close, 15 | volume, adj_close); 16 | divs = load 'NYSE_dividends' as (exchange, symbol, date, dividends); 17 | jnd = join daily by (symbol, date) left outer, divs by (symbol, date); 18 | dump jnd; 19 | -------------------------------------------------------------------------------- /examples/ch5/limit.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | divs = load 'NYSE_dividends'; 14 | first10 = limit divs 10; 15 | dump first10; 16 | -------------------------------------------------------------------------------- /examples/ch5/order.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | daily = load 'NYSE_daily' as (exchange:chararray, symbol:chararray, 14 | date:chararray, open:float, high:float, low:float, close:float, 15 | volume:int, adj_close:float); 16 | bydate = order daily by date; 17 | dump bydate; 18 | -------------------------------------------------------------------------------- /examples/ch5/order2key.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | daily = load 'NYSE_daily' as (exchange:chararray, symbol:chararray, 14 | date:chararray, open:float, high:float, low:float, 15 | close:float, volume:int, adj_close:float); 16 | bydatensymbol = order daily by date, symbol; 17 | dump bydatensymbol; 18 | -------------------------------------------------------------------------------- /examples/ch5/orderdesc.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | daily = load 'NYSE_daily' as (exchange:chararray, symbol:chararray, 14 | date:chararray, open:float, high:float, low:float, close:float, 15 | volume:int, adj_close:float); 16 | byclose = order daily by close desc, open; 17 | dump byclose; 18 | -------------------------------------------------------------------------------- /examples/ch5/parallel.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | daily = load 'NYSE_daily' as (exchange, symbol, date, open, high, low, close, 14 | volume, adj_close); 15 | bysymbl = group daily by symbol parallel 10; 16 | average = foreach bysymbl generate group, AVG(daily.close) as avg; 17 | sorted = order average by avg desc parallel 2; 18 | dump sorted; 19 | 20 | -------------------------------------------------------------------------------- /examples/ch5/register.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | register '/piggybank.jar'; 14 | divs = load 'NYSE_dividends' as (exchange:chararray, symbol:chararray, 15 | date:chararray, dividends:float); 16 | backwards = foreach divs generate 17 | org.apache.pig.piggybank.evaluation.string.Reverse(symbol); 18 | dump backwards; 19 | -------------------------------------------------------------------------------- /examples/ch5/sample.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | divs = load 'NYSE_dividends'; 14 | some = sample divs 0.1; 15 | dump some; 16 | -------------------------------------------------------------------------------- /examples/ch5/selfjoin.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | -- For each stock, find all dividends that increased between two dates 14 | divs1 = load 'NYSE_dividends' as (exchange:chararray, symbol:chararray, 15 | date:chararray, dividends); 16 | divs2 = load 'NYSE_dividends' as (exchange:chararray, symbol:chararray, 17 | date:chararray, dividends); 18 | jnd = join divs1 by symbol, divs2 by symbol; 19 | increased = filter jnd by divs1::date < divs2::date and 20 | divs1::dividends < divs2::dividends; 21 | dump increased; 22 | -------------------------------------------------------------------------------- /examples/ch5/twokey.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | daily = load 'NYSE_daily' as (exchange, stock, date, dividends); 14 | grpd = group daily by (exchange, stock); 15 | avg = foreach grpd generate group, AVG(daily.dividends); 16 | --dump avg; 17 | describe grpd; 18 | 19 | -------------------------------------------------------------------------------- /examples/ch5/udf_in_foreach.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | divs = load 'NYSE_dividends' as (exchange, symbol, date, dividends); 14 | --make sure all strings are upper case 15 | upped = foreach divs generate UPPER(symbol) as symbol, dividends; 16 | grpd = group upped by symbol; --output a bag upped for each value of symbol 17 | --take a bag of integers, produce one result for each group 18 | sums = foreach grpd generate group, SUM(upped.dividends); 19 | dump sums; 20 | -------------------------------------------------------------------------------- /examples/ch6/analyze_stock.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | register 'acme.jar'; 14 | define analyze com.acme.financial.AnalyzeStock(); 15 | daily = load 'NYSE_daily' as (exchange:chararray, symbol:chararray, 16 | date:chararray, open:float, high:float, low:float, 17 | close:float, volume:int, adj_close:float); 18 | grpd = group daily by symbol; 19 | analyzed = foreach grpd { 20 | sorted = order daily by date; 21 | generate group, analyze(sorted); 22 | }; 23 | dump analyzed; 24 | -------------------------------------------------------------------------------- /examples/ch6/cross.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | daily = load 'NYSE_daily' as (exchange:chararray, symbol:chararray, 14 | date:chararray, open:float, high:float, low:float, 15 | close:float, volume:int, adj_close:float); 16 | divs = load 'NYSE_dividends' as (exchange:chararray, symbol:chararray, 17 | date:chararray, dividends:float); 18 | tonsodata = cross daily, divs parallel 10; 19 | store tonsodata into 'crossed'; 20 | 21 | -------------------------------------------------------------------------------- /examples/ch6/daily.params: -------------------------------------------------------------------------------- 1 | DATE=2009-12-17 2 | -------------------------------------------------------------------------------- /examples/ch6/daily.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | daily = load 'NYSE_daily' as (exchange:chararray, symbol:chararray, 14 | date:chararray, open:float, high:float, low:float, close:float, 15 | volume:int, adj_close:float); 16 | yesterday = filter daily by date == '$DATE'; 17 | grpd = group yesterday all; 18 | minmax = foreach grpd generate MAX(yesterday.high), MIN(yesterday.low); 19 | dump minmax; 20 | 21 | -------------------------------------------------------------------------------- /examples/ch6/distinct_symbols.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | daily = load 'NYSE_daily' as (exchange, symbol); -- not interested in other fields 14 | grpd = group daily by exchange; 15 | uniqcnt = foreach grpd { 16 | sym = daily.symbol; 17 | uniq_sym = distinct sym; 18 | generate group, COUNT(uniq_sym); 19 | }; 20 | dump uniqcnt; 21 | 22 | -------------------------------------------------------------------------------- /examples/ch6/dividend_analysis.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | -- Given daily input and a particular year, analyze how 14 | -- stock prices changed on days dividends were paid out. 15 | define dividend_analysis (daily, year, daily_symbol, daily_open, daily_close) 16 | returns analyzed { 17 | divs = load 'NYSE_dividends' as (exchange:chararray, 18 | symbol:chararray, date:chararray, dividends:float); 19 | divsthisyear = filter divs by date matches '$year-.*'; 20 | dailythisyear = filter $daily by date matches '$year-.*'; 21 | jnd = join divsthisyear by symbol, dailythisyear by $daily_symbol; 22 | $analyzed = foreach jnd generate dailythisyear::$daily_symbol, 23 | $daily_close - $daily_open; 24 | }; 25 | -------------------------------------------------------------------------------- /examples/ch6/double_distinct.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | divs = load 'NYSE_dividends' as (exchange:chararray, symbol:chararray); 14 | grpd = group divs all; 15 | uniq = foreach grpd { 16 | exchanges = divs.exchange; 17 | uniq_exchanges = distinct exchanges; 18 | symbols = divs.symbol; 19 | uniq_symbols = distinct symbols; 20 | generate COUNT(uniq_exchanges), COUNT(uniq_symbols); 21 | }; 22 | dump uniq; 23 | 24 | -------------------------------------------------------------------------------- /examples/ch6/flatten.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | players = load 'baseball' as (name:chararray, team:chararray, 14 | position:bag{t:(p:chararray)}, bat:map[]); 15 | pos = foreach players generate name, flatten(position) as position; 16 | bypos = group pos by position; 17 | dump bypos; 18 | -------------------------------------------------------------------------------- /examples/ch6/flatten_noempty.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | players = load 'baseball' as (name:chararray, team:chararray, 14 | position:bag{t:(p:chararray)}, bat:map[]); 15 | noempty = foreach players generate name, 16 | ((position is null or IsEmpty(position)) ? {('unknown')} : position) 17 | as position; 18 | pos = foreach noempty generate name, flatten(position) as position; 19 | bypos = group pos by position; 20 | dump bypos; 21 | -------------------------------------------------------------------------------- /examples/ch6/highest_dividend.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | divs = load 'NYSE_dividends' as (exchange:chararray, symbol:chararray, 14 | date:chararray, dividends:float); 15 | grpd = group divs by symbol; 16 | top3 = foreach grpd { 17 | sorted = order divs by dividends desc; 18 | top = limit sorted 3; 19 | generate group, flatten(top); 20 | }; 21 | dump top3; 22 | -------------------------------------------------------------------------------- /examples/ch6/macro.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | -- Given daily input and a particular year, analyze how 14 | -- stock prices changed on days dividends were paid out. 15 | define dividend_analysis (daily, year, daily_symbol, daily_open, daily_close) 16 | returns analyzed { 17 | divs = load 'NYSE_dividends' as (exchange:chararray, 18 | symbol:chararray, date:chararray, dividends:float); 19 | divsthisyear = filter divs by date matches '$year-.*'; 20 | dailythisyear = filter $daily by date matches '$year-.*'; 21 | jnd = join divsthisyear by symbol, dailythisyear by $daily_symbol; 22 | $analyzed = foreach jnd generate dailythisyear::$daily_symbol, 23 | $daily_close - $daily_open; 24 | }; 25 | 26 | daily = load 'NYSE_daily' as (exchange:chararray, symbol:chararray, 27 | date:chararray, open:float, high:float, low:float, close:float, 28 | volume:int, adj_close:float); 29 | results = dividend_analysis(daily, '2009', 'symbol', 'open', 'close'); 30 | dump results; 31 | 32 | -------------------------------------------------------------------------------- /examples/ch6/main.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | -- Given daily input and a particular year, analyze how 14 | -- stock prices changed on days dividends were paid out. 15 | import '../examples/ch6/dividend_analysis.pig'; 16 | 17 | daily = load 'NYSE_daily' as (exchange:chararray, symbol:chararray, 18 | date:chararray, open:float, high:float, low:float, close:float, 19 | volume:int, adj_close:float); 20 | results = dividend_analysis(daily, '2009', 'symbol', 'open', 'close'); 21 | dump results; 22 | 23 | -------------------------------------------------------------------------------- /examples/ch6/mergejoin.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | daily = load 'NYSE_daily_sorted' as (exchange:chararray, symbol:chararray, 14 | date:chararray, open:float, high:float, low:float, 15 | close:float, volume:int, adj_close:float); 16 | divs = load 'NYSE_dividends_sorted' as (exchange:chararray, symbol:chararray, 17 | date:chararray, dividends:float); 18 | jnd = join daily by symbol, divs by symbol using 'merge'; 19 | store jnd into 'daily_with_dividends'; 20 | 21 | -------------------------------------------------------------------------------- /examples/ch6/multiquery.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | players = load 'baseball' as (name:chararray, team:chararray, 14 | position:bag{t:(p:chararray)}, bat:map[]); 15 | pwithba = foreach players generate name, team, position, 16 | bat#'batting_average' as batavg; 17 | byteam = group pwithba by team; 18 | avgbyteam = foreach byteam generate group, AVG(pwithba.batavg); 19 | store avgbyteam into 'by_team'; 20 | flattenpos = foreach pwithba generate name, team, 21 | flatten(position) as position, batavg; 22 | bypos = group flattenpos by position; 23 | avgbypos = foreach bypos generate group, AVG(flattenpos.batavg); 24 | store avgbypos into 'by_position'; 25 | -------------------------------------------------------------------------------- /examples/ch6/repljoin.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | daily = load 'NYSE_daily' as (exchange:chararray, symbol:chararray, 14 | date:chararray, open:float, high:float, low:float, 15 | close:float, volume:int, adj_close:float); 16 | divs = load 'NYSE_dividends' as (exchange:chararray, symbol:chararray, 17 | date:chararray, dividends:float); 18 | jnd = join daily by (exchange, symbol), divs by (exchange, symbol) using 'replicated'; 19 | store jnd into 'daily_with_dividends'; 20 | 21 | -------------------------------------------------------------------------------- /examples/ch6/semijoin.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | daily = load 'NYSE_daily' as (exchange:chararray, symbol:chararray, 14 | date:chararray, open:float, high:float, low:float, 15 | close:float, volume:int, adj_close:float); 16 | divs = load 'NYSE_dividends' as (exchange:chararray, symbol:chararray, 17 | date:chararray, dividends:float); 18 | grpd = cogroup daily by (exchange, symbol), divs by (exchange, symbol); 19 | sjnd = filter grpd by not IsEmpty(divs); 20 | final = foreach sjnd generate flatten(daily); 21 | dump final; 22 | 23 | -------------------------------------------------------------------------------- /examples/ch6/sort_for_mergejoin.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | daily = load 'NYSE_daily' as (exchange:chararray, symbol:chararray, 14 | date:chararray, open:float, high:float, low:float, close:float, 15 | volume:int, adj_close:float); 16 | srtd = order daily by symbol; 17 | store srtd into 'NYSE_daily_sorted'; 18 | 19 | divs = load 'NYSE_dividends' as (exchange:chararray, symbol:chararray, 20 | date:chararray, dividends:float); 21 | dsrtd = order divs by symbol; 22 | store dsrtd into 'NYSE_dividends_sorted'; 23 | -------------------------------------------------------------------------------- /examples/ch6/streamship.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | define hd `highdiv.pl` ship('highdiv.pl'); 14 | divs = load 'NYSE_dividends' as (exchange, symbol, date, dividends); 15 | highdivs = stream divs through hd as (exchange, symbol, date, dividends); 16 | dump highdivs; 17 | 18 | -------------------------------------------------------------------------------- /examples/ch6/streamsimple.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | divs = load 'NYSE_dividends' as (exchange, symbol, date, dividends); 14 | highdivs = stream divs through `highdiv.pl` as (exchange, symbol, date, dividends); 15 | dump highdivs; 16 | 17 | -------------------------------------------------------------------------------- /examples/ch6/thetajoin.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | daily = load 'NYSE_daily' as (exchange:chararray, symbol:chararray, 14 | date:chararray, open:float, high:float, low:float, 15 | close:float, volume:int, adj_close:float); 16 | divs = load 'NYSE_dividends' as (exchange:chararray, symbol:chararray, 17 | date:chararray, dividends:float); 18 | crossed = cross daily, divs; 19 | tjnd = filter crossed by daily::date < divs::date; 20 | store tjnd into 'thetajoin'; 21 | 22 | -------------------------------------------------------------------------------- /examples/ch7/describe.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | divs = load 'NYSE_dividends' as (exchange:chararray, symbol:chararray, 14 | date:chararray, dividends:float); 15 | trimmed = foreach divs generate symbol, dividends; 16 | grpd = group trimmed by symbol; 17 | avgdiv = foreach grpd generate group, AVG(trimmed.dividends); 18 | 19 | describe trimmed; 20 | describe grpd; 21 | describe avgdiv; 22 | -------------------------------------------------------------------------------- /examples/ch7/expected.out: -------------------------------------------------------------------------------- 1 | (0.27305267014925455) -------------------------------------------------------------------------------- /examples/ch7/explain.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | divs = load 'NYSE_dividends' as (exchange, symbol, date, dividends); 14 | grpd = group divs by symbol; 15 | avgdiv = foreach grpd generate group, AVG(divs.dividends); 16 | store avgdiv into 'average_dividend'; 17 | -------------------------------------------------------------------------------- /examples/ch7/illustrate.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | divs = load 'NYSE_dividends' as (e:chararray, s:chararray, d:chararray, div:float); 14 | recent = filter divs by d > '2009-01-01'; 15 | trimmd = foreach recent generate s, div; 16 | grpd = group trimmd by s; 17 | avgdiv = foreach grpd generate group, AVG(trimmd.div); 18 | store avgdiv into 'average_dividends'; 19 | -------------------------------------------------------------------------------- /examples/ch7/java/build.xml: -------------------------------------------------------------------------------- 1 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | -------------------------------------------------------------------------------- /examples/ch7/java/example/PigUnitExample.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is made available under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations 12 | * under the License. 13 | */ 14 | package example; 15 | 16 | import java.io.File; 17 | import java.io.IOException; 18 | 19 | import junit.framework.TestCase; 20 | 21 | import org.apache.hadoop.fs.Path; 22 | import org.apache.pig.pigunit.Cluster; 23 | import org.apache.pig.pigunit.PigTest; 24 | import org.apache.pig.tools.parameters.ParseException; 25 | import org.junit.BeforeClass; 26 | import org.junit.Test; 27 | 28 | public class PigUnitExample { 29 | private PigTest test; 30 | private static Cluster cluster; 31 | 32 | @Test 33 | public void testDataInFile() throws ParseException, IOException { 34 | // Construct an instance of PigTest that will use the script 35 | // pigunit.pig 36 | test = new PigTest("../pigunit.pig"); 37 | 38 | // Specify our expected output. The format is a string for each line. 39 | // In this particular case we only expect one line of output. 40 | String[] output = { "(0.27305267014925455)" }; 41 | 42 | // Run the test and check that the output matches our expectation. 43 | // The "avgdiv" tells PigUnit what alias to check the output value 44 | // against. It inserts a store for that alias and then checks the 45 | // contents of the stored file against output 46 | test.assertOutput("avgdiv", output); 47 | } 48 | 49 | @Test 50 | public void testTextInput() throws ParseException, IOException { 51 | test = new PigTest("../pigunit.pig"); 52 | 53 | // Rather than read from a file, generate synthetic input. 54 | // Format is one record per line, tab separated. 55 | String[] input = { 56 | "NYSE\tCPO\t2009-12-30\t0.14", 57 | "NYSE\tCPO\t2009-01-06\t0.14", 58 | "NYSE\tCCS\t2009-10-28\t0.414", 59 | "NYSE\tCCS\t2009-01-28\t0.414", 60 | "NYSE\tCIF\t2009-12-09\t0.029", 61 | }; 62 | 63 | String[] output = { "(0.22739999999999996)" }; 64 | 65 | // Run the example script using the input we constructed 66 | // rather than loading whatever the load statement says. 67 | // "divs" is the alias to override with the input data 68 | // As with the previous example "avgdiv" is the alias 69 | // to test against the value(s) in output. 70 | test.assertOutput("divs", input, "avgdiv", output); 71 | } 72 | 73 | @Test 74 | public void testFileOutput() throws ParseException, IOException { 75 | // The script as an array of strings, one line per string. 76 | String[] script = { 77 | "divs = load '../../../data/NYSE_dividends' as (exchange, symbol, date, dividends);", 78 | "grpd = group divs all;", 79 | "avgdiv = foreach grpd generate AVG(divs.dividends);", 80 | "store avgdiv into 'average_dividend';", 81 | }; 82 | test = new PigTest(script); 83 | 84 | // Test output against an existing file that contains the 85 | // expected output. 86 | test.assertOutput(new File("../expected.out")); 87 | } 88 | 89 | @Test 90 | public void testWithParams() throws ParseException, IOException { 91 | // Parameters to be substituted in Pig Latin script before the 92 | // test is run. Format is one string for each parameter, 93 | // parameter=value 94 | String[] params = { 95 | "input=../../../data/NYSE_dividends", 96 | "output=average_dividend2" 97 | }; 98 | test = new PigTest("../pigunitwithparams.pig", params); 99 | 100 | String[] output = { "(0.27305267014925455)" }; 101 | 102 | // Test output in stored file against specified result 103 | test.assertOutput(output); 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /examples/ch7/pigunit.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | divs = load '../../../data/NYSE_dividends' as (exchange, symbol, date, dividends); 14 | grpd = group divs all; 15 | avgdiv = foreach grpd generate AVG(divs.dividends); 16 | store avgdiv into 'average_dividend'; 17 | -------------------------------------------------------------------------------- /examples/ch7/pigunitwithparams.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | divs = load '$input' as (exchange, symbol, date, dividends); 14 | grpd = group divs all; 15 | avgdiv = foreach grpd generate AVG(divs.dividends); 16 | store avgdiv into '$output'; 17 | -------------------------------------------------------------------------------- /examples/ch7/stats.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | -- Note, this script will not run with the sample input data. I selected a 14 | -- script from Pig's e2e test suite that runs a larger data set to give a 15 | -- slightly more realistic (though still quite small) example 16 | a = load '/user/pig/tests/data/singlefile/studenttab20m' as (name, age, gpa); 17 | b = load '/user/pig/tests/data/singlefile/votertab10k' as (name, age, registration, contributions); 18 | c = filter a by age < '50'; 19 | d = filter b by age < '50'; 20 | e = cogroup c by (name, age), d by (name, age) parallel 20; 21 | f = foreach e generate flatten(c), flatten(d); 22 | g = group f by registration parallel 20; 23 | h = foreach g generate group, SUM(f.d::contributions); 24 | i = order h by $1, $0 parallel 20; 25 | store i into 'student_voter_info'; 26 | -------------------------------------------------------------------------------- /examples/ch9/pagerank.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # This code is made available under the Apache License, Version 2.0 (the 3 | # "License"); you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | # License for the specific language governing permissions and limitations 12 | # under the License. 13 | 14 | from org.apache.pig.scripting import * 15 | 16 | P = Pig.compile(""" 17 | -- PR(A) = (1-d) + d (PR(T1)/C(T1) + ... + PR(Tn)/C(Tn)) 18 | 19 | previous_pagerank = load '$docs_in' as (url:chararray, pagerank:float, 20 | links:{link:(url:chararray)}); 21 | outbound_pagerank = foreach previous_pagerank generate 22 | pagerank / COUNT(links) as pagerank, 23 | flatten(links) as to_url; 24 | cogrpd = cogroup outbound_pagerank by to_url, 25 | previous_pagerank by url; 26 | new_pagerank = foreach cogrpd generate group as url, 27 | (1 - $d) + $d * SUM (outbound_pagerank.pagerank) 28 | as pagerank, 29 | flatten(previous_pagerank.links) as links, 30 | flatten(previous_pagerank.pagerank) AS previous_pagerank; 31 | store new_pagerank into '$docs_out'; 32 | nonulls = filter new_pagerank by previous_pagerank is not null and 33 | pagerank is not null; 34 | pagerank_diff = foreach nonulls generate ABS (previous_pagerank - pagerank); 35 | grpall = group pagerank_diff all; 36 | max_diff = foreach grpall generate MAX (pagerank_diff); 37 | store max_diff into '$max_diff'; 38 | """) 39 | 40 | params = { 'd': '0.5', 'docs_in': 'webcrawl' } 41 | 42 | for i in range(10): 43 | out = "out/pagerank_data_" + str(i + 1) 44 | max_diff = "out/max_diff_" + str(i + 1) 45 | params["docs_out"] = out 46 | params["max_diff"] = max_diff 47 | Pig.fs("rmr " + out) 48 | Pig.fs("rmr " + max_diff) 49 | bound = P.bind(params) 50 | stats = bound.runSingle() 51 | if not stats.isSuccessful(): 52 | raise 'failed' 53 | mdv = float(str(stats.result("max_diff").iterator().next().get(0))) 54 | print "max_diff_value = " + str(mdv) 55 | if mdv < 0.01: 56 | print "done at iteration " + str(i) 57 | break 58 | params["docs_in"] = out 59 | -------------------------------------------------------------------------------- /examples/ch9/pagerankbindnoarg.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # This code is made available under the Apache License, Version 2.0 (the 3 | # "License"); you may not use this file except in compliance with the License. 4 | # You may obtain a copy of the License at 5 | # 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | # License for the specific language governing permissions and limitations 12 | # under the License. 13 | 14 | from org.apache.pig.scripting import * 15 | 16 | P = Pig.compile(""" 17 | -- PR(A) = (1-d) + d (PR(T1)/C(T1) + ... + PR(Tn)/C(Tn)) 18 | 19 | previous_pagerank = load '$docs_in' using PigStorage('\t') 20 | as (url:chararray, pagerank:float, 21 | links:{link:(url:chararray)}); 22 | outbound_pagerank = foreach previous_pagerank generatE 23 | pagerank / COUNT(links) as pagerank, 24 | flatten(links) as to_url; 25 | cogrpd = cogroup outbound_pagerank by to_url, 26 | previous_pagerank by url; 27 | new_pagerank = foreach cogrpd generate group as url, 28 | (1 - $d) + $d * SUM (outbound_pagerank.pagerank) 29 | as pagerank, 30 | flatten(previous_pagerank.links) as links, 31 | flatten(previous_pagerank.pagerank) as previous_pagerank; 32 | store new_pagerank into '$docs_out' using PigStorage('\t'); 33 | nonulls = filter new_pagerank by previous_pagerank is not null and 34 | pagerank is not null; 35 | pagerank_diff = foreach nonulls generate ABS (previous_pagerank - pagerank); 36 | grpall = group pagerank_diff all; 37 | max_diff = foreach grpall generate MAX (pagerank_diff); 38 | store max_diff into '$max_diff'; 39 | """) 40 | 41 | d = 0.5 42 | docs_in = 'webcrawl' 43 | 44 | for i in range(10): 45 | docs_out = "out/pagerank_data_" + str(i + 1) 46 | max_diff = "out/max_diff_" + str(i + 1) 47 | Pig.fs("rmr " + docs_out) 48 | Pig.fs("rmr " + max_diff) 49 | bound = P.bind() 50 | stats = bound.runSingle() 51 | if not stats.isSuccessful(): 52 | raise 'failed' 53 | mdv = float(str(stats.result("max_diff").iterator().next().get(0))) 54 | print "max_diff_value = " + str(mdv) 55 | if mdv < 0.01: 56 | print "done at iteration " + str(i) 57 | break 58 | docs_in = docs_out 59 | -------------------------------------------------------------------------------- /setup/README: -------------------------------------------------------------------------------- 1 | This directory contains Pig Latin scripts and code for UDFs used to prepare the 2 | data for these examples. 3 | 4 | Baseball setup: 5 | The tomap UDF, used to convert batting statistics into a map, can be compiled 6 | by doing: 7 | 8 | javac -cp /pig.jar tomap.java 9 | 10 | where is the path to a copy of pig.jar. It can then be placed in the 11 | jar by doing 12 | 13 | jar -cf udf.jar tomap.class 14 | 15 | Once you have downloaded the baseball data (see ../data/README for information 16 | on obtaining the data) you can run 17 | 18 | /bin/pig -x local baseball.pig 19 | 20 | NYSE setup: 21 | The total NYSE data was too large to host and would take too long for convient 22 | examples. So the example data contains only information for ticker symbols 23 | beginning with 'C' from the year 2009. 24 | 25 | If you would like to load the entire data set into your cluster 26 | for slightly more realistic testing, the examples should work once you adjust 27 | the file paths for load and store. I did not change the schema of the data. 28 | -------------------------------------------------------------------------------- /setup/baseball.pig: -------------------------------------------------------------------------------- 1 | -- This code is made available under the Apache License, Version 2.0 (the 2 | -- "License"); you may not use this file except in compliance with the License. 3 | -- You may obtain a copy of the License at 4 | -- 5 | -- http://www.apache.org/licenses/LICENSE-2.0 6 | -- 7 | -- Unless required by applicable law or agreed to in writing, software 8 | -- distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 9 | -- WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 10 | -- License for the specific language governing permissions and limitations 11 | -- under the License. 12 | 13 | register 'udf.jar'; 14 | -- Load teams and flatten out the list of players on each team so I can 15 | -- determine which team each player is from. 16 | teams = load 'baseball_team.tsv' as (name, id, division, 17 | league, current_roster, current_manager, 18 | historical_roster, historical_managers, 19 | current_coaches, historical_coaches, team_stats); 20 | teams_with_players = filter teams by current_roster is not null; 21 | -- Replace the comma separator with white space so I can use TOKENIZE to 22 | -- turn the list into a bag. 23 | temp = foreach teams_with_players generate name, 24 | REPLACE(current_roster, ',', ' ') as current_roster; 25 | team_players = foreach temp generate name, 26 | flatten(TOKENIZE(current_roster)) as id; 27 | 28 | -- Load players, turn their positions into a bag, and join them against 29 | -- the team data generated above 30 | players = load 'baseball_player.tsv' as (name, id, 31 | hall_of_fame_induction, current_team, position_s, 32 | bats, former_teams, batting_stats, baseball_almanac_id, 33 | lifetime_batting_statistics); 34 | player_position = filter players by position_s is not null; 35 | -- Replace spaces with underscores so that "relief pitcher" ends up 36 | -- as one position and not two. 37 | temp2 = foreach player_position generate name, current_team, 38 | REPLACE(position_s, ' ', '_') as position, 39 | batting_stats, lifetime_batting_statistics; 40 | -- Replace the underscore separator with white space so I can use TOKENIZE to 41 | -- turn the list into a bag. 42 | temp1 = foreach temp2 generate name, current_team, 43 | REPLACE(position, ',', ' ') as position, batting_stats, 44 | lifetime_batting_statistics; 45 | positioned_players = foreach temp1 generate name, current_team, 46 | TOKENIZE(position) as position, 47 | lifetime_batting_statistics; 48 | joined = join positioned_players by current_team, 49 | team_players by id; 50 | players_with_teams = foreach joined generate positioned_players::name as name, 51 | team_players::name as team, position, 52 | lifetime_batting_statistics; 53 | 54 | -- Load lifetime batting stats, turn them into a map, and then join them 55 | -- with the players for our final result 56 | batting = load 'lifetime_batting_statistics.tsv' as (name,id, player, 57 | starting_season, ending_season, games, at_bats, hits, runs, doubles, 58 | triples, home_runs, grand_slams, rbis, bases_on_balls, ibbs, 59 | strikeouts, sacrifice_hits, sacrifice_flies, hit_by_pitch, gdp, 60 | batting_average, on_base_percentage, slugging_percentage, 61 | last_statistics_season); 62 | in_map = foreach batting generate id, tomap(name, id, player, starting_season, 63 | ending_season, games, at_bats, hits, runs, doubles, triples, 64 | home_runs, grand_slams, rbis, bases_on_balls, ibbs, strikeouts 65 | sacrifice_hits, sacrifice_flies, hit_by_pitch, gdp, batting_average, 66 | on_base_percentage, slugging_percentage, 67 | last_statistics_season) as lifetime_batting; 68 | almost = join players_with_teams by lifetime_batting_statistics, in_map by id; 69 | final = foreach almost generate players_with_teams::name as name, team, 70 | position, lifetime_batting; 71 | store final into 'baseball'; 72 | -------------------------------------------------------------------------------- /setup/tomap.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This code is made available under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance 4 | * with the License. You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | 15 | import java.io.IOException; 16 | import java.util.Map; 17 | import java.util.HashMap; 18 | 19 | import org.apache.pig.EvalFunc; 20 | import org.apache.pig.backend.executionengine.ExecException; 21 | import org.apache.pig.data.DataType; 22 | import org.apache.pig.data.Tuple; 23 | import org.apache.pig.impl.logicalLayer.schema.Schema; 24 | 25 | public class tomap extends EvalFunc { 26 | 27 | private Tuple current = null; 28 | private Map output = null; 29 | 30 | @Override 31 | public Map exec(Tuple input) throws IOException { 32 | // Make sure the input isn't null and is of the rigth size. 33 | if (input == null || input.size() != 25) return null; 34 | current = input; 35 | try { 36 | output = new HashMap(); 37 | 38 | // for each field we're interested in, only put it in 39 | // the output if it is non-null 40 | // ignore name, id, player, starting_season, ending_season 41 | ifNotNull("games", 5); 42 | ifNotNull("at_bats", 6); 43 | ifNotNull("hits", 7); 44 | ifNotNull("runs", 8); 45 | ifNotNull("doubles", 9); 46 | ifNotNull("triples", 10); 47 | ifNotNull("home_runs", 11); 48 | ifNotNull("grand_slams", 12); 49 | ifNotNull("rbis", 13); 50 | ifNotNull("base_on_balls", 14); 51 | ifNotNull("ibbs", 15); 52 | ifNotNull("strikeouts", 16); 53 | ifNotNull("sacrifice_hits", 17); 54 | ifNotNull("sacrifice_flies", 18); 55 | ifNotNull("hit_by_pitch", 19); 56 | ifNotNull("gdb", 20); 57 | ifNotNull("batting_average", 21); 58 | ifNotNull("on_base_percentage", 22); 59 | ifNotNull("slugging_percentage", 23); 60 | 61 | return output; 62 | } catch (ArrayIndexOutOfBoundsException e){ 63 | throw new RuntimeException("Function input must have even number of parameters"); 64 | } catch (Exception e) { 65 | throw new RuntimeException("Error while creating a map", e); 66 | } 67 | } 68 | 69 | @Override 70 | public Schema outputSchema(Schema input) { 71 | return new Schema(new Schema.FieldSchema(null, DataType.MAP)); 72 | } 73 | 74 | private void ifNotNull(String key, int pos) throws ExecException{ 75 | Object o = current.get(pos); 76 | if (o == null || "0".equals(o.toString()) || "0.0".equals(o.toString())) return; 77 | output.put(key, o); 78 | } 79 | 80 | } 81 | -------------------------------------------------------------------------------- /udfs/java/README: -------------------------------------------------------------------------------- 1 | You can build the java udfs by running 'ant' in this directory. All of the 2 | example scripts assume that the acme.jar is in the current working directory, 3 | so you will need to copy the acme.jar to the data directory before running 4 | the examples. 5 | -------------------------------------------------------------------------------- /udfs/java/build.xml: -------------------------------------------------------------------------------- 1 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | -------------------------------------------------------------------------------- /udfs/java/com/acme/financial/AnalyzeStock.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This code is made available under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance 4 | * with the License. You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | 15 | package com.acme.financial; 16 | 17 | import java.io.IOException; 18 | import java.util.HashMap; 19 | import java.util.Map; 20 | import java.util.Random; 21 | 22 | import org.apache.pig.EvalFunc; 23 | import org.apache.pig.backend.executionengine.ExecException; 24 | import org.apache.pig.data.DataBag; 25 | import org.apache.pig.data.DataType; 26 | import org.apache.pig.data.Tuple; 27 | import org.apache.pig.impl.logicalLayer.schema.Schema; 28 | 29 | /** 30 | * This UDF takes a bag of information about a stock, and 31 | * produces a floating point score between 1 and 100, 32 | * 1 being sell, 100 being buy. 33 | */ 34 | public class AnalyzeStock extends EvalFunc { 35 | 36 | Random r = new Random(); 37 | 38 | @Override 39 | public Float exec(Tuple input) throws IOException { 40 | // Make sure the input isn't null and is of the right size. 41 | if (input == null || input.size() != 1) return null; 42 | 43 | DataBag b = (DataBag)input.get(0); 44 | for (Tuple t : b) { 45 | // Do some magic analysis... 46 | } 47 | return r.nextFloat() * 100; 48 | } 49 | 50 | @Override 51 | public Schema outputSchema(Schema input) { 52 | return new Schema(new Schema.FieldSchema(null, DataType.FLOAT)); 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /udfs/java/com/acme/financial/AnalyzeStockV2.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This code is made available under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance 4 | * with the License. You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | 15 | package com.acme.financial; 16 | 17 | import java.io.FileInputStream; 18 | import java.io.IOException; 19 | import java.util.HashMap; 20 | import java.util.Map; 21 | import java.util.Properties; 22 | import java.util.Random; 23 | 24 | import org.apache.pig.EvalFunc; 25 | import org.apache.pig.backend.executionengine.ExecException; 26 | import org.apache.pig.data.DataBag; 27 | import org.apache.pig.data.DataType; 28 | import org.apache.pig.data.Tuple; 29 | import org.apache.pig.impl.logicalLayer.schema.Schema; 30 | import org.apache.pig.impl.util.UDFContext; 31 | 32 | /** 33 | * This UDF takes a bag of information about a stock, and 34 | * produces a floating point score between 1 and 100, 35 | * 1 being sell, 100 being buy. 36 | */ 37 | public class AnalyzeStockV2 extends EvalFunc { 38 | 39 | Random r = new Random(); 40 | Properties myProperties = null; 41 | 42 | @Override 43 | public Float exec(Tuple input) throws IOException { 44 | if (myProperties == null) { 45 | // Retrieve our class specific properties from UDFContext 46 | myProperties = 47 | UDFContext.getUDFContext().getUDFProperties(this.getClass()); 48 | } 49 | 50 | // Make sure the input isn't null and is of the right size. 51 | if (input == null || input.size() != 1) return null; 52 | 53 | DataBag b = (DataBag)input.get(0); 54 | for (Tuple t : b) { 55 | // Do some magic analysis, using properites from myProperties to 56 | // decide how ... 57 | } 58 | return r.nextFloat() * 100; 59 | } 60 | 61 | @Override 62 | public Schema outputSchema(Schema input) { 63 | try { 64 | // Read our properties file 65 | Properties prop = new Properties(); 66 | prop.load(new FileInputStream("/tmp/stock.properties")); 67 | // Get a properties object specific to this UDF class 68 | UDFContext context = UDFContext.getUDFContext(); 69 | Properties udfProp = context.getUDFProperties(this.getClass()); 70 | // Copy our properties into it. There is no need to pass it 71 | // back to UDFContext 72 | for (Map.Entry e : prop.entrySet()) { 73 | udfProp.setProperty((String)e.getKey(), (String)e.getValue()); 74 | } 75 | } catch (Exception e) { 76 | throw new RuntimeException(e); 77 | } 78 | 79 | return new Schema(new Schema.FieldSchema(null, DataType.FLOAT)); 80 | } 81 | 82 | } 83 | -------------------------------------------------------------------------------- /udfs/java/com/acme/financial/CurrencyConverter.java: -------------------------------------------------------------------------------- 1 | /* 2 | * This code is made available under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance 4 | * with the License. You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, 10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | * See the License for the specific language governing permissions and 12 | * limitations under the License. 13 | */ 14 | 15 | package com.acme.financial; 16 | 17 | import java.io.IOException; 18 | import java.util.Map; 19 | import java.util.HashMap; 20 | 21 | import org.apache.pig.EvalFunc; 22 | import org.apache.pig.backend.executionengine.ExecException; 23 | import org.apache.pig.data.DataType; 24 | import org.apache.pig.data.Tuple; 25 | import org.apache.pig.impl.logicalLayer.schema.Schema; 26 | 27 | public class CurrencyConverter extends EvalFunc { 28 | 29 | String from, to; 30 | 31 | public CurrencyConverter(String from, String to) { 32 | super(); 33 | this.from = from; 34 | this.to = to; 35 | } 36 | 37 | @Override 38 | public Float exec(Tuple input) throws IOException { 39 | // Make sure the input isn't null and is of the right size. 40 | if (input == null || input.size() != 1) return null; 41 | // do some magic lookup in a table 42 | // ... 43 | return (Float)input.get(0) * 1.5f; 44 | } 45 | 46 | @Override 47 | public Schema outputSchema(Schema input) { 48 | return new Schema(new Schema.FieldSchema(null, DataType.FLOAT)); 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /udfs/java/com/acme/io/JsonLoader.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is made available under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations 12 | * under the License. 13 | */ 14 | package com.acme.io; 15 | 16 | import java.io.ByteArrayInputStream; 17 | import java.io.DataInputStream; 18 | import java.io.IOException; 19 | import java.util.ArrayList; 20 | import java.util.HashMap; 21 | import java.util.List; 22 | import java.util.Map; 23 | import java.util.Properties; 24 | 25 | import org.codehaus.jackson.JsonFactory; 26 | import org.codehaus.jackson.JsonParser; 27 | import org.codehaus.jackson.JsonToken; 28 | 29 | import org.apache.commons.logging.Log; 30 | import org.apache.commons.logging.LogFactory; 31 | 32 | import org.apache.hadoop.fs.FileSystem; 33 | import org.apache.hadoop.fs.Path; 34 | import org.apache.hadoop.io.NullWritable; 35 | import org.apache.hadoop.io.Text; 36 | import org.apache.hadoop.mapreduce.InputFormat; 37 | import org.apache.hadoop.mapreduce.Job; 38 | import org.apache.hadoop.mapreduce.RecordReader; 39 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; 40 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat; 41 | 42 | import org.apache.pig.Expression; 43 | import org.apache.pig.LoadCaster; 44 | import org.apache.pig.LoadFunc; 45 | import org.apache.pig.LoadMetadata; 46 | import org.apache.pig.LoadPushDown; 47 | import org.apache.pig.ResourceSchema; 48 | import org.apache.pig.ResourceSchema.ResourceFieldSchema; 49 | import org.apache.pig.ResourceStatistics; 50 | import org.apache.pig.LoadPushDown.OperatorSet; 51 | import org.apache.pig.LoadPushDown.RequiredFieldList; 52 | import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit; 53 | import org.apache.pig.builtin.Utf8StorageConverter; 54 | import org.apache.pig.data.BagFactory; 55 | import org.apache.pig.data.DataBag; 56 | import org.apache.pig.data.DataByteArray; 57 | import org.apache.pig.data.DataType; 58 | import org.apache.pig.data.Tuple; 59 | import org.apache.pig.data.TupleFactory; 60 | import org.apache.pig.impl.logicalLayer.FrontendException; 61 | import org.apache.pig.impl.util.UDFContext; 62 | import org.apache.pig.impl.util.Utils; 63 | 64 | /** 65 | * A loader for data stored using {@link JsonStorage}. This is not a generic 66 | * JSON loader. It depends on the schema being stored with the data when 67 | * conceivably you could write a loader that determines the schema from the 68 | * JSON. It is also not well tested, for functionality or performance. It 69 | * works for simple demonstrations. 70 | * 71 | * Also note that this loader and the associated storage function require a 72 | * version of Pig that has PIG-2112 to work with complex data. 73 | */ 74 | public class JsonLoader extends LoadFunc implements LoadMetadata { 75 | 76 | protected RecordReader reader = null; 77 | protected ResourceFieldSchema[] fields = null; 78 | protected final Log log = LogFactory.getLog(getClass()); 79 | 80 | private String udfcSignature = null; 81 | private JsonFactory jsonFactory = null; 82 | private TupleFactory tupleFactory = TupleFactory.getInstance(); 83 | private BagFactory bagFactory = BagFactory.getInstance(); 84 | 85 | /** 86 | * Communicate to the loader the location of the object(s) being loaded. 87 | * The location string passed to the LoadFunc here is the return value of 88 | * {@link LoadFunc#relativeToAbsolutePath(String, Path)}. Implementations 89 | * should use this method to communicate the location (and any other 90 | * information) to its underlying InputFormat through the Job object. 91 | * 92 | * This method will be called in the backend multiple times. Implementations 93 | * should bear in mind that this method is called multiple times and should 94 | * ensure there are no inconsistent side effects due to the multiple calls. 95 | * 96 | * @param location Location as returned by 97 | * {@link LoadFunc#relativeToAbsolutePath(String, Path)} 98 | * @param job the {@link Job} object 99 | * store or retrieve earlier stored information from the {@link UDFContext} 100 | * @throws IOException if the location is not valid. 101 | */ 102 | public void setLocation(String location, Job job) throws IOException { 103 | // Tell our input format where we will be reading from 104 | FileInputFormat.setInputPaths(job, location); 105 | } 106 | 107 | /** 108 | * This will be called during planning on the front end. This is the 109 | * instance of InputFormat (rather than the class name) because the 110 | * load function may need to instantiate the InputFormat in order 111 | * to control how it is constructed. 112 | * @return the InputFormat associated with this loader. 113 | * @throws IOException if there is an exception during InputFormat 114 | * construction 115 | */ 116 | @SuppressWarnings("unchecked") 117 | public InputFormat getInputFormat() throws IOException { 118 | // We will use TextInputFormat, the default Hadoop input format for 119 | // text. It has a LongWritable key that we will ignore, and the value 120 | // is a Text (a string writable) that the JSON data is in. 121 | return new TextInputFormat(); 122 | } 123 | 124 | /** 125 | * This will be called on the front end during planning and not on the back 126 | * end during execution. 127 | * @return the {@link LoadCaster} associated with this loader. 128 | * Returning null indicates that casts from byte array are not supported 129 | * for this loader. 130 | * @throws IOException if there is an exception during LoadCaster 131 | */ 132 | public LoadCaster getLoadCaster() throws IOException { 133 | // We do not expect to do casting of byte arrays, because we will be 134 | // returning typed data. 135 | return null; 136 | } 137 | 138 | /** 139 | * Initializes LoadFunc for reading data. This will be called during 140 | * execution before any calls to getNext. The RecordReader needs to be 141 | * passed here because it has been instantiated for a particular InputSplit. 142 | * @param reader {@link RecordReader} to be used by this instance of 143 | * the LoadFunc 144 | * @param split The input {@link PigSplit} to process 145 | * @throws IOException if there is an exception during initialization 146 | */ 147 | @SuppressWarnings("unchecked") 148 | public void prepareToRead(RecordReader reader, PigSplit split) 149 | throws IOException { 150 | this.reader = reader; 151 | 152 | // Get the schema string from the UDFContext object. 153 | UDFContext udfc = UDFContext.getUDFContext(); 154 | Properties p = 155 | udfc.getUDFProperties(this.getClass(), new String[]{udfcSignature}); 156 | String strSchema = p.getProperty("pig.jsonloader.schema"); 157 | if (strSchema == null) { 158 | throw new IOException("Could not find schema in UDF context"); 159 | } 160 | 161 | // Parse the schema from the string stored in the properties object. 162 | ResourceSchema schema = 163 | new ResourceSchema(Utils.getSchemaFromString(strSchema)); 164 | fields = schema.getFields(); 165 | 166 | jsonFactory = new JsonFactory(); 167 | } 168 | 169 | 170 | /** 171 | * Retrieves the next tuple to be processed. Implementations should NOT 172 | * reuse tuple objects (or inner member objects) they return across calls 173 | * and should return a different tuple object in each call. 174 | * @return the next tuple to be processed or null if there are no more 175 | * tuples to be processed. 176 | * @throws IOException if there is an exception while retrieving the next 177 | * tuple 178 | */ 179 | public Tuple getNext() throws IOException { 180 | Text val = null; 181 | try { 182 | // Read the next key value pair from the record reader. If it's 183 | // finished, return null 184 | if (!reader.nextKeyValue()) return null; 185 | 186 | // Get the current value. We don't use the key. 187 | val = (Text)reader.getCurrentValue(); 188 | } catch (InterruptedException ie) { 189 | throw new IOException(ie); 190 | } 191 | 192 | // Create a parser specific for this input line. This may not be the 193 | // most efficient approach. 194 | ByteArrayInputStream bais = new ByteArrayInputStream(val.getBytes()); 195 | JsonParser p = jsonFactory.createJsonParser(bais); 196 | 197 | // Create the tuple we will be returning. We create it with the right 198 | // number of fields, as the Tuple object is optimized for this case. 199 | Tuple t = tupleFactory.newTuple(fields.length); 200 | 201 | // Read the start object marker. Throughout this file if the parsing 202 | // isn't what we expect we return a tuple with null fields rather than 203 | // throwing an exception. That way a few mangled lines don't fail the 204 | // job. 205 | if (p.nextToken() != JsonToken.START_OBJECT) { 206 | log.warn("Bad record, could not find start of record " + 207 | val.toString()); 208 | return t; 209 | } 210 | 211 | // Read each field in the record 212 | for (int i = 0; i < fields.length; i++) { 213 | t.set(i, readField(p, fields[i], i)); 214 | } 215 | 216 | if (p.nextToken() != JsonToken.END_OBJECT) { 217 | log.warn("Bad record, could not find end of record " + 218 | val.toString()); 219 | return t; 220 | } 221 | p.close(); 222 | return t; 223 | } 224 | 225 | private Object readField(JsonParser p, 226 | ResourceFieldSchema field, 227 | int fieldnum) throws IOException { 228 | // Read the next token 229 | JsonToken tok = p.nextToken(); 230 | if (tok == null) { 231 | log.warn("Early termination of record, expected " + fields.length 232 | + " fields bug found " + fieldnum); 233 | return null; 234 | } 235 | 236 | // Check to see if this value was null 237 | if (tok == JsonToken.VALUE_NULL) return null; 238 | 239 | // Read based on our expected type 240 | switch (field.getType()) { 241 | case DataType.INTEGER: 242 | // Read the field name 243 | p.nextToken(); 244 | return p.getValueAsInt(); 245 | 246 | case DataType.LONG: 247 | p.nextToken(); 248 | return p.getValueAsLong(); 249 | 250 | case DataType.FLOAT: 251 | p.nextToken(); 252 | return (float)p.getValueAsDouble(); 253 | 254 | case DataType.DOUBLE: 255 | p.nextToken(); 256 | return p.getValueAsDouble(); 257 | 258 | case DataType.BYTEARRAY: 259 | p.nextToken(); 260 | byte[] b = p.getBinaryValue(); 261 | // Use the DBA constructor that copies the bytes so that we own 262 | // the memory 263 | return new DataByteArray(b, 0, b.length); 264 | 265 | case DataType.CHARARRAY: 266 | p.nextToken(); 267 | return p.getText(); 268 | 269 | case DataType.MAP: 270 | // Should be a start of the map object 271 | if (p.nextToken() != JsonToken.START_OBJECT) { 272 | log.warn("Bad map field, could not find start of object, field " 273 | + fieldnum); 274 | return null; 275 | } 276 | Map m = new HashMap(); 277 | while (p.nextToken() != JsonToken.END_OBJECT) { 278 | String k = p.getCurrentName(); 279 | String v = p.getText(); 280 | m.put(k, v); 281 | } 282 | return m; 283 | 284 | case DataType.TUPLE: 285 | if (p.nextToken() != JsonToken.START_OBJECT) { 286 | log.warn("Bad tuple field, could not find start of object, " 287 | + "field " + fieldnum); 288 | return null; 289 | } 290 | 291 | ResourceSchema s = field.getSchema(); 292 | ResourceFieldSchema[] fs = s.getFields(); 293 | Tuple t = tupleFactory.newTuple(fs.length); 294 | 295 | for (int j = 0; j < fs.length; j++) { 296 | t.set(j, readField(p, fs[j], j)); 297 | } 298 | 299 | if (p.nextToken() != JsonToken.END_OBJECT) { 300 | log.warn("Bad tuple field, could not find end of object, " 301 | + "field " + fieldnum); 302 | return null; 303 | } 304 | return t; 305 | 306 | case DataType.BAG: 307 | if (p.nextToken() != JsonToken.START_ARRAY) { 308 | log.warn("Bad bag field, could not find start of array, " 309 | + "field " + fieldnum); 310 | return null; 311 | } 312 | 313 | s = field.getSchema(); 314 | fs = s.getFields(); 315 | // Drill down the next level to the tuple's schema. 316 | s = fs[0].getSchema(); 317 | fs = s.getFields(); 318 | 319 | DataBag bag = bagFactory.newDefaultBag(); 320 | 321 | JsonToken innerTok; 322 | while ((innerTok = p.nextToken()) != JsonToken.END_ARRAY) { 323 | if (innerTok != JsonToken.START_OBJECT) { 324 | log.warn("Bad bag tuple field, could not find start of " 325 | + "object, field " + fieldnum); 326 | return null; 327 | } 328 | 329 | t = tupleFactory.newTuple(fs.length); 330 | for (int j = 0; j < fs.length; j++) { 331 | t.set(j, readField(p, fs[j], j)); 332 | } 333 | 334 | if (p.nextToken() != JsonToken.END_OBJECT) { 335 | log.warn("Bad bag tuple field, could not find end of " 336 | + "object, field " + fieldnum); 337 | return null; 338 | } 339 | bag.add(t); 340 | } 341 | return bag; 342 | default: 343 | throw new IOException("Unknown type in input schema: " + 344 | field.getType()); 345 | } 346 | 347 | } 348 | 349 | //------------------------------------------------------------------------ 350 | 351 | /** 352 | * This method will be called by Pig both in the front end and back end to 353 | * pass a unique signature to the {@link LoadFunc}. The signature can be used 354 | * to store into the {@link UDFContext} any information which the 355 | * {@link LoadFunc} needs to store between various method invocations in the 356 | * front end and back end. A use case is to store {@link RequiredFieldList} 357 | * passed to it in {@link LoadPushDown#pushProjection(RequiredFieldList)} for 358 | * use in the back end before returning tuples in {@link LoadFunc#getNext()}. 359 | * This method will be call before other methods in {@link LoadFunc} 360 | * @param signature a unique signature to identify this LoadFunc 361 | */ 362 | public void setUDFContextSignature(String signature) { 363 | udfcSignature = signature; 364 | } 365 | 366 | /** 367 | * Get a schema for the data to be loaded. 368 | * @param location Location as returned by 369 | * {@link LoadFunc#relativeToAbsolutePath(String, org.apache.hadoop.fs.Path)} 370 | * @param job The {@link Job} object - this should be used only to obtain 371 | * cluster properties through {@link Job#getConfiguration()} and not to 372 | * set/query any runtime job information. 373 | * @return schema for the data to be loaded. This schema should represent 374 | * all tuples of the returned data. If the schema is unknown or it is 375 | * not possible to return a schema that represents all returned data, 376 | * then null should be returned. The schema should not be affected by 377 | * pushProjection, ie. getSchema should always return the original schema 378 | * even after pushProjection 379 | * @throws IOException if an exception occurs while determining the schema 380 | */ 381 | public ResourceSchema getSchema(String location, Job job) 382 | throws IOException { 383 | // Open the schema file and read the schema 384 | // Get an HDFS handle. 385 | FileSystem fs = FileSystem.get(job.getConfiguration()); 386 | DataInputStream in = fs.open(new Path(location + "/_schema")); 387 | String line = in.readLine(); 388 | in.close(); 389 | 390 | // Parse the schema 391 | ResourceSchema s = 392 | new ResourceSchema(Utils.getSchemaFromString(line)); 393 | if (s == null) { 394 | throw new IOException("Unable to parse schema found in file " + 395 | location + "/_schema"); 396 | } 397 | 398 | // Now that we have determined the schema, store it in our 399 | // UDFContext properties object so we have it when we need it on the 400 | // backend 401 | UDFContext udfc = UDFContext.getUDFContext(); 402 | Properties p = 403 | udfc.getUDFProperties(this.getClass(), new String[]{udfcSignature}); 404 | p.setProperty("pig.jsonloader.schema", line); 405 | 406 | return s; 407 | } 408 | 409 | 410 | 411 | /** 412 | * Get statistics about the data to be loaded. If no statistics are 413 | * available, then null should be returned. 414 | * @param location Location as returned by 415 | * {@link LoadFunc#relativeToAbsolutePath(String, org.apache.hadoop.fs.Path)} 416 | * @param job The {@link Job} object - this should be used only to obtain 417 | * cluster properties through {@link Job#getConfiguration()} and not to set/query 418 | * any runtime job information. 419 | * @return statistics about the data to be loaded. If no statistics are 420 | * available, then null should be returned. 421 | * @throws IOException if an exception occurs while retrieving statistics 422 | */ 423 | public ResourceStatistics getStatistics(String location, Job job) 424 | throws IOException { 425 | // We don't implement this one. 426 | return null; 427 | } 428 | 429 | /** 430 | * Find what columns are partition keys for this input. 431 | * @param location Location as returned by 432 | * {@link LoadFunc#relativeToAbsolutePath(String, org.apache.hadoop.fs.Path)} 433 | * @param job The {@link Job} object - this should be used only to obtain 434 | * cluster properties through {@link Job#getConfiguration()} and not to 435 | * set/query any runtime job information. 436 | * @return array of field names of the partition keys. Implementations 437 | * should return null to indicate that there are no partition keys 438 | * @throws IOException if an exception occurs while retrieving partition keys 439 | */ 440 | public String[] getPartitionKeys(String location, Job job) 441 | throws IOException { 442 | // We don't have partitions 443 | return null; 444 | } 445 | 446 | /** 447 | * Set the filter for partitioning. It is assumed that this filter 448 | * will only contain references to fields given as partition keys in 449 | * getPartitionKeys. So if the implementation returns null in 450 | * {@link #getPartitionKeys(String, Job)}, then this method is not 451 | * called by Pig runtime. This method is also not called by the Pig runtime 452 | * if there are no partition filter conditions. 453 | * @param partitionFilter that describes filter for partitioning 454 | * @throws IOException if the filter is not compatible with the storage 455 | * mechanism or contains non-partition fields. 456 | */ 457 | public void setPartitionFilter(Expression partitionFilter) 458 | throws IOException { 459 | // We don't have partitions 460 | } 461 | } 462 | -------------------------------------------------------------------------------- /udfs/java/com/acme/io/JsonStorage.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is made available under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations 12 | * under the License. 13 | */ 14 | package com.acme.io; 15 | 16 | import java.io.ByteArrayOutputStream; 17 | import java.io.DataOutputStream; 18 | import java.io.IOException; 19 | import java.util.Map; 20 | import java.util.Properties; 21 | 22 | import org.codehaus.jackson.JsonEncoding; 23 | import org.codehaus.jackson.JsonFactory; 24 | import org.codehaus.jackson.JsonGenerator; 25 | 26 | import org.apache.hadoop.fs.FileSystem; 27 | import org.apache.hadoop.fs.Path; 28 | import org.apache.hadoop.io.LongWritable; 29 | import org.apache.hadoop.io.Text; 30 | import org.apache.hadoop.mapreduce.Job; 31 | import org.apache.hadoop.mapreduce.OutputFormat; 32 | import org.apache.hadoop.mapreduce.RecordWriter; 33 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; 34 | import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; 35 | 36 | import org.apache.pig.ResourceSchema; 37 | import org.apache.pig.ResourceSchema.ResourceFieldSchema; 38 | import org.apache.pig.ResourceStatistics; 39 | import org.apache.pig.StoreMetadata; 40 | import org.apache.pig.StoreFunc; 41 | import org.apache.pig.data.DataByteArray; 42 | import org.apache.pig.data.DataType; 43 | import org.apache.pig.data.Tuple; 44 | import org.apache.pig.data.DataBag; 45 | import org.apache.pig.impl.logicalLayer.schema.Schema; 46 | import org.apache.pig.impl.util.UDFContext; 47 | import org.apache.pig.impl.util.Utils; 48 | 49 | /** 50 | * A JSON Pig store function. Each Pig tuple is stored on one line (as one 51 | * value for TextOutputFormat) so that it can be read easily using 52 | * TextInputFormat. Pig tuples are mapped to JSON objects. Pig bags are 53 | * mapped to JSON arrays. Pig maps are also mapped to JSON objects. Maps are 54 | * assumed to be string to string. A schema is stored in a side file to deal 55 | * with mapping between JSON and Pig types. This class is not well tested for 56 | * functionality or performance. 57 | * 58 | * Also note that this store function and the associated loader require a 59 | * version of Pig that has PIG-2112 to work with complex data. 60 | */ 61 | public class JsonStorage extends StoreFunc implements StoreMetadata { 62 | 63 | protected RecordWriter writer = null; 64 | protected ResourceFieldSchema[] fields = null; 65 | 66 | private String udfcSignature = null; 67 | private JsonFactory jsonFactory = null; 68 | 69 | // Default size for the byte buffer, should fit most tuples. 70 | private static final int BUF_SIZE = 4 * 1024; 71 | 72 | /* 73 | * Methods called on the front end 74 | */ 75 | 76 | /** 77 | * Return the OutputFormat associated with StoreFunc. This will be called 78 | * on the front end during planning and on the backend during 79 | * execution. 80 | * @return the {@link OutputFormat} associated with StoreFunc 81 | * @throws IOException if an exception occurs while constructing the 82 | * OutputFormat 83 | * 84 | */ 85 | @Override 86 | public OutputFormat getOutputFormat() throws IOException { 87 | // We will use TextOutputFormat, the default Hadoop output format for 88 | // text. The key is unused and the value will be a 89 | // Text (a string writable type) that we store our JSON data in. 90 | return new TextOutputFormat(); 91 | } 92 | 93 | /** 94 | * Communicate to the storer the location where the data needs to be 95 | * stored. The location string passed to the {@link StoreFunc} here is the 96 | * return value of StoreFunc#relToAbsPathForStoreLocation(String, Path) 97 | * This method will be called in the frontend and backend multiple times. 98 | * Implementations should bear in mind that this method is called multiple 99 | * times and should ensure there are no inconsistent side effects due to 100 | * the multiple calls. checkSchema(ResourceSchema) will be called before 101 | * any call to setStoreLocation(String, Job). 102 | * 103 | * @param location Location returned by 104 | * relToAbsPathForStoreLocation(String, Path) 105 | * @param job The Job object 106 | * @throws IOException if the location is not valid. 107 | */ 108 | 109 | @Override 110 | public void setStoreLocation(String location, Job job) throws IOException { 111 | // FileOutputFormat has a utility method for setting up the output 112 | // location. 113 | FileOutputFormat.setOutputPath(job, new Path(location)); 114 | } 115 | 116 | /** 117 | * This method will be called by Pig both in the front end and back end to 118 | * pass a unique signature to the {@link StoreFunc} which it can use to store 119 | * information in the {@link UDFContext} which it needs to store between 120 | * various method invocations in the front end and back end. This method 121 | * will be called before other methods in {@link StoreFunc}. This is necessary 122 | * because in a Pig Latin script with multiple stores, the different 123 | * instances of store functions need to be able to find their (and only their) 124 | * data in the UDFContext object. The default implementation is a no-op. 125 | * @param signature a unique signature to identify this StoreFunc 126 | */ 127 | @Override 128 | public void setStoreFuncUDFContextSignature(String signature) { 129 | // store the signature so we can use it later 130 | udfcSignature = signature; 131 | } 132 | 133 | /** 134 | * Set the schema for data to be stored. This will be called on the 135 | * front end during planning if the store is associated with a schema. 136 | * A Store function should implement this function to 137 | * check that a given schema is acceptable to it. For example, it 138 | * can check that the correct partition keys are included; 139 | * a storage function to be written directly to an OutputFormat can 140 | * make sure the schema will translate in a well defined way. Default implementation 141 | * is a no-op. 142 | * @param s to be checked 143 | * @throws IOException if this schema is not acceptable. It should include 144 | * a detailed error message indicating what is wrong with the schema. 145 | */ 146 | @Override 147 | public void checkSchema(ResourceSchema s) throws IOException { 148 | // We won't really check the schema here, we'll store it in our 149 | // UDFContext properties object so we have it when we need it on the 150 | // backend 151 | 152 | UDFContext udfc = UDFContext.getUDFContext(); 153 | Properties p = 154 | udfc.getUDFProperties(this.getClass(), new String[]{udfcSignature}); 155 | p.setProperty("pig.jsonstorage.schema", s.toString()); 156 | } 157 | 158 | 159 | /* 160 | * Methods called on the back end 161 | */ 162 | 163 | /** 164 | * Initialize StoreFunc to write data. This will be called during 165 | * execution on the backend before the call to putNext. 166 | * @param writer RecordWriter to use. 167 | * @throws IOException if an exception occurs during initialization 168 | */ 169 | @Override 170 | public void prepareToWrite(RecordWriter writer) throws IOException { 171 | // Store the record writer reference so we can use it when it's time 172 | // to write tuples 173 | this.writer = writer; 174 | 175 | // Get the schema string from the UDFContext object. 176 | UDFContext udfc = UDFContext.getUDFContext(); 177 | Properties p = 178 | udfc.getUDFProperties(this.getClass(), new String[]{udfcSignature}); 179 | String strSchema = p.getProperty("pig.jsonstorage.schema"); 180 | if (strSchema == null) { 181 | throw new IOException("Could not find schema in UDF context"); 182 | } 183 | 184 | // Parse the schema from the string stored in the properties object. 185 | ResourceSchema schema = 186 | new ResourceSchema(Utils.getSchemaFromString(strSchema)); 187 | fields = schema.getFields(); 188 | 189 | // Build a Json factory 190 | jsonFactory = new JsonFactory(); 191 | jsonFactory.configure( 192 | JsonGenerator.Feature.WRITE_NUMBERS_AS_STRINGS, false); 193 | } 194 | 195 | /** 196 | * Write a tuple to the data store. 197 | * @param t the tuple to store. 198 | * @throws IOException if an exception occurs during the write 199 | */ 200 | public void putNext(Tuple t) throws IOException { 201 | // Build a ByteArrayOutputStream to write the JSON into 202 | ByteArrayOutputStream baos = new ByteArrayOutputStream(BUF_SIZE); 203 | // Build the generator 204 | JsonGenerator json = 205 | jsonFactory.createJsonGenerator(baos, JsonEncoding.UTF8); 206 | 207 | // Write the beginning of the top level tuple object 208 | json.writeStartObject(); 209 | for (int i = 0; i < fields.length; i++) { 210 | writeField(json, fields[i], t.get(i)); 211 | } 212 | json.writeEndObject(); 213 | json.close(); 214 | 215 | // Hand a null key and our string to Hadoop 216 | try { 217 | writer.write(null, new Text(baos.toByteArray())); 218 | } catch (InterruptedException ie) { 219 | throw new IOException(ie); 220 | } 221 | } 222 | 223 | private void writeField(JsonGenerator json, 224 | ResourceFieldSchema field, 225 | Object d) throws IOException { 226 | 227 | // If the field is missing or the value is null, write a null 228 | if (d == null) { 229 | json.writeNullField(field.getName()); 230 | return; 231 | } 232 | 233 | // Based on the field's type, write it out 234 | switch (field.getType()) { 235 | case DataType.INTEGER: 236 | json.writeNumberField(field.getName(), (Integer)d); 237 | return; 238 | 239 | case DataType.LONG: 240 | json.writeNumberField(field.getName(), (Long)d); 241 | return; 242 | 243 | case DataType.FLOAT: 244 | json.writeNumberField(field.getName(), (Float)d); 245 | return; 246 | 247 | case DataType.DOUBLE: 248 | json.writeNumberField(field.getName(), (Double)d); 249 | return; 250 | 251 | case DataType.BYTEARRAY: 252 | json.writeBinaryField(field.getName(), ((DataByteArray)d).get()); 253 | return; 254 | 255 | case DataType.CHARARRAY: 256 | json.writeStringField(field.getName(), (String)d); 257 | return; 258 | 259 | case DataType.MAP: 260 | json.writeFieldName(field.getName()); 261 | json.writeStartObject(); 262 | for (Map.Entry e : ((Map)d).entrySet()) { 263 | json.writeStringField(e.getKey(), e.getValue().toString()); 264 | } 265 | json.writeEndObject(); 266 | return; 267 | 268 | case DataType.TUPLE: 269 | json.writeFieldName(field.getName()); 270 | json.writeStartObject(); 271 | 272 | ResourceSchema s = field.getSchema(); 273 | if (s == null) { 274 | throw new IOException("Schemas must be fully specified to use " 275 | + "this storage function. No schema found for field " + 276 | field.getName()); 277 | } 278 | ResourceFieldSchema[] fs = s.getFields(); 279 | 280 | for (int j = 0; j < fs.length; j++) { 281 | writeField(json, fs[j], ((Tuple)d).get(j)); 282 | } 283 | json.writeEndObject(); 284 | return; 285 | 286 | case DataType.BAG: 287 | json.writeFieldName(field.getName()); 288 | json.writeStartArray(); 289 | s = field.getSchema(); 290 | if (s == null) { 291 | throw new IOException("Schemas must be fully specified to use " 292 | + "this storage function. No schema found for field " + 293 | field.getName()); 294 | } 295 | fs = s.getFields(); 296 | if (fs.length != 1 || fs[0].getType() != DataType.TUPLE) { 297 | throw new IOException("Found a bag without a tuple " 298 | + "inside!"); 299 | } 300 | // Drill down the next level to the tuple's schema. 301 | s = fs[0].getSchema(); 302 | if (s == null) { 303 | throw new IOException("Schemas must be fully specified to use " 304 | + "this storage function. No schema found for field " + 305 | field.getName()); 306 | } 307 | fs = s.getFields(); 308 | for (Tuple t : (DataBag)d) { 309 | json.writeStartObject(); 310 | for (int j = 0; j < fs.length; j++) { 311 | writeField(json, fs[j], t.get(j)); 312 | } 313 | json.writeEndObject(); 314 | } 315 | json.writeEndArray(); 316 | return; 317 | } 318 | } 319 | 320 | /** 321 | * Store statistics about the data being written. 322 | * @param stats statistics to be recorded 323 | * @param location Location as returned by 324 | * {@link LoadFunc#relativeToAbsolutePath(String, org.apache.hadoop.fs.Path)} 325 | * @param job The {@link Job} object - this should be used only to obtain 326 | * cluster properties through {@link Job#getConfiguration()} and not to 327 | * set/query any runtime job information. 328 | * @throws IOException 329 | */ 330 | public void storeStatistics(ResourceStatistics stats, 331 | String location, 332 | Job job) throws IOException { 333 | // We don't implement this method 334 | } 335 | 336 | /** 337 | * Store schema of the data being written 338 | * @param schema Schema to be recorded 339 | * @param location Location as returned by 340 | * {@link LoadFunc#relativeToAbsolutePath(String, org.apache.hadoop.fs.Path)} 341 | * @param job The {@link Job} object - this should be used only to obtain 342 | * cluster properties through {@link Job#getConfiguration()} and not to 343 | * set/query any runtime job information. 344 | * @throws IOException 345 | */ 346 | public void storeSchema(ResourceSchema schema, String location, Job job) 347 | throws IOException { 348 | // Store the schema in a side file in the same directory. MapReduce 349 | // does not include files starting with "_" when reading data for a job. 350 | FileSystem fs = FileSystem.get(job.getConfiguration()); 351 | DataOutputStream out = fs.create(new Path(location + "/_schema")); 352 | out.writeBytes(schema.toString()); 353 | out.writeByte('\n'); 354 | out.close(); 355 | } 356 | 357 | } 358 | -------------------------------------------------------------------------------- /udfs/java/com/acme/marketing/CloseEnough.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is made available under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations 12 | * under the License. 13 | */ 14 | package com.acme.marketing; 15 | 16 | import java.io.DataInputStream; 17 | import java.io.IOException; 18 | import java.util.Random; 19 | 20 | import org.apache.pig.FilterFunc; 21 | import org.apache.pig.data.Tuple; 22 | 23 | /** 24 | * A filter UDF that determines if two zip codes are within a given distance. 25 | */ 26 | public class CloseEnough extends FilterFunc { 27 | 28 | int distance; 29 | Random r = new Random(); 30 | 31 | /* 32 | * @param miles - Distance in miles that two zip codes can be apart and 33 | * still be considered close enough. 34 | */ 35 | public CloseEnough(String miles) { 36 | // UDFs can only takes strings; convert to int here 37 | distance = Integer.valueOf(miles); 38 | } 39 | 40 | public Boolean exec(Tuple input) throws IOException { 41 | // expect two strings 42 | String zip1 = (String)input.get(0); 43 | String zip2 = (String)input.get(1); 44 | // do some lookup on zip code tables 45 | return r.nextBoolean(); 46 | } 47 | } 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /udfs/java/com/acme/marketing/MetroResolver.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is made available under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations 12 | * under the License. 13 | */ 14 | package com.acme.marketing; 15 | 16 | import java.io.DataInputStream; 17 | import java.io.IOException; 18 | import java.util.HashMap; 19 | import java.util.Map; 20 | 21 | import org.apache.hadoop.fs.FileSystem; 22 | import org.apache.hadoop.fs.Path; 23 | 24 | import org.apache.pig.EvalFunc; 25 | import org.apache.pig.data.Tuple; 26 | import org.apache.pig.impl.util.UDFContext; 27 | 28 | /** 29 | * A lookup UDF that maps cities to metropolatin areas. 30 | */ 31 | public class MetroResolver extends EvalFunc { 32 | 33 | String lookupFile; 34 | HashMap lookup = null; 35 | 36 | /* 37 | * @param file - File that contains a lookup table mapping cities to metro 38 | * areas. The file must be located on the file system where this UDF will 39 | * run. 40 | */ 41 | public MetroResolver(String file) { 42 | // Just store the filename, don't load the lookup table since we may 43 | // be on the front end or the back end. 44 | lookupFile = file; 45 | } 46 | 47 | public String exec(Tuple input) throws IOException { 48 | if (lookup == null) { 49 | // We have not been initialized yet, so do it now. 50 | lookup = new HashMap(); 51 | 52 | // Get an instance of the HDFS FileSystem class so 53 | // we can read a file from HDFS. We need a copy of 54 | // our configuration to do that. 55 | // Read the configuration from the UDFContext 56 | FileSystem fs = 57 | FileSystem.get(UDFContext.getUDFContext().getJobConf()); 58 | DataInputStream in = fs.open(new Path(lookupFile)); 59 | String line; 60 | while ((line = in.readLine()) != null) { 61 | String[] toks = new String[2]; 62 | toks = line.split(":", 2); 63 | lookup.put(toks[0], toks[1]); 64 | } 65 | in.close(); 66 | } 67 | return lookup.get((String)input.get(0)); 68 | } 69 | } 70 | 71 | 72 | 73 | -------------------------------------------------------------------------------- /udfs/java/com/acme/marketing/MetroResolverV2.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is made available under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations 12 | * under the License. 13 | */ 14 | package com.acme.marketing; 15 | 16 | import java.io.BufferedReader; 17 | import java.io.FileReader; 18 | import java.io.IOException; 19 | import java.util.ArrayList; 20 | import java.util.HashMap; 21 | import java.util.List; 22 | import java.util.Map; 23 | 24 | import org.apache.pig.EvalFunc; 25 | import org.apache.pig.data.Tuple; 26 | 27 | /** 28 | * A lookup UDF that maps cities to metropolatin areas. This time using the 29 | * Distributed Cache. 30 | */ 31 | public class MetroResolverV2 extends EvalFunc { 32 | 33 | String lookupFile; 34 | HashMap lookup = null; 35 | 36 | /* 37 | * @param file - File that contains a lookup table mapping cities to metro 38 | * areas. The file must be located on the file system where this UDF will 39 | * run. 40 | */ 41 | public MetroResolverV2(String file) { 42 | // Just store the filename, don't load the lookup table since we may 43 | // be on the front end or the back end. 44 | lookupFile = file; 45 | } 46 | 47 | public String exec(Tuple input) throws IOException { 48 | if (lookup == null) { 49 | // We have not been initialized yet, so do it now. 50 | lookup = new HashMap(); 51 | 52 | // Open the file as a local file 53 | FileReader fr = new FileReader("./mrv2_lookup"); 54 | BufferedReader d = new BufferedReader(fr); 55 | String line; 56 | while ((line = d.readLine()) != null) { 57 | String[] toks = new String[2]; 58 | toks = line.split(":", 2); 59 | lookup.put(toks[0], toks[1]); 60 | } 61 | fr.close(); 62 | } 63 | return lookup.get((String)input.get(0)); 64 | } 65 | 66 | public List getCacheFiles() { 67 | List list = new ArrayList(1); 68 | // We were passed the name of the file on HDFS. Append a 69 | // name for the file on the task node. 70 | list.add(lookupFile + "#mrv2_lookup"); 71 | return list; 72 | } 73 | } 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /udfs/java/com/acme/math/Pow.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is made available under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations 12 | * under the License. 13 | */ 14 | package com.acme.math; 15 | 16 | import java.io.IOException; 17 | 18 | import org.apache.pig.EvalFunc; 19 | import org.apache.pig.PigWarning; 20 | import org.apache.pig.data.DataType; 21 | import org.apache.pig.data.Tuple; 22 | import org.apache.pig.impl.logicalLayer.schema.Schema; 23 | import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; 24 | 25 | /** 26 | * A simple UDF that takes a value and raises it to the power of a second 27 | * value. It can be used in a Pig Latin script as Pow(x, y), where x and y 28 | * are both expected to be ints. 29 | */ 30 | public class Pow extends EvalFunc { 31 | 32 | public Long exec(Tuple input) throws IOException { 33 | try { 34 | /* Rather than give you explicit arguments UDFs are always handed 35 | * a tuple. The UDF must know the arguments it expects and pull 36 | * them out of the tuple. These next two lines get the first and 37 | * second fields out of the input tuple that was handed in. Since 38 | * Tuple.get returns Objects, we must cast them to Integers. If 39 | * the case fails an exception will be thrown. 40 | */ 41 | int base = (Integer)input.get(0); 42 | int exponent = (Integer)input.get(1); 43 | long result = 1; 44 | 45 | /* Probably not the most efficient method...*/ 46 | for (int i = 0; i < exponent; i++) { 47 | long preresult = result; 48 | result *= base; 49 | if (preresult > result) { 50 | // We overflowed. Give a warning, but do not throw an 51 | // exception. 52 | warn("Overflow!", PigWarning.TOO_LARGE_FOR_INT); 53 | // Returning null will indicate to Pig that we failed but 54 | // we want to continue execution 55 | return null; 56 | } 57 | } 58 | return result; 59 | } catch (Exception e) { 60 | // Throwing an exception will cause the task to fail. 61 | throw new IOException("Something bad happened!", e); 62 | } 63 | } 64 | 65 | public Schema outputSchema(Schema input) { 66 | // Check that we were passed two fields 67 | if (input.size() != 2) { 68 | throw new RuntimeException( 69 | "Expected (int, int), input does not have 2 fields"); 70 | } 71 | 72 | try { 73 | // Get the types for both columns and check them. If they are 74 | // wrong figure out what types were passed and give a good error 75 | // message. 76 | if (input.getField(0).type != DataType.INTEGER || 77 | input.getField(1).type != DataType.INTEGER) { 78 | String msg = "Expected input (int, int), received schema ("; 79 | msg += DataType.findTypeName(input.getField(0).type); 80 | msg += ", "; 81 | msg += DataType.findTypeName(input.getField(1).type); 82 | msg += ")"; 83 | throw new RuntimeException(msg); 84 | } 85 | } catch (Exception e) { 86 | throw new RuntimeException(e); 87 | } 88 | 89 | // Construct our output schema which is one field, that is a long 90 | return new Schema(new FieldSchema(null, DataType.LONG)); 91 | } 92 | } 93 | -------------------------------------------------------------------------------- /udfs/java/com/acme/math/PowV2.java: -------------------------------------------------------------------------------- 1 | /** 2 | * This code is made available under the Apache License, Version 2.0 (the 3 | * "License"); you may not use this file except in compliance with the License. 4 | * You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software 9 | * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 10 | * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 11 | * License for the specific language governing permissions and limitations 12 | * under the License. 13 | */ 14 | package com.acme.math; 15 | 16 | import java.io.IOException; 17 | import java.util.ArrayList; 18 | import java.util.List; 19 | 20 | import org.apache.pig.EvalFunc; 21 | import org.apache.pig.FuncSpec; 22 | import org.apache.pig.PigWarning; 23 | import org.apache.pig.data.DataType; 24 | import org.apache.pig.data.Tuple; 25 | import org.apache.pig.impl.logicalLayer.FrontendException; 26 | import org.apache.pig.impl.logicalLayer.schema.Schema; 27 | import org.apache.pig.impl.logicalLayer.schema.Schema.FieldSchema; 28 | 29 | /** 30 | * A simple UDF that takes a value and raises it to the power of a second 31 | * value. It can be used in a Pig Latin script as Pow(x, y), where x and y 32 | * are both expected to be ints. 33 | */ 34 | public class PowV2 extends EvalFunc { 35 | 36 | public Double exec(Tuple input) throws IOException { 37 | System.out.println("in pow"); 38 | try { 39 | return Math.pow((Double)input.get(0), (Double)input.get(1)); 40 | } catch (Exception e) { 41 | throw new IOException("Something bad happened!", e); 42 | } 43 | } 44 | 45 | public List getArgToFuncMapping() throws FrontendException { 46 | List funcList = new ArrayList(); 47 | Schema s = new Schema(); 48 | s.add(new Schema.FieldSchema(null, DataType.DOUBLE)); 49 | s.add(new Schema.FieldSchema(null, DataType.DOUBLE)); 50 | funcList.add(new FuncSpec(this.getClass().getName(), s)); 51 | s = new Schema(); 52 | s.add(new Schema.FieldSchema(null, DataType.LONG)); 53 | s.add(new Schema.FieldSchema(null, DataType.LONG)); 54 | funcList.add(new FuncSpec(LongPow.class.getName(), s)); 55 | return funcList; 56 | } 57 | 58 | 59 | public static class LongPow extends EvalFunc { 60 | 61 | public Long exec(Tuple input) throws IOException { 62 | System.out.println("in longpow"); 63 | try { 64 | long base = (Long)input.get(0); 65 | long exponent = (Long)input.get(1); 66 | long result = 1; 67 | 68 | for (long i = 0; i < exponent; i++) { 69 | long preresult = result; 70 | result *= base; 71 | if (preresult > result) { 72 | warn("Overflow!", PigWarning.TOO_LARGE_FOR_INT); 73 | return null; 74 | } 75 | } 76 | return result; 77 | } catch (Exception e) { 78 | throw new IOException("Something bad happened!", e); 79 | } 80 | } 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /udfs/python/production.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # This code is made available under the Apache License, Version 2.0 (the 4 | # "License"); you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 11 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 12 | # License for the specific language governing permissions and limitations 13 | # under the License. 14 | 15 | @outputSchema("production:float") 16 | def production(slugging_pct, onbase_pct): 17 | return slugging_pct + onbase_pct 18 | 19 | 20 | -------------------------------------------------------------------------------- /udfs/python/square.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # This code is made available under the Apache License, Version 2.0 (the 4 | # "License"); you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 11 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 12 | # License for the specific language governing permissions and limitations 13 | # under the License. 14 | 15 | @outputSchemaFunction("schema") 16 | def square(num): 17 | return num * num 18 | 19 | 20 | @schemaFunction("schema") 21 | def schema(input): 22 | # Return whatever type we were handed 23 | return input 24 | --------------------------------------------------------------------------------