├── .gitignore ├── README.md ├── simple_examples.sql └── simple_sql_example_notebook.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .config 2 | .ipynb_checkpoints 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Simple examples of PL/Python syntax 2 | =================================== 3 | 4 | Requirements: 5 | PostgreSQL or Greenplum DB with PL/Python installed. 6 | NumPy installed on all compute nodes. 7 | 8 | You can view the IPython notebook at http://nbviewer.ipython.org/github/ihuston/plpython_examples/blob/master/simple_sql_example_notebook.ipynb 9 | 10 | -------------------------------------------------------------------------------- /simple_examples.sql: -------------------------------------------------------------------------------- 1 | -- Some quick tests of the capability of PL/Python on PostgreSQL and Greenplum DB 2 | -- Create a schema to do some work in 3 | CREATE SCHEMA plp; 4 | 5 | -- Simple SQL User Defined Function to get started 6 | CREATE FUNCTION plp.times2(INT) 7 | RETURNS INT 8 | AS $$ 9 | SELECT 2 * $1; 10 | $$ LANGUAGE sql; 11 | 12 | --Try it out 13 | SELECT plp.times2(10); 14 | 15 | -- Test using Python code 16 | CREATE FUNCTION plp.pymax (a integer, b integer) 17 | RETURNS integer 18 | AS $$ 19 | if a > b: 20 | return a 21 | return b 22 | $$ LANGUAGE plpythonu; 23 | 24 | --Test Python code 25 | SELECT plp.pymax(10, 5); 26 | 27 | 28 | -- Create a composite return type 29 | DROP TYPE IF EXISTS plp.named_value; 30 | CREATE TYPE plp.named_value AS ( 31 | name text, 32 | value integer 33 | ); 34 | 35 | --Simple function which returns a composite object 36 | CREATE OR REPLACE FUNCTION plp.make_pair (name text, value integer) 37 | RETURNS plp.named_value 38 | AS $$ 39 | return [ name, value ] 40 | # or alternatively, as tuple: return ( name, value ) 41 | # or as dict: return { "name": name, "value": value } 42 | # or as an object with attributes .name and .value 43 | $$ LANGUAGE plpythonu; 44 | 45 | --Try out the function 46 | SELECT plp.make_pair('Zozimus', 1); 47 | 48 | --Using NumPy inside a PL/Python function 49 | CREATE OR REPLACE FUNCTION plp.make_pair (name text) 50 | RETURNS plp.named_value 51 | AS $$ 52 | import numpy as np 53 | a = np.arange(100) 54 | return [name, a[2]] 55 | $$ LANGUAGE plpythonu; 56 | 57 | --Try it out 58 | SELECT plp.make_pair('Horatio'); 59 | 60 | --Returning a set of results using SETOF 61 | CREATE OR REPLACE FUNCTION plp.make_pair_sets (name text) 62 | RETURNS SETOF plp.named_value 63 | AS $$ 64 | import numpy as np 65 | return ((name, i) for i in np.arange(3)) 66 | $$ LANGUAGE plpythonu; 67 | 68 | --Try it out 69 | SELECT plp.make_pair_sets('Gerald'); 70 | 71 | 72 | --Set up some data to show parallelisation 73 | DROP TABLE IF EXISTS plp.test_data; 74 | 75 | CREATE TABLE plp.test_data AS 76 | SELECT 'a'::text AS name, generate_series(0,1000000)::float AS x, generate_series(0,1000000)/100.0 AS y 77 | DISTRIBUTED BY (name); 78 | 79 | INSERT INTO plp.test_data 80 | SELECT 'b'::text AS name, generate_series(0,1000000)::float AS x, sin(generate_series(0,1000000)/100.0) AS y; 81 | 82 | INSERT INTO plp.test_data 83 | SELECT 'c'::text AS name, generate_series(0,1000000)::float AS x, 100.0 + sin(generate_series(0,1000000)/100.0) AS y; 84 | 85 | -- Create a function to find the mean of some numbers 86 | DROP FUNCTION IF EXISTS plp.np_mean(double precision[]); 87 | CREATE OR REPLACE FUNCTION plp.np_mean(value_array double precision[]) 88 | RETURNS float 89 | AS $$ 90 | import numpy as np 91 | return np.mean(value_array) 92 | $$ LANGUAGE plpythonu; 93 | 94 | -- Need to pass the numbers as an array using array_agg 95 | SELECT plp.np_mean(array_agg(y)) FROM plp.test_data; 96 | 97 | -- Now try to do this for each type of data in parallel by grouping 98 | SELECT name, plp.np_mean(array_agg(y)) FROM plp.test_data GROUP BY name ORDER BY name; 99 | 100 | -- Now try do something even more interesting 101 | DROP FUNCTION IF EXISTS plp.linregr(double precision[]); 102 | CREATE OR REPLACE FUNCTION plp.linregr(x double precision[], y double precision[]) 103 | RETURNS float[] 104 | AS $$ 105 | from scipy import stats 106 | return stats.linregress(x, y) 107 | $$ LANGUAGE plpythonu; 108 | 109 | -- Do linear regression for all data 110 | SELECT plp.linregr(array_agg(x), array_agg(y)) 111 | FROM plp.test_data; 112 | 113 | -- Now do it separately for each 'name' 114 | SELECT name, plp.linregr(array_agg(x), array_agg(y)) 115 | FROM plp.test_data 116 | GROUP BY name ORDER BY name; -------------------------------------------------------------------------------- /simple_sql_example_notebook.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "name": "" 4 | }, 5 | "nbformat": 3, 6 | "nbformat_minor": 0, 7 | "worksheets": [ 8 | { 9 | "cells": [ 10 | { 11 | "cell_type": "heading", 12 | "level": 1, 13 | "metadata": {}, 14 | "source": [ 15 | "Simple Examples of using PL/Python in a SQL Database" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "Written by Ian Huston.\n", 23 | "\n", 24 | "These notes are a short introduction to using the procedural language PL/Python and how this can help speed up calculations by leveraging the power of a massively parallel processing database like the [Pivotal Greenplum Database](http://gopivotal.com/products/pivotal-greenplum-database).\n", 25 | "\n", 26 | "Requirements: [PostgreSQL](http://postgresql.org) or [Greenplum DB](http://gopivotal.com/products/pivotal-greenplum-database), PL/Python + [NumPy](http://numpy.scipy.org) installed on all nodes.\n", 27 | "\n", 28 | "For this notebook you will also need [ipython-sql](https://github.com/catherinedevlin/ipython-sql) by Catherine Devlin" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "collapsed": false, 34 | "input": [ 35 | "#Load the ipython-sql magic command\n", 36 | "%load_ext sql" 37 | ], 38 | "language": "python", 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "output_type": "stream", 43 | "stream": "stdout", 44 | "text": [ 45 | "The sql extension is already loaded. To reload it, use:\n", 46 | " %reload_ext sql\n" 47 | ] 48 | } 49 | ], 50 | "prompt_number": 5 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "First we need to connect to the Greenplum database" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "collapsed": false, 62 | "input": [ 63 | "# Normally use the following:\n", 64 | "# %sql postgresql://user:passwd@server/db" 65 | ], 66 | "language": "python", 67 | "metadata": {}, 68 | "outputs": [], 69 | "prompt_number": 6 70 | }, 71 | { 72 | "cell_type": "code", 73 | "collapsed": false, 74 | "input": [ 75 | "# Use a workaround to not display password!\n", 76 | "with open(\"./.config\") as conn_config:\n", 77 | " conn_info = conn_config.readline().strip()\n", 78 | "ip = get_ipython()\n", 79 | "ip.magic(\"%sql \" + conn_info)" 80 | ], 81 | "language": "python", 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "metadata": {}, 86 | "output_type": "pyout", 87 | "prompt_number": 7, 88 | "text": [ 89 | "u'Connected: ihuston@hustondb'" 90 | ] 91 | } 92 | ], 93 | "prompt_number": 7 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "We will do our work in a separate schema (container inside a database) to keep things tidy." 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "collapsed": false, 105 | "input": [ 106 | "%%sql \n", 107 | "-- Some quick tests of the capability of PL/Python on \n", 108 | "-- PostgreSQL and Greenplum DB.\n", 109 | "-- Create a schema to do some work in\n", 110 | "\n", 111 | "DROP SCHEMA plp CASCADE;\n", 112 | "CREATE SCHEMA plp;" 113 | ], 114 | "language": "python", 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "output_type": "stream", 119 | "stream": "stdout", 120 | "text": [ 121 | "Done.\n", 122 | "Done." 123 | ] 124 | }, 125 | { 126 | "output_type": "stream", 127 | "stream": "stdout", 128 | "text": [ 129 | "\n" 130 | ] 131 | }, 132 | { 133 | "metadata": {}, 134 | "output_type": "pyout", 135 | "prompt_number": 8, 136 | "text": [ 137 | "[]" 138 | ] 139 | } 140 | ], 141 | "prompt_number": 8 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "First demonstrate what a User Defined Function looks like in SQL." 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "collapsed": false, 153 | "input": [ 154 | "%%sql \n", 155 | "-- Simple SQL User Defined Function to get started\n", 156 | "\n", 157 | "CREATE FUNCTION plp.times2(INT)\n", 158 | "RETURNS INT\n", 159 | "AS $$\n", 160 | "SELECT 2 * $1;\n", 161 | "$$ LANGUAGE sql;" 162 | ], 163 | "language": "python", 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "output_type": "stream", 168 | "stream": "stdout", 169 | "text": [ 170 | "Done.\n" 171 | ] 172 | }, 173 | { 174 | "metadata": {}, 175 | "output_type": "pyout", 176 | "prompt_number": 9, 177 | "text": [ 178 | "[]" 179 | ] 180 | } 181 | ], 182 | "prompt_number": 9 183 | }, 184 | { 185 | "cell_type": "code", 186 | "collapsed": false, 187 | "input": [ 188 | "%%sql \n", 189 | "--Try it out\n", 190 | "\n", 191 | "SELECT plp.times2(10);" 192 | ], 193 | "language": "python", 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "output_type": "stream", 198 | "stream": "stdout", 199 | "text": [ 200 | "1 rows affected.\n" 201 | ] 202 | }, 203 | { 204 | "html": [ 205 | "\n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | "
times2
20
" 213 | ], 214 | "metadata": {}, 215 | "output_type": "pyout", 216 | "prompt_number": 10, 217 | "text": [ 218 | "[(20,)]" 219 | ] 220 | } 221 | ], 222 | "prompt_number": 10 223 | }, 224 | { 225 | "cell_type": "code", 226 | "collapsed": false, 227 | "input": [ 228 | "%%sql\n", 229 | "-- Test using Python code\n", 230 | "\n", 231 | "CREATE FUNCTION plp.pymax (a integer, b integer)\n", 232 | "RETURNS integer\n", 233 | "AS $$\n", 234 | "if a > b:\n", 235 | " return a\n", 236 | "return b\n", 237 | "$$ LANGUAGE plpythonu;" 238 | ], 239 | "language": "python", 240 | "metadata": {}, 241 | "outputs": [ 242 | { 243 | "output_type": "stream", 244 | "stream": "stdout", 245 | "text": [ 246 | "Done.\n" 247 | ] 248 | }, 249 | { 250 | "metadata": {}, 251 | "output_type": "pyout", 252 | "prompt_number": 11, 253 | "text": [ 254 | "[]" 255 | ] 256 | } 257 | ], 258 | "prompt_number": 11 259 | }, 260 | { 261 | "cell_type": "code", 262 | "collapsed": false, 263 | "input": [ 264 | "%%sql\n", 265 | "--Test Python code\n", 266 | "\n", 267 | "SELECT plp.pymax(10, 5);" 268 | ], 269 | "language": "python", 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "output_type": "stream", 274 | "stream": "stdout", 275 | "text": [ 276 | "1 rows affected.\n" 277 | ] 278 | }, 279 | { 280 | "html": [ 281 | "\n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | "
pymax
10
" 289 | ], 290 | "metadata": {}, 291 | "output_type": "pyout", 292 | "prompt_number": 12, 293 | "text": [ 294 | "[(10,)]" 295 | ] 296 | } 297 | ], 298 | "prompt_number": 12 299 | }, 300 | { 301 | "cell_type": "code", 302 | "collapsed": false, 303 | "input": [ 304 | "%%sql\n", 305 | "-- Create a composite return type\n", 306 | "\n", 307 | "CREATE TYPE plp.named_value AS (\n", 308 | " name text,\n", 309 | " value integer\n", 310 | ");" 311 | ], 312 | "language": "python", 313 | "metadata": {}, 314 | "outputs": [ 315 | { 316 | "output_type": "stream", 317 | "stream": "stdout", 318 | "text": [ 319 | "Done.\n" 320 | ] 321 | }, 322 | { 323 | "metadata": {}, 324 | "output_type": "pyout", 325 | "prompt_number": 13, 326 | "text": [ 327 | "[]" 328 | ] 329 | } 330 | ], 331 | "prompt_number": 13 332 | }, 333 | { 334 | "cell_type": "code", 335 | "collapsed": false, 336 | "input": [ 337 | "%%sql\n", 338 | "--Simple function which returns a composite object\n", 339 | "\n", 340 | "CREATE OR REPLACE FUNCTION plp.make_pair (name text, value integer)\n", 341 | "RETURNS named_value\n", 342 | "AS $$\n", 343 | "return [ name, value ]\n", 344 | " # or alternatively, as tuple: return ( name, value )\n", 345 | " # or as dict: return { \"name\": name, \"value\": value }\n", 346 | " # or as an object with attributes .name and .value\n", 347 | "$$ LANGUAGE plpythonu;" 348 | ], 349 | "language": "python", 350 | "metadata": {}, 351 | "outputs": [ 352 | { 353 | "output_type": "stream", 354 | "stream": "stdout", 355 | "text": [ 356 | "Done.\n" 357 | ] 358 | }, 359 | { 360 | "metadata": {}, 361 | "output_type": "pyout", 362 | "prompt_number": 14, 363 | "text": [ 364 | "[]" 365 | ] 366 | } 367 | ], 368 | "prompt_number": 14 369 | }, 370 | { 371 | "cell_type": "code", 372 | "collapsed": false, 373 | "input": [ 374 | "%%sql\n", 375 | "--Try out the function\n", 376 | "-- (See https://en.wikipedia.org/wiki/Zozimus)\n", 377 | "\n", 378 | "SELECT plp.make_pair('Zozimus', 1);" 379 | ], 380 | "language": "python", 381 | "metadata": {}, 382 | "outputs": [ 383 | { 384 | "output_type": "stream", 385 | "stream": "stdout", 386 | "text": [ 387 | "1 rows affected.\n" 388 | ] 389 | }, 390 | { 391 | "html": [ 392 | "\n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | "
make_pair
(Zozimus,1)
" 400 | ], 401 | "metadata": {}, 402 | "output_type": "pyout", 403 | "prompt_number": 15, 404 | "text": [ 405 | "[('(Zozimus,1)',)]" 406 | ] 407 | } 408 | ], 409 | "prompt_number": 15 410 | }, 411 | { 412 | "cell_type": "markdown", 413 | "metadata": {}, 414 | "source": [ 415 | "Note that UDFs and PL/X functions allow for multiple function signatures." 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "collapsed": false, 421 | "input": [ 422 | "%%sql\n", 423 | "--Using NumPy inside a PL/Python function\n", 424 | "\n", 425 | "CREATE OR REPLACE FUNCTION plp.make_pair (name text)\n", 426 | "RETURNS named_value\n", 427 | "AS $$\n", 428 | "import numpy as np\n", 429 | "a = np.arange(100)\n", 430 | "return [name, a[2]]\n", 431 | "$$ LANGUAGE plpythonu;" 432 | ], 433 | "language": "python", 434 | "metadata": {}, 435 | "outputs": [ 436 | { 437 | "output_type": "stream", 438 | "stream": "stdout", 439 | "text": [ 440 | "Done.\n" 441 | ] 442 | }, 443 | { 444 | "metadata": {}, 445 | "output_type": "pyout", 446 | "prompt_number": 16, 447 | "text": [ 448 | "[]" 449 | ] 450 | } 451 | ], 452 | "prompt_number": 16 453 | }, 454 | { 455 | "cell_type": "code", 456 | "collapsed": false, 457 | "input": [ 458 | "%%sql\n", 459 | "--Try it out\n", 460 | "\n", 461 | "SELECT plp.make_pair('Horatio');" 462 | ], 463 | "language": "python", 464 | "metadata": {}, 465 | "outputs": [ 466 | { 467 | "output_type": "stream", 468 | "stream": "stdout", 469 | "text": [ 470 | "1 rows affected.\n" 471 | ] 472 | }, 473 | { 474 | "html": [ 475 | "\n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | "
make_pair
(Horatio,2)
" 483 | ], 484 | "metadata": {}, 485 | "output_type": "pyout", 486 | "prompt_number": 17, 487 | "text": [ 488 | "[('(Horatio,2)',)]" 489 | ] 490 | } 491 | ], 492 | "prompt_number": 17 493 | }, 494 | { 495 | "cell_type": "markdown", 496 | "metadata": {}, 497 | "source": [ 498 | "Note that the SQL calling syntax affects how the results are returned. \n", 499 | "\n", 500 | "In SQL this is the difference between returning composite types and splitting out results into each subtype (name and value here)." 501 | ] 502 | }, 503 | { 504 | "cell_type": "code", 505 | "collapsed": false, 506 | "input": [ 507 | "%%sql\n", 508 | "\n", 509 | "SELECT * FROM plp.make_pair('Horatio');" 510 | ], 511 | "language": "python", 512 | "metadata": {}, 513 | "outputs": [ 514 | { 515 | "output_type": "stream", 516 | "stream": "stdout", 517 | "text": [ 518 | "1 rows affected.\n" 519 | ] 520 | }, 521 | { 522 | "html": [ 523 | "\n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | "
namevalue
Horatio2
" 533 | ], 534 | "metadata": {}, 535 | "output_type": "pyout", 536 | "prompt_number": 18, 537 | "text": [ 538 | "[(u'Horatio', 2)]" 539 | ] 540 | } 541 | ], 542 | "prompt_number": 18 543 | }, 544 | { 545 | "cell_type": "code", 546 | "collapsed": false, 547 | "input": [ 548 | "%%sql\n", 549 | "--Returning a set of results using SETOF\n", 550 | "\n", 551 | "CREATE OR REPLACE FUNCTION make_pair_sets (name text)\n", 552 | "RETURNS SETOF named_value\n", 553 | "AS $$\n", 554 | "import numpy as np\n", 555 | "return ((name, i) for i in np.arange(3))\n", 556 | "$$ LANGUAGE plpythonu;" 557 | ], 558 | "language": "python", 559 | "metadata": {}, 560 | "outputs": [ 561 | { 562 | "output_type": "stream", 563 | "stream": "stdout", 564 | "text": [ 565 | "Done.\n" 566 | ] 567 | }, 568 | { 569 | "metadata": {}, 570 | "output_type": "pyout", 571 | "prompt_number": 19, 572 | "text": [ 573 | "[]" 574 | ] 575 | } 576 | ], 577 | "prompt_number": 19 578 | }, 579 | { 580 | "cell_type": "code", 581 | "collapsed": false, 582 | "input": [ 583 | "%%sql\n", 584 | "--Try it out\n", 585 | "\n", 586 | "SELECT * FROM make_pair_sets('Gerald');" 587 | ], 588 | "language": "python", 589 | "metadata": {}, 590 | "outputs": [ 591 | { 592 | "output_type": "stream", 593 | "stream": "stdout", 594 | "text": [ 595 | "3 rows affected.\n" 596 | ] 597 | }, 598 | { 599 | "html": [ 600 | "\n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | "
namevalue
Gerald0
Gerald1
Gerald2
" 618 | ], 619 | "metadata": {}, 620 | "output_type": "pyout", 621 | "prompt_number": 20, 622 | "text": [ 623 | "[(u'Gerald', 0), (u'Gerald', 1), (u'Gerald', 2)]" 624 | ] 625 | } 626 | ], 627 | "prompt_number": 20 628 | }, 629 | { 630 | "cell_type": "heading", 631 | "level": 2, 632 | "metadata": {}, 633 | "source": [ 634 | "Parallelisation" 635 | ] 636 | }, 637 | { 638 | "cell_type": "markdown", 639 | "metadata": {}, 640 | "source": [ 641 | "Now we will try to use parallelisation with PL/Python" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "collapsed": false, 647 | "input": [ 648 | "%%sql\n", 649 | "--Set up some data to show parallelisation\n", 650 | "\n", 651 | "DROP TABLE IF EXISTS plp.test_data;\n", 652 | "\n", 653 | "CREATE TABLE plp.test_data AS\n", 654 | "SELECT \n", 655 | " 'a'::text AS name\n", 656 | " , generate_series(0,1000000)::float AS x\n", 657 | " , generate_series(0,1000000)/100.0 AS y\n", 658 | "DISTRIBUTED BY (name);\n", 659 | "\n", 660 | "INSERT INTO plp.test_data \n", 661 | "SELECT \n", 662 | " 'b'::text AS name\n", 663 | " , generate_series(0,1000000)::float AS x\n", 664 | " , sin(generate_series(0,1000000)/100.0) AS y;\n", 665 | "\n", 666 | "INSERT INTO plp.test_data \n", 667 | "SELECT \n", 668 | " 'c'::text AS name\n", 669 | " , generate_series(0,1000000)::float AS x\n", 670 | " , 100.0 + sin(generate_series(0,1000000)/100.0) AS y;" 671 | ], 672 | "language": "python", 673 | "metadata": {}, 674 | "outputs": [ 675 | { 676 | "output_type": "stream", 677 | "stream": "stdout", 678 | "text": [ 679 | "Done.\n", 680 | "1000001 rows affected." 681 | ] 682 | }, 683 | { 684 | "output_type": "stream", 685 | "stream": "stdout", 686 | "text": [ 687 | "\n", 688 | "1000001 rows affected." 689 | ] 690 | }, 691 | { 692 | "output_type": "stream", 693 | "stream": "stdout", 694 | "text": [ 695 | "\n", 696 | "1000001 rows affected." 697 | ] 698 | }, 699 | { 700 | "output_type": "stream", 701 | "stream": "stdout", 702 | "text": [ 703 | "\n" 704 | ] 705 | }, 706 | { 707 | "metadata": {}, 708 | "output_type": "pyout", 709 | "prompt_number": 21, 710 | "text": [ 711 | "[]" 712 | ] 713 | } 714 | ], 715 | "prompt_number": 21 716 | }, 717 | { 718 | "cell_type": "code", 719 | "collapsed": false, 720 | "input": [ 721 | "%%sql\n", 722 | "-- Create a function to find the mean of some numbers\n", 723 | "\n", 724 | "DROP FUNCTION IF EXISTS plp.np_mean(double precision[]);\n", 725 | "\n", 726 | "CREATE OR REPLACE FUNCTION plp.np_mean(value_array double precision[])\n", 727 | "RETURNS float\n", 728 | "AS $$\n", 729 | "import numpy as np\n", 730 | "return np.mean(value_array)\n", 731 | "$$ LANGUAGE plpythonu;" 732 | ], 733 | "language": "python", 734 | "metadata": {}, 735 | "outputs": [ 736 | { 737 | "output_type": "stream", 738 | "stream": "stdout", 739 | "text": [ 740 | "Done.\n", 741 | "Done." 742 | ] 743 | }, 744 | { 745 | "output_type": "stream", 746 | "stream": "stdout", 747 | "text": [ 748 | "\n" 749 | ] 750 | }, 751 | { 752 | "metadata": {}, 753 | "output_type": "pyout", 754 | "prompt_number": 22, 755 | "text": [ 756 | "[]" 757 | ] 758 | } 759 | ], 760 | "prompt_number": 22 761 | }, 762 | { 763 | "cell_type": "code", 764 | "collapsed": false, 765 | "input": [ 766 | "%%sql\n", 767 | "-- Need to pass the numbers as an array using array_agg\n", 768 | "\n", 769 | "SELECT plp.np_mean(array_agg(y)) FROM plp.test_data;" 770 | ], 771 | "language": "python", 772 | "metadata": {}, 773 | "outputs": [ 774 | { 775 | "output_type": "stream", 776 | "stream": "stdout", 777 | "text": [ 778 | "1 rows affected.\n" 779 | ] 780 | }, 781 | { 782 | "html": [ 783 | "\n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | "
np_mean
1700.00013004
" 791 | ], 792 | "metadata": {}, 793 | "output_type": "pyout", 794 | "prompt_number": 23, 795 | "text": [ 796 | "[(1700.00013004,)]" 797 | ] 798 | } 799 | ], 800 | "prompt_number": 23 801 | }, 802 | { 803 | "cell_type": "code", 804 | "collapsed": false, 805 | "input": [ 806 | "%%sql\n", 807 | "-- Now try to do this for each type of data in parallel by grouping\n", 808 | "\n", 809 | "SELECT \n", 810 | " name, \n", 811 | " plp.np_mean(array_agg(y)) \n", 812 | "FROM plp.test_data \n", 813 | "GROUP BY name ORDER BY name;" 814 | ], 815 | "language": "python", 816 | "metadata": {}, 817 | "outputs": [ 818 | { 819 | "output_type": "stream", 820 | "stream": "stdout", 821 | "text": [ 822 | "3 rows affected.\n" 823 | ] 824 | }, 825 | { 826 | "html": [ 827 | "\n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | "
namenp_mean
a5000.0
b0.000195060907772
c100.000195061
" 845 | ], 846 | "metadata": {}, 847 | "output_type": "pyout", 848 | "prompt_number": 24, 849 | "text": [ 850 | "[(u'a', 5000.0), (u'b', 0.000195060907772), (u'c', 100.000195061)]" 851 | ] 852 | } 853 | ], 854 | "prompt_number": 24 855 | }, 856 | { 857 | "cell_type": "code", 858 | "collapsed": false, 859 | "input": [ 860 | "%%sql\n", 861 | "-- Now try do something even more interesting\n", 862 | "\n", 863 | "DROP FUNCTION IF EXISTS plp.linregr(double precision[]);\n", 864 | "\n", 865 | "CREATE OR REPLACE FUNCTION \n", 866 | " plp.linregr(x double precision[], y double precision[])\n", 867 | "RETURNS float[]\n", 868 | "AS $$\n", 869 | "from scipy import stats\n", 870 | "return stats.linregress(x, y)\n", 871 | "$$ LANGUAGE plpythonu;" 872 | ], 873 | "language": "python", 874 | "metadata": {}, 875 | "outputs": [ 876 | { 877 | "output_type": "stream", 878 | "stream": "stdout", 879 | "text": [ 880 | "Done.\n", 881 | "Done." 882 | ] 883 | }, 884 | { 885 | "output_type": "stream", 886 | "stream": "stdout", 887 | "text": [ 888 | "\n" 889 | ] 890 | }, 891 | { 892 | "metadata": {}, 893 | "output_type": "pyout", 894 | "prompt_number": 25, 895 | "text": [ 896 | "[]" 897 | ] 898 | } 899 | ], 900 | "prompt_number": 25 901 | }, 902 | { 903 | "cell_type": "code", 904 | "collapsed": false, 905 | "input": [ 906 | "%%sql\n", 907 | "-- Do linear regression for all data\n", 908 | "\n", 909 | "SELECT plp.linregr(array_agg(x), array_agg(y)) \n", 910 | "FROM plp.test_data;" 911 | ], 912 | "language": "python", 913 | "metadata": {}, 914 | "outputs": [ 915 | { 916 | "output_type": "stream", 917 | "stream": "stdout", 918 | "text": [ 919 | "1 rows affected.\n" 920 | ] 921 | }, 922 | { 923 | "html": [ 924 | "\n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | "
linregr
[0.00333333331357, 33.3334732575, 0.335532910587, 0.0, 5.4031491216e-06]
" 932 | ], 933 | "metadata": {}, 934 | "output_type": "pyout", 935 | "prompt_number": 26, 936 | "text": [ 937 | "[([0.00333333331357, 33.3334732575, 0.335532910587, 0.0, 5.4031491216e-06],)]" 938 | ] 939 | } 940 | ], 941 | "prompt_number": 26 942 | }, 943 | { 944 | "cell_type": "code", 945 | "collapsed": false, 946 | "input": [ 947 | "%%sql\n", 948 | "-- Now do it separately for each 'name'\n", 949 | "\n", 950 | "SELECT name, plp.linregr(array_agg(x), array_agg(y)) \n", 951 | "FROM plp.test_data \n", 952 | "GROUP BY name ORDER BY name;" 953 | ], 954 | "language": "python", 955 | "metadata": {}, 956 | "outputs": [ 957 | { 958 | "output_type": "stream", 959 | "stream": "stdout", 960 | "text": [ 961 | "3 rows affected.\n" 962 | ] 963 | }, 964 | { 965 | "html": [ 966 | "\n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | "
namelinregr
a[0.0100000000001, -6.82430254528e-08, 1.0, 0.0, 0.0]
b[-2.96599680404e-11, 0.000209890891792, -1.210882548e-05, 0.990338798439, 2.44945167615e-09]
c[-2.96599680394e-11, 100.000209891, -1.21088254796e-05, 0.99033879844, 2.44945167615e-09]
" 984 | ], 985 | "metadata": {}, 986 | "output_type": "pyout", 987 | "prompt_number": 27, 988 | "text": [ 989 | "[(u'a', [0.0100000000001, -6.82430254528e-08, 1.0, 0.0, 0.0]),\n", 990 | " (u'b', [-2.96599680404e-11, 0.000209890891792, -1.210882548e-05, 0.990338798439, 2.44945167615e-09]),\n", 991 | " (u'c', [-2.96599680394e-11, 100.000209891, -1.21088254796e-05, 0.99033879844, 2.44945167615e-09])]" 992 | ] 993 | } 994 | ], 995 | "prompt_number": 27 996 | }, 997 | { 998 | "cell_type": "markdown", 999 | "metadata": {}, 1000 | "source": [ 1001 | "In this example we have shown how to run models separately for different data using the GROUP BY clause. It is important to have distributed your data in the correct way to utilise the parallel architecture.\n", 1002 | "\n", 1003 | "For further information [see these notes](http://gopivotal.github.io/gp-r/) on using PL/R in addition to PL/Python." 1004 | ] 1005 | } 1006 | ], 1007 | "metadata": {} 1008 | } 1009 | ] 1010 | } --------------------------------------------------------------------------------