├── .gitignore ├── Demo1-MemoryMapping.ipynb ├── Demo2-Flight.ipynb ├── README.md ├── fec-2012.parquet └── slides.pdf /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints -------------------------------------------------------------------------------- /Demo1-MemoryMapping.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 14, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pyarrow as pa\n", 11 | "import pyarrow.parquet as pq\n", 12 | "import pandas as pd" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "Here is a campaign contribution dataset from the 2012 presidential election as a Parquet file" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 15, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "fec = pq.read_table('fec-2012.parquet')" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 16, 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "cmte_id: string\n", 41 | "cand_id: string\n", 42 | "cand_nm: string\n", 43 | "contbr_nm: string\n", 44 | "contbr_city: string\n", 45 | "contbr_st: string\n", 46 | "contbr_zip: string\n", 47 | "contbr_employer: string\n", 48 | "contbr_occupation: string\n", 49 | "contb_receipt_amt: double\n", 50 | "contb_receipt_dt: string\n", 51 | "receipt_desc: string\n", 52 | "memo_cd: string\n", 53 | "memo_text: string\n", 54 | "form_tp: string\n", 55 | "file_num: int64\n", 56 | "-- schema metadata --\n", 57 | "pandas: '{\"index_columns\": [{\"kind\": \"range\", \"name\": null, \"start\": 0, \"' + 2218\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "print(fec.schema.to_string(show_field_metadata=False))" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "The extra metadata is a pandas-specific detail (where the file was produced), so ignore that" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 17, 75 | "metadata": {}, 76 | "outputs": [ 77 | { 78 | "data": { 79 | "text/plain": [ 80 | "364795968" 81 | ] 82 | }, 83 | "execution_count": 17, 84 | "metadata": {}, 85 | "output_type": "execute_result" 86 | } 87 | ], 88 | "source": [ 89 | "pa.total_allocated_bytes()" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "I'm going to write 50 copies of the table end-to-end in a stream so we have a 7+ gigabyte file to work with" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 18, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "with open('fec.arrow', 'wb') as f:\n", 106 | " writer = pa.ipc.RecordBatchStreamWriter(f, fec.schema)\n", 107 | " for i in range(50):\n", 108 | " writer.write(fec)\n", 109 | " writer.close()" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 19, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | "total 8921544\r\n", 122 | "-rw------- 1 wesm wesm 3921 Apr 3 11:11 Demo1-MemoryMapping.ipynb\r\n", 123 | "-rw------- 1 wesm wesm 9148 Apr 3 11:10 Demo2-Flight.ipynb\r\n", 124 | "-rw------- 1 wesm wesm 27867532 Feb 26 13:40 fec-2012.parquet\r\n", 125 | "-rw------- 1 wesm wesm 9107358528 Apr 3 11:32 fec.arrow\r\n", 126 | "-rw------- 1 wesm wesm 600 Feb 26 13:31 README.md\r\n", 127 | "-rw------- 1 wesm wesm 403260 Feb 26 13:31 slides.pdf\r\n" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "!ls -l" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 20, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "mmap = pa.memory_map('fec.arrow')\n", 142 | "f = pa.ipc.open_stream(mmap)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "Now we're going to \"parse\" the stream to obtain Arorw data structures referencing the memory map" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 21, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "name": "stdout", 159 | "output_type": "stream", 160 | "text": [ 161 | "CPU times: user 3.79 ms, sys: 0 ns, total: 3.79 ms\n", 162 | "Wall time: 3.05 ms\n" 163 | ] 164 | } 165 | ], 166 | "source": [ 167 | "%%time\n", 168 | "t = f.read_all()" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 22, 174 | "metadata": {}, 175 | "outputs": [ 176 | { 177 | "data": { 178 | "text/plain": [ 179 | "50086550" 180 | ] 181 | }, 182 | "execution_count": 22, 183 | "metadata": {}, 184 | "output_type": "execute_result" 185 | } 186 | ], 187 | "source": [ 188 | "len(t)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 23, 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "data": { 198 | "text/plain": [ 199 | "364795968" 200 | ] 201 | }, 202 | "execution_count": 23, 203 | "metadata": {}, 204 | "output_type": "execute_result" 205 | } 206 | ], 207 | "source": [ 208 | "pa.total_allocated_bytes()" 209 | ] 210 | }, 211 | { 212 | "cell_type": "markdown", 213 | "metadata": {}, 214 | "source": [ 215 | "Note that the read does take a little bit of time (~700ms) because of the 50 table chunks referencing the memory map that have to be reconstructed " 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 24, 221 | "metadata": {}, 222 | "outputs": [ 223 | { 224 | "data": { 225 | "text/plain": [ 226 | "\n", 227 | "[\n", 228 | " [\n", 229 | " \"Bachmann, Michelle\",\n", 230 | " \"Bachmann, Michelle\",\n", 231 | " \"Bachmann, Michelle\",\n", 232 | " \"Bachmann, Michelle\",\n", 233 | " \"Bachmann, Michelle\",\n", 234 | " \"Bachmann, Michelle\",\n", 235 | " \"Bachmann, Michelle\",\n", 236 | " \"Bachmann, Michelle\",\n", 237 | " \"Bachmann, Michelle\",\n", 238 | " \"Bachmann, Michelle\",\n", 239 | " ...\n", 240 | " \"Perry, Rick\",\n", 241 | " \"Perry, Rick\",\n", 242 | " \"Perry, Rick\",\n", 243 | " \"Perry, Rick\",\n", 244 | " \"Perry, Rick\",\n", 245 | " \"Perry, Rick\",\n", 246 | " \"Perry, Rick\",\n", 247 | " \"Perry, Rick\",\n", 248 | " \"Perry, Rick\",\n", 249 | " \"Perry, Rick\"\n", 250 | " ],\n", 251 | " [\n", 252 | " \"Bachmann, Michelle\",\n", 253 | " \"Bachmann, Michelle\",\n", 254 | " \"Bachmann, Michelle\",\n", 255 | " \"Bachmann, Michelle\",\n", 256 | " \"Bachmann, Michelle\",\n", 257 | " \"Bachmann, Michelle\",\n", 258 | " \"Bachmann, Michelle\",\n", 259 | " \"Bachmann, Michelle\",\n", 260 | " \"Bachmann, Michelle\",\n", 261 | " \"Bachmann, Michelle\",\n", 262 | " ...\n", 263 | " \"Perry, Rick\",\n", 264 | " \"Perry, Rick\",\n", 265 | " \"Perry, Rick\",\n", 266 | " \"Perry, Rick\",\n", 267 | " \"Perry, Rick\",\n", 268 | " \"Perry, Rick\",\n", 269 | " \"Perry, Rick\",\n", 270 | " \"Perry, Rick\",\n", 271 | " \"Perry, Rick\",\n", 272 | " \"Perry, Rick\"\n", 273 | " ],\n", 274 | " [\n", 275 | " \"Bachmann, Michelle\",\n", 276 | " \"Bachmann, Michelle\",\n", 277 | " \"Bachmann, Michelle\",\n", 278 | " \"Bachmann, Michelle\",\n", 279 | " \"Bachmann, Michelle\",\n", 280 | " \"Bachmann, Michelle\",\n", 281 | " \"Bachmann, Michelle\",\n", 282 | " \"Bachmann, Michelle\",\n", 283 | " \"Bachmann, Michelle\",\n", 284 | " \"Bachmann, Michelle\",\n", 285 | " ...\n", 286 | " \"Perry, Rick\",\n", 287 | " \"Perry, Rick\",\n", 288 | " \"Perry, Rick\",\n", 289 | " \"Perry, Rick\",\n", 290 | " \"Perry, Rick\",\n", 291 | " \"Perry, Rick\",\n", 292 | " \"Perry, Rick\",\n", 293 | " \"Perry, Rick\",\n", 294 | " \"Perry, Rick\",\n", 295 | " \"Perry, Rick\"\n", 296 | " ],\n", 297 | " [\n", 298 | " \"Bachmann, Michelle\",\n", 299 | " \"Bachmann, Michelle\",\n", 300 | " \"Bachmann, Michelle\",\n", 301 | " \"Bachmann, Michelle\",\n", 302 | " \"Bachmann, Michelle\",\n", 303 | " \"Bachmann, Michelle\",\n", 304 | " \"Bachmann, Michelle\",\n", 305 | " \"Bachmann, Michelle\",\n", 306 | " \"Bachmann, Michelle\",\n", 307 | " \"Bachmann, Michelle\",\n", 308 | " ...\n", 309 | " \"Perry, Rick\",\n", 310 | " \"Perry, Rick\",\n", 311 | " \"Perry, Rick\",\n", 312 | " \"Perry, Rick\",\n", 313 | " \"Perry, Rick\",\n", 314 | " \"Perry, Rick\",\n", 315 | " \"Perry, Rick\",\n", 316 | " \"Perry, Rick\",\n", 317 | " \"Perry, Rick\",\n", 318 | " \"Perry, Rick\"\n", 319 | " ],\n", 320 | " [\n", 321 | " \"Bachmann, Michelle\",\n", 322 | " \"Bachmann, Michelle\",\n", 323 | " \"Bachmann, Michelle\",\n", 324 | " \"Bachmann, Michelle\",\n", 325 | " \"Bachmann, Michelle\",\n", 326 | " \"Bachmann, Michelle\",\n", 327 | " \"Bachmann, Michelle\",\n", 328 | " \"Bachmann, Michelle\",\n", 329 | " \"Bachmann, Michelle\",\n", 330 | " \"Bachmann, Michelle\",\n", 331 | " ...\n", 332 | " \"Perry, Rick\",\n", 333 | " \"Perry, Rick\",\n", 334 | " \"Perry, Rick\",\n", 335 | " \"Perry, Rick\",\n", 336 | " \"Perry, Rick\",\n", 337 | " \"Perry, Rick\",\n", 338 | " \"Perry, Rick\",\n", 339 | " \"Perry, Rick\",\n", 340 | " \"Perry, Rick\",\n", 341 | " \"Perry, Rick\"\n", 342 | " ],\n", 343 | " [\n", 344 | " \"Bachmann, Michelle\",\n", 345 | " \"Bachmann, Michelle\",\n", 346 | " \"Bachmann, Michelle\",\n", 347 | " \"Bachmann, Michelle\",\n", 348 | " \"Bachmann, Michelle\",\n", 349 | " \"Bachmann, Michelle\",\n", 350 | " \"Bachmann, Michelle\",\n", 351 | " \"Bachmann, Michelle\",\n", 352 | " \"Bachmann, Michelle\",\n", 353 | " \"Bachmann, Michelle\",\n", 354 | " ...\n", 355 | " \"Perry, Rick\",\n", 356 | " \"Perry, Rick\",\n", 357 | " \"Perry, Rick\",\n", 358 | " \"Perry, Rick\",\n", 359 | " \"Perry, Rick\",\n", 360 | " \"Perry, Rick\",\n", 361 | " \"Perry, Rick\",\n", 362 | " \"Perry, Rick\",\n", 363 | " \"Perry, Rick\",\n", 364 | " \"Perry, Rick\"\n", 365 | " ],\n", 366 | " [\n", 367 | " \"Bachmann, Michelle\",\n", 368 | " \"Bachmann, Michelle\",\n", 369 | " \"Bachmann, Michelle\",\n", 370 | " \"Bachmann, Michelle\",\n", 371 | " \"Bachmann, Michelle\",\n", 372 | " \"Bachmann, Michelle\",\n", 373 | " \"Bachmann, Michelle\",\n", 374 | " \"Bachmann, Michelle\",\n", 375 | " \"Bachmann, Michelle\",\n", 376 | " \"Bachmann, Michelle\",\n", 377 | " ...\n", 378 | " \"Perry, Rick\",\n", 379 | " \"Perry, Rick\",\n", 380 | " \"Perry, Rick\",\n", 381 | " \"Perry, Rick\",\n", 382 | " \"Perry, Rick\",\n", 383 | " \"Perry, Rick\",\n", 384 | " \"Perry, Rick\",\n", 385 | " \"Perry, Rick\",\n", 386 | " \"Perry, Rick\",\n", 387 | " \"Perry, Rick\"\n", 388 | " ],\n", 389 | " [\n", 390 | " \"Bachmann, Michelle\",\n", 391 | " \"Bachmann, Michelle\",\n", 392 | " \"Bachmann, Michelle\",\n", 393 | " \"Bachmann, Michelle\",\n", 394 | " \"Bachmann, Michelle\",\n", 395 | " \"Bachmann, Michelle\",\n", 396 | " \"Bachmann, Michelle\",\n", 397 | " \"Bachmann, Michelle\",\n", 398 | " \"Bachmann, Michelle\",\n", 399 | " \"Bachmann, Michelle\",\n", 400 | " ...\n", 401 | " \"Perry, Rick\",\n", 402 | " \"Perry, Rick\",\n", 403 | " \"Perry, Rick\",\n", 404 | " \"Perry, Rick\",\n", 405 | " \"Perry, Rick\",\n", 406 | " \"Perry, Rick\",\n", 407 | " \"Perry, Rick\",\n", 408 | " \"Perry, Rick\",\n", 409 | " \"Perry, Rick\",\n", 410 | " \"Perry, Rick\"\n", 411 | " ],\n", 412 | " [\n", 413 | " \"Bachmann, Michelle\",\n", 414 | " \"Bachmann, Michelle\",\n", 415 | " \"Bachmann, Michelle\",\n", 416 | " \"Bachmann, Michelle\",\n", 417 | " \"Bachmann, Michelle\",\n", 418 | " \"Bachmann, Michelle\",\n", 419 | " \"Bachmann, Michelle\",\n", 420 | " \"Bachmann, Michelle\",\n", 421 | " \"Bachmann, Michelle\",\n", 422 | " \"Bachmann, Michelle\",\n", 423 | " ...\n", 424 | " \"Perry, Rick\",\n", 425 | " \"Perry, Rick\",\n", 426 | " \"Perry, Rick\",\n", 427 | " \"Perry, Rick\",\n", 428 | " \"Perry, Rick\",\n", 429 | " \"Perry, Rick\",\n", 430 | " \"Perry, Rick\",\n", 431 | " \"Perry, Rick\",\n", 432 | " \"Perry, Rick\",\n", 433 | " \"Perry, Rick\"\n", 434 | " ],\n", 435 | " [\n", 436 | " \"Bachmann, Michelle\",\n", 437 | " \"Bachmann, Michelle\",\n", 438 | " \"Bachmann, Michelle\",\n", 439 | " \"Bachmann, Michelle\",\n", 440 | " \"Bachmann, Michelle\",\n", 441 | " \"Bachmann, Michelle\",\n", 442 | " \"Bachmann, Michelle\",\n", 443 | " \"Bachmann, Michelle\",\n", 444 | " \"Bachmann, Michelle\",\n", 445 | " \"Bachmann, Michelle\",\n", 446 | " ...\n", 447 | " \"Perry, Rick\",\n", 448 | " \"Perry, Rick\",\n", 449 | " \"Perry, Rick\",\n", 450 | " \"Perry, Rick\",\n", 451 | " \"Perry, Rick\",\n", 452 | " \"Perry, Rick\",\n", 453 | " \"Perry, Rick\",\n", 454 | " \"Perry, Rick\",\n", 455 | " \"Perry, Rick\",\n", 456 | " \"Perry, Rick\"\n", 457 | " ],\n", 458 | "...\n", 459 | " [\n", 460 | " \"Bachmann, Michelle\",\n", 461 | " \"Bachmann, Michelle\",\n", 462 | " \"Bachmann, Michelle\",\n", 463 | " \"Bachmann, Michelle\",\n", 464 | " \"Bachmann, Michelle\",\n", 465 | " \"Bachmann, Michelle\",\n", 466 | " \"Bachmann, Michelle\",\n", 467 | " \"Bachmann, Michelle\",\n", 468 | " \"Bachmann, Michelle\",\n", 469 | " \"Bachmann, Michelle\",\n", 470 | " ...\n", 471 | " \"Perry, Rick\",\n", 472 | " \"Perry, Rick\",\n", 473 | " \"Perry, Rick\",\n", 474 | " \"Perry, Rick\",\n", 475 | " \"Perry, Rick\",\n", 476 | " \"Perry, Rick\",\n", 477 | " \"Perry, Rick\",\n", 478 | " \"Perry, Rick\",\n", 479 | " \"Perry, Rick\",\n", 480 | " \"Perry, Rick\"\n", 481 | " ],\n", 482 | " [\n", 483 | " \"Bachmann, Michelle\",\n", 484 | " \"Bachmann, Michelle\",\n", 485 | " \"Bachmann, Michelle\",\n", 486 | " \"Bachmann, Michelle\",\n", 487 | " \"Bachmann, Michelle\",\n", 488 | " \"Bachmann, Michelle\",\n", 489 | " \"Bachmann, Michelle\",\n", 490 | " \"Bachmann, Michelle\",\n", 491 | " \"Bachmann, Michelle\",\n", 492 | " \"Bachmann, Michelle\",\n", 493 | " ...\n", 494 | " \"Perry, Rick\",\n", 495 | " \"Perry, Rick\",\n", 496 | " \"Perry, Rick\",\n", 497 | " \"Perry, Rick\",\n", 498 | " \"Perry, Rick\",\n", 499 | " \"Perry, Rick\",\n", 500 | " \"Perry, Rick\",\n", 501 | " \"Perry, Rick\",\n", 502 | " \"Perry, Rick\",\n", 503 | " \"Perry, Rick\"\n", 504 | " ],\n", 505 | " [\n", 506 | " \"Bachmann, Michelle\",\n", 507 | " \"Bachmann, Michelle\",\n", 508 | " \"Bachmann, Michelle\",\n", 509 | " \"Bachmann, Michelle\",\n", 510 | " \"Bachmann, Michelle\",\n", 511 | " \"Bachmann, Michelle\",\n", 512 | " \"Bachmann, Michelle\",\n", 513 | " \"Bachmann, Michelle\",\n", 514 | " \"Bachmann, Michelle\",\n", 515 | " \"Bachmann, Michelle\",\n", 516 | " ...\n", 517 | " \"Perry, Rick\",\n", 518 | " \"Perry, Rick\",\n", 519 | " \"Perry, Rick\",\n", 520 | " \"Perry, Rick\",\n", 521 | " \"Perry, Rick\",\n", 522 | " \"Perry, Rick\",\n", 523 | " \"Perry, Rick\",\n", 524 | " \"Perry, Rick\",\n", 525 | " \"Perry, Rick\",\n", 526 | " \"Perry, Rick\"\n", 527 | " ],\n", 528 | " [\n", 529 | " \"Bachmann, Michelle\",\n", 530 | " \"Bachmann, Michelle\",\n", 531 | " \"Bachmann, Michelle\",\n", 532 | " \"Bachmann, Michelle\",\n", 533 | " \"Bachmann, Michelle\",\n", 534 | " \"Bachmann, Michelle\",\n", 535 | " \"Bachmann, Michelle\",\n", 536 | " \"Bachmann, Michelle\",\n", 537 | " \"Bachmann, Michelle\",\n", 538 | " \"Bachmann, Michelle\",\n", 539 | " ...\n", 540 | " \"Perry, Rick\",\n", 541 | " \"Perry, Rick\",\n", 542 | " \"Perry, Rick\",\n", 543 | " \"Perry, Rick\",\n", 544 | " \"Perry, Rick\",\n", 545 | " \"Perry, Rick\",\n", 546 | " \"Perry, Rick\",\n", 547 | " \"Perry, Rick\",\n", 548 | " \"Perry, Rick\",\n", 549 | " \"Perry, Rick\"\n", 550 | " ],\n", 551 | " [\n", 552 | " \"Bachmann, Michelle\",\n", 553 | " \"Bachmann, Michelle\",\n", 554 | " \"Bachmann, Michelle\",\n", 555 | " \"Bachmann, Michelle\",\n", 556 | " \"Bachmann, Michelle\",\n", 557 | " \"Bachmann, Michelle\",\n", 558 | " \"Bachmann, Michelle\",\n", 559 | " \"Bachmann, Michelle\",\n", 560 | " \"Bachmann, Michelle\",\n", 561 | " \"Bachmann, Michelle\",\n", 562 | " ...\n", 563 | " \"Perry, Rick\",\n", 564 | " \"Perry, Rick\",\n", 565 | " \"Perry, Rick\",\n", 566 | " \"Perry, Rick\",\n", 567 | " \"Perry, Rick\",\n", 568 | " \"Perry, Rick\",\n", 569 | " \"Perry, Rick\",\n", 570 | " \"Perry, Rick\",\n", 571 | " \"Perry, Rick\",\n", 572 | " \"Perry, Rick\"\n", 573 | " ],\n", 574 | " [\n", 575 | " \"Bachmann, Michelle\",\n", 576 | " \"Bachmann, Michelle\",\n", 577 | " \"Bachmann, Michelle\",\n", 578 | " \"Bachmann, Michelle\",\n", 579 | " \"Bachmann, Michelle\",\n", 580 | " \"Bachmann, Michelle\",\n", 581 | " \"Bachmann, Michelle\",\n", 582 | " \"Bachmann, Michelle\",\n", 583 | " \"Bachmann, Michelle\",\n", 584 | " \"Bachmann, Michelle\",\n", 585 | " ...\n", 586 | " \"Perry, Rick\",\n", 587 | " \"Perry, Rick\",\n", 588 | " \"Perry, Rick\",\n", 589 | " \"Perry, Rick\",\n", 590 | " \"Perry, Rick\",\n", 591 | " \"Perry, Rick\",\n", 592 | " \"Perry, Rick\",\n", 593 | " \"Perry, Rick\",\n", 594 | " \"Perry, Rick\",\n", 595 | " \"Perry, Rick\"\n", 596 | " ],\n", 597 | " [\n", 598 | " \"Bachmann, Michelle\",\n", 599 | " \"Bachmann, Michelle\",\n", 600 | " \"Bachmann, Michelle\",\n", 601 | " \"Bachmann, Michelle\",\n", 602 | " \"Bachmann, Michelle\",\n", 603 | " \"Bachmann, Michelle\",\n", 604 | " \"Bachmann, Michelle\",\n", 605 | " \"Bachmann, Michelle\",\n", 606 | " \"Bachmann, Michelle\",\n", 607 | " \"Bachmann, Michelle\",\n", 608 | " ...\n", 609 | " \"Perry, Rick\",\n", 610 | " \"Perry, Rick\",\n", 611 | " \"Perry, Rick\",\n", 612 | " \"Perry, Rick\",\n", 613 | " \"Perry, Rick\",\n", 614 | " \"Perry, Rick\",\n", 615 | " \"Perry, Rick\",\n", 616 | " \"Perry, Rick\",\n", 617 | " \"Perry, Rick\",\n", 618 | " \"Perry, Rick\"\n", 619 | " ],\n", 620 | " [\n", 621 | " \"Bachmann, Michelle\",\n", 622 | " \"Bachmann, Michelle\",\n", 623 | " \"Bachmann, Michelle\",\n", 624 | " \"Bachmann, Michelle\",\n", 625 | " \"Bachmann, Michelle\",\n", 626 | " \"Bachmann, Michelle\",\n", 627 | " \"Bachmann, Michelle\",\n", 628 | " \"Bachmann, Michelle\",\n", 629 | " \"Bachmann, Michelle\",\n", 630 | " \"Bachmann, Michelle\",\n", 631 | " ...\n", 632 | " \"Perry, Rick\",\n", 633 | " \"Perry, Rick\",\n", 634 | " \"Perry, Rick\",\n", 635 | " \"Perry, Rick\",\n", 636 | " \"Perry, Rick\",\n", 637 | " \"Perry, Rick\",\n", 638 | " \"Perry, Rick\",\n", 639 | " \"Perry, Rick\",\n", 640 | " \"Perry, Rick\",\n", 641 | " \"Perry, Rick\"\n", 642 | " ],\n", 643 | " [\n", 644 | " \"Bachmann, Michelle\",\n", 645 | " \"Bachmann, Michelle\",\n", 646 | " \"Bachmann, Michelle\",\n", 647 | " \"Bachmann, Michelle\",\n", 648 | " \"Bachmann, Michelle\",\n", 649 | " \"Bachmann, Michelle\",\n", 650 | " \"Bachmann, Michelle\",\n", 651 | " \"Bachmann, Michelle\",\n", 652 | " \"Bachmann, Michelle\",\n", 653 | " \"Bachmann, Michelle\",\n", 654 | " ...\n", 655 | " \"Perry, Rick\",\n", 656 | " \"Perry, Rick\",\n", 657 | " \"Perry, Rick\",\n", 658 | " \"Perry, Rick\",\n", 659 | " \"Perry, Rick\",\n", 660 | " \"Perry, Rick\",\n", 661 | " \"Perry, Rick\",\n", 662 | " \"Perry, Rick\",\n", 663 | " \"Perry, Rick\",\n", 664 | " \"Perry, Rick\"\n", 665 | " ],\n", 666 | " [\n", 667 | " \"Bachmann, Michelle\",\n", 668 | " \"Bachmann, Michelle\",\n", 669 | " \"Bachmann, Michelle\",\n", 670 | " \"Bachmann, Michelle\",\n", 671 | " \"Bachmann, Michelle\",\n", 672 | " \"Bachmann, Michelle\",\n", 673 | " \"Bachmann, Michelle\",\n", 674 | " \"Bachmann, Michelle\",\n", 675 | " \"Bachmann, Michelle\",\n", 676 | " \"Bachmann, Michelle\",\n", 677 | " ...\n", 678 | " \"Perry, Rick\",\n", 679 | " \"Perry, Rick\",\n", 680 | " \"Perry, Rick\",\n", 681 | " \"Perry, Rick\",\n", 682 | " \"Perry, Rick\",\n", 683 | " \"Perry, Rick\",\n", 684 | " \"Perry, Rick\",\n", 685 | " \"Perry, Rick\",\n", 686 | " \"Perry, Rick\",\n", 687 | " \"Perry, Rick\"\n", 688 | " ]\n", 689 | "]" 690 | ] 691 | }, 692 | "execution_count": 24, 693 | "metadata": {}, 694 | "output_type": "execute_result" 695 | } 696 | ], 697 | "source": [ 698 | "t[2]" 699 | ] 700 | }, 701 | { 702 | "cell_type": "code", 703 | "execution_count": 25, 704 | "metadata": {}, 705 | "outputs": [ 706 | { 707 | "data": { 708 | "text/plain": [ 709 | "\n", 710 | "[\n", 711 | " \"Bachmann, Michelle\",\n", 712 | " \"Romney, Mitt\",\n", 713 | " \"Obama, Barack\",\n", 714 | " \"Roemer, Charles E. 'Buddy' III\",\n", 715 | " \"Pawlenty, Timothy\",\n", 716 | " \"Johnson, Gary Earl\",\n", 717 | " \"Paul, Ron\",\n", 718 | " \"Santorum, Rick\",\n", 719 | " \"Cain, Herman\",\n", 720 | " \"Gingrich, Newt\",\n", 721 | " \"McCotter, Thaddeus G\",\n", 722 | " \"Huntsman, Jon\",\n", 723 | " \"Perry, Rick\"\n", 724 | "]" 725 | ] 726 | }, 727 | "execution_count": 25, 728 | "metadata": {}, 729 | "output_type": "execute_result" 730 | } 731 | ], 732 | "source": [ 733 | "t[2].unique()" 734 | ] 735 | }, 736 | { 737 | "cell_type": "markdown", 738 | "metadata": {}, 739 | "source": [ 740 | "The amount of allocated memory is unchanged because of memory mapping" 741 | ] 742 | }, 743 | { 744 | "cell_type": "code", 745 | "execution_count": null, 746 | "metadata": {}, 747 | "outputs": [], 748 | "source": [ 749 | "pa.total_allocated_bytes()" 750 | ] 751 | }, 752 | { 753 | "cell_type": "code", 754 | "execution_count": null, 755 | "metadata": {}, 756 | "outputs": [], 757 | "source": [ 758 | "t[0].chunk(5)[1000]" 759 | ] 760 | }, 761 | { 762 | "cell_type": "code", 763 | "execution_count": null, 764 | "metadata": {}, 765 | "outputs": [], 766 | "source": [ 767 | "t[0].chunk(5).buffers()" 768 | ] 769 | }, 770 | { 771 | "cell_type": "code", 772 | "execution_count": null, 773 | "metadata": {}, 774 | "outputs": [], 775 | "source": [ 776 | "t[0].num_chunks" 777 | ] 778 | }, 779 | { 780 | "cell_type": "code", 781 | "execution_count": null, 782 | "metadata": {}, 783 | "outputs": [], 784 | "source": [] 785 | } 786 | ], 787 | "metadata": { 788 | "kernelspec": { 789 | "display_name": "Python 3", 790 | "language": "python", 791 | "name": "python3" 792 | }, 793 | "language_info": { 794 | "codemirror_mode": { 795 | "name": "ipython", 796 | "version": 3 797 | }, 798 | "file_extension": ".py", 799 | "mimetype": "text/x-python", 800 | "name": "python", 801 | "nbconvert_exporter": "python", 802 | "pygments_lexer": "ipython3", 803 | "version": "3.7.6" 804 | } 805 | }, 806 | "nbformat": 4, 807 | "nbformat_minor": 2 808 | } 809 | -------------------------------------------------------------------------------- /Demo2-Flight.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pyarrow as pa\n", 10 | "import pyarrow.parquet as pq\n", 11 | "import pyarrow.flight as flight\n", 12 | "import numpy as np\n", 13 | "import pandas as pd\n", 14 | "import time\n", 15 | "import threading\n", 16 | "\n", 17 | "from pyarrow.util import find_free_port" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "# Implement a Flight server in Python\n", 25 | "\n", 26 | "This server has a few goals\n", 27 | "\n", 28 | "* Clients can send (\"put\") datasets, to be kept in memory by the server\n", 29 | "* Clients can request a list of cached datasets (\"list-tables\")\n", 30 | "* Clients can request (\"get\") a cached table\n", 31 | "\n", 32 | "Note that this server is very simple and does not show some of the more sophisticated \"query planning\" capabilities of Arrow Flight, nor does it show parallel or multi-part access. My goal is to show you that\n", 33 | "\n", 34 | "* It's easy to write a Flight service in Python\n", 35 | "* The performance of Flight is **very, very good**" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "class DemoServer(flight.FlightServerBase):\n", 45 | " \n", 46 | " def __init__(self, location):\n", 47 | " self._cache = {}\n", 48 | " super().__init__(location)\n", 49 | " \n", 50 | " def list_actions(self, context):\n", 51 | " return [flight.ActionType('list-tables', 'List stored tables'),\n", 52 | " flight.ActionType('drop-table', 'Drop a stored table')]\n", 53 | "\n", 54 | " # -----------------------------------------------------------------\n", 55 | " # Implement actions\n", 56 | " \n", 57 | " def do_action(self, context, action):\n", 58 | " handlers = {\n", 59 | " 'list-tables': self._list_tables,\n", 60 | " 'drop-table': self._drop_table\n", 61 | " } \n", 62 | " handler = handlers.get(action.type)\n", 63 | " if not handler:\n", 64 | " raise NotImplementedError \n", 65 | " return handlers[action.type](action)\n", 66 | " \n", 67 | " def _drop_table(self, action):\n", 68 | " del self._cache[action.body]\n", 69 | " \n", 70 | " def _list_tables(self, action):\n", 71 | " return iter([flight.Result(cache_key) \n", 72 | " for cache_key in sorted(self._cache.keys())])\n", 73 | "\n", 74 | " # -----------------------------------------------------------------\n", 75 | " # Implement puts\n", 76 | " \n", 77 | " def do_put(self, context, descriptor, reader, writer):\n", 78 | " self._cache[descriptor.command] = reader.read_all()\n", 79 | " \n", 80 | " # -----------------------------------------------------------------\n", 81 | " # Implement gets\n", 82 | "\n", 83 | " def do_get(self, context, ticket):\n", 84 | " table = self._cache[ticket.ticket]\n", 85 | " return flight.RecordBatchStream(table)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "Some helper utilities, you can ignore this part" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "## Start server in background, connect client" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "pa.ipc.IpcWriteOptions?" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 117, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "port = 1337\n", 118 | "location = flight.Location.for_grpc_tcp(\"localhost\", find_free_port())\n", 119 | "location\n", 120 | "\n", 121 | "server = DemoServer(location)\n", 122 | "\n", 123 | "thread = threading.Thread(target=lambda: server.serve(), daemon=True)\n", 124 | "thread.start()\n", 125 | "\n", 126 | "class DemoClient:\n", 127 | " \n", 128 | " def __init__(self, location, options=None):\n", 129 | " self.con = flight.connect(location)\n", 130 | " self.con.wait_for_available()\n", 131 | " self.options = options\n", 132 | " \n", 133 | " # Call \"list-tables\" RPC and return results as Python list\n", 134 | " def list_tables(self):\n", 135 | " action = flight.Action('list-tables', b'')\n", 136 | " return [x.body.to_pybytes().decode('utf8') for x in self.con.do_action(action)] \n", 137 | "\n", 138 | " # Send a pyarrow.Table to the server to be cached\n", 139 | " def cache_table_in_server(self, name, table):\n", 140 | " desc = flight.FlightDescriptor.for_command(name.encode('utf8'))\n", 141 | " put_writer, put_meta_reader = self.con.do_put(desc, table.schema,\n", 142 | " options=self.options)\n", 143 | " put_writer.write(table)\n", 144 | " put_writer.close()\n", 145 | "\n", 146 | " # Request a pyarrow.Table by name\n", 147 | " def get_table(self, name):\n", 148 | " reader = self.con.do_get(flight.Ticket(name.encode('utf8')),\n", 149 | " options=self.options)\n", 150 | " return reader.read_all()\n", 151 | "\n", 152 | " def list_actions(self):\n", 153 | " return self.con.list_actions()\n", 154 | "\n", 155 | "ipc_options = pa.ipc.IpcWriteOptions(compression='zstd')\n", 156 | "options = flight.FlightCallOptions(write_options=ipc_options)\n", 157 | "client = DemoClient(location, options=options)" 158 | ] 159 | }, 160 | { 161 | "cell_type": "markdown", 162 | "metadata": {}, 163 | "source": [ 164 | "### Ask server for supported actions" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 118, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "table = pa.table([pa.array([1,2,3,4,5])], names=['f0'])\n", 174 | "client.cache_table_in_server('table1', table)" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 119, 180 | "metadata": {}, 181 | "outputs": [ 182 | { 183 | "data": { 184 | "text/plain": [ 185 | "['table1']" 186 | ] 187 | }, 188 | "execution_count": 119, 189 | "metadata": {}, 190 | "output_type": "execute_result" 191 | } 192 | ], 193 | "source": [ 194 | "client.list_tables()" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 120, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "client.cache_table_in_server('table2', table)\n", 204 | "client.cache_table_in_server('table3', table)\n", 205 | "client.cache_table_in_server('table4', table)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 121, 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "data": { 215 | "text/plain": [ 216 | "['table1', 'table2', 'table3', 'table4']" 217 | ] 218 | }, 219 | "execution_count": 121, 220 | "metadata": {}, 221 | "output_type": "execute_result" 222 | } 223 | ], 224 | "source": [ 225 | "client.list_tables()" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 122, 231 | "metadata": {}, 232 | "outputs": [ 233 | { 234 | "data": { 235 | "text/plain": [ 236 | "pyarrow.Table\n", 237 | "f0: int64" 238 | ] 239 | }, 240 | "execution_count": 122, 241 | "metadata": {}, 242 | "output_type": "execute_result" 243 | } 244 | ], 245 | "source": [ 246 | "client.get_table('table1')" 247 | ] 248 | }, 249 | { 250 | "cell_type": "markdown", 251 | "metadata": {}, 252 | "source": [ 253 | "### Now let's make a much bigger table and test performance" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": {}, 260 | "outputs": [], 261 | "source": [ 262 | "# fec = pd.read_csv('/home/wesm/code/pydata-book/datasets/fec/P00000001-ALL.csv',\n", 263 | "# low_memory=False)\n", 264 | "# table = pa.table(fec)\n", 265 | "# pq.write_table(table, 'fec-2012.parquet')" 266 | ] 267 | }, 268 | { 269 | "cell_type": "code", 270 | "execution_count": 123, 271 | "metadata": {}, 272 | "outputs": [], 273 | "source": [ 274 | "fec_table = pq.read_table('fec-2012.parquet')" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": 124, 280 | "metadata": {}, 281 | "outputs": [], 282 | "source": [ 283 | "fec_table = pa.concat_tables([fec_table] * 10)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 125, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "# How big is it?\n", 293 | "out = pa.BufferOutputStream()\n", 294 | "with pa.ipc.RecordBatchStreamWriter(out, fec_table.schema,\n", 295 | " options=ipc_options) as writer:\n", 296 | " writer.write(fec_table)\n", 297 | "num_bytes = len(out.getvalue())" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 126, 303 | "metadata": {}, 304 | "outputs": [ 305 | { 306 | "name": "stdout", 307 | "output_type": "stream", 308 | "text": [ 309 | "Table is 0.4677470475435257 gigabytes\n" 310 | ] 311 | } 312 | ], 313 | "source": [ 314 | "print(f'Table is {num_bytes / (1 << 30)} gigabytes')" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 127, 320 | "metadata": {}, 321 | "outputs": [ 322 | { 323 | "name": "stdout", 324 | "output_type": "stream", 325 | "text": [ 326 | "CPU times: user 8.6 s, sys: 878 ms, total: 9.48 s\n", 327 | "Wall time: 1.09 s\n" 328 | ] 329 | } 330 | ], 331 | "source": [ 332 | "%%time\n", 333 | "client.cache_table_in_server('fec_table', fec_table)" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 128, 339 | "metadata": {}, 340 | "outputs": [ 341 | { 342 | "data": { 343 | "text/plain": [ 344 | "['fec_table', 'table1', 'table2', 'table3', 'table4']" 345 | ] 346 | }, 347 | "execution_count": 128, 348 | "metadata": {}, 349 | "output_type": "execute_result" 350 | } 351 | ], 352 | "source": [ 353 | "client.list_tables()" 354 | ] 355 | }, 356 | { 357 | "cell_type": "code", 358 | "execution_count": 129, 359 | "metadata": {}, 360 | "outputs": [ 361 | { 362 | "name": "stdout", 363 | "output_type": "stream", 364 | "text": [ 365 | "CPU times: user 358 ms, sys: 718 ms, total: 1.08 s\n", 366 | "Wall time: 630 ms\n" 367 | ] 368 | } 369 | ], 370 | "source": [ 371 | "%%time \n", 372 | "fec_table_received = client.get_table('fec_table')" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": null, 378 | "metadata": {}, 379 | "outputs": [], 380 | "source": [] 381 | } 382 | ], 383 | "metadata": { 384 | "kernelspec": { 385 | "display_name": "Python 3", 386 | "language": "python", 387 | "name": "python3" 388 | }, 389 | "language_info": { 390 | "codemirror_mode": { 391 | "name": "ipython", 392 | "version": 3 393 | }, 394 | "file_extension": ".py", 395 | "mimetype": "text/x-python", 396 | "name": "python", 397 | "nbconvert_exporter": "python", 398 | "pygments_lexer": "ipython3", 399 | "version": "3.7.6" 400 | } 401 | }, 402 | "nbformat": 4, 403 | "nbformat_minor": 2 404 | } 405 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Apache Arrow @ VLDB 2019 Workshop Materials 2 | 3 | This repo contains the slide deck and Jupyter notebooks from my 2 hour [Apache 4 | Arrow][1] workshop at VLDB 2019 in Los Angeles, CA. The slides are also 5 | uploaded here for your convenience. 6 | 7 | * **SLIDES**: https://www.slideshare.net/wesm/apache-arrow-workshop-at-vldb-2019-boss-session-169065658 8 | * **Demo 1**: Simple example of writing an Arrow protocol stream, then 9 | memory-mapping and reading it 10 | * **Demo 2**: Implement a simple Arrow Flight service in Python and test 11 | throughput (~1.5 GByte/second over TCP in this demo) 12 | 13 | [1]: https://arrow.apache.org -------------------------------------------------------------------------------- /fec-2012.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wesm/vldb-2019-apache-arrow-workshop/90d1d79de6388d2bdbf1be134bd71acda006e5cf/fec-2012.parquet -------------------------------------------------------------------------------- /slides.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wesm/vldb-2019-apache-arrow-workshop/90d1d79de6388d2bdbf1be134bd71acda006e5cf/slides.pdf --------------------------------------------------------------------------------