├── .gitignore ├── LICENSE ├── README.md ├── blazingsql_demo.ipynb ├── colab_notebooks ├── blazingsql_demo.ipynb ├── federated_query_demo.ipynb ├── graphistry_netflow_demo.ipynb └── vs_pyspark_netflow.ipynb ├── data ├── Music.csv ├── cancer_data_00.csv ├── cancer_data_01.parquet ├── cancer_data_02.csv └── small-chunk2.csv ├── federated_query_demo.ipynb ├── graphistry_netflow_demo.ipynb ├── imgs └── bsql_main.png ├── requirements.txt ├── sample_use_cases ├── csv_to_parquet.ipynb └── python_scripts │ └── csv_to_parquet.py ├── taxi_fare_prediction.ipynb ├── utils ├── blazing_conda_test.ipynb └── env-check.py └── vs_pyspark_netflow.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints/ 2 | .log 3 | .csv 4 | .parquet 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BlazingSQL Demos 2 | Demo Python notebooks using BlazingSQL with the RAPIDS AI ecoystem. 3 | 4 | | Notebook Title | Description |Launch in Colab| 5 | |----------------|----------------|----------------| 6 | | Getting Started | How to set up and get started with BlazingSQL and the RAPIDS AI suite |[![Google Colab Badge](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BlazingDB/bsql-demos/blob/master/colab_notebooks/blazingsql_demo.ipynb)| 7 | | Netflow | Query 65M rows of network security data (netflow) with BlazingSQL and then pass to Graphistry to visualize and interact with the data |[![Google Colab Badge](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BlazingDB/bsql-demos/blob/master/colab_notebooks/graphistry_netflow_demo.ipynb)| 8 | | Taxi | Train a linear regression model with cuML on 55 million rows of public NYC Taxi Data loaded with BlazingSQL |[![Google Colab Badge](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BlazingDB/bsql-demos/blob/master/taxi_fare_prediction.ipynb)| 9 | | BlazingSQL vs. Apache Spark | Analyze 20 million rows of net flow data. Compare BlazingSQL and Apache Spark timings for the same workload |[![Google Colab Badge](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BlazingDB/bsql-demos/blob/master/colab_notebooks/vs_pyspark_netflow.ipynb)| 10 | | Federated Query | In a single query, join an Apache Parquet file, a CSV file, and a GPU DataFrame (GDF) in GPU memory. |[![Google Colab Badge](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BlazingDB/bsql-demos/blob/master/colab_notebooks/federated_query_demo.ipynb)| -------------------------------------------------------------------------------- /blazingsql_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "8AdUt3HiUrc3" 8 | }, 9 | "source": [ 10 | "# Getting Started with BlazingSQL\n", 11 | "\n", 12 | "[BlazingSQL](https://github.com/BlazingDB/blazingsql) provides an open-source SQL interface to ETL massive datasets directly into GPU memory and the [RAPIDS.ai](https://github.com/rapidsai) Ecosystem. \n", 13 | "\n", 14 | "In this notebook, we will cover how to query cuDF (GPU) DataFrames with BlazingSQL. \n", 15 | "\n", 16 | "To learn more about the GPU DataFrame and how it enables end-to-end workloads on RAPIDS, check out our [blog post](https://blog.blazingdb.com/blazingsql-part-1-the-gpu-dataframe-gdf-and-cudf-in-rapids-ai-96ec15102240).\n", 17 | "\n", 18 | "## Imports" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 1, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "import cudf\n", 28 | "from blazingsql import BlazingContext" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": { 34 | "colab_type": "text", 35 | "id": "aMwNKxePSwOp" 36 | }, 37 | "source": [ 38 | "## Connect to BlazingSQL - Create BlazingContext\n", 39 | "You can think of the BlazingContext much like a SparkContext; this is where information such as FileSystems you have registered and Tables you have created will be stored." 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 2, 45 | "metadata": { 46 | "colab": { 47 | "base_uri": "https://localhost:8080/", 48 | "height": 35 49 | }, 50 | "colab_type": "code", 51 | "id": "ZR_vWwtMcvvY", 52 | "outputId": "c78cc40a-f7d8-4ac5-c255-d99edd03b785" 53 | }, 54 | "outputs": [ 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "BlazingContext ready\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "bc = BlazingContext()" 65 | ] 66 | }, 67 | { 68 | "cell_type": "markdown", 69 | "metadata": { 70 | "colab_type": "text", 71 | "id": "N2bqpDEnZyQf" 72 | }, 73 | "source": [ 74 | "## cuDF -> BSQL\n", 75 | "In the next few cells, we'll genereate a cuDF DataFrame and create a BlazingSQL table from it. " 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 3, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "data": { 85 | "text/html": [ 86 | "
\n", 87 | "\n", 100 | "\n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | "
idrankscore
011a
173b
244c
323d
495e
\n", 142 | "
" 143 | ], 144 | "text/plain": [ 145 | " id rank score\n", 146 | "0 1 1 a\n", 147 | "1 7 3 b\n", 148 | "2 4 4 c\n", 149 | "3 2 3 d\n", 150 | "4 9 5 e" 151 | ] 152 | }, 153 | "execution_count": 3, 154 | "metadata": {}, 155 | "output_type": "execute_result" 156 | } 157 | ], 158 | "source": [ 159 | "# generate cuDF DataFrame\n", 160 | "df = cudf.DataFrame()\n", 161 | "\n", 162 | "# add id & value columns\n", 163 | "df['id'] = [1, 7, 4, 2, 9]\n", 164 | "df['rank'] = [1, 3, 4, 3, 5]\n", 165 | "df['score'] = ['a', 'b', 'c', 'd', 'e']\n", 166 | "\n", 167 | "# how's it look?\n", 168 | "df" 169 | ] 170 | }, 171 | { 172 | "cell_type": "markdown", 173 | "metadata": { 174 | "colab_type": "text", 175 | "id": "HJFz-mqZTJ5Z" 176 | }, 177 | "source": [ 178 | "#### Create a Table\n", 179 | "Now we can easily create a table with BlazingContext's `.create_table()` method. " 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 4, 185 | "metadata": {}, 186 | "outputs": [ 187 | { 188 | "data": { 189 | "text/plain": [ 190 | "" 191 | ] 192 | }, 193 | "execution_count": 4, 194 | "metadata": {}, 195 | "output_type": "execute_result" 196 | } 197 | ], 198 | "source": [ 199 | "# BlazingSQL table from DataFrame\n", 200 | "bc.create_table('table_a', df)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": { 206 | "colab_type": "text", 207 | "id": "98HJFrt5TRa0" 208 | }, 209 | "source": [ 210 | "## Query a Table\n", 211 | "We can can now execute SQL queries with `.sql()`, which processes data on GPU and returns results as cuDF DataFrames!" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 5, 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "data": { 221 | "text/html": [ 222 | "
\n", 223 | "\n", 236 | "\n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | "
idrankscore
011a
173b
\n", 260 | "
" 261 | ], 262 | "text/plain": [ 263 | " id rank score\n", 264 | "0 1 1 a\n", 265 | "1 7 3 b" 266 | ] 267 | }, 268 | "execution_count": 5, 269 | "metadata": {}, 270 | "output_type": "execute_result" 271 | } 272 | ], 273 | "source": [ 274 | "# query everything from the first 2 instances \n", 275 | "bc.sql('select * from table_a LIMIT 2')" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 6, 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "data": { 285 | "text/html": [ 286 | "
\n", 287 | "\n", 300 | "\n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | "
count(*)
05
\n", 314 | "
" 315 | ], 316 | "text/plain": [ 317 | " count(*)\n", 318 | "0 5" 319 | ] 320 | }, 321 | "execution_count": 6, 322 | "metadata": {}, 323 | "output_type": "execute_result" 324 | } 325 | ], 326 | "source": [ 327 | "# query table - how many instances are there?\n", 328 | "bc.sql('select count(*) from table_a')" 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": 7, 334 | "metadata": { 335 | "colab": { 336 | "base_uri": "https://localhost:8080/", 337 | "height": 1000 338 | }, 339 | "colab_type": "code", 340 | "id": "14GwxmLsTV_p", 341 | "outputId": "144b7601-5363-49f8-d5af-13e80917672c" 342 | }, 343 | "outputs": [ 344 | { 345 | "data": { 346 | "text/html": [ 347 | "
\n", 348 | "\n", 361 | "\n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | "
idrankscore
073b
195e
\n", 385 | "
" 386 | ], 387 | "text/plain": [ 388 | " id rank score\n", 389 | "0 7 3 b\n", 390 | "1 9 5 e" 391 | ] 392 | }, 393 | "execution_count": 7, 394 | "metadata": {}, 395 | "output_type": "execute_result" 396 | } 397 | ], 398 | "source": [ 399 | "# query events with a value of at least 7\n", 400 | "bc.sql('SELECT * FROM table_a WHERE id >= 7')" 401 | ] 402 | }, 403 | { 404 | "cell_type": "markdown", 405 | "metadata": { 406 | "colab_type": "text", 407 | "id": "wygAeTIFTm2X" 408 | }, 409 | "source": [ 410 | "# You're Ready to Rock\n", 411 | "And... thats it! You are now live with BlazingSQL.\n", 412 | "\n", 413 | "\n", 414 | "Check out our [docs](https://docs.blazingdb.com) to get fancy or to learn more about how BlazingSQL works with the rest of [RAPIDS AI](https://rapids.ai/)." 415 | ] 416 | } 417 | ], 418 | "metadata": { 419 | "accelerator": "GPU", 420 | "colab": { 421 | "collapsed_sections": [], 422 | "name": "blazingsql_demo.ipynb", 423 | "provenance": [] 424 | }, 425 | "kernelspec": { 426 | "display_name": "winston@blazingdb.com", 427 | "language": "python", 428 | "name": "condaenv-winston_blazingdb.com" 429 | }, 430 | "language_info": { 431 | "codemirror_mode": { 432 | "name": "ipython", 433 | "version": 3 434 | }, 435 | "file_extension": ".py", 436 | "mimetype": "text/x-python", 437 | "name": "python", 438 | "nbconvert_exporter": "python", 439 | "pygments_lexer": "ipython3", 440 | "version": "3.7.3" 441 | } 442 | }, 443 | "nbformat": 4, 444 | "nbformat_minor": 4 445 | } 446 | -------------------------------------------------------------------------------- /colab_notebooks/blazingsql_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "8AdUt3HiUrc3" 8 | }, 9 | "source": [ 10 | "# Getting Started with BlazingSQL\n", 11 | "\n", 12 | "In this notebook, we will cover: \n", 13 | "- How to set up [BlazingSQL](https://blazingsql.com) and the [RAPIDS AI](https://rapids.ai/) suite.\n", 14 | "- How to read and query csv files with cuDF and BlazingSQL.\n", 15 | "![Impression](https://www.google-analytics.com/collect?v=1&tid=UA-39814657-5&cid=555&t=event&ec=guides&ea=bsql-quick-start-guide&dt=bsql-quick-start-guide)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "colab_type": "text", 22 | "id": "_h26epJpUeZP" 23 | }, 24 | "source": [ 25 | "## Setup\n", 26 | "### Environment Sanity Check \n", 27 | "\n", 28 | "RAPIDS packages (BlazingSQL included) require Pascal+ architecture to run. For Colab, this translates to a T4 GPU instance. \n", 29 | "\n", 30 | "The cell below will let you know what type of GPU you've been allocated, and how to proceed." 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 1, 36 | "metadata": { 37 | "colab": { 38 | "base_uri": "https://localhost:8080/", 39 | "height": 322 40 | }, 41 | "colab_type": "code", 42 | "id": "_lf6yKBoRYGy", 43 | "outputId": "8e9f7e7e-b89f-49bd-fd3c-c435ffb55c9c" 44 | }, 45 | "outputs": [ 46 | { 47 | "name": "stdout", 48 | "output_type": "stream", 49 | "text": [ 50 | "\n", 51 | "\n", 52 | "***********************************\n", 53 | "GPU = b'Tesla T4'\n", 54 | "Woo! You got the right kind of GPU!\n", 55 | "***********************************\n", 56 | "\n", 57 | "\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "!wget https://github.com/BlazingDB/bsql-demos/raw/master/utils/colab_env.py\n", 63 | "!python colab_env.py " 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": { 69 | "colab_type": "text", 70 | "id": "xM8xTlqeRi-g" 71 | }, 72 | "source": [ 73 | "## Installs \n", 74 | "The cell below pulls our Google Colab install script from the `bsql-demos` repo then runs it. The script first installs miniconda, then uses miniconda to install BlazingSQL and RAPIDS AI. This takes a few minutes to run. " 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": { 81 | "colab": {}, 82 | "colab_type": "code", 83 | "id": "gfWF_lG1HqV7" 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "!wget https://github.com/BlazingDB/bsql-demos/raw/master/utils/bsql-colab.sh \n", 88 | "!bash bsql-colab.sh\n", 89 | "\n", 90 | "import sys, os, time\n", 91 | "sys.path.append('/usr/local/lib/python3.6/site-packages/')\n", 92 | "os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'\n", 93 | "os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'\n", 94 | "\n", 95 | "import subprocess\n", 96 | "subprocess.Popen(['blazingsql-orchestrator', '9100', '8889', '127.0.0.1', '8890'],stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)\n", 97 | "subprocess.Popen(['java', '-jar', '/usr/local/lib/blazingsql-algebra.jar', '-p', '8890'])\n", 98 | "\n", 99 | "import pyblazing.apiv2.context as cont\n", 100 | "cont.runRal()\n", 101 | "time.sleep(1) " 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": { 107 | "colab_type": "text", 108 | "id": "aMwNKxePSwOp" 109 | }, 110 | "source": [ 111 | "## Import packages and create Blazing Context\n", 112 | "You can think of the BlazingContext much like a Spark Context (i.e. where information such as FileSystems you have registered and Tables you have created will be stored). If you have issues running this cell, restart runtime and try running it again.\n" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 1, 118 | "metadata": { 119 | "colab": { 120 | "base_uri": "https://localhost:8080/", 121 | "height": 35 122 | }, 123 | "colab_type": "code", 124 | "id": "ZR_vWwtMcvvY", 125 | "outputId": "c78cc40a-f7d8-4ac5-c255-d99edd03b785" 126 | }, 127 | "outputs": [ 128 | { 129 | "name": "stdout", 130 | "output_type": "stream", 131 | "text": [ 132 | "BlazingContext ready\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "import cudf\n", 138 | "from blazingsql import BlazingContext\n", 139 | "# start up BlazingSQL\n", 140 | "bc = BlazingContext()" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": { 146 | "colab_type": "text", 147 | "id": "N2bqpDEnZyQf" 148 | }, 149 | "source": [ 150 | "## Read CSV\n", 151 | "First we need to download a CSV file. Then we use cuDF to read the CSV file, which gives us a GPU DataFrame (GDF). To learn more about the GDF and how it enables end to end workloads on rapids, check out our [blog post](https://blog.blazingdb.com/blazingsql-part-1-the-gpu-dataframe-gdf-and-cudf-in-rapids-ai-96ec15102240)." 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 2, 157 | "metadata": { 158 | "colab": { 159 | "base_uri": "https://localhost:8080/", 160 | "height": 204 161 | }, 162 | "colab_type": "code", 163 | "id": "iqRDacOBOg44", 164 | "outputId": "dccb35e0-c284-498b-80b7-8cfc84a7a6a7" 165 | }, 166 | "outputs": [ 167 | { 168 | "name": "stdout", 169 | "output_type": "stream", 170 | "text": [ 171 | "--2020-01-23 02:59:55-- https://s3.amazonaws.com/blazingsql-colab/Music.csv\n", 172 | "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.0.133\n", 173 | "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.0.133|:443... connected.\n", 174 | "HTTP request sent, awaiting response... 200 OK\n", 175 | "Length: 10473 (10K) [text/csv]\n", 176 | "Saving to: ‘Music.csv’\n", 177 | "\n", 178 | "Music.csv 100%[===================>] 10.23K --.-KB/s in 0s \n", 179 | "\n", 180 | "2020-01-23 02:59:55 (190 MB/s) - ‘Music.csv’ saved [10473/10473]\n", 181 | "\n" 182 | ] 183 | } 184 | ], 185 | "source": [ 186 | "#Download the test CSV\n", 187 | "!wget 'https://s3.amazonaws.com/blazingsql-colab/Music.csv'" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 3, 193 | "metadata": { 194 | "colab": {}, 195 | "colab_type": "code", 196 | "id": "HhRhj-ZvZygH" 197 | }, 198 | "outputs": [ 199 | { 200 | "data": { 201 | "text/html": [ 202 | "
\n", 203 | "\n", 216 | "\n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | "
ARTISTRATINGYEARLOCATIONFESTIVAL_SET
0Arcade Fire10.02018.0Las Vegas1.0
1Justice10.02018.0Las Vegas1.0
2Florence and The Machine10.02018.0Las Vegas1.0
3Odesza10.02018.0Indio1.0
4Bon Iver10.02017.0Indio1.0
\n", 270 | "
" 271 | ], 272 | "text/plain": [ 273 | " ARTIST RATING YEAR LOCATION FESTIVAL_SET\n", 274 | "0 Arcade Fire 10.0 2018.0 Las Vegas 1.0\n", 275 | "1 Justice 10.0 2018.0 Las Vegas 1.0\n", 276 | "2 Florence and The Machine 10.0 2018.0 Las Vegas 1.0\n", 277 | "3 Odesza 10.0 2018.0 Indio 1.0\n", 278 | "4 Bon Iver 10.0 2017.0 Indio 1.0" 279 | ] 280 | }, 281 | "execution_count": 3, 282 | "metadata": {}, 283 | "output_type": "execute_result" 284 | } 285 | ], 286 | "source": [ 287 | "# like pandas, cudf can simply read the csv\n", 288 | "gdf = cudf.read_csv('Music.csv')\n", 289 | "\n", 290 | "# let's see how it looks\n", 291 | "gdf.head()" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": { 297 | "colab_type": "text", 298 | "id": "HJFz-mqZTJ5Z" 299 | }, 300 | "source": [ 301 | "## Create a Table\n", 302 | "Now we just need to create a table. " 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 4, 308 | "metadata": { 309 | "colab": {}, 310 | "colab_type": "code", 311 | "id": "HJuvtJDYTMyb" 312 | }, 313 | "outputs": [ 314 | { 315 | "data": { 316 | "text/plain": [ 317 | "" 318 | ] 319 | }, 320 | "execution_count": 4, 321 | "metadata": {}, 322 | "output_type": "execute_result" 323 | } 324 | ], 325 | "source": [ 326 | "bc.create_table('music', gdf, header=0)" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": { 332 | "colab_type": "text", 333 | "id": "98HJFrt5TRa0" 334 | }, 335 | "source": [ 336 | "## Query a Table\n", 337 | "That's it! Now when you can write a SQL query the data will get processed on the GPU with BlazingSQL, and the output will be a GPU DataFrame (GDF) inside RAPIDS!" 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 5, 343 | "metadata": { 344 | "colab": { 345 | "base_uri": "https://localhost:8080/", 346 | "height": 1000 347 | }, 348 | "colab_type": "code", 349 | "id": "14GwxmLsTV_p", 350 | "outputId": "144b7601-5363-49f8-d5af-13e80917672c" 351 | }, 352 | "outputs": [ 353 | { 354 | "data": { 355 | "text/html": [ 356 | "
\n", 357 | "\n", 370 | "\n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | "
ARTISTRATINGLOCATION
0Arcade Fire10.0Las Vegas
1Justice10.0Las Vegas
2Florence and The Machine10.0Las Vegas
3Odesza10.0Indio
4Bon Iver10.0Indio
5LA Philharmonic + Sigur Ros10.0LA
6Sigur Ros10.0Malmo
7Arcade Fire10.0Indio
8Escort9.0San Francisco
9Phoenix9.0Berkeley
\n", 442 | "
" 443 | ], 444 | "text/plain": [ 445 | " ARTIST RATING LOCATION\n", 446 | "0 Arcade Fire 10.0 Las Vegas\n", 447 | "1 Justice 10.0 Las Vegas\n", 448 | "2 Florence and The Machine 10.0 Las Vegas\n", 449 | "3 Odesza 10.0 Indio\n", 450 | "4 Bon Iver 10.0 Indio\n", 451 | "5 LA Philharmonic + Sigur Ros 10.0 LA\n", 452 | "6 Sigur Ros 10.0 Malmo\n", 453 | "7 Arcade Fire 10.0 Indio\n", 454 | "8 Escort 9.0 San Francisco\n", 455 | "9 Phoenix 9.0 Berkeley" 456 | ] 457 | }, 458 | "execution_count": 5, 459 | "metadata": {}, 460 | "output_type": "execute_result" 461 | } 462 | ], 463 | "source": [ 464 | "# query 10 events with a rating of at least 7\n", 465 | "gdf = bc.sql('select ARTIST, RATING, LOCATION from music where RATING >= 7 limit 10')\n", 466 | "\n", 467 | "# display GDF (just like pandas)\n", 468 | "gdf" 469 | ] 470 | }, 471 | { 472 | "cell_type": "markdown", 473 | "metadata": { 474 | "colab_type": "text", 475 | "id": "wygAeTIFTm2X" 476 | }, 477 | "source": [ 478 | "# You're Ready to Rock\n", 479 | "And... thats it! You are now live with BlazingSQL.\n", 480 | "\n", 481 | "\n", 482 | "Check out our [docs](https://docs.blazingdb.com) to get fancy or to learn more about how BlazingSQL works with the rest of [RAPIDS AI](https://rapids.ai/)." 483 | ] 484 | } 485 | ], 486 | "metadata": { 487 | "accelerator": "GPU", 488 | "colab": { 489 | "collapsed_sections": [], 490 | "name": "blazingsql_demo.ipynb", 491 | "provenance": [] 492 | }, 493 | "kernelspec": { 494 | "display_name": "Python 3", 495 | "language": "python", 496 | "name": "python3" 497 | }, 498 | "language_info": { 499 | "codemirror_mode": { 500 | "name": "ipython", 501 | "version": 3 502 | }, 503 | "file_extension": ".py", 504 | "mimetype": "text/x-python", 505 | "name": "python", 506 | "nbconvert_exporter": "python", 507 | "pygments_lexer": "ipython3", 508 | "version": "3.6.7" 509 | } 510 | }, 511 | "nbformat": 4, 512 | "nbformat_minor": 4 513 | } 514 | -------------------------------------------------------------------------------- /colab_notebooks/graphistry_netflow_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "kJyD4oSbugE0" 8 | }, 9 | "source": [ 10 | "# Graphistry Netflow Demo\n", 11 | "\n", 12 | "In this example we are taking millions of rows of netflow (network traffic flow) data in order to search for anomalous activity within a network.\n", 13 | "\n", 14 | "In this notebook, we will: \n", 15 | "- Set up [BlazingSQL](https://blazingsql.com) and the [RAPIDS AI](https://rapids.ai/) suite.\n", 16 | "- Query 20M rows of network security data (netflow) with BlazingSQL and then pass to Graphistry to visualize and interact with the data.\n", 17 | "![Impression](https://www.google-analytics.com/collect?v=1&tid=UA-39814657-5&cid=555&t=event&ec=guides&ea=graphistry-netflow-demo&dt=graphistry-netflow-demo)" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "## Setup\n", 25 | "### Environment Sanity Check \n", 26 | "\n", 27 | "RAPIDS packages (BlazingSQL included) require Pascal+ architecture to run. For Colab, this translates to a T4 GPU instance. \n", 28 | "\n", 29 | "The cell below will let you know what type of GPU you've been allocated, and how to proceed." 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 1, 35 | "metadata": { 36 | "colab": { 37 | "base_uri": "https://localhost:8080/", 38 | "height": 312 39 | }, 40 | "colab_type": "code", 41 | "id": "zxhxwrfI7aoT", 42 | "outputId": "0880eafa-a0b1-4f39-d3dc-bab9d4e8b127" 43 | }, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "\n", 50 | "\n", 51 | "***********************************\n", 52 | "GPU = b'Tesla T4'\n", 53 | "Woo! You got the right kind of GPU!\n", 54 | "***********************************\n", 55 | "\n", 56 | "\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "!wget https://github.com/BlazingDB/bsql-demos/raw/master/utils/colab_env.py\n", 62 | "!python colab_env.py " 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": {}, 68 | "source": [ 69 | "## Installs \n", 70 | "The cell below pulls our Google Colab install script from the `bsql-demos` repo then runs it. The script first installs miniconda, then uses miniconda to install BlazingSQL and RAPIDS AI. This takes a few minutes to run. " 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 3, 76 | "metadata": { 77 | "colab": { 78 | "base_uri": "https://localhost:8080/", 79 | "height": 35 80 | }, 81 | "colab_type": "code", 82 | "id": "a7RprJxtZZtQ", 83 | "outputId": "5ed256e4-93ee-4295-914d-c5c75c9d6059" 84 | }, 85 | "outputs": [], 86 | "source": [ 87 | "!wget https://github.com/BlazingDB/bsql-demos/raw/master/utils/bsql-colab.sh \n", 88 | "!bash bsql-colab.sh\n", 89 | "\n", 90 | "import sys, os, time\n", 91 | "sys.path.append('/usr/local/lib/python3.6/site-packages/')\n", 92 | "os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'\n", 93 | "os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'\n", 94 | "\n", 95 | "import subprocess\n", 96 | "subprocess.Popen(['blazingsql-orchestrator', '9100', '8889', '127.0.0.1', '8890'],stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)\n", 97 | "subprocess.Popen(['java', '-jar', '/usr/local/lib/blazingsql-algebra.jar', '-p', '8890'])\n", 98 | "\n", 99 | "import pyblazing.apiv2.context as cont\n", 100 | "cont.runRal()\n", 101 | "time.sleep(1) \n", 102 | "!wget https://github.com/BlazingDB/bsql-demos/raw/master/utils/colab_env.py\n", 103 | "!python colab_env.py " 104 | ] 105 | }, 106 | { 107 | "cell_type": "markdown", 108 | "metadata": { 109 | "colab_type": "text", 110 | "id": "4guM6G87ul8e" 111 | }, 112 | "source": [ 113 | "## Download CSV\n", 114 | "\n", 115 | "The cell below will download the data for this demo from AWS and store it locally as `nf-chunk2.csv`. " 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": { 122 | "colab": { 123 | "base_uri": "https://localhost:8080/", 124 | "height": 208 125 | }, 126 | "colab_type": "code", 127 | "id": "F6teFkVGufUf", 128 | "outputId": "42fedd97-8baf-4d1a-ea41-95602cd8cb11" 129 | }, 130 | "outputs": [ 131 | { 132 | "name": "stdout", 133 | "output_type": "stream", 134 | "text": [ 135 | "--2019-08-23 21:43:50-- https://blazingsql-colab.s3.amazonaws.com/netflow_data/nf-chunk2.csv\n", 136 | "Resolving blazingsql-colab.s3.amazonaws.com (blazingsql-colab.s3.amazonaws.com)... 52.216.137.76\n", 137 | "Connecting to blazingsql-colab.s3.amazonaws.com (blazingsql-colab.s3.amazonaws.com)|52.216.137.76|:443... connected.\n", 138 | "HTTP request sent, awaiting response... 200 OK\n", 139 | "Length: 2725056295 (2.5G) [text/csv]\n", 140 | "Saving to: ‘nf-chunk2.csv’\n", 141 | "\n", 142 | "nf-chunk2.csv 100%[===================>] 2.54G 49.2MB/s in 56s \n", 143 | "\n", 144 | "2019-08-23 21:44:46 (46.2 MB/s) - ‘nf-chunk2.csv’ saved [2725056295/2725056295]\n", 145 | "\n" 146 | ] 147 | } 148 | ], 149 | "source": [ 150 | "!wget https://blazingsql-colab.s3.amazonaws.com/netflow_data/nf-chunk2.csv" 151 | ] 152 | }, 153 | { 154 | "cell_type": "markdown", 155 | "metadata": {}, 156 | "source": [ 157 | "## Blazing Context\n", 158 | "Here we are importing cuDF and BlazingContext. You can think of the BlazingContext much like a Spark Context (i.e. where information such as FileSystems you have registered and Tables you have created will be stored). If you have issues running this cell, restart runtime and try running it again." 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 1, 164 | "metadata": { 165 | "colab": { 166 | "base_uri": "https://localhost:8080/", 167 | "height": 69 168 | }, 169 | "colab_type": "code", 170 | "id": "pqQ8lqL8vb-8", 171 | "outputId": "4e5ebc46-6319-4d3a-851c-7d6a2ac2825d" 172 | }, 173 | "outputs": [ 174 | { 175 | "name": "stdout", 176 | "output_type": "stream", 177 | "text": [ 178 | "BlazingContext ready\n" 179 | ] 180 | } 181 | ], 182 | "source": [ 183 | "from blazingsql import BlazingContext\n", 184 | "import cudf\n", 185 | "# start up BlazingSQL\n", 186 | "bc = BlazingContext()" 187 | ] 188 | }, 189 | { 190 | "cell_type": "markdown", 191 | "metadata": { 192 | "colab_type": "text", 193 | "id": "yp7z8bfivbna" 194 | }, 195 | "source": [ 196 | "### Load & Query Tables\n", 197 | "\n", 198 | "In the cell below, we are first loading the CSV file into a GPU DataFrame (gdf), and then creating tables so that we can run SQL queries on those GDFs. \n", 199 | "\n", 200 | "Note: when you create a table off of a GDF there is no copy, it is merely registering the schema." 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 2, 206 | "metadata": { 207 | "colab": {}, 208 | "colab_type": "code", 209 | "id": "lU-2wlwQntnq" 210 | }, 211 | "outputs": [ 212 | { 213 | "data": { 214 | "text/plain": [ 215 | "" 216 | ] 217 | }, 218 | "execution_count": 2, 219 | "metadata": {}, 220 | "output_type": "execute_result" 221 | } 222 | ], 223 | "source": [ 224 | "# Load CSVs into GPU DataFrames (gdf)\n", 225 | "netflow_gdf = cudf.read_csv('nf-chunk2.csv')\n", 226 | "\n", 227 | "# Create BlazingSQL Tables - There is no copy in this process\n", 228 | "bc.create_table('netflow', netflow_gdf)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": { 234 | "colab_type": "text", 235 | "id": "cgivbut9df-R" 236 | }, 237 | "source": [ 238 | "#### Query\n", 239 | "With the table made, we can simply run a SQL query.\n", 240 | "\n", 241 | "We are going to run some joins and aggregations in order to condese these millions of rows into thousands of rows that represent nodes and edges." 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 3, 247 | "metadata": { 248 | "colab": { 249 | "base_uri": "https://localhost:8080/", 250 | "height": 277 251 | }, 252 | "colab_type": "code", 253 | "id": "umBG2Tp0wbQx", 254 | "outputId": "b89e3666-f85a-40e9-e7c4-cda9a80b7fe5" 255 | }, 256 | "outputs": [ 257 | { 258 | "name": "stdout", 259 | "output_type": "stream", 260 | "text": [ 261 | "CPU times: user 32.3 ms, sys: 453 µs, total: 32.8 ms\n", 262 | "Wall time: 438 ms\n" 263 | ] 264 | } 265 | ], 266 | "source": [ 267 | "%%time\n", 268 | "# make a query\n", 269 | "query = '''\n", 270 | " SELECT\n", 271 | " a.firstSeenSrcIp as source,\n", 272 | " a.firstSeenDestIp as destination,\n", 273 | " count(a.firstSeenDestPort) as targetPorts,\n", 274 | " SUM(a.firstSeenSrcTotalBytes) as bytesOut,\n", 275 | " SUM(a.firstSeenDestTotalBytes) as bytesIn,\n", 276 | " SUM(a.durationSeconds) as durationSeconds,\n", 277 | " MIN(parsedDate) as firstFlowDate,\n", 278 | " MAX(parsedDate) as lastFlowDate,\n", 279 | " COUNT(*) as attemptCount\n", 280 | " FROM\n", 281 | " netflow a\n", 282 | " GROUP BY\n", 283 | " a.firstSeenSrcIp,\n", 284 | " a.firstSeenDestIp\n", 285 | " '''\n", 286 | "\n", 287 | "# query the table\n", 288 | "gdf = bc.sql(query)" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 4, 294 | "metadata": {}, 295 | "outputs": [ 296 | { 297 | "data": { 298 | "text/html": [ 299 | "
\n", 300 | "\n", 313 | "\n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | "
sourcedestinationtargetPortsbytesOutbytesIndurationSecondsfirstFlowDatelastFlowDateattemptCount
0172.10.1.226239.255.255.2503875062013-04-03 06:36:442013-04-03 06:36:513
1172.30.1.200239.255.255.250922750122013-04-03 06:35:522013-04-03 06:43:269
2172.30.1.225172.0.0.11909002013-04-03 06:36:142013-04-03 06:36:141
3172.30.1.46239.255.255.2501740250182013-04-03 06:35:222013-04-03 06:44:0817
4172.20.2.71239.255.255.2503875062013-04-03 06:37:112013-04-03 06:37:183
5172.10.1.233172.0.0.1118018002013-04-03 06:36:452013-04-03 06:36:451
6172.30.1.10210.0.0.10145463302013-04-03 06:48:052013-04-03 06:48:051
7172.20.1.39239.255.255.2501525062013-04-03 06:36:592013-04-03 06:36:591
8172.10.1.96172.0.0.11180002013-04-03 06:36:212013-04-03 06:36:211
9172.20.1.2239.255.255.250193675062013-04-03 06:36:502013-04-03 06:36:5919
\n", 451 | "
" 452 | ], 453 | "text/plain": [ 454 | " source destination targetPorts bytesOut bytesIn \\\n", 455 | "0 172.10.1.226 239.255.255.250 3 875 0 \n", 456 | "1 172.30.1.200 239.255.255.250 9 2275 0 \n", 457 | "2 172.30.1.225 172.0.0.1 1 90 90 \n", 458 | "3 172.30.1.46 239.255.255.250 17 4025 0 \n", 459 | "4 172.20.2.71 239.255.255.250 3 875 0 \n", 460 | "5 172.10.1.233 172.0.0.1 1 180 180 \n", 461 | "6 172.30.1.102 10.0.0.10 1 454 633 \n", 462 | "7 172.20.1.39 239.255.255.250 1 525 0 \n", 463 | "8 172.10.1.96 172.0.0.1 1 180 0 \n", 464 | "9 172.20.1.2 239.255.255.250 19 3675 0 \n", 465 | "\n", 466 | " durationSeconds firstFlowDate lastFlowDate attemptCount \n", 467 | "0 6 2013-04-03 06:36:44 2013-04-03 06:36:51 3 \n", 468 | "1 12 2013-04-03 06:35:52 2013-04-03 06:43:26 9 \n", 469 | "2 0 2013-04-03 06:36:14 2013-04-03 06:36:14 1 \n", 470 | "3 18 2013-04-03 06:35:22 2013-04-03 06:44:08 17 \n", 471 | "4 6 2013-04-03 06:37:11 2013-04-03 06:37:18 3 \n", 472 | "5 0 2013-04-03 06:36:45 2013-04-03 06:36:45 1 \n", 473 | "6 0 2013-04-03 06:48:05 2013-04-03 06:48:05 1 \n", 474 | "7 6 2013-04-03 06:36:59 2013-04-03 06:36:59 1 \n", 475 | "8 0 2013-04-03 06:36:21 2013-04-03 06:36:21 1 \n", 476 | "9 6 2013-04-03 06:36:50 2013-04-03 06:36:59 19 " 477 | ] 478 | }, 479 | "execution_count": 4, 480 | "metadata": {}, 481 | "output_type": "execute_result" 482 | } 483 | ], 484 | "source": [ 485 | "# how's the dataframe look?\n", 486 | "gdf.head(10)" 487 | ] 488 | } 489 | ], 490 | "metadata": { 491 | "file_extension": ".py", 492 | "kernelspec": { 493 | "display_name": "Python 3", 494 | "language": "python", 495 | "name": "python3" 496 | }, 497 | "language_info": { 498 | "codemirror_mode": { 499 | "name": "ipython", 500 | "version": 3 501 | }, 502 | "file_extension": ".py", 503 | "mimetype": "text/x-python", 504 | "name": "python", 505 | "nbconvert_exporter": "python", 506 | "pygments_lexer": "ipython3", 507 | "version": "3.6.7" 508 | }, 509 | "mimetype": "text/x-python", 510 | "name": "python", 511 | "npconvert_exporter": "python", 512 | "pygments_lexer": "ipython3", 513 | "version": 3 514 | }, 515 | "nbformat": 4, 516 | "nbformat_minor": 2 517 | } 518 | -------------------------------------------------------------------------------- /colab_notebooks/vs_pyspark_netflow.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "d0hJ4z8rBOFC" 8 | }, 9 | "source": [ 10 | "# BlazingSQL vs. Apache Spark \n", 11 | "\n", 12 | "Below we have one of our popular workloads running with [BlazingSQL](https://blazingsql.com/) + [RAPIDS AI](https://rapids.ai) and then running the entire ETL phase again, only this time with Apache Spark + PySpark.\n", 13 | "\n", 14 | "In this notebook, we will cover: \n", 15 | "- How to set up BlazingSQL and the RAPIDS AI suite in Google Colab.\n", 16 | "- How to read and query csv files with cuDF and BlazingSQL.\n", 17 | "- How BlazingSQL compares against Apache Spark (analyzing over 20M records).\n", 18 | "\n", 19 | "![Impression](https://www.google-analytics.com/collect?v=1&tid=UA-39814657-5&cid=555&t=event&ec=guides&ea=bsql_vs_spark&dt=bsql_vs_spark)" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": { 25 | "colab_type": "text", 26 | "id": "kJyD4oSbugE0" 27 | }, 28 | "source": [ 29 | "## Setup\n", 30 | "### Environment Sanity Check \n", 31 | "\n", 32 | "RAPIDS packages (BlazingSQL included) require Pascal+ architecture to run. For Colab, this translates to a T4 GPU instance. \n", 33 | "\n", 34 | "The cell below will let you know what type of GPU you've been allocated, and how to proceed." 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 14, 40 | "metadata": { 41 | "colab": { 42 | "base_uri": "https://localhost:8080/", 43 | "height": 35 44 | }, 45 | "colab_type": "code", 46 | "id": "QzVzojZ7tc9a", 47 | "outputId": "1c412c49-59fd-482b-83dc-1764af8fda12" 48 | }, 49 | "outputs": [ 50 | { 51 | "name": "stdout", 52 | "output_type": "stream", 53 | "text": [ 54 | "\n", 55 | "\n", 56 | "***********************************\n", 57 | "GPU = b'Tesla T4'\n", 58 | "Woo! You got the right kind of GPU!\n", 59 | "***********************************\n", 60 | "\n", 61 | "\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "!wget https://github.com/BlazingDB/bsql-demos/raw/master/utils/colab_env.py\n", 67 | "!python colab_env.py " 68 | ] 69 | }, 70 | { 71 | "cell_type": "markdown", 72 | "metadata": { 73 | "colab": {}, 74 | "colab_type": "code", 75 | "id": "btG1BbSA1nLu" 76 | }, 77 | "source": [ 78 | "## Installs \n", 79 | "The cell below pulls our Google Colab install script from the `bsql-demos` repo then runs it. The script first installs miniconda, then uses miniconda to install BlazingSQL and RAPIDS AI. This takes a few minutes to run. " 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "!wget https://github.com/BlazingDB/bsql-demos/raw/master/utils/bsql-colab.sh \n", 89 | "!bash bsql-colab.sh\n", 90 | "\n", 91 | "import sys, os, time\n", 92 | "sys.path.append('/usr/local/lib/python3.6/site-packages/')\n", 93 | "os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'\n", 94 | "os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'\n", 95 | "\n", 96 | "import subprocess\n", 97 | "subprocess.Popen(['blazingsql-orchestrator', '9100', '8889', '127.0.0.1', '8890'],stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)\n", 98 | "subprocess.Popen(['java', '-jar', '/usr/local/lib/blazingsql-algebra.jar', '-p', '8890'])\n", 99 | "\n", 100 | "import pyblazing.apiv2.context as cont\n", 101 | "cont.runRal()\n", 102 | "time.sleep(1) " 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": { 108 | "colab_type": "text", 109 | "id": "0guvG6Ws_zmX" 110 | }, 111 | "source": [ 112 | "## Import packages and create Blazing Context\n", 113 | "You can think of the BlazingContext much like a Spark Context (i.e. where information such as FileSystems you have registered and Tables you have created will be stored). If you have issues running this cell, restart runtime and try running it again." 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 1, 119 | "metadata": { 120 | "colab": { 121 | "base_uri": "https://localhost:8080/", 122 | "height": 35 123 | }, 124 | "colab_type": "code", 125 | "id": "ojm_V-WAtz0f", 126 | "outputId": "a46625f4-1494-4a13-eb13-2f38efd80ccf" 127 | }, 128 | "outputs": [ 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | "BlazingContext ready\n" 134 | ] 135 | } 136 | ], 137 | "source": [ 138 | "from blazingsql import BlazingContext\n", 139 | "import cudf\n", 140 | "# start up BlazingSQL\n", 141 | "bc = BlazingContext()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": { 147 | "colab_type": "text", 148 | "id": "yp7z8bfivbna" 149 | }, 150 | "source": [ 151 | "### Load & Query Table\n", 152 | "First, we need to download the netflow data (20 million records) from AWS." 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": { 159 | "colab": {}, 160 | "colab_type": "code", 161 | "id": "2dAt6DfG37KH" 162 | }, 163 | "outputs": [], 164 | "source": [ 165 | "# takes a few minutes to download\n", 166 | "!wget https://blazingsql-colab.s3.amazonaws.com/netflow_data/nf-chunk2.csv" 167 | ] 168 | }, 169 | { 170 | "cell_type": "markdown", 171 | "metadata": { 172 | "colab_type": "text", 173 | "id": "OTEaAsp2_zmf" 174 | }, 175 | "source": [ 176 | "#### BlazingSQL + cuDF \n", 177 | "Data in hand, we can test the preformance of cuDF and BlazingSQL on this dataset." 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 2, 183 | "metadata": { 184 | "colab": { 185 | "base_uri": "https://localhost:8080/", 186 | "height": 52 187 | }, 188 | "colab_type": "code", 189 | "id": "rirBsYQU3NH5", 190 | "outputId": "51ced2b1-b930-4173-bbfa-09672e751d3f" 191 | }, 192 | "outputs": [ 193 | { 194 | "name": "stdout", 195 | "output_type": "stream", 196 | "text": [ 197 | "CPU times: user 138 ms, sys: 142 ms, total: 280 ms\n", 198 | "Wall time: 304 ms\n" 199 | ] 200 | } 201 | ], 202 | "source": [ 203 | "%%time\n", 204 | "# Load CSVs into GPU DataFrames (GDF)\n", 205 | "netflow_gdf = cudf.read_csv('nf-chunk2.csv')" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 3, 211 | "metadata": { 212 | "colab": { 213 | "base_uri": "https://localhost:8080/", 214 | "height": 52 215 | }, 216 | "colab_type": "code", 217 | "id": "zCzLEFfB3N4k", 218 | "outputId": "10ff9097-2736-423e-969d-de75983fbdda" 219 | }, 220 | "outputs": [ 221 | { 222 | "name": "stdout", 223 | "output_type": "stream", 224 | "text": [ 225 | "CPU times: user 27.5 ms, sys: 747 µs, total: 28.2 ms\n", 226 | "Wall time: 55.9 ms\n" 227 | ] 228 | }, 229 | { 230 | "data": { 231 | "text/plain": [ 232 | "" 233 | ] 234 | }, 235 | "execution_count": 3, 236 | "metadata": {}, 237 | "output_type": "execute_result" 238 | } 239 | ], 240 | "source": [ 241 | "%%time\n", 242 | "# Create BlazingSQL table from GDF - There is no copy in this process\n", 243 | "bc.create_table('netflow', netflow_gdf)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": 4, 249 | "metadata": { 250 | "colab": { 251 | "base_uri": "https://localhost:8080/", 252 | "height": 295 253 | }, 254 | "colab_type": "code", 255 | "id": "umBG2Tp0wbQx", 256 | "outputId": "0975395e-7f5b-4244-afa3-45c8658ce61c" 257 | }, 258 | "outputs": [ 259 | { 260 | "name": "stdout", 261 | "output_type": "stream", 262 | "text": [ 263 | "CPU times: user 30.8 ms, sys: 0 ns, total: 30.8 ms\n", 264 | "Wall time: 429 ms\n" 265 | ] 266 | } 267 | ], 268 | "source": [ 269 | "%%time\n", 270 | "# make a query\n", 271 | "query = '''\n", 272 | " SELECT\n", 273 | " a.firstSeenSrcIp as source,\n", 274 | " a.firstSeenDestIp as destination,\n", 275 | " count(a.firstSeenDestPort) as targetPorts,\n", 276 | " SUM(a.firstSeenSrcTotalBytes) as bytesOut,\n", 277 | " SUM(a.firstSeenDestTotalBytes) as bytesIn,\n", 278 | " SUM(a.durationSeconds) as durationSeconds,\n", 279 | " MIN(parsedDate) as firstFlowDate,\n", 280 | " MAX(parsedDate) as lastFlowDate,\n", 281 | " COUNT(*) as attemptCount\n", 282 | " FROM\n", 283 | " netflow a\n", 284 | " GROUP BY\n", 285 | " a.firstSeenSrcIp,\n", 286 | " a.firstSeenDestIp\n", 287 | " '''\n", 288 | "\n", 289 | "# query the table\n", 290 | "gdf = bc.sql(query)" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 5, 296 | "metadata": { 297 | "colab": {}, 298 | "colab_type": "code", 299 | "id": "48_W2v8q_zmq", 300 | "outputId": "db0394f1-e082-49b0-c477-e3bba8d3d0f4" 301 | }, 302 | "outputs": [ 303 | { 304 | "data": { 305 | "text/html": [ 306 | "
\n", 307 | "\n", 320 | "\n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | "
sourcedestinationtargetPortsbytesOutbytesIndurationSecondsfirstFlowDatelastFlowDateattemptCount
0172.30.2.48172.0.0.1190002013-04-03 06:36:092013-04-03 06:36:091
1172.10.2.81239.255.255.250142800062013-04-03 06:36:412013-04-03 06:36:4814
2172.30.2.58172.0.0.1190002013-04-03 06:36:092013-04-03 06:36:091
3172.30.1.17110.0.0.13145463302013-04-03 06:48:022013-04-03 06:48:021
4172.30.1.1710.0.0.7145363202013-04-03 06:47:562013-04-03 06:47:561
\n", 398 | "
" 399 | ], 400 | "text/plain": [ 401 | " source destination targetPorts bytesOut bytesIn \\\n", 402 | "0 172.30.2.48 172.0.0.1 1 90 0 \n", 403 | "1 172.10.2.81 239.255.255.250 14 2800 0 \n", 404 | "2 172.30.2.58 172.0.0.1 1 90 0 \n", 405 | "3 172.30.1.171 10.0.0.13 1 454 633 \n", 406 | "4 172.30.1.17 10.0.0.7 1 453 632 \n", 407 | "\n", 408 | " durationSeconds firstFlowDate lastFlowDate attemptCount \n", 409 | "0 0 2013-04-03 06:36:09 2013-04-03 06:36:09 1 \n", 410 | "1 6 2013-04-03 06:36:41 2013-04-03 06:36:48 14 \n", 411 | "2 0 2013-04-03 06:36:09 2013-04-03 06:36:09 1 \n", 412 | "3 0 2013-04-03 06:48:02 2013-04-03 06:48:02 1 \n", 413 | "4 0 2013-04-03 06:47:56 2013-04-03 06:47:56 1 " 414 | ] 415 | }, 416 | "execution_count": 5, 417 | "metadata": {}, 418 | "output_type": "execute_result" 419 | } 420 | ], 421 | "source": [ 422 | "# how's it look?\n", 423 | "gdf.head()" 424 | ] 425 | }, 426 | { 427 | "cell_type": "markdown", 428 | "metadata": { 429 | "colab_type": "text", 430 | "id": "6PXbjW1hTxrD" 431 | }, 432 | "source": [ 433 | "## Apache Spark\n", 434 | "The cell below installs Apache Spark ([PySpark](https://spark.apache.org/docs/latest/api/python/index.html))." 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 4, 440 | "metadata": { 441 | "colab": {}, 442 | "colab_type": "code", 443 | "id": "pnEEvVEtT8xi" 444 | }, 445 | "outputs": [], 446 | "source": [ 447 | "# Note: This installs Spark (version 2.4.1, as tested in Jan 2020)\n", 448 | "!pip install pyspark" 449 | ] 450 | }, 451 | { 452 | "cell_type": "markdown", 453 | "metadata": { 454 | "colab_type": "text", 455 | "id": "W3-XmZkz_zmw" 456 | }, 457 | "source": [ 458 | "#### PyBlazing vs PySpark\n", 459 | "With everything installed we can launch a SparkSession and see how BlazingSQL stacks up." 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": 6, 465 | "metadata": { 466 | "colab": { 467 | "base_uri": "https://localhost:8080/", 468 | "height": 51 469 | }, 470 | "colab_type": "code", 471 | "id": "nioEt2MqT9B0", 472 | "outputId": "f75b9823-5dbd-45b1-9282-562d3d6ddaf0" 473 | }, 474 | "outputs": [ 475 | { 476 | "name": "stdout", 477 | "output_type": "stream", 478 | "text": [ 479 | "CPU times: user 50.2 ms, sys: 12.9 ms, total: 63.1 ms\n", 480 | "Wall time: 3.88 s\n" 481 | ] 482 | } 483 | ], 484 | "source": [ 485 | "%%time\n", 486 | "# I copied this cell's snippet from another Google Colab by Luca Canali here: https://colab.research.google.com/github/LucaCanali/sparkMeasure/blob/master/examples/SparkMeasure_Jupyter_Colab_Example.ipynb\n", 487 | "\n", 488 | "from pyspark.sql import SparkSession\n", 489 | "\n", 490 | "# Create Spark Session\n", 491 | "# This example uses a local cluster, you can modify master to use YARN or K8S if available \n", 492 | "# This example downloads sparkMeasure 0.13 for scala 2_11 from maven central\n", 493 | "\n", 494 | "spark = SparkSession \\\n", 495 | " .builder \\\n", 496 | " .master(\"local[*]\") \\\n", 497 | " .appName(\"PySpark Netflow Benchmark code\") \\\n", 498 | " .config(\"spark.jars.packages\",\"ch.cern.sparkmeasure:spark-measure_2.11:0.13\") \\\n", 499 | " .getOrCreate()" 500 | ] 501 | }, 502 | { 503 | "cell_type": "markdown", 504 | "metadata": { 505 | "colab_type": "text", 506 | "id": "G8XSppQiUdLY" 507 | }, 508 | "source": [ 509 | "### Load & Query Table" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": 5, 515 | "metadata": { 516 | "colab": { 517 | "base_uri": "https://localhost:8080/", 518 | "height": 51 519 | }, 520 | "colab_type": "code", 521 | "id": "ZSLuSYSOUDtf", 522 | "outputId": "2b93169b-63c5-4c46-da14-af87645bf51b" 523 | }, 524 | "outputs": [ 525 | { 526 | "name": "stdout", 527 | "output_type": "stream", 528 | "text": [ 529 | "CPU times: user 2.73 ms, sys: 0 ns, total: 2.73 ms\n", 530 | "Wall time: 2.91 s\n" 531 | ] 532 | } 533 | ], 534 | "source": [ 535 | "%%time\n", 536 | "# load CSV into Spark\n", 537 | "netflow_df = spark.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('nf-chunk2.csv')" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": 6, 543 | "metadata": { 544 | "colab": { 545 | "base_uri": "https://localhost:8080/", 546 | "height": 51 547 | }, 548 | "colab_type": "code", 549 | "id": "iT3BwLn8UDwE", 550 | "outputId": "4eeff800-489f-4230-adb9-f3a1c16ede66" 551 | }, 552 | "outputs": [ 553 | { 554 | "name": "stdout", 555 | "output_type": "stream", 556 | "text": [ 557 | "CPU times: user 1.06 ms, sys: 611 µs, total: 1.67 ms\n", 558 | "Wall time: 120 ms\n" 559 | ] 560 | } 561 | ], 562 | "source": [ 563 | "%%time\n", 564 | "# create table for querying\n", 565 | "netflow_df.createOrReplaceTempView('netflow')" 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "execution_count": 7, 571 | "metadata": { 572 | "colab": { 573 | "base_uri": "https://localhost:8080/", 574 | "height": 493 575 | }, 576 | "colab_type": "code", 577 | "id": "9SBhahA5UD2k", 578 | "outputId": "accc1938-6470-44df-ab7f-70058c755b2b" 579 | }, 580 | "outputs": [ 581 | { 582 | "name": "stdout", 583 | "output_type": "stream", 584 | "text": [ 585 | "+------------+---------------+-----------+--------+-------+---------------+-------------------+-------------------+------------+\n", 586 | "| source| destination|targetPorts|bytesOut|bytesIn|durationSeconds| firstFlowDate| lastFlowDate|attemptCount|\n", 587 | "+------------+---------------+-----------+--------+-------+---------------+-------------------+-------------------+------------+\n", 588 | "| 172.10.1.13|239.255.255.250| 15| 2975| 0| 6|2013-04-03 06:36:19|2013-04-03 06:36:27| 15|\n", 589 | "|172.10.1.232| 172.0.0.1| 1| 180| 180| 0|2013-04-03 06:36:45|2013-04-03 06:36:45| 1|\n", 590 | "|172.10.1.238|239.255.255.250| 2| 700| 0| 6|2013-04-03 06:36:44|2013-04-03 06:36:51| 2|\n", 591 | "| 172.10.1.35| 172.0.0.1| 1| 270| 0| 0|2013-04-03 06:36:21|2013-04-03 06:36:21| 1|\n", 592 | "|172.10.2.137| 172.0.0.1| 1| 90| 90| 0|2013-04-03 06:36:42|2013-04-03 06:36:42| 1|\n", 593 | "+------------+---------------+-----------+--------+-------+---------------+-------------------+-------------------+------------+\n", 594 | "only showing top 5 rows\n", 595 | "\n", 596 | "CPU times: user 1.5 ms, sys: 861 µs, total: 2.36 ms\n", 597 | "Wall time: 1.14 s\n" 598 | ] 599 | } 600 | ], 601 | "source": [ 602 | "%%time\n", 603 | "# make a query\n", 604 | "query = '''\n", 605 | " SELECT\n", 606 | " a.firstSeenSrcIp as source,\n", 607 | " a.firstSeenDestIp as destination,\n", 608 | " count(a.firstSeenDestPort) as targetPorts,\n", 609 | " SUM(a.firstSeenSrcTotalBytes) as bytesOut,\n", 610 | " SUM(a.firstSeenDestTotalBytes) as bytesIn,\n", 611 | " SUM(a.durationSeconds) as durationSeconds,\n", 612 | " MIN(parsedDate) as firstFlowDate,\n", 613 | " MAX(parsedDate) as lastFlowDate,\n", 614 | " COUNT(*) as attemptCount\n", 615 | " FROM\n", 616 | " netflow a\n", 617 | " GROUP BY\n", 618 | " a.firstSeenSrcIp,\n", 619 | " a.firstSeenDestIp\n", 620 | " '''\n", 621 | "\n", 622 | "# query with Spark\n", 623 | "edges_df = spark.sql(query)\n", 624 | "\n", 625 | "# set/display results\n", 626 | "edges_df.show(5)" 627 | ] 628 | } 629 | ], 630 | "metadata": { 631 | "accelerator": "GPU", 632 | "colab": { 633 | "collapsed_sections": [], 634 | "name": "vs_pyspark_netflow.ipynb", 635 | "provenance": [], 636 | "toc_visible": true 637 | }, 638 | "kernelspec": { 639 | "display_name": "Python 3", 640 | "language": "python", 641 | "name": "python3" 642 | }, 643 | "language_info": { 644 | "codemirror_mode": { 645 | "name": "ipython", 646 | "version": 3 647 | }, 648 | "file_extension": ".py", 649 | "mimetype": "text/x-python", 650 | "name": "python", 651 | "nbconvert_exporter": "python", 652 | "pygments_lexer": "ipython3", 653 | "version": "3.6.7" 654 | } 655 | }, 656 | "nbformat": 4, 657 | "nbformat_minor": 2 658 | } 659 | -------------------------------------------------------------------------------- /data/Music.csv: -------------------------------------------------------------------------------- 1 | ARTIST,RATING,YEAR,LOCATION,FESTIVAL_SET 2 | Arcade Fire,10,2018,Las Vegas,1 3 | Justice,10,2018,Las Vegas,1 4 | Florence and The Machine,10,2018,Las Vegas,1 5 | Odesza,10,2018,Indio,1 6 | Bon Iver,10,2017,Indio,1 7 | LA Philharmonic + Sigur Ros,10,2017,LA,0 8 | Sigur Ros,10,2014,Malmo,0 9 | Arcade Fire,10,2014,Indio,1 10 | Escort,9,2018,San Francisco,0 11 | Phoenix,9,2018,Berkeley,0 12 | Jamie XX,9,2018,Golden Gate Park,1 13 | Beyonce,10,2018,Indio,1 14 | Soulwax,9,2018,Indio,1 15 | The XX,9,2017,Las Vegas,1 16 | Justice,9,2017,Indio,1 17 | Sigur Ros,9,2017,LA,0 18 | The XX,9,2017,London,0 19 | Porter Robinson and Madeon,9,2017,London,0 20 | Garden City Movement,9,2018,Tel Aviv,0 21 | ACDC,9,2015,Indio,1 22 | Porter Robinson,9,2015,Las Vegas,1 23 | Alt-J,9,2015,Barcelona,1 24 | Arcade Fire,9,2014,LA,0 25 | Phoenix,9,2013,Indio,1 26 | Chvrches,9,2013,Copenhagen,0 27 | Red Hot Chili Peppers,9,2006,Oakland,0 28 | Jungle,8,2018,Las Vegas,1 29 | Sylvan Esso,8,2018,Las Vegas,1 30 | Lake Street Dive,8,2018,San Francisco,0 31 | Elohim,8,2018,Golden Gate Park,1 32 | Tash Sultana,8,2018,Golden Gate Park,1 33 | David Byrne,8,2018,Indio,1 34 | Eminem,8,2018,Indio,1 35 | Tank and the Bangas,8,2018,Indio,1 36 | The Blaze,8,2018,Indio,1 37 | Jungle,8,2018,San Francisco,0 38 | Chance The Rapper,8,2017,Las Vegas,1 39 | Goldroom,8,2017,Las Vegas,1 40 | Mura Masa,8,2017,Las Vegas,1 41 | ZHU,8,2017,Las Vegas,1 42 | Goldroom,8,2017,San Francisco,1 43 | Phoenix,8,2017,Mountain View,1 44 | Hans Zimmer,8,2017,Indio,1 45 | Moderat,8,2017,Indio,1 46 | The XX,8,2017,Indio,1 47 | BORNS,8,2016,Indio,1 48 | Chvrches,8,2016,Indio,1 49 | Gallant,8,2016,Indio,1 50 | Matt & Kim,8,2016,Indio,1 51 | The Lumineers,8,2016,Las Vegas,1 52 | Flume,8,2016,Las Vegas,1 53 | Griz ,8,2016,San Francisco,1 54 | James Vincent McMorrow,8,2016,London,0 55 | Mura Masa,8,2016,San Francisco,0 56 | Alt-J,8,2015,Indio,1 57 | Jamie XX,8,2015,Indio,1 58 | ODESZA,8,2015,Indio,1 59 | Porter Robinson,8,2015,Indio,1 60 | Yelle,8,2015,Indio,1 61 | Sylvan Esso,8,2015,Indio,1 62 | ODESZA,8,2015,Indio,1 63 | Imagine Dragons,8,2015,LA,0 64 | Ben Howard,8,2015,Berkeley ,0 65 | Imagine Dragons,8,2015,Las Vegas,1 66 | Elton John,8,2015,San Francisco,1 67 | Garden City Movement,8,2015,Barcelona,1 68 | Jungle,8,2015,Barcelona,1 69 | Matt and Kim,8,2015,LA,0 70 | Daughter,8,2014,Indio,1 71 | Chromeo,8,2014,Indio,1 72 | Flume,8,2014,Indio,1 73 | Phantogram,8,2014,Monterey,1 74 | Major Lazer,8,2013,Indio,1 75 | The XX,8,2013,Indio,1 76 | Yeasayer,8,2013,Indio,1 77 | The Floor is Made of Lava,8,2013,Copenhagen,0 78 | Taylor Swift,8,2012,Claremont,0 79 | Elton John,8,2010,Ontario,0 80 | First Aid Kit,7,2018,Las Vegas,1 81 | Cut Copy,7,2018,Berkeley,0 82 | Rainbow Kitten Surprise\,7,2018,Golden Gate Park,1 83 | LP,7,2018,Golden Gate Park,1 84 | Chvrches,7,2018,Golden Gate Park,1 85 | Bon Iver,7,2018,Golden Gate Park,1 86 | Bleachers,7,2018,Indio,1 87 | Lola Marsh,7,2018,Tel Aviv,0 88 | The Paz Band,7,2017,Tel Aviv,0 89 | The Revivalists (Acoustic),7,2017,Las Vegas,1 90 | Lorde,7,2017,Las Vegas,1 91 | Treehouse Dubstep,7,2017,Las Vegas,1 92 | Sofi Tukker (Tukker DJ set),7,2017,Las Vegas,1 93 | Tycho,7,2017,Las Vegas,1 94 | Pretty Lights,7,2017,Las Vegas,1 95 | Tokimonsta,7,2017,Las Vegas,0 96 | San Fermin,7,2017,San Francisco,1 97 | Franz Ferdinand,7,2017,Mountain View,1 98 | Ezra Furman,7,2017,Indio,1 99 | FKJ,7,2017,Indio,1 100 | GoldLink,7,2017,Indio,1 101 | Jack Garratt,7,2017,Indio,1 102 | Oh Wonder,7,2017,Indio,1 103 | Phantogram,7,2017,Indio,1 104 | Sam Gellaitry,7,2017,Indio,1 105 | Sigur Ros,7,2017,Oakland,0 106 | LA Philharmonic,7,2017,LA,0 107 | Despacio,7,2016,Indio,1 108 | Goldroom,7,2016,Indio,1 109 | LCD soundsystem,7,2016,Indio,1 110 | Lido,7,2016,Indio,1 111 | Lord Huron,7,2016,Indio,1 112 | Major Lazer,7,2016,Indio,1 113 | Rufus du sol ,7,2016,Indio,1 114 | Spacewench,7,2016,Las Vegas,1 115 | Big Grams,7,2016,San Francisco,1 116 | Rufus Du Sol,7,2016,San Francisco,1 117 | Yellow Claw,7,2015,Indio,1 118 | St. Lucia,7,2015,Indio,1 119 | Jamie XX,7,2015,Indio,1 120 | Klingande,7,2015,Las Vegas,1 121 | Major Lazer,7,2015,Las Vegas,1 122 | Jauz,7,2015,Las Vegas,1 123 | Walk the Moon,7,2015,Las Vegas,1 124 | Madeon,7,2015,Las Vegas,1 125 | Chvrches,7,2015,Oakland,1 126 | Death Cab for Cutie,7,2015,Oakland,1 127 | X Ambassadors,7,2015,Oakland,1 128 | Porter Robinson,7,2015,San Francisco,1 129 | James Bay,7,2015,San Francisco,1 130 | Sam Smith,7,2015,San Francisco,1 131 | ACollective,7,2015,Barcelona,1 132 | Chet faker,7,2015,Barcelona,1 133 | Sylvan Esso,7,2015,Barcelona,1 134 | Chvrches,7,2014,Indio,1 135 | Krewella,7,2014,Indio,1 136 | St Lucia,7,2014,Indio,1 137 | The Naked and Famous,7,2014,Monterey,1 138 | Future Islands,7,2014,Monterey,1 139 | Tokyo Police Club,7,2014,Monterey,1 140 | Macklemore,7,2014,San Francisco,1 141 | Watsky,7,2014,San Francisco,1 142 | The Kooks,7,2014,San Francisco,1 143 | Yeah Yeah Yeahs,7,2013,Indio,1 144 | Passion Pit,7,2013,Indio,1 145 | Purity Ring,7,2013,Indio,1 146 | Red Hot Chili Peppers,7,2013,Indio,1 147 | The Postal Service,7,2013,Indio,1 148 | Vampire weekend,7,2013,Indio,1 149 | Scavenger Hunt,7,2013,LA,0 150 | Ms MR,7,2013,Copenhagen,0 151 | The Fratellis,7,2016,London,0 152 | St . Lucia,7,2015,LA,0 153 | Anderson Paak,7,2016,London,0 154 | Poolside,6,2018,Las Vegas,1 155 | St. Vincent,6,2018,Las Vegas,1 156 | Superorganism,6,2018,Las Vegas,1 157 | Sofi Tukker,6,2018,Las Vegas,1 158 | Sir Sly,6,2018,Berkeley,0 159 | Carly Rae Jepsen,6,2018,Golden Gate Park,1 160 | Alt-j,6,2018,Indio,1 161 | Nile Rogers and CHIC,6,2018,Indio,1 162 | Sudan Archives,6,2018,Indio,1 163 | Petit Biscuit,6,2018,Indio,1 164 | Elohim,6,2018,Indio,1 165 | St. Vincent,6,2018,Indio,1 166 | Nessi Gomes,6,2018,Israel,0 167 | RAC (DJ Set),6,2017,Las Vegas,1 168 | Two Door Cinema Club,6,2017,Las Vegas,1 169 | Milky Chance,6,2017,Las Vegas,1 170 | Alt-J,6,2017,San Francisco,1 171 | RAC,6,2017,San Francisco,1 172 | SOHN,6,2017,San Francisco,1 173 | RAC,6,2017,San Francisco,1 174 | Joseph,6,2017,San Francisco,1 175 | James Vincent McMorrow,6,2017,San Francisco,1 176 | Young the Giant,6,2017,San Francisco,1 177 | Lorde,6,2017,San Francisco,1 178 | Andre McMahon in the Wilderness,6,2017,Mountain View,1 179 | Joseph,6,2017,Indio,1 180 | Nao,6,2017,Indio,1 181 | Porter & Madeon,6,2017,Indio,1 182 | Two Door Cinema Club,6,2017,Indio,1 183 | Tycho,6,2017,Indio,1 184 | Of Monsters and Men,6,2017,Indio,1 185 | Flume,6,2017,Indio,1 186 | Lapsley,6,2017,Indio,1 187 | Jimmy Eat World,6,2012,Las Vegas,1 188 | Keys N Krates,6,2012,Las Vegas,1 189 | Leon Bridges,6,2012,Las Vegas,1 190 | Oh Wonder,6,2012,Las Vegas,1 191 | The Wombats,6,2012,San Francisco,1 192 | Oh Wonder,6,2012,San Francisco,1 193 | The War on Drugs,6,2012,Indio,1 194 | Andre McMahon in the Wilderness,6,2012,Indio,1 195 | Phox,6,2012,Indio,1 196 | Metric,6,2012,Las Vegas,1 197 | Bastille,6,2012,Oakland,1 198 | Halsey,6,2012,Oakland,1 199 | George Ezra,6,2012,San Francisco,1 200 | Mumford and Sons,6,2012,San Francisco,1 201 | Benjamin Booker,6,2012,San Francisco,1 202 | Mac Demarco,6,2012,Barcelona,1 203 | Bastille,6,2012,Indio,1 204 | Ellie Goulding,6,2012,Indio,1 205 | STRFKR,6,2012,Indio,1 206 | The National,6,2012,Monterey,1 207 | Blind Pilot,6,2012,Monterey,1 208 | Beck,6,2012,Monterey,1 209 | Flume,6,2012,San Francisco,1 210 | Lykke Li,6,2012,San Francisco,1 211 | Haim,6,2012,San Francisco,1 212 | Tycho,6,2012,San Francisco,1 213 | Earth Wind & Fire,6,2012,Claremont,0 214 | Of Monsters and Men,6,2012,Indio,1 215 | Japandroids,6,2012,Indio,1 216 | Lumineers,6,2012,Indio,1 217 | Chvrches,6,2012,LA,0 218 | Jack Johnson ,6,2012,Berkeley ,0 219 | Daughter,6,2012,Berkeley ,0 220 | Tom Misch,5,2018,Oakland,0 221 | Chvrches,5,2018,Las Vegas,1 222 | Two Feet,5,2018,Las Vegas,1 223 | Odesza,5,2018,Golden Gate Park,1 224 | Rezz,5,2018,Indio,1 225 | Jacob Banks,5,2017,Las Vegas,1 226 | Future Islands,5,2017,San Francisco,1 227 | Cold War Kids,5,2017,Mountain View,1 228 | Big Gigantic,5,2017,Indio,1 229 | Glass Animals,5,2017,Indio,1 230 | The Head and the Heart,5,2017,Indio,1 231 | What So 0t,5,2017,Indio,1 232 | Calvin Harris,5,2017,Indio,1 233 | Halsey,5,2017,Indio,1 234 | Snails,5,2017,Indio,1 235 | The 1975,5,2017,Indio,1 236 | Mr. Carmack,5,2017,Las Vegas,1 237 | Halsey,5,2017,San Francisco,1 238 | MO,5,2017,Indio,1 239 | Tycho,5,2017,Indio,1 240 | Coasts,5,2011,Indio,1 241 | Alessia Cara,5,2011,Las Vegas,1 242 | Halsey,5,2011,Las Vegas,1 243 | Run the Jewels,5,2011,Las Vegas,1 244 | Silversun Pickups,5,2011,Oakland,1 245 | First Aid Kit,5,2011,San Francisco,1 246 | Broods,5,2011,San Francisco,1 247 | RL Grime,5,2011,San Francisco,1 248 | Belle and Sebastian,5,2011,Barcelona,1 249 | Run the Jewels,5,2011,Barcelona,1 250 | The Strokes,5,2011,Barcelona,1 251 | Haim,5,2011,Indio,1 252 | The Head and the Heart,5,2011,Indio,1 253 | MGMT,5,2011,Indio,1 254 | Empire of the Sun,5,2011,Indio,1 255 | Grouplove,5,2011,Indio,1 256 | The 1975,5,2011,Indio,1 257 | Mr Little Jeans,5,2011,Monterey,1 258 | Atmosphere,5,2011,San Francisco,1 259 | The Chainsmokers,5,2011,Claremont,0 260 | Jessie Ware,5,2011,Indio,1 261 | Van Halen,5,2011,MOuntain View, 262 | Tycho,5,2011,London,0 263 | Foster The People,4,2018,Las Vegas,1 264 | Brasstracks,4,2018,Las Vegas,1 265 | Olivia O'Brien,4,2018,Golden Gate Park,1 266 | Slow Magic,4,2018,Indio,1 267 | Blink-182,4,2017,Las Vegas,1 268 | Classixx,4,2017,Las Vegas,1 269 | Local Natives,4,2017,Las Vegas,1 270 | The Japanese House,4,2017,San Francisco,1 271 | Above and Beyond,4,2017,San Francisco,1 272 | Milky Chance,4,2017,Mountain View,1 273 | Honne,4,2017,Indio,1 274 | RL Grime,4,2014,Indio,1 275 | James Bay,4,2014,Indio,1 276 | Ellie Goulding,4,2014,Indio,1 277 | Louis the child,4,2014,Indio,1 278 | Mr Carmack,4,2014,Indio,1 279 | ZHU,4,2014,Indio,1 280 | Gryffin,4,2014,Las Vegas,1 281 | ZHU,4,2014,Las Vegas,1 282 | Jauz,4,2014,San Francisco,1 283 | Chance The Rapper,4,2014,San Francisco,1 284 | George Ezra,4,2014,Indio,1 285 | Alabama Shakes,4,2014,Indio,1 286 | Kaskade,4,2014,Indio,1 287 | Madeon,4,2014,Indio,1 288 | Milky chance,4,2014,Indio,1 289 | Ryn Weaver,4,2014,Indio,1 290 | The weeknd,4,2014,Indio,1 291 | What So 0t,4,2014,Indio,1 292 | Lindsey Stirling,4,2014,Las Vegas,1 293 | Glass Animals,4,2014,Las Vegas,1 294 | Odesza,4,2014,San Francisco,1 295 | Black keys,4,2014,Barcelona,1 296 | Lorde,4,2014,Indio,1 297 | Adrian Lux,4,2014,Indio,1 298 | Outkast,4,2014,Indio,1 299 | Alesso,4,2014,Indio,1 300 | Beach House,4,2014,Monterey,1 301 | Kanye West,4,2014,San Francisco,1 302 | Disclosure,4,2014,San Francisco,1 303 | Chromeo,4,2014,San Francisco,1 304 | RAC,4,2014,LA,0 305 | Passion Pit,4,2014,San Francisco,0 306 | Banners,4,2014,San Francisco,0 307 | 98 Degrees,4,2014,LA,1 308 | Broken Social Scene,3,2018,Golden Gate Park,1 309 | Portugal the Man,3,2018,Indio,1 310 | Thundercat,3,2017,San Francisco,1 311 | Arkells,3,2017,Indio,1 312 | Jack U,3,2017,Indio,1 313 | Third Eye Blind,3,2017,Las Vegas,1 314 | Years and Years,3,2017,San Francisco,1 315 | Zedd,3,2017,San Francisco,1 316 | Angus and Julia Stone,3,2017,Indio,1 317 | Clean Bandit,3,2017,Indio,1 318 | Ratatat,3,2017,Indio,1 319 | Kaskade,3,2017,Indio,1 320 | Peking Duk,3,2017,Las Vegas,1 321 | Foals ,3,2017,Oakland,1 322 | The Neighborhood,3,2017,Indio,1 323 | Tyler the Creator / Earl Sweatshirt,3,2017,Indio,1 324 | Childish Gambi0,3,2017,Claremont,0 325 | Two Friends,3,2017,San Francisco,0 326 | Digitalism,3,2017,Copenhagen,0 327 | Deorro,2,2017,Las Vegas,1 328 | Bearson,2,2017,Las Vegas,1 329 | Whethan,2,2017,Mountain View,1 330 | Kungs,2,2017,Indio,1 331 | Thomas Jack,2,2017,Indio,1 332 | Vanic,2,2017,Indio,1 333 | Drake,2,2017,Indio,1 334 | Kygo,2,2017,Indio,1 335 | Health,2,2017,Barcelona,1 336 | Muse,2,2017,Indio,1 337 | Modest Mouse,2,2017,Indio,1 338 | The K0cks,2,2017,LA,0 339 | The Chainsmokers,2,2017,London,0 340 | Avicii,2,2017,Vegas,0 341 | 2 Chainz,1,2017,Las Vegas,1 342 | Nick cave and the bad seeds,1,2017,Indio,1 343 | ,,,, 344 | ,,,, 345 | ,,,, 346 | ,,,, 347 | ,,,", ", 348 | -------------------------------------------------------------------------------- /data/cancer_data_00.csv: -------------------------------------------------------------------------------- 1 | 1,23,12,151 2 | 0,9,13,133 3 | 1,21,27,130 4 | 1,14,16,78 5 | 1,9,19,135 6 | 0,25,25,83 7 | 1,16,26,120 8 | 1,15,18,90 9 | 1,19,24,88 10 | 1,25,11,84 11 | 1,24,21,103 12 | 1,17,15,104 13 | 0,14,15,132 14 | 1,12,22,104 15 | 1,12,13,94 16 | 1,22,19,97 17 | 1,10,16,95 18 | 1,15,14,108 19 | 1,20,14,130 20 | 0,17,11,87 21 | 0,16,14,86 22 | 0,17,24,60 23 | 1,20,27,103 24 | 1,19,12,137 25 | 1,9,13,110 26 | 1,19,27,116 27 | 1,10,24,97 28 | 1,16,24,122 29 | 1,15,15,102 30 | 1,11,16,115 31 | 1,11,22,125 32 | 1,23,26,78 33 | 1,20,18,113 34 | 1,11,21,128 35 | 1,16,23,107 36 | 1,10,13,110 37 | 1,18,12,94 38 | 0,21,11,83 39 | 1,11,15,96 40 | 1,10,14,88 41 | 1,24,16,86 42 | 1,19,27,72 43 | 1,11,11,128 44 | 1,15,21,87 45 | 1,10,15,85 46 | 1,18,11,124 47 | 0,22,12,52 48 | 1,20,14,86 49 | 0,20,21,78 50 | 0,25,11,87 51 | 0,19,25,75 52 | 0,19,22,87 53 | 0,25,15,76 54 | 1,14,26,120 55 | 1,18,25,97 56 | 0,18,13,73 57 | 1,10,19,126 58 | 1,17,20,96 59 | 0,22,15,83 60 | 0,23,26,54 61 | 0,15,18,65 62 | 0,25,15,55 63 | 1,12,22,96 64 | 0,24,17,59 65 | 1,16,19,83 66 | 1,11,21,97 67 | 0,12,13,60 68 | 0,18,12,72 69 | 0,16,17,59 70 | 0,17,21,81 71 | 1,21,18,124 72 | 0,9,26,59 73 | 1,21,12,114 74 | 1,22,25,90 75 | 0,18,13,79 76 | 1,21,18,104 77 | 0,10,17,88 78 | 1,11,21,120 79 | 1,16,18,144 80 | 0,22,16,83 81 | 0,10,18,74 82 | 0,17,21,86 83 | 1,10,15,172 84 | 1,20,14,129 85 | 0,25,21,77 86 | 1,14,13,121 87 | 1,19,26,94 88 | 1,19,11,122 89 | 0,11,11,80 90 | 0,12,23,96 91 | 0,23,27,95 92 | 1,10,12,100 93 | 0,14,14,85 94 | 0,10,17,87 95 | 1,22,26,100 96 | 1,23,16,132 97 | 0,22,14,78 98 | 0,19,27,62 99 | 0,21,24,74 100 | 1,16,27,94 -------------------------------------------------------------------------------- /data/cancer_data_01.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BlazingDB/bsql-demos/ebee8a606a272f3e2ab7a38587a6092fe2018d93/data/cancer_data_01.parquet -------------------------------------------------------------------------------- /data/cancer_data_02.csv: -------------------------------------------------------------------------------- 1 | 0.278,0.242,0.079 2 | 0.079,0.181,0.057 3 | 0.16,0.207,0.06 4 | 0.284,0.26,0.097 5 | 0.133,0.181,0.059 6 | 0.17,0.209,0.076 7 | 0.109,0.179,0.057 8 | 0.165,0.22,0.075 9 | 0.193,0.235,0.074 10 | 0.24,0.203,0.082 11 | 0.067,0.153,0.057 12 | 0.129,0.184,0.061 13 | 0.246,0.24,0.078 14 | 0.1,0.185,0.053 15 | 0.229,0.207,0.077 16 | 0.16,0.23,0.071 17 | 0.072,0.159,0.059 18 | 0.202,0.216,0.074 19 | 0.103,0.158,0.054 20 | 0.081,0.189,0.058 21 | 0.127,0.197,0.068 22 | 0.065,0.182,0.069 23 | 0.214,0.252,0.07 24 | 0.102,0.177,0.053 25 | 0.146,0.2,0.063 26 | 0.228,0.304,0.074 27 | 0.187,0.225,0.069 28 | 0.107,0.17,0.057 29 | 0.17,0.193,0.065 30 | 0.116,0.174,0.061 31 | 0.189,0.218,0.062 32 | 0.152,0.23,0.078 33 | 0.15,0.225,0.064 34 | 0.172,0.185,0.063 35 | 0.156,0.2,0.065 36 | 0.134,0.19,0.057 37 | 0.11,0.189,0.061 38 | 0.038,0.147,0.059 39 | 0.051,0.157,0.055 40 | 0.126,0.172,0.064 41 | 0.06,0.178,0.056 42 | 0.122,0.19,0.069 43 | 0.219,0.231,0.063 44 | 0.144,0.197,0.068 45 | 0.105,0.175,0.062 46 | 0.169,0.191,0.06 47 | 0.059,0.177,0.065 48 | 0.123,0.213,0.068 49 | 0.091,0.168,0.06 50 | 0.077,0.181,0.057 51 | 0.05,0.15,0.059 52 | 0.061,0.135,0.06 53 | 0.048,0.187,0.061 54 | 0.149,0.209,0.063 55 | 0.071,0.162,0.057 56 | 0.055,0.192,0.059 57 | 0.127,0.192,0.06 58 | 0.137,0.203,0.068 59 | 0.038,0.182,0.055 60 | 0.053,0.168,0.072 61 | 0.081,0.274,0.07 62 | 0.09,0.183,0.068 63 | 0.201,0.195,0.073 64 | 0.088,0.234,0.07 65 | 0.126,0.191,0.066 66 | 0.148,0.195,0.067 67 | 0.078,0.172,0.069 68 | 0.047,0.152,0.057 69 | 0.141,0.211,0.08 70 | 0.052,0.159,0.057 71 | 0.103,0.158,0.055 72 | 0.153,0.19,0.09 73 | 0.183,0.193,0.065 74 | 0.128,0.166,0.066 75 | 0.068,0.172,0.059 76 | 0.084,0.18,0.054 77 | 0.105,0.24,0.066 78 | 0.215,0.215,0.067 79 | 0.345,0.291,0.081 80 | 0.095,0.172,0.06 81 | 0.094,0.184,0.07 82 | 0.154,0.194,0.069 83 | 0.267,0.183,0.068 84 | 0.179,0.163,0.072 85 | 0.072,0.208,0.06 86 | 0.105,0.213,0.06 87 | 0.099,0.208,0.056 88 | 0.121,0.195,0.056 89 | 0.094,0.193,0.064 90 | 0.134,0.212,0.063 91 | 0.086,0.169,0.059 92 | 0.104,0.172,0.061 93 | 0.051,0.139,0.053 94 | 0.082,0.164,0.057 95 | 0.155,0.186,0.063 96 | 0.131,0.21,0.056 97 | 0.071,0.19,0.066 98 | 0.053,0.135,0.069 99 | 0.075,0.162,0.066 100 | 0.114,0.188,0.064 -------------------------------------------------------------------------------- /federated_query_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "8AdUt3HiUrc3" 8 | }, 9 | "source": [ 10 | "# Querying Multiple Data Formats \n", 11 | "In this notebook, we will cover: \n", 12 | "- How to create and then join BlazingSQL tables from CSV, Parquet, and GPU DataFrame (GDF) sources. \n", 13 | "\n", 14 | "## Imports" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 1, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "import os\n", 24 | "import cudf\n", 25 | "from blazingsql import BlazingContext" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": { 31 | "colab_type": "text", 32 | "id": "aMwNKxePSwOp" 33 | }, 34 | "source": [ 35 | "## Import packages and create BlazingContext\n", 36 | "You can think of the BlazingContext much like a SparkContext; this is where information such as FileSystems you have registered and Tables you have created will be stored. " 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": 2, 42 | "metadata": { 43 | "colab": { 44 | "base_uri": "https://localhost:8080/", 45 | "height": 35 46 | }, 47 | "colab_type": "code", 48 | "id": "azZ7l2q7odYT", 49 | "outputId": "a5302d6e-307e-45c5-a682-c786cc999a40" 50 | }, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "BlazingContext ready\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "# start up BlazingSQL\n", 62 | "bc = BlazingContext()" 63 | ] 64 | }, 65 | { 66 | "cell_type": "markdown", 67 | "metadata": { 68 | "colab_type": "text", 69 | "id": "N2bqpDEnZyQf" 70 | }, 71 | "source": [ 72 | "### Create Table from CSV\n", 73 | "Here we create a BlazingSQL table directly from a comma-separated values (CSV) file." 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 3, 79 | "metadata": { 80 | "colab": {}, 81 | "colab_type": "code", 82 | "id": "HhRhj-ZvZygH" 83 | }, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/plain": [ 88 | "" 89 | ] 90 | }, 91 | "execution_count": 3, 92 | "metadata": {}, 93 | "output_type": "execute_result" 94 | } 95 | ], 96 | "source": [ 97 | "# define column names and types\n", 98 | "column_names = ['diagnosis_result', 'radius', 'texture', 'perimeter']\n", 99 | "column_types = ['float32', 'float32', 'float32', 'float32']\n", 100 | "\n", 101 | "# identify local directory path \n", 102 | "cwd = os.getcwd()\n", 103 | "# add path to data\n", 104 | "data_path = cwd + '/data/cancer_data_00.csv'\n", 105 | "\n", 106 | "# create table from CSV file\n", 107 | "bc.create_table('data_00', data_path, dtype=column_types, names=column_names)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": { 113 | "colab_type": "text", 114 | "id": "HJFz-mqZTJ5Z" 115 | }, 116 | "source": [ 117 | "### Create Table from Parquet\n", 118 | "Here we create a BlazingSQL table directly from an Apache Parquet file." 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 4, 124 | "metadata": { 125 | "colab": {}, 126 | "colab_type": "code", 127 | "id": "HJuvtJDYTMyb" 128 | }, 129 | "outputs": [ 130 | { 131 | "data": { 132 | "text/plain": [ 133 | "" 134 | ] 135 | }, 136 | "execution_count": 4, 137 | "metadata": {}, 138 | "output_type": "execute_result" 139 | } 140 | ], 141 | "source": [ 142 | "# create table from Parquet file\n", 143 | "bc.create_table('data_01', cwd + '/data/cancer_data_01.parquet')" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": { 149 | "colab_type": "text", 150 | "id": "98HJFrt5TRa0" 151 | }, 152 | "source": [ 153 | "### Create Table from GPU DataFrame\n", 154 | "Here we use cuDF to create a GPU DataFrame (GDF), then use BlazingSQL to create a table from that GDF.\n", 155 | "\n", 156 | "The GDF is the standard memory representation for the RAPIDS AI ecosystem." 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 5, 162 | "metadata": { 163 | "colab": {}, 164 | "colab_type": "code", 165 | "id": "14GwxmLsTV_p", 166 | "scrolled": true 167 | }, 168 | "outputs": [ 169 | { 170 | "data": { 171 | "text/plain": [ 172 | "" 173 | ] 174 | }, 175 | "execution_count": 5, 176 | "metadata": {}, 177 | "output_type": "execute_result" 178 | } 179 | ], 180 | "source": [ 181 | "# define column names and types\n", 182 | "column_names = ['compactness', 'symmetry', 'fractal_dimension']\n", 183 | "column_types = ['float32', 'float32', 'float32', 'float32']\n", 184 | "\n", 185 | "# make GDF with cuDF (uses relative path)\n", 186 | "gdf_02 = cudf.read_csv('data/cancer_data_02.csv', dtype=column_types, names=column_names)\n", 187 | "\n", 188 | "# create BlazingSQL table from GDF\n", 189 | "bc.create_table('data_02', gdf_02)" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": { 195 | "colab_type": "text", 196 | "id": "9DAZShZ2y-Nx" 197 | }, 198 | "source": [ 199 | "# Join Tables Together \n", 200 | "\n", 201 | "Now we can use BlazingSQL to join all three data formats in a single federated query. " 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 6, 207 | "metadata": { 208 | "colab": { 209 | "base_uri": "https://localhost:8080/", 210 | "height": 1000 211 | }, 212 | "colab_type": "code", 213 | "id": "HOYSFebvzGcX", 214 | "outputId": "ad133dfd-540e-4142-8f12-a4a70d803bb6", 215 | "scrolled": true 216 | }, 217 | "outputs": [ 218 | { 219 | "data": { 220 | "text/html": [ 221 | "
\n", 222 | "\n", 235 | "\n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | " \n", 826 | " \n", 827 | " \n", 828 | " \n", 829 | " \n", 830 | " \n", 831 | " \n", 832 | " \n", 833 | " \n", 834 | " \n", 835 | " \n", 836 | " \n", 837 | " \n", 838 | " \n", 839 | " \n", 840 | " \n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | " \n", 912 | " \n", 913 | " \n", 914 | " \n", 915 | " \n", 916 | " \n", 917 | " \n", 918 | " \n", 919 | " \n", 920 | " \n", 921 | " \n", 922 | " \n", 923 | " \n", 924 | " \n", 925 | " \n", 926 | " \n", 927 | " \n", 928 | " \n", 929 | " \n", 930 | " \n", 931 | " \n", 932 | " \n", 933 | " \n", 934 | " \n", 935 | " \n", 936 | " \n", 937 | " \n", 938 | " \n", 939 | " \n", 940 | " \n", 941 | " \n", 942 | " \n", 943 | " \n", 944 | " \n", 945 | " \n", 946 | " \n", 947 | " \n", 948 | " \n", 949 | " \n", 950 | " \n", 951 | " \n", 952 | " \n", 953 | " \n", 954 | " \n", 955 | " \n", 956 | " \n", 957 | " \n", 958 | " \n", 959 | " \n", 960 | " \n", 961 | " \n", 962 | " \n", 963 | " \n", 964 | " \n", 965 | " \n", 966 | " \n", 967 | " \n", 968 | " \n", 969 | " \n", 970 | " \n", 971 | " \n", 972 | " \n", 973 | " \n", 974 | " \n", 975 | " \n", 976 | " \n", 977 | " \n", 978 | " \n", 979 | " \n", 980 | " \n", 981 | " \n", 982 | " \n", 983 | " \n", 984 | "
diagnosis_resultradiustextureperimeterareasmoothnesscompactnesssymmetryfractal_dimension
01.011.021.0120.01033.00.1150.1490000040.2090000060.063000001
10.017.021.086.0563.00.0820.0599999990.1780000030.056000002
21.019.026.094.0578.00.1130.2290000020.2070000020.077
31.019.011.0122.01094.00.0940.1070000010.1700000020.057
40.010.017.087.0566.00.0980.0810.189000010.058000002
51.016.019.083.0477.00.1280.1700000020.2090000060.075999998
60.022.016.083.0477.00.1280.1700000020.2090000060.075999998
70.017.021.086.0535.00.1160.1230000030.2130000140.067999996
80.010.017.087.0545.00.1040.1439999940.1969999970.067999996
91.023.016.0132.01123.00.0970.2460000070.240000010.078000002
101.016.019.083.0524.00.0900.0379999990.1470.059
111.021.018.0124.01076.00.1100.1690.1910.059999999
120.022.016.083.0524.00.0900.0379999990.1470.059
131.019.026.094.0633.00.0980.1099999990.189000010.060999997
141.016.019.083.0527.00.0810.0379999990.1470.059
150.022.016.083.0527.00.0810.0379999990.1470.059
161.010.012.0100.0706.00.1040.1550000010.1860000040.063000001
171.022.026.0100.0706.00.1040.1550000010.1860000040.063000001
180.010.017.087.0561.00.0880.0770.1810000090.057
190.012.023.096.0699.00.0940.0509999990.1570000050.055
200.018.012.072.0371.00.1230.1220000010.1899999980.068999998
210.012.023.096.0657.00.1140.1369999950.2030000090.067999996
220.012.023.096.0646.00.1050.2010000050.1949999930.072999999
230.010.018.074.0413.00.0900.0750000030.1620.066
241.011.021.097.0713.00.0910.0710000020.1620.057
250.022.016.083.0506.00.099nullnullnull
260.014.014.085.0532.00.097nullnullnull
270.010.017.087.0572.00.077nullnullnull
281.016.019.083.0477.00.1280.1700000020.1930000040.064999998
290.022.016.083.0477.00.1280.1700000020.1930000040.064999998
..............................
2861.011.021.097.0659.00.1140.1599999960.2070000020.059999999
2870.012.013.060.0274.00.1020.0649999980.1820000110.068999998
2880.010.017.088.0520.00.1270.1930000040.2349999990.074000001
2890.017.021.086.0520.00.1080.1270000040.1969999970.067999996
2901.019.026.094.0643.00.0980.1140.1880000080.063999996
2910.023.027.095.0685.00.0990.0719999970.1590000090.059
2921.011.021.097.0645.00.1050.1870000060.2249999940.068999998
2930.016.017.059.0261.00.0770.0880.2339999970.07
2940.09.026.059.0261.00.0770.0880.2339999970.07
2951.021.018.0104.0783.00.0840.1000000010.1850000020.052999999
2960.010.017.088.0559.00.1020.1260000020.1720000060.063999996
2971.014.013.0121.01075.00.099nullnullnull
2981.019.026.094.0648.00.094nullnullnull
2991.019.011.0122.01076.00.090nullnullnull
3000.011.011.080.0466.00.088nullnullnull
3010.012.023.096.0652.00.113nullnullnull
3020.023.027.095.0663.00.090nullnullnull
3030.010.017.087.0555.00.102nullnullnull
3040.016.017.059.0244.00.098nullnullnull
3050.09.026.059.0244.00.098nullnullnull
3061.021.018.0104.0781.00.097nullnullnull
3071.011.021.0120.01040.00.095nullnullnull
3081.016.019.083.0506.00.099nullnullnull
3091.022.025.090.0578.00.119nullnullnull
3101.011.021.097.0659.00.1140.1599999960.2300000040.071000002
3110.014.014.085.0552.00.0740.0509999990.1389999990.052999999
3120.025.021.077.0443.00.0970.0719999970.2080000040.059999999
3130.017.021.086.0520.00.1080.1270000040.1920000020.059999999
3140.023.027.095.0685.00.0990.0719999970.2080000040.059999999
3150.010.017.088.0559.00.1020.1260000020.1910.066
\n", 985 | "

316 rows × 9 columns

\n", 986 | "
" 987 | ], 988 | "text/plain": [ 989 | " diagnosis_result radius texture perimeter area smoothness \\\n", 990 | "0 1.0 11.0 21.0 120.0 1033.0 0.115 \n", 991 | "1 0.0 17.0 21.0 86.0 563.0 0.082 \n", 992 | "2 1.0 19.0 26.0 94.0 578.0 0.113 \n", 993 | "3 1.0 19.0 11.0 122.0 1094.0 0.094 \n", 994 | "4 0.0 10.0 17.0 87.0 566.0 0.098 \n", 995 | "5 1.0 16.0 19.0 83.0 477.0 0.128 \n", 996 | "6 0.0 22.0 16.0 83.0 477.0 0.128 \n", 997 | "7 0.0 17.0 21.0 86.0 535.0 0.116 \n", 998 | "8 0.0 10.0 17.0 87.0 545.0 0.104 \n", 999 | "9 1.0 23.0 16.0 132.0 1123.0 0.097 \n", 1000 | "10 1.0 16.0 19.0 83.0 524.0 0.090 \n", 1001 | "11 1.0 21.0 18.0 124.0 1076.0 0.110 \n", 1002 | "12 0.0 22.0 16.0 83.0 524.0 0.090 \n", 1003 | "13 1.0 19.0 26.0 94.0 633.0 0.098 \n", 1004 | "14 1.0 16.0 19.0 83.0 527.0 0.081 \n", 1005 | "15 0.0 22.0 16.0 83.0 527.0 0.081 \n", 1006 | "16 1.0 10.0 12.0 100.0 706.0 0.104 \n", 1007 | "17 1.0 22.0 26.0 100.0 706.0 0.104 \n", 1008 | "18 0.0 10.0 17.0 87.0 561.0 0.088 \n", 1009 | "19 0.0 12.0 23.0 96.0 699.0 0.094 \n", 1010 | "20 0.0 18.0 12.0 72.0 371.0 0.123 \n", 1011 | "21 0.0 12.0 23.0 96.0 657.0 0.114 \n", 1012 | "22 0.0 12.0 23.0 96.0 646.0 0.105 \n", 1013 | "23 0.0 10.0 18.0 74.0 413.0 0.090 \n", 1014 | "24 1.0 11.0 21.0 97.0 713.0 0.091 \n", 1015 | "25 0.0 22.0 16.0 83.0 506.0 0.099 \n", 1016 | "26 0.0 14.0 14.0 85.0 532.0 0.097 \n", 1017 | "27 0.0 10.0 17.0 87.0 572.0 0.077 \n", 1018 | "28 1.0 16.0 19.0 83.0 477.0 0.128 \n", 1019 | "29 0.0 22.0 16.0 83.0 477.0 0.128 \n", 1020 | ".. ... ... ... ... ... ... \n", 1021 | "286 1.0 11.0 21.0 97.0 659.0 0.114 \n", 1022 | "287 0.0 12.0 13.0 60.0 274.0 0.102 \n", 1023 | "288 0.0 10.0 17.0 88.0 520.0 0.127 \n", 1024 | "289 0.0 17.0 21.0 86.0 520.0 0.108 \n", 1025 | "290 1.0 19.0 26.0 94.0 643.0 0.098 \n", 1026 | "291 0.0 23.0 27.0 95.0 685.0 0.099 \n", 1027 | "292 1.0 11.0 21.0 97.0 645.0 0.105 \n", 1028 | "293 0.0 16.0 17.0 59.0 261.0 0.077 \n", 1029 | "294 0.0 9.0 26.0 59.0 261.0 0.077 \n", 1030 | "295 1.0 21.0 18.0 104.0 783.0 0.084 \n", 1031 | "296 0.0 10.0 17.0 88.0 559.0 0.102 \n", 1032 | "297 1.0 14.0 13.0 121.0 1075.0 0.099 \n", 1033 | "298 1.0 19.0 26.0 94.0 648.0 0.094 \n", 1034 | "299 1.0 19.0 11.0 122.0 1076.0 0.090 \n", 1035 | "300 0.0 11.0 11.0 80.0 466.0 0.088 \n", 1036 | "301 0.0 12.0 23.0 96.0 652.0 0.113 \n", 1037 | "302 0.0 23.0 27.0 95.0 663.0 0.090 \n", 1038 | "303 0.0 10.0 17.0 87.0 555.0 0.102 \n", 1039 | "304 0.0 16.0 17.0 59.0 244.0 0.098 \n", 1040 | "305 0.0 9.0 26.0 59.0 244.0 0.098 \n", 1041 | "306 1.0 21.0 18.0 104.0 781.0 0.097 \n", 1042 | "307 1.0 11.0 21.0 120.0 1040.0 0.095 \n", 1043 | "308 1.0 16.0 19.0 83.0 506.0 0.099 \n", 1044 | "309 1.0 22.0 25.0 90.0 578.0 0.119 \n", 1045 | "310 1.0 11.0 21.0 97.0 659.0 0.114 \n", 1046 | "311 0.0 14.0 14.0 85.0 552.0 0.074 \n", 1047 | "312 0.0 25.0 21.0 77.0 443.0 0.097 \n", 1048 | "313 0.0 17.0 21.0 86.0 520.0 0.108 \n", 1049 | "314 0.0 23.0 27.0 95.0 685.0 0.099 \n", 1050 | "315 0.0 10.0 17.0 88.0 559.0 0.102 \n", 1051 | "\n", 1052 | " compactness symmetry fractal_dimension \n", 1053 | "0 0.149000004 0.209000006 0.063000001 \n", 1054 | "1 0.059999999 0.178000003 0.056000002 \n", 1055 | "2 0.229000002 0.207000002 0.077 \n", 1056 | "3 0.107000001 0.170000002 0.057 \n", 1057 | "4 0.081 0.18900001 0.058000002 \n", 1058 | "5 0.170000002 0.209000006 0.075999998 \n", 1059 | "6 0.170000002 0.209000006 0.075999998 \n", 1060 | "7 0.123000003 0.213000014 0.067999996 \n", 1061 | "8 0.143999994 0.196999997 0.067999996 \n", 1062 | "9 0.246000007 0.24000001 0.078000002 \n", 1063 | "10 0.037999999 0.147 0.059 \n", 1064 | "11 0.169 0.191 0.059999999 \n", 1065 | "12 0.037999999 0.147 0.059 \n", 1066 | "13 0.109999999 0.18900001 0.060999997 \n", 1067 | "14 0.037999999 0.147 0.059 \n", 1068 | "15 0.037999999 0.147 0.059 \n", 1069 | "16 0.155000001 0.186000004 0.063000001 \n", 1070 | "17 0.155000001 0.186000004 0.063000001 \n", 1071 | "18 0.077 0.181000009 0.057 \n", 1072 | "19 0.050999999 0.157000005 0.055 \n", 1073 | "20 0.122000001 0.189999998 0.068999998 \n", 1074 | "21 0.136999995 0.203000009 0.067999996 \n", 1075 | "22 0.201000005 0.194999993 0.072999999 \n", 1076 | "23 0.075000003 0.162 0.066 \n", 1077 | "24 0.071000002 0.162 0.057 \n", 1078 | "25 null null null \n", 1079 | "26 null null null \n", 1080 | "27 null null null \n", 1081 | "28 0.170000002 0.193000004 0.064999998 \n", 1082 | "29 0.170000002 0.193000004 0.064999998 \n", 1083 | ".. ... ... ... \n", 1084 | "286 0.159999996 0.207000002 0.059999999 \n", 1085 | "287 0.064999998 0.182000011 0.068999998 \n", 1086 | "288 0.193000004 0.234999999 0.074000001 \n", 1087 | "289 0.127000004 0.196999997 0.067999996 \n", 1088 | "290 0.114 0.188000008 0.063999996 \n", 1089 | "291 0.071999997 0.159000009 0.059 \n", 1090 | "292 0.187000006 0.224999994 0.068999998 \n", 1091 | "293 0.088 0.233999997 0.07 \n", 1092 | "294 0.088 0.233999997 0.07 \n", 1093 | "295 0.100000001 0.185000002 0.052999999 \n", 1094 | "296 0.126000002 0.172000006 0.063999996 \n", 1095 | "297 null null null \n", 1096 | "298 null null null \n", 1097 | "299 null null null \n", 1098 | "300 null null null \n", 1099 | "301 null null null \n", 1100 | "302 null null null \n", 1101 | "303 null null null \n", 1102 | "304 null null null \n", 1103 | "305 null null null \n", 1104 | "306 null null null \n", 1105 | "307 null null null \n", 1106 | "308 null null null \n", 1107 | "309 null null null \n", 1108 | "310 0.159999996 0.230000004 0.071000002 \n", 1109 | "311 0.050999999 0.138999999 0.052999999 \n", 1110 | "312 0.071999997 0.208000004 0.059999999 \n", 1111 | "313 0.127000004 0.192000002 0.059999999 \n", 1112 | "314 0.071999997 0.208000004 0.059999999 \n", 1113 | "315 0.126000002 0.191 0.066 \n", 1114 | "\n", 1115 | "[316 rows x 9 columns]" 1116 | ] 1117 | }, 1118 | "execution_count": 6, 1119 | "metadata": {}, 1120 | "output_type": "execute_result" 1121 | } 1122 | ], 1123 | "source": [ 1124 | "# grab everything from 00 & 02, area & smoothness from 01\n", 1125 | "query = '''\n", 1126 | " SELECT \n", 1127 | " a.*, \n", 1128 | " b.area, b.smoothness, \n", 1129 | " c.* \n", 1130 | " FROM \n", 1131 | " data_00 AS a\n", 1132 | " LEFT JOIN \n", 1133 | " data_01 AS b\n", 1134 | " ON (a.perimeter = b.perimeter)\n", 1135 | " LEFT JOIN \n", 1136 | " data_02 AS c\n", 1137 | " ON (b.compactness = c.compactness)\n", 1138 | " '''\n", 1139 | "\n", 1140 | "# join the tables together (type(gdf)==cudf.core.dataframe.Dataframe)\n", 1141 | "gdf = bc.sql(query)\n", 1142 | "\n", 1143 | "# display result\n", 1144 | "gdf" 1145 | ] 1146 | }, 1147 | { 1148 | "cell_type": "markdown", 1149 | "metadata": { 1150 | "colab_type": "text", 1151 | "id": "wygAeTIFTm2X" 1152 | }, 1153 | "source": [ 1154 | "# You're Ready to Rock\n", 1155 | "And... thats it! You are now live with BlazingSQL.\n", 1156 | "\n", 1157 | "Check out our [docs](https://docs.blazingdb.com) to get fancy or to learn more about how BlazingSQL works with the rest of [RAPIDS AI](https://rapids.ai/)." 1158 | ] 1159 | } 1160 | ], 1161 | "metadata": { 1162 | "accelerator": "GPU", 1163 | "colab": { 1164 | "collapsed_sections": [ 1165 | "McVBO7GHRDzz" 1166 | ], 1167 | "name": "BlazingSQL_Federated_Query_Demo.ipynb", 1168 | "provenance": [], 1169 | "toc_visible": true 1170 | }, 1171 | "kernelspec": { 1172 | "display_name": "Python 3", 1173 | "language": "python", 1174 | "name": "python3" 1175 | }, 1176 | "language_info": { 1177 | "codemirror_mode": { 1178 | "name": "ipython", 1179 | "version": 3 1180 | }, 1181 | "file_extension": ".py", 1182 | "mimetype": "text/x-python", 1183 | "name": "python", 1184 | "nbconvert_exporter": "python", 1185 | "pygments_lexer": "ipython3", 1186 | "version": "3.7.3" 1187 | } 1188 | }, 1189 | "nbformat": 4, 1190 | "nbformat_minor": 4 1191 | } 1192 | -------------------------------------------------------------------------------- /graphistry_netflow_demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "kJyD4oSbugE0" 8 | }, 9 | "source": [ 10 | "# Graphistry Netflow Demo\n", 11 | "\n", 12 | "In this example we are taking millions of rows of netflow (network traffic flow) data in order to search for anomalous activity within a network. We will query 70M+ rows of network security data (netflow) with BlazingSQL and pass it to Graphistry for visualization." 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "## Blazing Context\n", 20 | "Here we are importing cuDF and BlazingContext. You can think of the BlazingContext much like a Spark Context (i.e. where information such as FileSystems you have registered and Tables you have created will be stored). If you have issues running this cell, restart runtime and try running it again." 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 12, 26 | "metadata": {}, 27 | "outputs": [ 28 | { 29 | "name": "stdout", 30 | "output_type": "stream", 31 | "text": [ 32 | "Already connected to the Orchestrator\n", 33 | "BlazingContext ready\n" 34 | ] 35 | } 36 | ], 37 | "source": [ 38 | "from blazingsql import BlazingContext \n", 39 | "\n", 40 | "bc = BlazingContext()" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": { 46 | "colab_type": "text", 47 | "id": "yp7z8bfivbna" 48 | }, 49 | "source": [ 50 | "### Create & Query Tables\n", 51 | "In this next cell we identify the full path to the data." 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 13, 57 | "metadata": {}, 58 | "outputs": [ 59 | { 60 | "data": { 61 | "text/plain": [ 62 | "'/home/winston/bsql-demos/data/*_0.parquet'" 63 | ] 64 | }, 65 | "execution_count": 13, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "# identify working directory path\n", 72 | "local_path = !pwd\n", 73 | "\n", 74 | "# make wildcard path to load all 4 parquet files into blazingsql\n", 75 | "path = str(local_path) + '/data/*_0.parquet'\n", 76 | "\n", 77 | "# what's the path? \n", 78 | "path" 79 | ] 80 | }, 81 | { 82 | "cell_type": "markdown", 83 | "metadata": {}, 84 | "source": [ 85 | "#### Create\n", 86 | "Here use the path identified above to load all 4 parquet files into a single BlazingSQL table. This is done by using a wildcard (*) in the file path. \n", 87 | "\n", 88 | "Note: point path to `data/small-chunk2.csv` for pre-downloaded data." 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 31, 94 | "metadata": { 95 | "colab": {}, 96 | "colab_type": "code", 97 | "id": "lU-2wlwQntnq" 98 | }, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "CPU times: user 4.16 ms, sys: 4.18 ms, total: 8.35 ms\n", 105 | "Wall time: 298 ms\n" 106 | ] 107 | }, 108 | { 109 | "data": { 110 | "text/plain": [ 111 | "" 112 | ] 113 | }, 114 | "execution_count": 31, 115 | "metadata": {}, 116 | "output_type": "execute_result" 117 | } 118 | ], 119 | "source": [ 120 | "%%time\n", 121 | "# blazingsql table from gpu dataframe\n", 122 | "bc.create_table('netflow', path)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": { 128 | "colab_type": "text", 129 | "id": "cgivbut9df-R" 130 | }, 131 | "source": [ 132 | "#### Query\n", 133 | "With the table made, we can simply run a SQL query.\n", 134 | "\n", 135 | "We are going to run some joins and aggregations in order to condese these millions of rows into thousands of rows that represent nodes and edges." 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 32, 141 | "metadata": { 142 | "colab": { 143 | "base_uri": "https://localhost:8080/", 144 | "height": 277 145 | }, 146 | "colab_type": "code", 147 | "id": "umBG2Tp0wbQx", 148 | "outputId": "b89e3666-f85a-40e9-e7c4-cda9a80b7fe5" 149 | }, 150 | "outputs": [ 151 | { 152 | "name": "stdout", 153 | "output_type": "stream", 154 | "text": [ 155 | "CPU times: user 29.3 ms, sys: 41.9 ms, total: 71.3 ms\n", 156 | "Wall time: 4.51 s\n" 157 | ] 158 | } 159 | ], 160 | "source": [ 161 | "%%time\n", 162 | "# what are we looking for \n", 163 | "query = '''\n", 164 | " SELECT\n", 165 | " a.firstSeenSrcIp as source,\n", 166 | " a.firstSeenDestIp as destination,\n", 167 | " count(a.firstSeenDestPort) as targetPorts,\n", 168 | " SUM(a.firstSeenSrcTotalBytes) as bytesOut,\n", 169 | " SUM(a.firstSeenDestTotalBytes) as bytesIn,\n", 170 | " SUM(a.durationSeconds) as durationSeconds,\n", 171 | " MIN(parsedDate) as firstFlowDate,\n", 172 | " MAX(parsedDate) as lastFlowDate,\n", 173 | " COUNT(*) as attemptCount\n", 174 | " FROM\n", 175 | " netflow a\n", 176 | " GROUP BY\n", 177 | " a.firstSeenSrcIp,\n", 178 | " a.firstSeenDestIp\n", 179 | " '''\n", 180 | "\n", 181 | "# run sql query (returns cuDF DataFrame)\n", 182 | "gdf = bc.sql(query)\n", 183 | "\n", 184 | "# how do the results look?\n", 185 | "gdf.head(25)" 186 | ] 187 | } 188 | ], 189 | "metadata": { 190 | "file_extension": ".py", 191 | "kernelspec": { 192 | "display_name": "Python 3", 193 | "language": "python", 194 | "name": "python3" 195 | }, 196 | "language_info": { 197 | "codemirror_mode": { 198 | "name": "ipython", 199 | "version": 3 200 | }, 201 | "file_extension": ".py", 202 | "mimetype": "text/x-python", 203 | "name": "python", 204 | "nbconvert_exporter": "python", 205 | "pygments_lexer": "ipython3", 206 | "version": "3.7.3" 207 | }, 208 | "mimetype": "text/x-python", 209 | "name": "python", 210 | "npconvert_exporter": "python", 211 | "pygments_lexer": "ipython3", 212 | "version": 3 213 | }, 214 | "nbformat": 4, 215 | "nbformat_minor": 4 216 | } 217 | -------------------------------------------------------------------------------- /imgs/bsql_main.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BlazingDB/bsql-demos/ebee8a606a272f3e2ab7a38587a6092fe2018d93/imgs/bsql_main.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | blazingsql>=0.11 2 | cudf>=0.11 3 | cuml>=0.11 -------------------------------------------------------------------------------- /sample_use_cases/csv_to_parquet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# CSV to Parquet" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "In this demo we'll walk through querying a CSV file from an AWS S3 bucket and saving the results locally as a Parquet file." 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "Identify the Dask Client (`client`) of your local GPUs, and pass it to BlazingContext (`bc`) upon initialization to activate distributed query execution with BlazingSQL." 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "name": "stdout", 31 | "output_type": "stream", 32 | "text": [ 33 | "BlazingContext ready\n" 34 | ] 35 | } 36 | ], 37 | "source": [ 38 | "from dask_cuda import LocalCUDACluster\n", 39 | "cluster = LocalCUDACluster()\n", 40 | "\n", 41 | "from dask.distributed import Client\n", 42 | "client = Client(cluster)\n", 43 | "\n", 44 | "from blazingsql import BlazingContext\n", 45 | "bc = BlazingContext(dask_client=client, network_interface='lo')" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": {}, 51 | "source": [ 52 | "Register a public AWS S3 bucket and create a table (`taxi`) from it." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 2, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "data": { 62 | "text/plain": [ 63 | "" 64 | ] 65 | }, 66 | "execution_count": 2, 67 | "metadata": {}, 68 | "output_type": "execute_result" 69 | } 70 | ], 71 | "source": [ 72 | "bc.s3('blazingsql-colab', bucket_name='blazingsql-colab')\n", 73 | "\n", 74 | "col_names = ['key', 'fare', 'pickup_x', 'pickup_y', 'dropoff_x', 'dropoff_y', 'passenger_count']\n", 75 | "bc.create_table('taxi', 's3://blazingsql-colab/taxi_data/taxi_00.csv', names=col_names)" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "Tag the file path to the local directory where results will be saved as `data_dir`." 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": 3, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "from os import getcwd\n", 92 | "data_dir = getcwd().replace('/sample_use_cases', '/data')" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "\n", 100 | "\n", 101 | "As BlazingSQL returns a distributed query's results as a dask_cudf.DataFrame, we can call write those results directly [.to_parquet()](https://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.to_parquet)." 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 4, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "bc.sql('SELECT * FROM taxi').to_parquet(f'{data_dir}/yellow_cab')" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "Create a table from that newly written file, and run a simple query to see how it looks by `.compute()`ing to a cudf.DataFrame for display." 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 5, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "data": { 127 | "text/html": [ 128 | "
\n", 129 | "\n", 142 | "\n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | " \n", 212 | " \n", 213 | " \n", 214 | " \n", 215 | " \n", 216 | " \n", 217 | " \n", 218 | " \n", 219 | " \n", 220 | " \n", 221 | " \n", 222 | " \n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | "
keyfarepickup_xpickup_ydropoff_xdropoff_ypassenger_countindex
02012-02-02 22:30:19.00000028.9-73.98870340.758803-73.98651740.73720510
12014-09-20 07:19:24.00000014.0-73.99020840.746703-73.99472940.75051211
22013-02-23 07:18:05.00000015.5-74.01675740.709438-74.00940.71949632
32015-04-18 23:49:27.000000913.5-74.00270840.733730-73.9860992440.7347755413
42010-03-04 08:15:59.000000110.5-73.98835640.737665-74.01245940.71393414
...........................
49999952011-02-24 16:06:26.00000016.9-73.96654240.804975-73.94904340.80422724999995
49999962009-09-22 19:20:22.00000099.7-73.98005540.752535-74.00644340.73961314999996
49999972012-04-19 02:17:32.000000114.1-73.99850840.745305-73.95318440.79936124999997
49999982012-06-08 11:09:47.00000063.3-73.95363040.778797-73.94606840.77555214999998
49999992009-06-21 11:07:00.000000366.5-73.98157840.772575-73.96333340.76213214999999
\n", 280 | "

5000000 rows × 8 columns

\n", 281 | "
" 282 | ], 283 | "text/plain": [ 284 | " key fare pickup_x pickup_y \\\n", 285 | "0 2012-02-02 22:30:19.0000002 8.9 -73.988703 40.758803 \n", 286 | "1 2014-09-20 07:19:24.0000001 4.0 -73.990208 40.746703 \n", 287 | "2 2013-02-23 07:18:05.0000001 5.5 -74.016757 40.709438 \n", 288 | "3 2015-04-18 23:49:27.0000009 13.5 -74.002708 40.733730 \n", 289 | "4 2010-03-04 08:15:59.0000001 10.5 -73.988356 40.737665 \n", 290 | "... ... ... ... ... \n", 291 | "4999995 2011-02-24 16:06:26.0000001 6.9 -73.966542 40.804975 \n", 292 | "4999996 2009-09-22 19:20:22.0000009 9.7 -73.980055 40.752535 \n", 293 | "4999997 2012-04-19 02:17:32.0000001 14.1 -73.998508 40.745305 \n", 294 | "4999998 2012-06-08 11:09:47.0000006 3.3 -73.953630 40.778797 \n", 295 | "4999999 2009-06-21 11:07:00.00000036 6.5 -73.981578 40.772575 \n", 296 | "\n", 297 | " dropoff_x dropoff_y passenger_count index \n", 298 | "0 -73.986517 40.737205 1 0 \n", 299 | "1 -73.994729 40.750512 1 1 \n", 300 | "2 -74.009 40.719496 3 2 \n", 301 | "3 -73.98609924 40.73477554 1 3 \n", 302 | "4 -74.012459 40.713934 1 4 \n", 303 | "... ... ... ... ... \n", 304 | "4999995 -73.949043 40.804227 2 4999995 \n", 305 | "4999996 -74.006443 40.739613 1 4999996 \n", 306 | "4999997 -73.953184 40.799361 2 4999997 \n", 307 | "4999998 -73.946068 40.775552 1 4999998 \n", 308 | "4999999 -73.963333 40.762132 1 4999999 \n", 309 | "\n", 310 | "[5000000 rows x 8 columns]" 311 | ] 312 | }, 313 | "execution_count": 5, 314 | "metadata": {}, 315 | "output_type": "execute_result" 316 | } 317 | ], 318 | "source": [ 319 | "bc.create_table('parquet_taxi', f'{data_dir}/yellow_cab/part.0.parquet')\n", 320 | "\n", 321 | "bc.sql('select * from parquet_taxi').compute()" 322 | ] 323 | }, 324 | { 325 | "cell_type": "markdown", 326 | "metadata": {}, 327 | "source": [ 328 | "You can find the Python script version of this Notebook at [/python_scripts/csv_to_parquet.py](python_scripts/csv_to_parquet.py)." 329 | ] 330 | } 331 | ], 332 | "metadata": { 333 | "kernelspec": { 334 | "display_name": "RAPIDS Nightly", 335 | "language": "python", 336 | "name": "python3" 337 | }, 338 | "language_info": { 339 | "codemirror_mode": { 340 | "name": "ipython", 341 | "version": 3 342 | }, 343 | "file_extension": ".py", 344 | "mimetype": "text/x-python", 345 | "name": "python", 346 | "nbconvert_exporter": "python", 347 | "pygments_lexer": "ipython3", 348 | "version": "3.7.6" 349 | } 350 | }, 351 | "nbformat": 4, 352 | "nbformat_minor": 4 353 | } 354 | -------------------------------------------------------------------------------- /sample_use_cases/python_scripts/csv_to_parquet.py: -------------------------------------------------------------------------------- 1 | from dask.distributed import Client 2 | from blazingsql import BlazingContext 3 | from dask_cuda import LocalCUDACluster 4 | 5 | # initalize BlazingContext with the Dask Client of local GPUs to distribute query execution 6 | bc = BlazingContext(dask_client=Client(LocalCUDACluster()), network_interface='lo') 7 | 8 | # register public AWS S3 bucket 9 | bc.s3('blazingsql-colab', bucket_name='blazingsql-colab') 10 | 11 | # create a table from that S3 bucket 12 | col_names = ['key', 'fare', 'pickup_x', 'pickup_y', 'dropoff_x', 'dropoff_y', 'passenger_count'] 13 | bc.create_table('taxi', 's3://blazingsql-colab/taxi_data/taxi_00.csv', names=col_names) 14 | 15 | # query the table & write results locally as parquet 16 | bc.sql('SELECT * FROM taxi').to_parquet(f'../../data/yellow_cab') 17 | -------------------------------------------------------------------------------- /taxi_fare_prediction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "l4fOFMjbRvkZ" 8 | }, 9 | "source": [ 10 | "# BlazingSQL + cuML NYC Taxi Cab Fare Prediction\n", 11 | "\n", 12 | "This demo uses publicly available [NYC Taxi Cab Data](https://www.kaggle.com/c/new-york-city-taxi-fare-prediction) to predict the total fare of a taxi ride in New York City given the pickup and dropoff locations. \n", 13 | "\n", 14 | "In this notebook, we will cover: \n", 15 | "- How to read and query multiple CSV files with BlazingSQL.\n", 16 | "- How to implement a linear regression model with cuML.\n", 17 | "\n", 18 | "### Imports\n", 19 | "This next cell will import all packages you need to run this notebook end-to-end." 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 1, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import os\n", 29 | "import urllib\n", 30 | "from cuml import LinearRegression\n", 31 | "from blazingsql import BlazingContext" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": {}, 37 | "source": [ 38 | "## Create BlazingContext\n", 39 | "You can think of the BlazingContext much like a Spark Context (i.e. where information such as FileSystems you have registered and Tables you have created will be stored). If you have issues running this cell, restart runtime and try running it again." 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 2, 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "name": "stdout", 49 | "output_type": "stream", 50 | "text": [ 51 | "BlazingContext ready\n" 52 | ] 53 | } 54 | ], 55 | "source": [ 56 | "# connect to BlazingSQL\n", 57 | "bc = BlazingContext()" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": { 63 | "colab_type": "text", 64 | "id": "Gt0TPBqif50q" 65 | }, 66 | "source": [ 67 | "### Download Data\n", 68 | "For this demo we will train our model with 25,000,000 rows of data from 5 CSV files (5M rows each).\n", 69 | "\n", 70 | "The cell below will check if you already have them, and, if you don't, will download them from AWS for you. " 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 3, 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "name": "stdout", 80 | "output_type": "stream", 81 | "text": [ 82 | "Downloading https://blazingsql-colab.s3.amazonaws.com/taxi_data/taxi_00.csv to data/taxi_00.csv\n", 83 | "Downloading https://blazingsql-colab.s3.amazonaws.com/taxi_data/taxi_01.csv to data/taxi_01.csv\n", 84 | "Downloading https://blazingsql-colab.s3.amazonaws.com/taxi_data/taxi_02.csv to data/taxi_02.csv\n", 85 | "Downloading https://blazingsql-colab.s3.amazonaws.com/taxi_data/taxi_03.csv to data/taxi_03.csv\n", 86 | "Downloading https://blazingsql-colab.s3.amazonaws.com/taxi_data/taxi_04.csv to data/taxi_04.csv\n", 87 | "CPU times: user 4.19 s, sys: 5.16 s, total: 9.36 s\n", 88 | "Wall time: 26.8 s\n" 89 | ] 90 | } 91 | ], 92 | "source": [ 93 | "%%time\n", 94 | "# download taxi data\n", 95 | "base_url = 'https://blazingsql-colab.s3.amazonaws.com/taxi_data/'\n", 96 | "for i in range(0, 5):\n", 97 | " fn = 'taxi_0' + str(i) + '.csv'\n", 98 | " # check if we already have the file\n", 99 | " if not os.path.isfile('data/' + fn):\n", 100 | " # we don't let me know we're downloading it now\n", 101 | " print(f'Downloading {base_url + fn} to data/{fn}')\n", 102 | " # download file\n", 103 | " urllib.request.urlretrieve(base_url + fn, 'data/' + fn)\n", 104 | " # we already have data\n", 105 | " else:\n", 106 | " # let us know\n", 107 | " print(f'data/{fn} already downloaded')" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": { 113 | "colab_type": "text", 114 | "id": "PXtydYrimQGt" 115 | }, 116 | "source": [ 117 | "## Extract, transform, load\n", 118 | "In order to train our Linear Regression model, we must first preform ETL to prepare our data.\n", 119 | "\n", 120 | "BlazingSQL currently requires the full file path to create tables, the cell below will identify that path for you." 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 4, 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/plain": [ 131 | "'/home/jupyter-winston/bsql-demos/data/taxi_0*.csv'" 132 | ] 133 | }, 134 | "execution_count": 4, 135 | "metadata": {}, 136 | "output_type": "execute_result" 137 | } 138 | ], 139 | "source": [ 140 | "# identify current working directory\n", 141 | "cwd = os.getcwd()\n", 142 | "# add path to data w/ wildcard (*) so BSQL can read all 5 files at once\n", 143 | "data_path = cwd + '/data/taxi_0*.csv'\n", 144 | "# how's it look?\n", 145 | "data_path" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "### ETL: Create Table \n", 153 | "In this next cell we will create a single BlazingSQL table from all 5 CSVs." 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 5, 159 | "metadata": { 160 | "colab": {}, 161 | "colab_type": "code", 162 | "id": "Gr7CUSrsEBmW" 163 | }, 164 | "outputs": [ 165 | { 166 | "name": "stdout", 167 | "output_type": "stream", 168 | "text": [ 169 | "CPU times: user 3.13 ms, sys: 2.44 ms, total: 5.57 ms\n", 170 | "Wall time: 4.66 ms\n" 171 | ] 172 | }, 173 | { 174 | "data": { 175 | "text/plain": [ 176 | "" 177 | ] 178 | }, 179 | "execution_count": 5, 180 | "metadata": {}, 181 | "output_type": "execute_result" 182 | } 183 | ], 184 | "source": [ 185 | "%%time\n", 186 | "# tag column names and types\n", 187 | "col_names = ['key', 'fare_amount', 'pickup_longitude', 'pickup_latitude', \n", 188 | " 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']\n", 189 | "col_types = ['date64', 'float32', 'float32', 'float32',\n", 190 | " 'float32', 'float32', 'float32']\n", 191 | "\n", 192 | "# create a table from all 5 taxi files at once\n", 193 | "bc.create_table('train_taxi', data_path, names=col_names, dtype=col_types, header=0)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "markdown", 198 | "metadata": { 199 | "colab_type": "text", 200 | "id": "XnzjqEFnmDC5" 201 | }, 202 | "source": [ 203 | "### ETL: Query Tables for Training Data" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 7, 209 | "metadata": { 210 | "colab": { 211 | "base_uri": "https://localhost:8080/", 212 | "height": 425 213 | }, 214 | "colab_type": "code", 215 | "id": "_MDxz73ZMhhK", 216 | "outputId": "f2abeafc-0cdf-46b1-ddf5-a5cde3d37792" 217 | }, 218 | "outputs": [ 219 | { 220 | "data": { 221 | "text/html": [ 222 | "
\n", 223 | "\n", 236 | "\n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | " \n", 247 | " \n", 248 | " \n", 249 | " \n", 250 | " \n", 251 | " \n", 252 | " \n", 253 | " \n", 254 | " \n", 255 | " \n", 256 | " \n", 257 | " \n", 258 | " \n", 259 | " \n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | "
hoursdaysmonthsyearslongitude_distancelatitude_distancepassenger_count
020.010.09.013.00.0490570.0030631.0
120.022.011.09.00.0034640.0070881.0
221.04.012.09.00.0031510.0075841.0
322.06.05.015.00.0071410.0115431.0
423.027.04.09.0-0.014870-0.0331611.0
\n", 302 | "
" 303 | ], 304 | "text/plain": [ 305 | " hours days months years longitude_distance latitude_distance \\\n", 306 | "0 20.0 10.0 9.0 13.0 0.049057 0.003063 \n", 307 | "1 20.0 22.0 11.0 9.0 0.003464 0.007088 \n", 308 | "2 21.0 4.0 12.0 9.0 0.003151 0.007584 \n", 309 | "3 22.0 6.0 5.0 15.0 0.007141 0.011543 \n", 310 | "4 23.0 27.0 4.0 9.0 -0.014870 -0.033161 \n", 311 | "\n", 312 | " passenger_count \n", 313 | "0 1.0 \n", 314 | "1 1.0 \n", 315 | "2 1.0 \n", 316 | "3 1.0 \n", 317 | "4 1.0 " 318 | ] 319 | }, 320 | "execution_count": 7, 321 | "metadata": {}, 322 | "output_type": "execute_result" 323 | } 324 | ], 325 | "source": [ 326 | "# extract time columns, long & lat, # riders (all floats)\n", 327 | "query = '''\n", 328 | " select \n", 329 | " cast(hour(key) as float) hours, \n", 330 | " cast(dayofmonth(key) as float) days, \n", 331 | " cast(month(key) as float) months, \n", 332 | " cast(year(key) - 2000 as float) years, \n", 333 | " dropoff_longitude - pickup_longitude as longitude_distance, \n", 334 | " dropoff_latitude - pickup_latitude as latitude_distance, \n", 335 | " passenger_count \n", 336 | " from \n", 337 | " train_taxi\n", 338 | " '''\n", 339 | "\n", 340 | "# run query on table (returns cuDF DataFrame)\n", 341 | "X_train = bc.sql(query)\n", 342 | "\n", 343 | "# fill any null values \n", 344 | "X_train['longitude_distance'] = X_train['longitude_distance'].fillna(0)\n", 345 | "X_train['latitude_distance'] = X_train['latitude_distance'].fillna(0)\n", 346 | "X_train['passenger_count'] = X_train['passenger_count'].fillna(0)\n", 347 | "\n", 348 | "# how's it look? \n", 349 | "X_train.head()" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 8, 355 | "metadata": {}, 356 | "outputs": [ 357 | { 358 | "data": { 359 | "text/html": [ 360 | "
\n", 361 | "\n", 374 | "\n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | "
fare_amount
017.0
13.3
24.1
36.0
48.9
\n", 404 | "
" 405 | ], 406 | "text/plain": [ 407 | " fare_amount\n", 408 | "0 17.0\n", 409 | "1 3.3\n", 410 | "2 4.1\n", 411 | "3 6.0\n", 412 | "4 8.9" 413 | ] 414 | }, 415 | "execution_count": 8, 416 | "metadata": {}, 417 | "output_type": "execute_result" 418 | } 419 | ], 420 | "source": [ 421 | "# query dependent variable y\n", 422 | "y_train = bc.sql('SELECT fare_amount FROM train_taxi')\n", 423 | "# how's it look?\n", 424 | "y_train.head()" 425 | ] 426 | }, 427 | { 428 | "cell_type": "markdown", 429 | "metadata": {}, 430 | "source": [ 431 | "## Linear Regression\n", 432 | "To learn more about the cuML's LinearRegression model, check out [Beginner’s Guide to Linear Regression in Google Colab with cuML](https://medium.com/future-vision/beginners-guide-to-linear-regression-in-python-with-cuml-30e2709c761?source=friends_link&sk=1da35920b9e2ffea59d5cb3c998bfeae).\n", 433 | "\n", 434 | "### LR: Train Model" 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": 9, 440 | "metadata": { 441 | "colab": { 442 | "base_uri": "https://localhost:8080/", 443 | "height": 531 444 | }, 445 | "colab_type": "code", 446 | "id": "tVUZvT9TB6Ii", 447 | "outputId": "d61c0249-47ee-40b8-a72f-9d62383f23dd" 448 | }, 449 | "outputs": [ 450 | { 451 | "name": "stdout", 452 | "output_type": "stream", 453 | "text": [ 454 | "Coefficients:\n", 455 | "0 -0.027069\n", 456 | "1 0.003295\n", 457 | "2 0.107198\n", 458 | "3 0.636705\n", 459 | "4 0.000932\n", 460 | "5 -0.000494\n", 461 | "6 0.092028\n", 462 | "dtype: float32\n", 463 | "\n", 464 | "Y intercept:\n", 465 | "3.3608126640319824\n", 466 | "\n", 467 | "CPU times: user 892 ms, sys: 412 ms, total: 1.3 s\n", 468 | "Wall time: 2.25 s\n" 469 | ] 470 | } 471 | ], 472 | "source": [ 473 | "%%time\n", 474 | "# call & create cuML model\n", 475 | "lr = LinearRegression(fit_intercept=True, normalize=False, algorithm=\"eig\")\n", 476 | "\n", 477 | "# train Linear Regression model \n", 478 | "reg = lr.fit(X_train, y_train)\n", 479 | "\n", 480 | "# display results\n", 481 | "print(f\"Coefficients:\\n{reg.coef_}\\n\")\n", 482 | "print(f\"Y intercept:\\n{reg.intercept_}\\n\")" 483 | ] 484 | }, 485 | { 486 | "cell_type": "markdown", 487 | "metadata": { 488 | "colab_type": "text", 489 | "id": "pHtni9xcl-ht" 490 | }, 491 | "source": [ 492 | "### LR: Use Model to Predict Future Taxi Fares \n", 493 | "\n", 494 | "#### Download Test Data\n", 495 | "The cell below will check to see if you've already got the Test data, and, if you don't, will download it for you." 496 | ] 497 | }, 498 | { 499 | "cell_type": "code", 500 | "execution_count": 10, 501 | "metadata": {}, 502 | "outputs": [ 503 | { 504 | "name": "stdout", 505 | "output_type": "stream", 506 | "text": [ 507 | "--2020-01-23 04:49:37-- https://blazingsql-demos.s3-us-west-1.amazonaws.com/test.csv\n", 508 | "Resolving blazingsql-demos.s3-us-west-1.amazonaws.com (blazingsql-demos.s3-us-west-1.amazonaws.com)... 52.219.116.137\n", 509 | "Connecting to blazingsql-demos.s3-us-west-1.amazonaws.com (blazingsql-demos.s3-us-west-1.amazonaws.com)|52.219.116.137|:443... connected.\n", 510 | "HTTP request sent, awaiting response... 200 OK\n", 511 | "Length: 982916 (960K) [text/csv]\n", 512 | "Saving to: ‘data/test.csv’\n", 513 | "\n", 514 | "test.csv 100%[===================>] 959.88K 2.22MB/s in 0.4s \n", 515 | "\n", 516 | "2020-01-23 04:49:38 (2.22 MB/s) - ‘data/test.csv’ saved [982916/982916]\n", 517 | "\n", 518 | "CPU times: user 8.09 ms, sys: 26.9 ms, total: 35 ms\n", 519 | "Wall time: 902 ms\n" 520 | ] 521 | } 522 | ], 523 | "source": [ 524 | "%%time\n", 525 | "# do we have Test taxi file?\n", 526 | "if not os.path.isfile('/data/test.csv'):\n", 527 | " !wget -P data https://blazingsql-demos.s3-us-west-1.amazonaws.com/test.csv\n", 528 | "else:\n", 529 | " print('test data already downloaded')" 530 | ] 531 | }, 532 | { 533 | "cell_type": "code", 534 | "execution_count": 11, 535 | "metadata": { 536 | "colab": {}, 537 | "colab_type": "code", 538 | "id": "yRM5PosNiuGh" 539 | }, 540 | "outputs": [ 541 | { 542 | "name": "stdout", 543 | "output_type": "stream", 544 | "text": [ 545 | "CPU times: user 1.68 ms, sys: 5.19 ms, total: 6.87 ms\n", 546 | "Wall time: 5.42 ms\n" 547 | ] 548 | }, 549 | { 550 | "data": { 551 | "text/plain": [ 552 | "" 553 | ] 554 | }, 555 | "execution_count": 11, 556 | "metadata": {}, 557 | "output_type": "execute_result" 558 | } 559 | ], 560 | "source": [ 561 | "%%time\n", 562 | "# set column names and types\n", 563 | "col_names = ['key', 'fare_amount', 'pickup_longitude', 'pickup_latitude', \n", 564 | " 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']\n", 565 | "col_types = ['date64', 'float32', 'float32', 'float32', 'float32', 'float32', 'float32']\n", 566 | "\n", 567 | "# tag path to test data\n", 568 | "test_path = cwd + '/data/test.csv'\n", 569 | "\n", 570 | "# create test table directly from CSV\n", 571 | "bc.create_table('test_taxi', test_path, names=col_names, dtype=col_types)" 572 | ] 573 | }, 574 | { 575 | "cell_type": "code", 576 | "execution_count": 12, 577 | "metadata": { 578 | "colab": {}, 579 | "colab_type": "code", 580 | "id": "g4I8AJ51dpW5" 581 | }, 582 | "outputs": [ 583 | { 584 | "name": "stdout", 585 | "output_type": "stream", 586 | "text": [ 587 | "CPU times: user 61.8 ms, sys: 1.41 ms, total: 63.2 ms\n", 588 | "Wall time: 36.9 ms\n" 589 | ] 590 | }, 591 | { 592 | "data": { 593 | "text/html": [ 594 | "
\n", 595 | "\n", 608 | "\n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | "
hoursdaysmonthsyearslongitude_distancelatitude_distancepassenger_count
013.027.01.015.0-0.008110-0.0199701.0
113.027.01.015.0-0.0120240.0198141.0
211.08.010.011.00.002869-0.0051191.0
321.01.012.012.0-0.009277-0.0161781.0
421.01.012.012.0-0.022537-0.0453451.0
\n", 674 | "
" 675 | ], 676 | "text/plain": [ 677 | " hours days months years longitude_distance latitude_distance \\\n", 678 | "0 13.0 27.0 1.0 15.0 -0.008110 -0.019970 \n", 679 | "1 13.0 27.0 1.0 15.0 -0.012024 0.019814 \n", 680 | "2 11.0 8.0 10.0 11.0 0.002869 -0.005119 \n", 681 | "3 21.0 1.0 12.0 12.0 -0.009277 -0.016178 \n", 682 | "4 21.0 1.0 12.0 12.0 -0.022537 -0.045345 \n", 683 | "\n", 684 | " passenger_count \n", 685 | "0 1.0 \n", 686 | "1 1.0 \n", 687 | "2 1.0 \n", 688 | "3 1.0 \n", 689 | "4 1.0 " 690 | ] 691 | }, 692 | "execution_count": 12, 693 | "metadata": {}, 694 | "output_type": "execute_result" 695 | } 696 | ], 697 | "source": [ 698 | "%%time\n", 699 | "# extract time columns, long & lat, # riders (all floats)\n", 700 | "query = '''\n", 701 | " select \n", 702 | " cast(hour(key) as float) hours, \n", 703 | " cast(dayofmonth(key) as float) days, \n", 704 | " cast(month(key) as float) months, \n", 705 | " cast(year(key) - 2000 as float) years, \n", 706 | " dropoff_longitude - pickup_longitude as longitude_distance, \n", 707 | " dropoff_latitude - pickup_latitude as latitude_distance, \n", 708 | " passenger_count\n", 709 | " from \n", 710 | " test_taxi\n", 711 | " '''\n", 712 | "\n", 713 | "# run query on table (returns cuDF DataFrame)\n", 714 | "X_test = bc.sql(query)\n", 715 | "\n", 716 | "# fill null values \n", 717 | "X_test['longitude_distance'] = X_test['longitude_distance'].fillna(0)\n", 718 | "X_test['latitude_distance'] = X_test['latitude_distance'].fillna(0)\n", 719 | "X_test['passenger_count'] = X_test['passenger_count'].fillna(0)\n", 720 | "\n", 721 | "# how's it look? \n", 722 | "X_test.head()" 723 | ] 724 | }, 725 | { 726 | "cell_type": "code", 727 | "execution_count": 13, 728 | "metadata": { 729 | "colab": {}, 730 | "colab_type": "code", 731 | "id": "zCft6P5QkepN" 732 | }, 733 | "outputs": [ 734 | { 735 | "data": { 736 | "text/plain": [ 737 | "0 12.847689\n", 738 | "1 12.847666\n", 739 | "2 11.257179\n", 740 | "3 11.814514\n", 741 | "4 11.814518\n", 742 | "5 11.814510\n", 743 | "6 11.223505\n", 744 | "7 11.223265\n", 745 | "8 11.223516\n", 746 | "9 12.234369\n", 747 | "10 12.234383\n", 748 | "11 12.234411\n", 749 | "12 9.695659\n", 750 | "13 9.695644\n", 751 | "14 11.467134\n", 752 | "15 11.467148\n", 753 | "16 11.460003\n", 754 | "17 11.460035\n", 755 | "18 11.460011\n", 756 | "19 11.460001\n", 757 | "20 13.480091\n", 758 | "21 12.704147\n", 759 | "22 12.704123\n", 760 | "23 12.704136\n", 761 | "24 12.704132\n", 762 | "25 12.704119\n", 763 | "26 12.704292\n", 764 | "27 12.704145\n", 765 | "28 12.704140\n", 766 | "29 12.704115\n", 767 | " ... \n", 768 | "9884 12.641771\n", 769 | "9885 12.641808\n", 770 | "9886 12.641790\n", 771 | "9887 12.641766\n", 772 | "9888 12.641785\n", 773 | "9889 12.641790\n", 774 | "9890 12.641781\n", 775 | "9891 12.641809\n", 776 | "9892 12.641788\n", 777 | "9893 12.641804\n", 778 | "9894 12.641783\n", 779 | "9895 12.641851\n", 780 | "9896 12.641764\n", 781 | "9897 13.446104\n", 782 | "9898 13.204254\n", 783 | "9899 14.129877\n", 784 | "9900 13.363419\n", 785 | "9901 13.627535\n", 786 | "9902 14.162102\n", 787 | "9903 13.824402\n", 788 | "9904 13.664045\n", 789 | "9905 13.252615\n", 790 | "9906 14.129101\n", 791 | "9907 13.444111\n", 792 | "9908 13.710255\n", 793 | "9909 13.707689\n", 794 | "9910 13.150122\n", 795 | "9911 13.413801\n", 796 | "9912 13.645849\n", 797 | "9913 13.251087\n", 798 | "Length: 9914, dtype: float32" 799 | ] 800 | }, 801 | "execution_count": 13, 802 | "metadata": {}, 803 | "output_type": "execute_result" 804 | } 805 | ], 806 | "source": [ 807 | "# predict fares \n", 808 | "predictions = lr.predict(X_test)\n", 809 | "\n", 810 | "# display predictions\n", 811 | "predictions" 812 | ] 813 | }, 814 | { 815 | "cell_type": "code", 816 | "execution_count": 14, 817 | "metadata": { 818 | "colab": {}, 819 | "colab_type": "code", 820 | "id": "GdjUjJ42l2BI" 821 | }, 822 | "outputs": [ 823 | { 824 | "data": { 825 | "text/html": [ 826 | "
\n", 827 | "\n", 840 | "\n", 841 | " \n", 842 | " \n", 843 | " \n", 844 | " \n", 845 | " \n", 846 | " \n", 847 | " \n", 848 | " \n", 849 | " \n", 850 | " \n", 851 | " \n", 852 | " \n", 853 | " \n", 854 | " \n", 855 | " \n", 856 | " \n", 857 | " \n", 858 | " \n", 859 | " \n", 860 | " \n", 861 | " \n", 862 | " \n", 863 | " \n", 864 | " \n", 865 | " \n", 866 | " \n", 867 | " \n", 868 | " \n", 869 | " \n", 870 | " \n", 871 | " \n", 872 | " \n", 873 | " \n", 874 | " \n", 875 | " \n", 876 | " \n", 877 | " \n", 878 | " \n", 879 | " \n", 880 | " \n", 881 | " \n", 882 | " \n", 883 | " \n", 884 | " \n", 885 | " \n", 886 | " \n", 887 | " \n", 888 | " \n", 889 | " \n", 890 | " \n", 891 | " \n", 892 | " \n", 893 | " \n", 894 | " \n", 895 | " \n", 896 | " \n", 897 | " \n", 898 | " \n", 899 | " \n", 900 | " \n", 901 | " \n", 902 | " \n", 903 | " \n", 904 | " \n", 905 | " \n", 906 | " \n", 907 | " \n", 908 | " \n", 909 | " \n", 910 | " \n", 911 | "
hoursdaysmonthsyearslongitude_distancelatitude_distancepassenger_countpredicted_fare
013.027.01.015.0-0.008110-0.0199701.012.847689
113.027.01.015.0-0.0120240.0198141.012.847666
211.08.010.011.00.002869-0.0051191.011.257179
321.01.012.012.0-0.009277-0.0161781.011.814514
421.01.012.012.0-0.022537-0.0453451.011.814518
\n", 912 | "
" 913 | ], 914 | "text/plain": [ 915 | " hours days months years longitude_distance latitude_distance \\\n", 916 | "0 13.0 27.0 1.0 15.0 -0.008110 -0.019970 \n", 917 | "1 13.0 27.0 1.0 15.0 -0.012024 0.019814 \n", 918 | "2 11.0 8.0 10.0 11.0 0.002869 -0.005119 \n", 919 | "3 21.0 1.0 12.0 12.0 -0.009277 -0.016178 \n", 920 | "4 21.0 1.0 12.0 12.0 -0.022537 -0.045345 \n", 921 | "\n", 922 | " passenger_count predicted_fare \n", 923 | "0 1.0 12.847689 \n", 924 | "1 1.0 12.847666 \n", 925 | "2 1.0 11.257179 \n", 926 | "3 1.0 11.814514 \n", 927 | "4 1.0 11.814518 " 928 | ] 929 | }, 930 | "execution_count": 14, 931 | "metadata": {}, 932 | "output_type": "execute_result" 933 | } 934 | ], 935 | "source": [ 936 | "# add predictions to test dataframe\n", 937 | "X_test['predicted_fare'] = predictions\n", 938 | "\n", 939 | "# how's that look?\n", 940 | "X_test.head()" 941 | ] 942 | } 943 | ], 944 | "metadata": { 945 | "accelerator": "GPU", 946 | "colab": { 947 | "collapsed_sections": [], 948 | "name": "BlazingSQL_cuML_Taxi_Fare_Prediction.ipynb", 949 | "provenance": [] 950 | }, 951 | "kernelspec": { 952 | "display_name": "Python 3", 953 | "language": "python", 954 | "name": "python3" 955 | }, 956 | "language_info": { 957 | "codemirror_mode": { 958 | "name": "ipython", 959 | "version": 3 960 | }, 961 | "file_extension": ".py", 962 | "mimetype": "text/x-python", 963 | "name": "python", 964 | "nbconvert_exporter": "python", 965 | "pygments_lexer": "ipython3", 966 | "version": "3.6.7" 967 | } 968 | }, 969 | "nbformat": 4, 970 | "nbformat_minor": 4 971 | } 972 | -------------------------------------------------------------------------------- /utils/env-check.py: -------------------------------------------------------------------------------- 1 | import sys, os 2 | 3 | sys.path.append('/usr/local/lib/python3.6/site-packages/') 4 | os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so' 5 | os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/' 6 | 7 | import pynvml 8 | 9 | pynvml.nvmlInit() 10 | handle = pynvml.nvmlDeviceGetHandleByIndex(0) 11 | device_name = pynvml.nvmlDeviceGetName(handle) 12 | 13 | if device_name != b'Tesla T4': 14 | raise Exception(""" 15 | Unfortunately Colab didn't give you a T4 GPU. 16 | 17 | Make sure you've configured Colab to request a GPU instance type. 18 | 19 | If you get a K80 GPU, try Runtime -> Reset all runtimes... 20 | """) 21 | else: 22 | print('*********************************************') 23 | print('Woo! Your instance has the right kind of GPU!') 24 | print('*********************************************') 25 | print() 26 | -------------------------------------------------------------------------------- /vs_pyspark_netflow.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "colab_type": "text", 7 | "id": "d0hJ4z8rBOFC" 8 | }, 9 | "source": [ 10 | "# BlazingSQL vs. Apache Spark \n", 11 | "\n", 12 | "Below we have one of our popular workloads running with [BlazingSQL](https://blazingsql.com/), and then with Apache Spark + PySpark.\n", 13 | "\n", 14 | "In this notebook, we will cover: \n", 15 | "- How to read and query csv files with BlazingSQL.\n", 16 | "- How BlazingSQL compares against Apache Spark (analyzing over 20M records)." 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "colab_type": "text", 23 | "id": "0guvG6Ws_zmX" 24 | }, 25 | "source": [ 26 | "## Import packages and create Blazing Context\n", 27 | "You can think of the BlazingContext much like a Spark Context (i.e. information such as FileSystems you have registered and Tables you have created will be stored here). " 28 | ] 29 | }, 30 | { 31 | "cell_type": "code", 32 | "execution_count": 1, 33 | "metadata": { 34 | "colab": { 35 | "base_uri": "https://localhost:8080/", 36 | "height": 35 37 | }, 38 | "colab_type": "code", 39 | "id": "ojm_V-WAtz0f", 40 | "outputId": "a46625f4-1494-4a13-eb13-2f38efd80ccf" 41 | }, 42 | "outputs": [ 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "BlazingContext ready\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "from blazingsql import BlazingContext\n", 53 | "# start up BlazingSQL\n", 54 | "bc = BlazingContext()" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": { 60 | "colab_type": "text", 61 | "id": "yp7z8bfivbna" 62 | }, 63 | "source": [ 64 | "### Load & Query Table\n", 65 | "First, we need to download the netflow data (21,526,138 records) from AWS. If you do not wish to download the full 2.5G file, the first 100,000 rows of data are pre-downloaded at `data/small-chunk2.csv`, simply skip the cell below and change the file path when propmted 2 cells from now." 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 2, 71 | "metadata": { 72 | "colab": {}, 73 | "colab_type": "code", 74 | "id": "2dAt6DfG37KH" 75 | }, 76 | "outputs": [ 77 | { 78 | "name": "stdout", 79 | "output_type": "stream", 80 | "text": [ 81 | "--2020-01-20 22:14:17-- https://blazingsql-colab.s3.amazonaws.com/netflow_data/nf-chunk2.csv\n", 82 | "Resolving blazingsql-colab.s3.amazonaws.com (blazingsql-colab.s3.amazonaws.com)... 52.216.112.139\n", 83 | "Connecting to blazingsql-colab.s3.amazonaws.com (blazingsql-colab.s3.amazonaws.com)|52.216.112.139|:443... connected.\n", 84 | "HTTP request sent, awaiting response... 200 OK\n", 85 | "Length: 2725056295 (2.5G) [text/csv]\n", 86 | "Saving to: ‘data/nf-chunk2.csv’\n", 87 | "\n", 88 | "nf-chunk2.csv 100%[===================>] 2.54G 51.8MB/s in 49s \n", 89 | "\n", 90 | "2020-01-20 22:15:06 (53.2 MB/s) - ‘data/nf-chunk2.csv’ saved [2725056295/2725056295]\n", 91 | "\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "# save nf-chunk2 to data folder, may take a few minutes to download\n", 97 | "!wget -P data/ https://blazingsql-colab.s3.amazonaws.com/netflow_data/nf-chunk2.csv " 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": { 103 | "colab_type": "text", 104 | "id": "OTEaAsp2_zmf" 105 | }, 106 | "source": [ 107 | "## BlazingSQL \n", 108 | "Data in hand, we can test the preformance of BlazingSQL on this dataset. \n", 109 | "\n", 110 | "To use pre-downloaded data, change the file path to `data/small-chunk2.csv`." 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 3, 116 | "metadata": { 117 | "colab": { 118 | "base_uri": "https://localhost:8080/", 119 | "height": 52 120 | }, 121 | "colab_type": "code", 122 | "id": "rirBsYQU3NH5", 123 | "outputId": "51ced2b1-b930-4173-bbfa-09672e751d3f" 124 | }, 125 | "outputs": [ 126 | { 127 | "data": { 128 | "text/plain": [ 129 | "'/home/winston/bsql-demos/data/nf-chunk2.csv'" 130 | ] 131 | }, 132 | "execution_count": 3, 133 | "metadata": {}, 134 | "output_type": "execute_result" 135 | } 136 | ], 137 | "source": [ 138 | "import os\n", 139 | "# determine current working directory \n", 140 | "cwd = os.getcwd()\n", 141 | "# complete path to data\n", 142 | "path = cwd + '/data/nf-chunk2.csv'\n", 143 | "# what's the path?\n", 144 | "path" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 4, 150 | "metadata": { 151 | "colab": { 152 | "base_uri": "https://localhost:8080/", 153 | "height": 52 154 | }, 155 | "colab_type": "code", 156 | "id": "zCzLEFfB3N4k", 157 | "outputId": "10ff9097-2736-423e-969d-de75983fbdda" 158 | }, 159 | "outputs": [ 160 | { 161 | "name": "stdout", 162 | "output_type": "stream", 163 | "text": [ 164 | "CPU times: user 9.9 ms, sys: 13.1 ms, total: 23 ms\n", 165 | "Wall time: 1.14 s\n" 166 | ] 167 | }, 168 | { 169 | "data": { 170 | "text/plain": [ 171 | "" 172 | ] 173 | }, 174 | "execution_count": 4, 175 | "metadata": {}, 176 | "output_type": "execute_result" 177 | } 178 | ], 179 | "source": [ 180 | "%%time\n", 181 | "# Create BlazingSQL table from GDF - There is no copy in this process\n", 182 | "bc.create_table('netflow', path, header=0)" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 5, 188 | "metadata": { 189 | "colab": { 190 | "base_uri": "https://localhost:8080/", 191 | "height": 295 192 | }, 193 | "colab_type": "code", 194 | "id": "umBG2Tp0wbQx", 195 | "outputId": "0975395e-7f5b-4244-afa3-45c8658ce61c" 196 | }, 197 | "outputs": [ 198 | { 199 | "name": "stdout", 200 | "output_type": "stream", 201 | "text": [ 202 | "CPU times: user 5.07 s, sys: 2.61 s, total: 7.67 s\n", 203 | "Wall time: 10.4 s\n" 204 | ] 205 | } 206 | ], 207 | "source": [ 208 | "%%time\n", 209 | "# define the query\n", 210 | "query = '''\n", 211 | " SELECT\n", 212 | " a.firstSeenSrcIp as source,\n", 213 | " a.firstSeenDestIp as destination,\n", 214 | " count(a.firstSeenDestPort) as targetPorts,\n", 215 | " SUM(a.firstSeenSrcTotalBytes) as bytesOut,\n", 216 | " SUM(a.firstSeenDestTotalBytes) as bytesIn,\n", 217 | " SUM(a.durationSeconds) as durationSeconds,\n", 218 | " MIN(parsedDate) as firstFlowDate,\n", 219 | " MAX(parsedDate) as lastFlowDate,\n", 220 | " COUNT(*) as attemptCount\n", 221 | " FROM \n", 222 | " netflow a\n", 223 | " GROUP BY\n", 224 | " a.firstSeenSrcIp,\n", 225 | " a.firstSeenDestIp\n", 226 | " '''\n", 227 | "\n", 228 | "# query the table (returns cuDF DataFrame)\n", 229 | "gdf = bc.sql(query)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 6, 235 | "metadata": { 236 | "colab": {}, 237 | "colab_type": "code", 238 | "id": "48_W2v8q_zmq", 239 | "outputId": "db0394f1-e082-49b0-c477-e3bba8d3d0f4" 240 | }, 241 | "outputs": [ 242 | { 243 | "data": { 244 | "text/html": [ 245 | "
\n", 246 | "\n", 259 | "\n", 260 | " \n", 261 | " \n", 262 | " \n", 263 | " \n", 264 | " \n", 265 | " \n", 266 | " \n", 267 | " \n", 268 | " \n", 269 | " \n", 270 | " \n", 271 | " \n", 272 | " \n", 273 | " \n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | " \n", 332 | " \n", 333 | " \n", 334 | " \n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | "
sourcedestinationtargetPortsbytesOutbytesIndurationSecondsfirstFlowDatelastFlowDateattemptCount
0172.30.2.6010.0.0.98234839477161342013-04-03 06:48:472013-04-03 12:12:3782
1172.10.1.16210.0.0.11873962853983242013-04-03 06:50:132013-04-03 14:58:3587
210.1.0.76172.10.1.82163339202013-04-03 09:55:052013-04-03 09:55:051
3172.30.1.56172.0.0.12533303240672013-04-03 01:59:092013-04-03 22:05:3925
4172.30.1.1010.0.0.12693104243044252013-04-03 06:48:012013-04-03 12:11:4069
5172.10.1.8910.0.0.51125122270260242013-04-03 06:48:242013-04-03 15:17:39112
6172.10.1.23410.0.0.51044728764750182013-04-03 06:53:552013-04-03 15:11:07104
7172.30.2.12510.0.0.96930701415583412013-04-03 06:50:502013-04-03 12:12:3769
8172.30.1.8510.0.0.884378285286432013-04-03 06:48:212013-04-03 12:06:5384
910.0.0.9172.30.1.124163239102013-04-03 10:36:042013-04-03 10:36:041
\n", 397 | "
" 398 | ], 399 | "text/plain": [ 400 | " source destination targetPorts bytesOut bytesIn \\\n", 401 | "0 172.30.2.60 10.0.0.9 82 34839 47716 \n", 402 | "1 172.10.1.162 10.0.0.11 87 39628 53983 \n", 403 | "2 10.1.0.76 172.10.1.82 1 633 392 \n", 404 | "3 172.30.1.56 172.0.0.1 25 3330 3240 \n", 405 | "4 172.30.1.10 10.0.0.12 69 31042 43044 \n", 406 | "5 172.10.1.89 10.0.0.5 112 51222 70260 \n", 407 | "6 172.10.1.234 10.0.0.5 104 47287 64750 \n", 408 | "7 172.30.2.125 10.0.0.9 69 30701 41558 \n", 409 | "8 172.30.1.85 10.0.0.8 84 37828 52864 \n", 410 | "9 10.0.0.9 172.30.1.124 1 632 391 \n", 411 | "\n", 412 | " durationSeconds firstFlowDate lastFlowDate attemptCount \n", 413 | "0 134 2013-04-03 06:48:47 2013-04-03 12:12:37 82 \n", 414 | "1 24 2013-04-03 06:50:13 2013-04-03 14:58:35 87 \n", 415 | "2 0 2013-04-03 09:55:05 2013-04-03 09:55:05 1 \n", 416 | "3 67 2013-04-03 01:59:09 2013-04-03 22:05:39 25 \n", 417 | "4 25 2013-04-03 06:48:01 2013-04-03 12:11:40 69 \n", 418 | "5 24 2013-04-03 06:48:24 2013-04-03 15:17:39 112 \n", 419 | "6 18 2013-04-03 06:53:55 2013-04-03 15:11:07 104 \n", 420 | "7 341 2013-04-03 06:50:50 2013-04-03 12:12:37 69 \n", 421 | "8 3 2013-04-03 06:48:21 2013-04-03 12:06:53 84 \n", 422 | "9 0 2013-04-03 10:36:04 2013-04-03 10:36:04 1 " 423 | ] 424 | }, 425 | "execution_count": 6, 426 | "metadata": {}, 427 | "output_type": "execute_result" 428 | } 429 | ], 430 | "source": [ 431 | "# how's it look?\n", 432 | "gdf.head(10)" 433 | ] 434 | }, 435 | { 436 | "cell_type": "markdown", 437 | "metadata": { 438 | "colab_type": "text", 439 | "id": "6PXbjW1hTxrD" 440 | }, 441 | "source": [ 442 | "## Apache Spark\n", 443 | "The cell below installs Apache Spark ([PySpark](https://spark.apache.org/docs/latest/api/python/index.html))." 444 | ] 445 | }, 446 | { 447 | "cell_type": "code", 448 | "execution_count": 7, 449 | "metadata": { 450 | "colab": {}, 451 | "colab_type": "code", 452 | "id": "pnEEvVEtT8xi" 453 | }, 454 | "outputs": [ 455 | { 456 | "name": "stdout", 457 | "output_type": "stream", 458 | "text": [ 459 | "Collecting pyspark\n", 460 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/87/21/f05c186f4ddb01d15d0ddc36ef4b7e3cedbeb6412274a41f26b55a650ee5/pyspark-2.4.4.tar.gz (215.7MB)\n", 461 | "\u001b[K |████████████████████████████████| 215.7MB 50kB/s s eta 0:00:01\n", 462 | "\u001b[?25hCollecting py4j==0.10.7\n", 463 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)\n", 464 | "\u001b[K |████████████████████████████████| 204kB 54.4MB/s eta 0:00:01\n", 465 | "\u001b[?25hBuilding wheels for collected packages: pyspark\n", 466 | " Building wheel for pyspark (setup.py) ... \u001b[?25ldone\n", 467 | "\u001b[?25h Created wheel for pyspark: filename=pyspark-2.4.4-py2.py3-none-any.whl size=216130387 sha256=14abaa33edbf681f432ee00d234718731961da639e5eec86c4784667d43b4f5d\n", 468 | " Stored in directory: /home/winston/.cache/pip/wheels/ab/09/4d/0d184230058e654eb1b04467dbc1292f00eaa186544604b471\n", 469 | "Successfully built pyspark\n", 470 | "Installing collected packages: py4j, pyspark\n", 471 | "Successfully installed py4j-0.10.7 pyspark-2.4.4\n" 472 | ] 473 | } 474 | ], 475 | "source": [ 476 | "# installs Spark (2.4.4 Jan 2020)\n", 477 | "!pip install pyspark" 478 | ] 479 | }, 480 | { 481 | "cell_type": "markdown", 482 | "metadata": { 483 | "colab_type": "text", 484 | "id": "W3-XmZkz_zmw" 485 | }, 486 | "source": [ 487 | "#### PyBlazing vs PySpark\n", 488 | "With everything installed we can launch a SparkSession and see how BlazingSQL stacks up." 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": 1, 494 | "metadata": { 495 | "colab": { 496 | "base_uri": "https://localhost:8080/", 497 | "height": 51 498 | }, 499 | "colab_type": "code", 500 | "id": "nioEt2MqT9B0", 501 | "outputId": "f75b9823-5dbd-45b1-9282-562d3d6ddaf0" 502 | }, 503 | "outputs": [ 504 | { 505 | "name": "stdout", 506 | "output_type": "stream", 507 | "text": [ 508 | "CPU times: user 321 ms, sys: 208 ms, total: 529 ms\n", 509 | "Wall time: 3.65 s\n" 510 | ] 511 | } 512 | ], 513 | "source": [ 514 | "%%time\n", 515 | "# copied this cell's snippet from another Google Colab by Luca Canali here: https://colab.research.google.com/github/LucaCanali/sparkMeasure/blob/master/examples/SparkMeasure_Jupyter_Colab_Example.ipynb\n", 516 | "\n", 517 | "from pyspark.sql import SparkSession\n", 518 | "\n", 519 | "# Create Spark Session\n", 520 | "# This example uses a local cluster, you can modify master to use YARN or K8S if available \n", 521 | "# This example downloads sparkMeasure 0.13 for scala 2_11 from maven central\n", 522 | "\n", 523 | "spark = SparkSession \\\n", 524 | " .builder \\\n", 525 | " .master(\"local[*]\") \\\n", 526 | " .appName(\"PySpark Netflow Benchmark code\") \\\n", 527 | " .config(\"spark.jars.packages\",\"ch.cern.sparkmeasure:spark-measure_2.11:0.13\") \\\n", 528 | " .getOrCreate()" 529 | ] 530 | }, 531 | { 532 | "cell_type": "markdown", 533 | "metadata": { 534 | "colab_type": "text", 535 | "id": "G8XSppQiUdLY" 536 | }, 537 | "source": [ 538 | "### Load & Query Table" 539 | ] 540 | }, 541 | { 542 | "cell_type": "code", 543 | "execution_count": 2, 544 | "metadata": { 545 | "colab": { 546 | "base_uri": "https://localhost:8080/", 547 | "height": 51 548 | }, 549 | "colab_type": "code", 550 | "id": "ZSLuSYSOUDtf", 551 | "outputId": "2b93169b-63c5-4c46-da14-af87645bf51b" 552 | }, 553 | "outputs": [ 554 | { 555 | "name": "stdout", 556 | "output_type": "stream", 557 | "text": [ 558 | "CPU times: user 20.2 ms, sys: 11.3 ms, total: 31.5 ms\n", 559 | "Wall time: 2min 46s\n" 560 | ] 561 | } 562 | ], 563 | "source": [ 564 | "%%time\n", 565 | "# load CSV into Spark\n", 566 | "netflow_df = spark.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('data/nf-chunk2.csv')" 567 | ] 568 | }, 569 | { 570 | "cell_type": "code", 571 | "execution_count": 3, 572 | "metadata": { 573 | "colab": { 574 | "base_uri": "https://localhost:8080/", 575 | "height": 51 576 | }, 577 | "colab_type": "code", 578 | "id": "iT3BwLn8UDwE", 579 | "outputId": "4eeff800-489f-4230-adb9-f3a1c16ede66" 580 | }, 581 | "outputs": [ 582 | { 583 | "name": "stdout", 584 | "output_type": "stream", 585 | "text": [ 586 | "CPU times: user 1.72 ms, sys: 176 µs, total: 1.9 ms\n", 587 | "Wall time: 157 ms\n" 588 | ] 589 | } 590 | ], 591 | "source": [ 592 | "%%time\n", 593 | "# create table for querying\n", 594 | "netflow_df.createOrReplaceTempView('netflow')" 595 | ] 596 | }, 597 | { 598 | "cell_type": "code", 599 | "execution_count": 4, 600 | "metadata": { 601 | "colab": { 602 | "base_uri": "https://localhost:8080/", 603 | "height": 493 604 | }, 605 | "colab_type": "code", 606 | "id": "9SBhahA5UD2k", 607 | "outputId": "accc1938-6470-44df-ab7f-70058c755b2b" 608 | }, 609 | "outputs": [ 610 | { 611 | "name": "stdout", 612 | "output_type": "stream", 613 | "text": [ 614 | "+------------+---------------+-----------+--------+-------+---------------+-------------------+-------------------+------------+\n", 615 | "| source| destination|targetPorts|bytesOut|bytesIn|durationSeconds| firstFlowDate| lastFlowDate|attemptCount|\n", 616 | "+------------+---------------+-----------+--------+-------+---------------+-------------------+-------------------+------------+\n", 617 | "| 172.10.1.13|239.255.255.250| 15| 2975| 0| 6|2013-04-03 06:36:19|2013-04-03 06:36:27| 15|\n", 618 | "|172.30.1.204|239.255.255.250| 8| 1750| 0| 6|2013-04-03 06:36:13|2013-04-03 06:36:20| 8|\n", 619 | "| 172.30.2.86| 172.0.0.1| 1| 540| 0| 2|2013-04-03 06:36:09|2013-04-03 06:36:09| 1|\n", 620 | "|172.30.1.246| 172.0.0.1| 29| 2610| 2610| 0|2013-04-03 00:26:46|2013-04-03 23:06:00| 29|\n", 621 | "| 172.30.1.51|239.255.255.250| 16| 3850| 0| 18|2013-04-03 06:35:22|2013-04-03 06:44:08| 16|\n", 622 | "| 172.10.1.35| 172.0.0.1| 1| 270| 0| 0|2013-04-03 06:36:21|2013-04-03 06:36:21| 1|\n", 623 | "| 172.20.1.91|239.255.255.250| 19| 3675| 0| 6|2013-04-03 06:36:50|2013-04-03 06:36:59| 19|\n", 624 | "|172.20.1.249|239.255.255.250| 2| 700| 0| 6|2013-04-03 06:37:17|2013-04-03 06:37:23| 2|\n", 625 | "|172.10.1.232| 172.0.0.1| 30| 3060| 3060| 48|2013-04-03 01:31:31|2013-04-03 22:53:36| 30|\n", 626 | "|172.10.1.238|239.255.255.250| 2| 700| 0| 6|2013-04-03 06:36:44|2013-04-03 06:36:51| 2|\n", 627 | "+------------+---------------+-----------+--------+-------+---------------+-------------------+-------------------+------------+\n", 628 | "only showing top 10 rows\n", 629 | "\n", 630 | "CPU times: user 4.39 ms, sys: 8.82 ms, total: 13.2 ms\n", 631 | "Wall time: 1min 9s\n" 632 | ] 633 | } 634 | ], 635 | "source": [ 636 | "%%time\n", 637 | "# define the same query run tested on blazingsql above\n", 638 | "query = '''\n", 639 | " SELECT\n", 640 | " a.firstSeenSrcIp as source,\n", 641 | " a.firstSeenDestIp as destination,\n", 642 | " count(a.firstSeenDestPort) as targetPorts,\n", 643 | " SUM(a.firstSeenSrcTotalBytes) as bytesOut,\n", 644 | " SUM(a.firstSeenDestTotalBytes) as bytesIn,\n", 645 | " SUM(a.durationSeconds) as durationSeconds,\n", 646 | " MIN(parsedDate) as firstFlowDate,\n", 647 | " MAX(parsedDate) as lastFlowDate,\n", 648 | " COUNT(*) as attemptCount\n", 649 | " FROM\n", 650 | " netflow a\n", 651 | " GROUP BY\n", 652 | " a.firstSeenSrcIp,\n", 653 | " a.firstSeenDestIp\n", 654 | " '''\n", 655 | "\n", 656 | "# query with Spark\n", 657 | "edges_df = spark.sql(query)\n", 658 | "\n", 659 | "# set/display results\n", 660 | "edges_df.show(10)" 661 | ] 662 | } 663 | ], 664 | "metadata": { 665 | "accelerator": "GPU", 666 | "colab": { 667 | "collapsed_sections": [], 668 | "name": "vs_pyspark_netflow.ipynb", 669 | "provenance": [], 670 | "toc_visible": true 671 | }, 672 | "kernelspec": { 673 | "display_name": "Python 3", 674 | "language": "python", 675 | "name": "python3" 676 | }, 677 | "language_info": { 678 | "codemirror_mode": { 679 | "name": "ipython", 680 | "version": 3 681 | }, 682 | "file_extension": ".py", 683 | "mimetype": "text/x-python", 684 | "name": "python", 685 | "nbconvert_exporter": "python", 686 | "pygments_lexer": "ipython3", 687 | "version": "3.7.6" 688 | } 689 | }, 690 | "nbformat": 4, 691 | "nbformat_minor": 4 692 | } 693 | --------------------------------------------------------------------------------