├── .gitignore
├── LICENSE
├── README.md
├── blazingsql_demo.ipynb
├── colab_notebooks
├── blazingsql_demo.ipynb
├── federated_query_demo.ipynb
├── graphistry_netflow_demo.ipynb
└── vs_pyspark_netflow.ipynb
├── data
├── Music.csv
├── cancer_data_00.csv
├── cancer_data_01.parquet
├── cancer_data_02.csv
└── small-chunk2.csv
├── federated_query_demo.ipynb
├── graphistry_netflow_demo.ipynb
├── imgs
└── bsql_main.png
├── requirements.txt
├── sample_use_cases
├── csv_to_parquet.ipynb
└── python_scripts
│ └── csv_to_parquet.py
├── taxi_fare_prediction.ipynb
├── utils
├── blazing_conda_test.ipynb
└── env-check.py
└── vs_pyspark_netflow.ipynb
/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | .log
3 | .csv
4 | .parquet
5 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # BlazingSQL Demos
2 | Demo Python notebooks using BlazingSQL with the RAPIDS AI ecoystem.
3 |
4 | | Notebook Title | Description |Launch in Colab|
5 | |----------------|----------------|----------------|
6 | | Getting Started | How to set up and get started with BlazingSQL and the RAPIDS AI suite |[](https://colab.research.google.com/github/BlazingDB/bsql-demos/blob/master/colab_notebooks/blazingsql_demo.ipynb)|
7 | | Netflow | Query 65M rows of network security data (netflow) with BlazingSQL and then pass to Graphistry to visualize and interact with the data |[](https://colab.research.google.com/github/BlazingDB/bsql-demos/blob/master/colab_notebooks/graphistry_netflow_demo.ipynb)|
8 | | Taxi | Train a linear regression model with cuML on 55 million rows of public NYC Taxi Data loaded with BlazingSQL |[](https://colab.research.google.com/github/BlazingDB/bsql-demos/blob/master/taxi_fare_prediction.ipynb)|
9 | | BlazingSQL vs. Apache Spark | Analyze 20 million rows of net flow data. Compare BlazingSQL and Apache Spark timings for the same workload |[](https://colab.research.google.com/github/BlazingDB/bsql-demos/blob/master/colab_notebooks/vs_pyspark_netflow.ipynb)|
10 | | Federated Query | In a single query, join an Apache Parquet file, a CSV file, and a GPU DataFrame (GDF) in GPU memory. |[](https://colab.research.google.com/github/BlazingDB/bsql-demos/blob/master/colab_notebooks/federated_query_demo.ipynb)|
--------------------------------------------------------------------------------
/blazingsql_demo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "colab_type": "text",
7 | "id": "8AdUt3HiUrc3"
8 | },
9 | "source": [
10 | "# Getting Started with BlazingSQL\n",
11 | "\n",
12 | "[BlazingSQL](https://github.com/BlazingDB/blazingsql) provides an open-source SQL interface to ETL massive datasets directly into GPU memory and the [RAPIDS.ai](https://github.com/rapidsai) Ecosystem. \n",
13 | "\n",
14 | "In this notebook, we will cover how to query cuDF (GPU) DataFrames with BlazingSQL. \n",
15 | "\n",
16 | "To learn more about the GPU DataFrame and how it enables end-to-end workloads on RAPIDS, check out our [blog post](https://blog.blazingdb.com/blazingsql-part-1-the-gpu-dataframe-gdf-and-cudf-in-rapids-ai-96ec15102240).\n",
17 | "\n",
18 | "## Imports"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 1,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "import cudf\n",
28 | "from blazingsql import BlazingContext"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {
34 | "colab_type": "text",
35 | "id": "aMwNKxePSwOp"
36 | },
37 | "source": [
38 | "## Connect to BlazingSQL - Create BlazingContext\n",
39 | "You can think of the BlazingContext much like a SparkContext; this is where information such as FileSystems you have registered and Tables you have created will be stored."
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 2,
45 | "metadata": {
46 | "colab": {
47 | "base_uri": "https://localhost:8080/",
48 | "height": 35
49 | },
50 | "colab_type": "code",
51 | "id": "ZR_vWwtMcvvY",
52 | "outputId": "c78cc40a-f7d8-4ac5-c255-d99edd03b785"
53 | },
54 | "outputs": [
55 | {
56 | "name": "stdout",
57 | "output_type": "stream",
58 | "text": [
59 | "BlazingContext ready\n"
60 | ]
61 | }
62 | ],
63 | "source": [
64 | "bc = BlazingContext()"
65 | ]
66 | },
67 | {
68 | "cell_type": "markdown",
69 | "metadata": {
70 | "colab_type": "text",
71 | "id": "N2bqpDEnZyQf"
72 | },
73 | "source": [
74 | "## cuDF -> BSQL\n",
75 | "In the next few cells, we'll genereate a cuDF DataFrame and create a BlazingSQL table from it. "
76 | ]
77 | },
78 | {
79 | "cell_type": "code",
80 | "execution_count": 3,
81 | "metadata": {},
82 | "outputs": [
83 | {
84 | "data": {
85 | "text/html": [
86 | "
\n",
87 | "\n",
100 | "
\n",
101 | " \n",
102 | " \n",
103 | " | \n",
104 | " id | \n",
105 | " rank | \n",
106 | " score | \n",
107 | "
\n",
108 | " \n",
109 | " \n",
110 | " \n",
111 | " 0 | \n",
112 | " 1 | \n",
113 | " 1 | \n",
114 | " a | \n",
115 | "
\n",
116 | " \n",
117 | " 1 | \n",
118 | " 7 | \n",
119 | " 3 | \n",
120 | " b | \n",
121 | "
\n",
122 | " \n",
123 | " 2 | \n",
124 | " 4 | \n",
125 | " 4 | \n",
126 | " c | \n",
127 | "
\n",
128 | " \n",
129 | " 3 | \n",
130 | " 2 | \n",
131 | " 3 | \n",
132 | " d | \n",
133 | "
\n",
134 | " \n",
135 | " 4 | \n",
136 | " 9 | \n",
137 | " 5 | \n",
138 | " e | \n",
139 | "
\n",
140 | " \n",
141 | "
\n",
142 | "
"
143 | ],
144 | "text/plain": [
145 | " id rank score\n",
146 | "0 1 1 a\n",
147 | "1 7 3 b\n",
148 | "2 4 4 c\n",
149 | "3 2 3 d\n",
150 | "4 9 5 e"
151 | ]
152 | },
153 | "execution_count": 3,
154 | "metadata": {},
155 | "output_type": "execute_result"
156 | }
157 | ],
158 | "source": [
159 | "# generate cuDF DataFrame\n",
160 | "df = cudf.DataFrame()\n",
161 | "\n",
162 | "# add id & value columns\n",
163 | "df['id'] = [1, 7, 4, 2, 9]\n",
164 | "df['rank'] = [1, 3, 4, 3, 5]\n",
165 | "df['score'] = ['a', 'b', 'c', 'd', 'e']\n",
166 | "\n",
167 | "# how's it look?\n",
168 | "df"
169 | ]
170 | },
171 | {
172 | "cell_type": "markdown",
173 | "metadata": {
174 | "colab_type": "text",
175 | "id": "HJFz-mqZTJ5Z"
176 | },
177 | "source": [
178 | "#### Create a Table\n",
179 | "Now we can easily create a table with BlazingContext's `.create_table()` method. "
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 4,
185 | "metadata": {},
186 | "outputs": [
187 | {
188 | "data": {
189 | "text/plain": [
190 | ""
191 | ]
192 | },
193 | "execution_count": 4,
194 | "metadata": {},
195 | "output_type": "execute_result"
196 | }
197 | ],
198 | "source": [
199 | "# BlazingSQL table from DataFrame\n",
200 | "bc.create_table('table_a', df)"
201 | ]
202 | },
203 | {
204 | "cell_type": "markdown",
205 | "metadata": {
206 | "colab_type": "text",
207 | "id": "98HJFrt5TRa0"
208 | },
209 | "source": [
210 | "## Query a Table\n",
211 | "We can can now execute SQL queries with `.sql()`, which processes data on GPU and returns results as cuDF DataFrames!"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": 5,
217 | "metadata": {},
218 | "outputs": [
219 | {
220 | "data": {
221 | "text/html": [
222 | "\n",
223 | "\n",
236 | "
\n",
237 | " \n",
238 | " \n",
239 | " | \n",
240 | " id | \n",
241 | " rank | \n",
242 | " score | \n",
243 | "
\n",
244 | " \n",
245 | " \n",
246 | " \n",
247 | " 0 | \n",
248 | " 1 | \n",
249 | " 1 | \n",
250 | " a | \n",
251 | "
\n",
252 | " \n",
253 | " 1 | \n",
254 | " 7 | \n",
255 | " 3 | \n",
256 | " b | \n",
257 | "
\n",
258 | " \n",
259 | "
\n",
260 | "
"
261 | ],
262 | "text/plain": [
263 | " id rank score\n",
264 | "0 1 1 a\n",
265 | "1 7 3 b"
266 | ]
267 | },
268 | "execution_count": 5,
269 | "metadata": {},
270 | "output_type": "execute_result"
271 | }
272 | ],
273 | "source": [
274 | "# query everything from the first 2 instances \n",
275 | "bc.sql('select * from table_a LIMIT 2')"
276 | ]
277 | },
278 | {
279 | "cell_type": "code",
280 | "execution_count": 6,
281 | "metadata": {},
282 | "outputs": [
283 | {
284 | "data": {
285 | "text/html": [
286 | "\n",
287 | "\n",
300 | "
\n",
301 | " \n",
302 | " \n",
303 | " | \n",
304 | " count(*) | \n",
305 | "
\n",
306 | " \n",
307 | " \n",
308 | " \n",
309 | " 0 | \n",
310 | " 5 | \n",
311 | "
\n",
312 | " \n",
313 | "
\n",
314 | "
"
315 | ],
316 | "text/plain": [
317 | " count(*)\n",
318 | "0 5"
319 | ]
320 | },
321 | "execution_count": 6,
322 | "metadata": {},
323 | "output_type": "execute_result"
324 | }
325 | ],
326 | "source": [
327 | "# query table - how many instances are there?\n",
328 | "bc.sql('select count(*) from table_a')"
329 | ]
330 | },
331 | {
332 | "cell_type": "code",
333 | "execution_count": 7,
334 | "metadata": {
335 | "colab": {
336 | "base_uri": "https://localhost:8080/",
337 | "height": 1000
338 | },
339 | "colab_type": "code",
340 | "id": "14GwxmLsTV_p",
341 | "outputId": "144b7601-5363-49f8-d5af-13e80917672c"
342 | },
343 | "outputs": [
344 | {
345 | "data": {
346 | "text/html": [
347 | "\n",
348 | "\n",
361 | "
\n",
362 | " \n",
363 | " \n",
364 | " | \n",
365 | " id | \n",
366 | " rank | \n",
367 | " score | \n",
368 | "
\n",
369 | " \n",
370 | " \n",
371 | " \n",
372 | " 0 | \n",
373 | " 7 | \n",
374 | " 3 | \n",
375 | " b | \n",
376 | "
\n",
377 | " \n",
378 | " 1 | \n",
379 | " 9 | \n",
380 | " 5 | \n",
381 | " e | \n",
382 | "
\n",
383 | " \n",
384 | "
\n",
385 | "
"
386 | ],
387 | "text/plain": [
388 | " id rank score\n",
389 | "0 7 3 b\n",
390 | "1 9 5 e"
391 | ]
392 | },
393 | "execution_count": 7,
394 | "metadata": {},
395 | "output_type": "execute_result"
396 | }
397 | ],
398 | "source": [
399 | "# query events with a value of at least 7\n",
400 | "bc.sql('SELECT * FROM table_a WHERE id >= 7')"
401 | ]
402 | },
403 | {
404 | "cell_type": "markdown",
405 | "metadata": {
406 | "colab_type": "text",
407 | "id": "wygAeTIFTm2X"
408 | },
409 | "source": [
410 | "# You're Ready to Rock\n",
411 | "And... thats it! You are now live with BlazingSQL.\n",
412 | "\n",
413 | "\n",
414 | "Check out our [docs](https://docs.blazingdb.com) to get fancy or to learn more about how BlazingSQL works with the rest of [RAPIDS AI](https://rapids.ai/)."
415 | ]
416 | }
417 | ],
418 | "metadata": {
419 | "accelerator": "GPU",
420 | "colab": {
421 | "collapsed_sections": [],
422 | "name": "blazingsql_demo.ipynb",
423 | "provenance": []
424 | },
425 | "kernelspec": {
426 | "display_name": "winston@blazingdb.com",
427 | "language": "python",
428 | "name": "condaenv-winston_blazingdb.com"
429 | },
430 | "language_info": {
431 | "codemirror_mode": {
432 | "name": "ipython",
433 | "version": 3
434 | },
435 | "file_extension": ".py",
436 | "mimetype": "text/x-python",
437 | "name": "python",
438 | "nbconvert_exporter": "python",
439 | "pygments_lexer": "ipython3",
440 | "version": "3.7.3"
441 | }
442 | },
443 | "nbformat": 4,
444 | "nbformat_minor": 4
445 | }
446 |
--------------------------------------------------------------------------------
/colab_notebooks/blazingsql_demo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "colab_type": "text",
7 | "id": "8AdUt3HiUrc3"
8 | },
9 | "source": [
10 | "# Getting Started with BlazingSQL\n",
11 | "\n",
12 | "In this notebook, we will cover: \n",
13 | "- How to set up [BlazingSQL](https://blazingsql.com) and the [RAPIDS AI](https://rapids.ai/) suite.\n",
14 | "- How to read and query csv files with cuDF and BlazingSQL.\n",
15 | ""
16 | ]
17 | },
18 | {
19 | "cell_type": "markdown",
20 | "metadata": {
21 | "colab_type": "text",
22 | "id": "_h26epJpUeZP"
23 | },
24 | "source": [
25 | "## Setup\n",
26 | "### Environment Sanity Check \n",
27 | "\n",
28 | "RAPIDS packages (BlazingSQL included) require Pascal+ architecture to run. For Colab, this translates to a T4 GPU instance. \n",
29 | "\n",
30 | "The cell below will let you know what type of GPU you've been allocated, and how to proceed."
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 1,
36 | "metadata": {
37 | "colab": {
38 | "base_uri": "https://localhost:8080/",
39 | "height": 322
40 | },
41 | "colab_type": "code",
42 | "id": "_lf6yKBoRYGy",
43 | "outputId": "8e9f7e7e-b89f-49bd-fd3c-c435ffb55c9c"
44 | },
45 | "outputs": [
46 | {
47 | "name": "stdout",
48 | "output_type": "stream",
49 | "text": [
50 | "\n",
51 | "\n",
52 | "***********************************\n",
53 | "GPU = b'Tesla T4'\n",
54 | "Woo! You got the right kind of GPU!\n",
55 | "***********************************\n",
56 | "\n",
57 | "\n"
58 | ]
59 | }
60 | ],
61 | "source": [
62 | "!wget https://github.com/BlazingDB/bsql-demos/raw/master/utils/colab_env.py\n",
63 | "!python colab_env.py "
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "metadata": {
69 | "colab_type": "text",
70 | "id": "xM8xTlqeRi-g"
71 | },
72 | "source": [
73 | "## Installs \n",
74 | "The cell below pulls our Google Colab install script from the `bsql-demos` repo then runs it. The script first installs miniconda, then uses miniconda to install BlazingSQL and RAPIDS AI. This takes a few minutes to run. "
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": null,
80 | "metadata": {
81 | "colab": {},
82 | "colab_type": "code",
83 | "id": "gfWF_lG1HqV7"
84 | },
85 | "outputs": [],
86 | "source": [
87 | "!wget https://github.com/BlazingDB/bsql-demos/raw/master/utils/bsql-colab.sh \n",
88 | "!bash bsql-colab.sh\n",
89 | "\n",
90 | "import sys, os, time\n",
91 | "sys.path.append('/usr/local/lib/python3.6/site-packages/')\n",
92 | "os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'\n",
93 | "os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'\n",
94 | "\n",
95 | "import subprocess\n",
96 | "subprocess.Popen(['blazingsql-orchestrator', '9100', '8889', '127.0.0.1', '8890'],stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)\n",
97 | "subprocess.Popen(['java', '-jar', '/usr/local/lib/blazingsql-algebra.jar', '-p', '8890'])\n",
98 | "\n",
99 | "import pyblazing.apiv2.context as cont\n",
100 | "cont.runRal()\n",
101 | "time.sleep(1) "
102 | ]
103 | },
104 | {
105 | "cell_type": "markdown",
106 | "metadata": {
107 | "colab_type": "text",
108 | "id": "aMwNKxePSwOp"
109 | },
110 | "source": [
111 | "## Import packages and create Blazing Context\n",
112 | "You can think of the BlazingContext much like a Spark Context (i.e. where information such as FileSystems you have registered and Tables you have created will be stored). If you have issues running this cell, restart runtime and try running it again.\n"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 1,
118 | "metadata": {
119 | "colab": {
120 | "base_uri": "https://localhost:8080/",
121 | "height": 35
122 | },
123 | "colab_type": "code",
124 | "id": "ZR_vWwtMcvvY",
125 | "outputId": "c78cc40a-f7d8-4ac5-c255-d99edd03b785"
126 | },
127 | "outputs": [
128 | {
129 | "name": "stdout",
130 | "output_type": "stream",
131 | "text": [
132 | "BlazingContext ready\n"
133 | ]
134 | }
135 | ],
136 | "source": [
137 | "import cudf\n",
138 | "from blazingsql import BlazingContext\n",
139 | "# start up BlazingSQL\n",
140 | "bc = BlazingContext()"
141 | ]
142 | },
143 | {
144 | "cell_type": "markdown",
145 | "metadata": {
146 | "colab_type": "text",
147 | "id": "N2bqpDEnZyQf"
148 | },
149 | "source": [
150 | "## Read CSV\n",
151 | "First we need to download a CSV file. Then we use cuDF to read the CSV file, which gives us a GPU DataFrame (GDF). To learn more about the GDF and how it enables end to end workloads on rapids, check out our [blog post](https://blog.blazingdb.com/blazingsql-part-1-the-gpu-dataframe-gdf-and-cudf-in-rapids-ai-96ec15102240)."
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": 2,
157 | "metadata": {
158 | "colab": {
159 | "base_uri": "https://localhost:8080/",
160 | "height": 204
161 | },
162 | "colab_type": "code",
163 | "id": "iqRDacOBOg44",
164 | "outputId": "dccb35e0-c284-498b-80b7-8cfc84a7a6a7"
165 | },
166 | "outputs": [
167 | {
168 | "name": "stdout",
169 | "output_type": "stream",
170 | "text": [
171 | "--2020-01-23 02:59:55-- https://s3.amazonaws.com/blazingsql-colab/Music.csv\n",
172 | "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.0.133\n",
173 | "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.0.133|:443... connected.\n",
174 | "HTTP request sent, awaiting response... 200 OK\n",
175 | "Length: 10473 (10K) [text/csv]\n",
176 | "Saving to: ‘Music.csv’\n",
177 | "\n",
178 | "Music.csv 100%[===================>] 10.23K --.-KB/s in 0s \n",
179 | "\n",
180 | "2020-01-23 02:59:55 (190 MB/s) - ‘Music.csv’ saved [10473/10473]\n",
181 | "\n"
182 | ]
183 | }
184 | ],
185 | "source": [
186 | "#Download the test CSV\n",
187 | "!wget 'https://s3.amazonaws.com/blazingsql-colab/Music.csv'"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 3,
193 | "metadata": {
194 | "colab": {},
195 | "colab_type": "code",
196 | "id": "HhRhj-ZvZygH"
197 | },
198 | "outputs": [
199 | {
200 | "data": {
201 | "text/html": [
202 | "\n",
203 | "\n",
216 | "
\n",
217 | " \n",
218 | " \n",
219 | " | \n",
220 | " ARTIST | \n",
221 | " RATING | \n",
222 | " YEAR | \n",
223 | " LOCATION | \n",
224 | " FESTIVAL_SET | \n",
225 | "
\n",
226 | " \n",
227 | " \n",
228 | " \n",
229 | " 0 | \n",
230 | " Arcade Fire | \n",
231 | " 10.0 | \n",
232 | " 2018.0 | \n",
233 | " Las Vegas | \n",
234 | " 1.0 | \n",
235 | "
\n",
236 | " \n",
237 | " 1 | \n",
238 | " Justice | \n",
239 | " 10.0 | \n",
240 | " 2018.0 | \n",
241 | " Las Vegas | \n",
242 | " 1.0 | \n",
243 | "
\n",
244 | " \n",
245 | " 2 | \n",
246 | " Florence and The Machine | \n",
247 | " 10.0 | \n",
248 | " 2018.0 | \n",
249 | " Las Vegas | \n",
250 | " 1.0 | \n",
251 | "
\n",
252 | " \n",
253 | " 3 | \n",
254 | " Odesza | \n",
255 | " 10.0 | \n",
256 | " 2018.0 | \n",
257 | " Indio | \n",
258 | " 1.0 | \n",
259 | "
\n",
260 | " \n",
261 | " 4 | \n",
262 | " Bon Iver | \n",
263 | " 10.0 | \n",
264 | " 2017.0 | \n",
265 | " Indio | \n",
266 | " 1.0 | \n",
267 | "
\n",
268 | " \n",
269 | "
\n",
270 | "
"
271 | ],
272 | "text/plain": [
273 | " ARTIST RATING YEAR LOCATION FESTIVAL_SET\n",
274 | "0 Arcade Fire 10.0 2018.0 Las Vegas 1.0\n",
275 | "1 Justice 10.0 2018.0 Las Vegas 1.0\n",
276 | "2 Florence and The Machine 10.0 2018.0 Las Vegas 1.0\n",
277 | "3 Odesza 10.0 2018.0 Indio 1.0\n",
278 | "4 Bon Iver 10.0 2017.0 Indio 1.0"
279 | ]
280 | },
281 | "execution_count": 3,
282 | "metadata": {},
283 | "output_type": "execute_result"
284 | }
285 | ],
286 | "source": [
287 | "# like pandas, cudf can simply read the csv\n",
288 | "gdf = cudf.read_csv('Music.csv')\n",
289 | "\n",
290 | "# let's see how it looks\n",
291 | "gdf.head()"
292 | ]
293 | },
294 | {
295 | "cell_type": "markdown",
296 | "metadata": {
297 | "colab_type": "text",
298 | "id": "HJFz-mqZTJ5Z"
299 | },
300 | "source": [
301 | "## Create a Table\n",
302 | "Now we just need to create a table. "
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": 4,
308 | "metadata": {
309 | "colab": {},
310 | "colab_type": "code",
311 | "id": "HJuvtJDYTMyb"
312 | },
313 | "outputs": [
314 | {
315 | "data": {
316 | "text/plain": [
317 | ""
318 | ]
319 | },
320 | "execution_count": 4,
321 | "metadata": {},
322 | "output_type": "execute_result"
323 | }
324 | ],
325 | "source": [
326 | "bc.create_table('music', gdf, header=0)"
327 | ]
328 | },
329 | {
330 | "cell_type": "markdown",
331 | "metadata": {
332 | "colab_type": "text",
333 | "id": "98HJFrt5TRa0"
334 | },
335 | "source": [
336 | "## Query a Table\n",
337 | "That's it! Now when you can write a SQL query the data will get processed on the GPU with BlazingSQL, and the output will be a GPU DataFrame (GDF) inside RAPIDS!"
338 | ]
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": 5,
343 | "metadata": {
344 | "colab": {
345 | "base_uri": "https://localhost:8080/",
346 | "height": 1000
347 | },
348 | "colab_type": "code",
349 | "id": "14GwxmLsTV_p",
350 | "outputId": "144b7601-5363-49f8-d5af-13e80917672c"
351 | },
352 | "outputs": [
353 | {
354 | "data": {
355 | "text/html": [
356 | "\n",
357 | "\n",
370 | "
\n",
371 | " \n",
372 | " \n",
373 | " | \n",
374 | " ARTIST | \n",
375 | " RATING | \n",
376 | " LOCATION | \n",
377 | "
\n",
378 | " \n",
379 | " \n",
380 | " \n",
381 | " 0 | \n",
382 | " Arcade Fire | \n",
383 | " 10.0 | \n",
384 | " Las Vegas | \n",
385 | "
\n",
386 | " \n",
387 | " 1 | \n",
388 | " Justice | \n",
389 | " 10.0 | \n",
390 | " Las Vegas | \n",
391 | "
\n",
392 | " \n",
393 | " 2 | \n",
394 | " Florence and The Machine | \n",
395 | " 10.0 | \n",
396 | " Las Vegas | \n",
397 | "
\n",
398 | " \n",
399 | " 3 | \n",
400 | " Odesza | \n",
401 | " 10.0 | \n",
402 | " Indio | \n",
403 | "
\n",
404 | " \n",
405 | " 4 | \n",
406 | " Bon Iver | \n",
407 | " 10.0 | \n",
408 | " Indio | \n",
409 | "
\n",
410 | " \n",
411 | " 5 | \n",
412 | " LA Philharmonic + Sigur Ros | \n",
413 | " 10.0 | \n",
414 | " LA | \n",
415 | "
\n",
416 | " \n",
417 | " 6 | \n",
418 | " Sigur Ros | \n",
419 | " 10.0 | \n",
420 | " Malmo | \n",
421 | "
\n",
422 | " \n",
423 | " 7 | \n",
424 | " Arcade Fire | \n",
425 | " 10.0 | \n",
426 | " Indio | \n",
427 | "
\n",
428 | " \n",
429 | " 8 | \n",
430 | " Escort | \n",
431 | " 9.0 | \n",
432 | " San Francisco | \n",
433 | "
\n",
434 | " \n",
435 | " 9 | \n",
436 | " Phoenix | \n",
437 | " 9.0 | \n",
438 | " Berkeley | \n",
439 | "
\n",
440 | " \n",
441 | "
\n",
442 | "
"
443 | ],
444 | "text/plain": [
445 | " ARTIST RATING LOCATION\n",
446 | "0 Arcade Fire 10.0 Las Vegas\n",
447 | "1 Justice 10.0 Las Vegas\n",
448 | "2 Florence and The Machine 10.0 Las Vegas\n",
449 | "3 Odesza 10.0 Indio\n",
450 | "4 Bon Iver 10.0 Indio\n",
451 | "5 LA Philharmonic + Sigur Ros 10.0 LA\n",
452 | "6 Sigur Ros 10.0 Malmo\n",
453 | "7 Arcade Fire 10.0 Indio\n",
454 | "8 Escort 9.0 San Francisco\n",
455 | "9 Phoenix 9.0 Berkeley"
456 | ]
457 | },
458 | "execution_count": 5,
459 | "metadata": {},
460 | "output_type": "execute_result"
461 | }
462 | ],
463 | "source": [
464 | "# query 10 events with a rating of at least 7\n",
465 | "gdf = bc.sql('select ARTIST, RATING, LOCATION from music where RATING >= 7 limit 10')\n",
466 | "\n",
467 | "# display GDF (just like pandas)\n",
468 | "gdf"
469 | ]
470 | },
471 | {
472 | "cell_type": "markdown",
473 | "metadata": {
474 | "colab_type": "text",
475 | "id": "wygAeTIFTm2X"
476 | },
477 | "source": [
478 | "# You're Ready to Rock\n",
479 | "And... thats it! You are now live with BlazingSQL.\n",
480 | "\n",
481 | "\n",
482 | "Check out our [docs](https://docs.blazingdb.com) to get fancy or to learn more about how BlazingSQL works with the rest of [RAPIDS AI](https://rapids.ai/)."
483 | ]
484 | }
485 | ],
486 | "metadata": {
487 | "accelerator": "GPU",
488 | "colab": {
489 | "collapsed_sections": [],
490 | "name": "blazingsql_demo.ipynb",
491 | "provenance": []
492 | },
493 | "kernelspec": {
494 | "display_name": "Python 3",
495 | "language": "python",
496 | "name": "python3"
497 | },
498 | "language_info": {
499 | "codemirror_mode": {
500 | "name": "ipython",
501 | "version": 3
502 | },
503 | "file_extension": ".py",
504 | "mimetype": "text/x-python",
505 | "name": "python",
506 | "nbconvert_exporter": "python",
507 | "pygments_lexer": "ipython3",
508 | "version": "3.6.7"
509 | }
510 | },
511 | "nbformat": 4,
512 | "nbformat_minor": 4
513 | }
514 |
--------------------------------------------------------------------------------
/colab_notebooks/graphistry_netflow_demo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "colab_type": "text",
7 | "id": "kJyD4oSbugE0"
8 | },
9 | "source": [
10 | "# Graphistry Netflow Demo\n",
11 | "\n",
12 | "In this example we are taking millions of rows of netflow (network traffic flow) data in order to search for anomalous activity within a network.\n",
13 | "\n",
14 | "In this notebook, we will: \n",
15 | "- Set up [BlazingSQL](https://blazingsql.com) and the [RAPIDS AI](https://rapids.ai/) suite.\n",
16 | "- Query 20M rows of network security data (netflow) with BlazingSQL and then pass to Graphistry to visualize and interact with the data.\n",
17 | ""
18 | ]
19 | },
20 | {
21 | "cell_type": "markdown",
22 | "metadata": {},
23 | "source": [
24 | "## Setup\n",
25 | "### Environment Sanity Check \n",
26 | "\n",
27 | "RAPIDS packages (BlazingSQL included) require Pascal+ architecture to run. For Colab, this translates to a T4 GPU instance. \n",
28 | "\n",
29 | "The cell below will let you know what type of GPU you've been allocated, and how to proceed."
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 1,
35 | "metadata": {
36 | "colab": {
37 | "base_uri": "https://localhost:8080/",
38 | "height": 312
39 | },
40 | "colab_type": "code",
41 | "id": "zxhxwrfI7aoT",
42 | "outputId": "0880eafa-a0b1-4f39-d3dc-bab9d4e8b127"
43 | },
44 | "outputs": [
45 | {
46 | "name": "stdout",
47 | "output_type": "stream",
48 | "text": [
49 | "\n",
50 | "\n",
51 | "***********************************\n",
52 | "GPU = b'Tesla T4'\n",
53 | "Woo! You got the right kind of GPU!\n",
54 | "***********************************\n",
55 | "\n",
56 | "\n"
57 | ]
58 | }
59 | ],
60 | "source": [
61 | "!wget https://github.com/BlazingDB/bsql-demos/raw/master/utils/colab_env.py\n",
62 | "!python colab_env.py "
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "## Installs \n",
70 | "The cell below pulls our Google Colab install script from the `bsql-demos` repo then runs it. The script first installs miniconda, then uses miniconda to install BlazingSQL and RAPIDS AI. This takes a few minutes to run. "
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 3,
76 | "metadata": {
77 | "colab": {
78 | "base_uri": "https://localhost:8080/",
79 | "height": 35
80 | },
81 | "colab_type": "code",
82 | "id": "a7RprJxtZZtQ",
83 | "outputId": "5ed256e4-93ee-4295-914d-c5c75c9d6059"
84 | },
85 | "outputs": [],
86 | "source": [
87 | "!wget https://github.com/BlazingDB/bsql-demos/raw/master/utils/bsql-colab.sh \n",
88 | "!bash bsql-colab.sh\n",
89 | "\n",
90 | "import sys, os, time\n",
91 | "sys.path.append('/usr/local/lib/python3.6/site-packages/')\n",
92 | "os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'\n",
93 | "os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'\n",
94 | "\n",
95 | "import subprocess\n",
96 | "subprocess.Popen(['blazingsql-orchestrator', '9100', '8889', '127.0.0.1', '8890'],stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)\n",
97 | "subprocess.Popen(['java', '-jar', '/usr/local/lib/blazingsql-algebra.jar', '-p', '8890'])\n",
98 | "\n",
99 | "import pyblazing.apiv2.context as cont\n",
100 | "cont.runRal()\n",
101 | "time.sleep(1) \n",
102 | "!wget https://github.com/BlazingDB/bsql-demos/raw/master/utils/colab_env.py\n",
103 | "!python colab_env.py "
104 | ]
105 | },
106 | {
107 | "cell_type": "markdown",
108 | "metadata": {
109 | "colab_type": "text",
110 | "id": "4guM6G87ul8e"
111 | },
112 | "source": [
113 | "## Download CSV\n",
114 | "\n",
115 | "The cell below will download the data for this demo from AWS and store it locally as `nf-chunk2.csv`. "
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {
122 | "colab": {
123 | "base_uri": "https://localhost:8080/",
124 | "height": 208
125 | },
126 | "colab_type": "code",
127 | "id": "F6teFkVGufUf",
128 | "outputId": "42fedd97-8baf-4d1a-ea41-95602cd8cb11"
129 | },
130 | "outputs": [
131 | {
132 | "name": "stdout",
133 | "output_type": "stream",
134 | "text": [
135 | "--2019-08-23 21:43:50-- https://blazingsql-colab.s3.amazonaws.com/netflow_data/nf-chunk2.csv\n",
136 | "Resolving blazingsql-colab.s3.amazonaws.com (blazingsql-colab.s3.amazonaws.com)... 52.216.137.76\n",
137 | "Connecting to blazingsql-colab.s3.amazonaws.com (blazingsql-colab.s3.amazonaws.com)|52.216.137.76|:443... connected.\n",
138 | "HTTP request sent, awaiting response... 200 OK\n",
139 | "Length: 2725056295 (2.5G) [text/csv]\n",
140 | "Saving to: ‘nf-chunk2.csv’\n",
141 | "\n",
142 | "nf-chunk2.csv 100%[===================>] 2.54G 49.2MB/s in 56s \n",
143 | "\n",
144 | "2019-08-23 21:44:46 (46.2 MB/s) - ‘nf-chunk2.csv’ saved [2725056295/2725056295]\n",
145 | "\n"
146 | ]
147 | }
148 | ],
149 | "source": [
150 | "!wget https://blazingsql-colab.s3.amazonaws.com/netflow_data/nf-chunk2.csv"
151 | ]
152 | },
153 | {
154 | "cell_type": "markdown",
155 | "metadata": {},
156 | "source": [
157 | "## Blazing Context\n",
158 | "Here we are importing cuDF and BlazingContext. You can think of the BlazingContext much like a Spark Context (i.e. where information such as FileSystems you have registered and Tables you have created will be stored). If you have issues running this cell, restart runtime and try running it again."
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": 1,
164 | "metadata": {
165 | "colab": {
166 | "base_uri": "https://localhost:8080/",
167 | "height": 69
168 | },
169 | "colab_type": "code",
170 | "id": "pqQ8lqL8vb-8",
171 | "outputId": "4e5ebc46-6319-4d3a-851c-7d6a2ac2825d"
172 | },
173 | "outputs": [
174 | {
175 | "name": "stdout",
176 | "output_type": "stream",
177 | "text": [
178 | "BlazingContext ready\n"
179 | ]
180 | }
181 | ],
182 | "source": [
183 | "from blazingsql import BlazingContext\n",
184 | "import cudf\n",
185 | "# start up BlazingSQL\n",
186 | "bc = BlazingContext()"
187 | ]
188 | },
189 | {
190 | "cell_type": "markdown",
191 | "metadata": {
192 | "colab_type": "text",
193 | "id": "yp7z8bfivbna"
194 | },
195 | "source": [
196 | "### Load & Query Tables\n",
197 | "\n",
198 | "In the cell below, we are first loading the CSV file into a GPU DataFrame (gdf), and then creating tables so that we can run SQL queries on those GDFs. \n",
199 | "\n",
200 | "Note: when you create a table off of a GDF there is no copy, it is merely registering the schema."
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 2,
206 | "metadata": {
207 | "colab": {},
208 | "colab_type": "code",
209 | "id": "lU-2wlwQntnq"
210 | },
211 | "outputs": [
212 | {
213 | "data": {
214 | "text/plain": [
215 | ""
216 | ]
217 | },
218 | "execution_count": 2,
219 | "metadata": {},
220 | "output_type": "execute_result"
221 | }
222 | ],
223 | "source": [
224 | "# Load CSVs into GPU DataFrames (gdf)\n",
225 | "netflow_gdf = cudf.read_csv('nf-chunk2.csv')\n",
226 | "\n",
227 | "# Create BlazingSQL Tables - There is no copy in this process\n",
228 | "bc.create_table('netflow', netflow_gdf)"
229 | ]
230 | },
231 | {
232 | "cell_type": "markdown",
233 | "metadata": {
234 | "colab_type": "text",
235 | "id": "cgivbut9df-R"
236 | },
237 | "source": [
238 | "#### Query\n",
239 | "With the table made, we can simply run a SQL query.\n",
240 | "\n",
241 | "We are going to run some joins and aggregations in order to condese these millions of rows into thousands of rows that represent nodes and edges."
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": 3,
247 | "metadata": {
248 | "colab": {
249 | "base_uri": "https://localhost:8080/",
250 | "height": 277
251 | },
252 | "colab_type": "code",
253 | "id": "umBG2Tp0wbQx",
254 | "outputId": "b89e3666-f85a-40e9-e7c4-cda9a80b7fe5"
255 | },
256 | "outputs": [
257 | {
258 | "name": "stdout",
259 | "output_type": "stream",
260 | "text": [
261 | "CPU times: user 32.3 ms, sys: 453 µs, total: 32.8 ms\n",
262 | "Wall time: 438 ms\n"
263 | ]
264 | }
265 | ],
266 | "source": [
267 | "%%time\n",
268 | "# make a query\n",
269 | "query = '''\n",
270 | " SELECT\n",
271 | " a.firstSeenSrcIp as source,\n",
272 | " a.firstSeenDestIp as destination,\n",
273 | " count(a.firstSeenDestPort) as targetPorts,\n",
274 | " SUM(a.firstSeenSrcTotalBytes) as bytesOut,\n",
275 | " SUM(a.firstSeenDestTotalBytes) as bytesIn,\n",
276 | " SUM(a.durationSeconds) as durationSeconds,\n",
277 | " MIN(parsedDate) as firstFlowDate,\n",
278 | " MAX(parsedDate) as lastFlowDate,\n",
279 | " COUNT(*) as attemptCount\n",
280 | " FROM\n",
281 | " netflow a\n",
282 | " GROUP BY\n",
283 | " a.firstSeenSrcIp,\n",
284 | " a.firstSeenDestIp\n",
285 | " '''\n",
286 | "\n",
287 | "# query the table\n",
288 | "gdf = bc.sql(query)"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": 4,
294 | "metadata": {},
295 | "outputs": [
296 | {
297 | "data": {
298 | "text/html": [
299 | "\n",
300 | "\n",
313 | "
\n",
314 | " \n",
315 | " \n",
316 | " | \n",
317 | " source | \n",
318 | " destination | \n",
319 | " targetPorts | \n",
320 | " bytesOut | \n",
321 | " bytesIn | \n",
322 | " durationSeconds | \n",
323 | " firstFlowDate | \n",
324 | " lastFlowDate | \n",
325 | " attemptCount | \n",
326 | "
\n",
327 | " \n",
328 | " \n",
329 | " \n",
330 | " 0 | \n",
331 | " 172.10.1.226 | \n",
332 | " 239.255.255.250 | \n",
333 | " 3 | \n",
334 | " 875 | \n",
335 | " 0 | \n",
336 | " 6 | \n",
337 | " 2013-04-03 06:36:44 | \n",
338 | " 2013-04-03 06:36:51 | \n",
339 | " 3 | \n",
340 | "
\n",
341 | " \n",
342 | " 1 | \n",
343 | " 172.30.1.200 | \n",
344 | " 239.255.255.250 | \n",
345 | " 9 | \n",
346 | " 2275 | \n",
347 | " 0 | \n",
348 | " 12 | \n",
349 | " 2013-04-03 06:35:52 | \n",
350 | " 2013-04-03 06:43:26 | \n",
351 | " 9 | \n",
352 | "
\n",
353 | " \n",
354 | " 2 | \n",
355 | " 172.30.1.225 | \n",
356 | " 172.0.0.1 | \n",
357 | " 1 | \n",
358 | " 90 | \n",
359 | " 90 | \n",
360 | " 0 | \n",
361 | " 2013-04-03 06:36:14 | \n",
362 | " 2013-04-03 06:36:14 | \n",
363 | " 1 | \n",
364 | "
\n",
365 | " \n",
366 | " 3 | \n",
367 | " 172.30.1.46 | \n",
368 | " 239.255.255.250 | \n",
369 | " 17 | \n",
370 | " 4025 | \n",
371 | " 0 | \n",
372 | " 18 | \n",
373 | " 2013-04-03 06:35:22 | \n",
374 | " 2013-04-03 06:44:08 | \n",
375 | " 17 | \n",
376 | "
\n",
377 | " \n",
378 | " 4 | \n",
379 | " 172.20.2.71 | \n",
380 | " 239.255.255.250 | \n",
381 | " 3 | \n",
382 | " 875 | \n",
383 | " 0 | \n",
384 | " 6 | \n",
385 | " 2013-04-03 06:37:11 | \n",
386 | " 2013-04-03 06:37:18 | \n",
387 | " 3 | \n",
388 | "
\n",
389 | " \n",
390 | " 5 | \n",
391 | " 172.10.1.233 | \n",
392 | " 172.0.0.1 | \n",
393 | " 1 | \n",
394 | " 180 | \n",
395 | " 180 | \n",
396 | " 0 | \n",
397 | " 2013-04-03 06:36:45 | \n",
398 | " 2013-04-03 06:36:45 | \n",
399 | " 1 | \n",
400 | "
\n",
401 | " \n",
402 | " 6 | \n",
403 | " 172.30.1.102 | \n",
404 | " 10.0.0.10 | \n",
405 | " 1 | \n",
406 | " 454 | \n",
407 | " 633 | \n",
408 | " 0 | \n",
409 | " 2013-04-03 06:48:05 | \n",
410 | " 2013-04-03 06:48:05 | \n",
411 | " 1 | \n",
412 | "
\n",
413 | " \n",
414 | " 7 | \n",
415 | " 172.20.1.39 | \n",
416 | " 239.255.255.250 | \n",
417 | " 1 | \n",
418 | " 525 | \n",
419 | " 0 | \n",
420 | " 6 | \n",
421 | " 2013-04-03 06:36:59 | \n",
422 | " 2013-04-03 06:36:59 | \n",
423 | " 1 | \n",
424 | "
\n",
425 | " \n",
426 | " 8 | \n",
427 | " 172.10.1.96 | \n",
428 | " 172.0.0.1 | \n",
429 | " 1 | \n",
430 | " 180 | \n",
431 | " 0 | \n",
432 | " 0 | \n",
433 | " 2013-04-03 06:36:21 | \n",
434 | " 2013-04-03 06:36:21 | \n",
435 | " 1 | \n",
436 | "
\n",
437 | " \n",
438 | " 9 | \n",
439 | " 172.20.1.2 | \n",
440 | " 239.255.255.250 | \n",
441 | " 19 | \n",
442 | " 3675 | \n",
443 | " 0 | \n",
444 | " 6 | \n",
445 | " 2013-04-03 06:36:50 | \n",
446 | " 2013-04-03 06:36:59 | \n",
447 | " 19 | \n",
448 | "
\n",
449 | " \n",
450 | "
\n",
451 | "
"
452 | ],
453 | "text/plain": [
454 | " source destination targetPorts bytesOut bytesIn \\\n",
455 | "0 172.10.1.226 239.255.255.250 3 875 0 \n",
456 | "1 172.30.1.200 239.255.255.250 9 2275 0 \n",
457 | "2 172.30.1.225 172.0.0.1 1 90 90 \n",
458 | "3 172.30.1.46 239.255.255.250 17 4025 0 \n",
459 | "4 172.20.2.71 239.255.255.250 3 875 0 \n",
460 | "5 172.10.1.233 172.0.0.1 1 180 180 \n",
461 | "6 172.30.1.102 10.0.0.10 1 454 633 \n",
462 | "7 172.20.1.39 239.255.255.250 1 525 0 \n",
463 | "8 172.10.1.96 172.0.0.1 1 180 0 \n",
464 | "9 172.20.1.2 239.255.255.250 19 3675 0 \n",
465 | "\n",
466 | " durationSeconds firstFlowDate lastFlowDate attemptCount \n",
467 | "0 6 2013-04-03 06:36:44 2013-04-03 06:36:51 3 \n",
468 | "1 12 2013-04-03 06:35:52 2013-04-03 06:43:26 9 \n",
469 | "2 0 2013-04-03 06:36:14 2013-04-03 06:36:14 1 \n",
470 | "3 18 2013-04-03 06:35:22 2013-04-03 06:44:08 17 \n",
471 | "4 6 2013-04-03 06:37:11 2013-04-03 06:37:18 3 \n",
472 | "5 0 2013-04-03 06:36:45 2013-04-03 06:36:45 1 \n",
473 | "6 0 2013-04-03 06:48:05 2013-04-03 06:48:05 1 \n",
474 | "7 6 2013-04-03 06:36:59 2013-04-03 06:36:59 1 \n",
475 | "8 0 2013-04-03 06:36:21 2013-04-03 06:36:21 1 \n",
476 | "9 6 2013-04-03 06:36:50 2013-04-03 06:36:59 19 "
477 | ]
478 | },
479 | "execution_count": 4,
480 | "metadata": {},
481 | "output_type": "execute_result"
482 | }
483 | ],
484 | "source": [
485 | "# how's the dataframe look?\n",
486 | "gdf.head(10)"
487 | ]
488 | }
489 | ],
490 | "metadata": {
491 | "file_extension": ".py",
492 | "kernelspec": {
493 | "display_name": "Python 3",
494 | "language": "python",
495 | "name": "python3"
496 | },
497 | "language_info": {
498 | "codemirror_mode": {
499 | "name": "ipython",
500 | "version": 3
501 | },
502 | "file_extension": ".py",
503 | "mimetype": "text/x-python",
504 | "name": "python",
505 | "nbconvert_exporter": "python",
506 | "pygments_lexer": "ipython3",
507 | "version": "3.6.7"
508 | },
509 | "mimetype": "text/x-python",
510 | "name": "python",
511 | "npconvert_exporter": "python",
512 | "pygments_lexer": "ipython3",
513 | "version": 3
514 | },
515 | "nbformat": 4,
516 | "nbformat_minor": 2
517 | }
518 |
--------------------------------------------------------------------------------
/colab_notebooks/vs_pyspark_netflow.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "colab_type": "text",
7 | "id": "d0hJ4z8rBOFC"
8 | },
9 | "source": [
10 | "# BlazingSQL vs. Apache Spark \n",
11 | "\n",
12 | "Below we have one of our popular workloads running with [BlazingSQL](https://blazingsql.com/) + [RAPIDS AI](https://rapids.ai) and then running the entire ETL phase again, only this time with Apache Spark + PySpark.\n",
13 | "\n",
14 | "In this notebook, we will cover: \n",
15 | "- How to set up BlazingSQL and the RAPIDS AI suite in Google Colab.\n",
16 | "- How to read and query csv files with cuDF and BlazingSQL.\n",
17 | "- How BlazingSQL compares against Apache Spark (analyzing over 20M records).\n",
18 | "\n",
19 | ""
20 | ]
21 | },
22 | {
23 | "cell_type": "markdown",
24 | "metadata": {
25 | "colab_type": "text",
26 | "id": "kJyD4oSbugE0"
27 | },
28 | "source": [
29 | "## Setup\n",
30 | "### Environment Sanity Check \n",
31 | "\n",
32 | "RAPIDS packages (BlazingSQL included) require Pascal+ architecture to run. For Colab, this translates to a T4 GPU instance. \n",
33 | "\n",
34 | "The cell below will let you know what type of GPU you've been allocated, and how to proceed."
35 | ]
36 | },
37 | {
38 | "cell_type": "code",
39 | "execution_count": 14,
40 | "metadata": {
41 | "colab": {
42 | "base_uri": "https://localhost:8080/",
43 | "height": 35
44 | },
45 | "colab_type": "code",
46 | "id": "QzVzojZ7tc9a",
47 | "outputId": "1c412c49-59fd-482b-83dc-1764af8fda12"
48 | },
49 | "outputs": [
50 | {
51 | "name": "stdout",
52 | "output_type": "stream",
53 | "text": [
54 | "\n",
55 | "\n",
56 | "***********************************\n",
57 | "GPU = b'Tesla T4'\n",
58 | "Woo! You got the right kind of GPU!\n",
59 | "***********************************\n",
60 | "\n",
61 | "\n"
62 | ]
63 | }
64 | ],
65 | "source": [
66 | "!wget https://github.com/BlazingDB/bsql-demos/raw/master/utils/colab_env.py\n",
67 | "!python colab_env.py "
68 | ]
69 | },
70 | {
71 | "cell_type": "markdown",
72 | "metadata": {
73 | "colab": {},
74 | "colab_type": "code",
75 | "id": "btG1BbSA1nLu"
76 | },
77 | "source": [
78 | "## Installs \n",
79 | "The cell below pulls our Google Colab install script from the `bsql-demos` repo then runs it. The script first installs miniconda, then uses miniconda to install BlazingSQL and RAPIDS AI. This takes a few minutes to run. "
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {},
86 | "outputs": [],
87 | "source": [
88 | "!wget https://github.com/BlazingDB/bsql-demos/raw/master/utils/bsql-colab.sh \n",
89 | "!bash bsql-colab.sh\n",
90 | "\n",
91 | "import sys, os, time\n",
92 | "sys.path.append('/usr/local/lib/python3.6/site-packages/')\n",
93 | "os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'\n",
94 | "os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'\n",
95 | "\n",
96 | "import subprocess\n",
97 | "subprocess.Popen(['blazingsql-orchestrator', '9100', '8889', '127.0.0.1', '8890'],stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)\n",
98 | "subprocess.Popen(['java', '-jar', '/usr/local/lib/blazingsql-algebra.jar', '-p', '8890'])\n",
99 | "\n",
100 | "import pyblazing.apiv2.context as cont\n",
101 | "cont.runRal()\n",
102 | "time.sleep(1) "
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "metadata": {
108 | "colab_type": "text",
109 | "id": "0guvG6Ws_zmX"
110 | },
111 | "source": [
112 | "## Import packages and create Blazing Context\n",
113 | "You can think of the BlazingContext much like a Spark Context (i.e. where information such as FileSystems you have registered and Tables you have created will be stored). If you have issues running this cell, restart runtime and try running it again."
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": 1,
119 | "metadata": {
120 | "colab": {
121 | "base_uri": "https://localhost:8080/",
122 | "height": 35
123 | },
124 | "colab_type": "code",
125 | "id": "ojm_V-WAtz0f",
126 | "outputId": "a46625f4-1494-4a13-eb13-2f38efd80ccf"
127 | },
128 | "outputs": [
129 | {
130 | "name": "stdout",
131 | "output_type": "stream",
132 | "text": [
133 | "BlazingContext ready\n"
134 | ]
135 | }
136 | ],
137 | "source": [
138 | "from blazingsql import BlazingContext\n",
139 | "import cudf\n",
140 | "# start up BlazingSQL\n",
141 | "bc = BlazingContext()"
142 | ]
143 | },
144 | {
145 | "cell_type": "markdown",
146 | "metadata": {
147 | "colab_type": "text",
148 | "id": "yp7z8bfivbna"
149 | },
150 | "source": [
151 | "### Load & Query Table\n",
152 | "First, we need to download the netflow data (20 million records) from AWS."
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": null,
158 | "metadata": {
159 | "colab": {},
160 | "colab_type": "code",
161 | "id": "2dAt6DfG37KH"
162 | },
163 | "outputs": [],
164 | "source": [
165 | "# takes a few minutes to download\n",
166 | "!wget https://blazingsql-colab.s3.amazonaws.com/netflow_data/nf-chunk2.csv"
167 | ]
168 | },
169 | {
170 | "cell_type": "markdown",
171 | "metadata": {
172 | "colab_type": "text",
173 | "id": "OTEaAsp2_zmf"
174 | },
175 | "source": [
176 | "#### BlazingSQL + cuDF \n",
177 | "Data in hand, we can test the preformance of cuDF and BlazingSQL on this dataset."
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": 2,
183 | "metadata": {
184 | "colab": {
185 | "base_uri": "https://localhost:8080/",
186 | "height": 52
187 | },
188 | "colab_type": "code",
189 | "id": "rirBsYQU3NH5",
190 | "outputId": "51ced2b1-b930-4173-bbfa-09672e751d3f"
191 | },
192 | "outputs": [
193 | {
194 | "name": "stdout",
195 | "output_type": "stream",
196 | "text": [
197 | "CPU times: user 138 ms, sys: 142 ms, total: 280 ms\n",
198 | "Wall time: 304 ms\n"
199 | ]
200 | }
201 | ],
202 | "source": [
203 | "%%time\n",
204 | "# Load CSVs into GPU DataFrames (GDF)\n",
205 | "netflow_gdf = cudf.read_csv('nf-chunk2.csv')"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": 3,
211 | "metadata": {
212 | "colab": {
213 | "base_uri": "https://localhost:8080/",
214 | "height": 52
215 | },
216 | "colab_type": "code",
217 | "id": "zCzLEFfB3N4k",
218 | "outputId": "10ff9097-2736-423e-969d-de75983fbdda"
219 | },
220 | "outputs": [
221 | {
222 | "name": "stdout",
223 | "output_type": "stream",
224 | "text": [
225 | "CPU times: user 27.5 ms, sys: 747 µs, total: 28.2 ms\n",
226 | "Wall time: 55.9 ms\n"
227 | ]
228 | },
229 | {
230 | "data": {
231 | "text/plain": [
232 | ""
233 | ]
234 | },
235 | "execution_count": 3,
236 | "metadata": {},
237 | "output_type": "execute_result"
238 | }
239 | ],
240 | "source": [
241 | "%%time\n",
242 | "# Create BlazingSQL table from GDF - There is no copy in this process\n",
243 | "bc.create_table('netflow', netflow_gdf)"
244 | ]
245 | },
246 | {
247 | "cell_type": "code",
248 | "execution_count": 4,
249 | "metadata": {
250 | "colab": {
251 | "base_uri": "https://localhost:8080/",
252 | "height": 295
253 | },
254 | "colab_type": "code",
255 | "id": "umBG2Tp0wbQx",
256 | "outputId": "0975395e-7f5b-4244-afa3-45c8658ce61c"
257 | },
258 | "outputs": [
259 | {
260 | "name": "stdout",
261 | "output_type": "stream",
262 | "text": [
263 | "CPU times: user 30.8 ms, sys: 0 ns, total: 30.8 ms\n",
264 | "Wall time: 429 ms\n"
265 | ]
266 | }
267 | ],
268 | "source": [
269 | "%%time\n",
270 | "# make a query\n",
271 | "query = '''\n",
272 | " SELECT\n",
273 | " a.firstSeenSrcIp as source,\n",
274 | " a.firstSeenDestIp as destination,\n",
275 | " count(a.firstSeenDestPort) as targetPorts,\n",
276 | " SUM(a.firstSeenSrcTotalBytes) as bytesOut,\n",
277 | " SUM(a.firstSeenDestTotalBytes) as bytesIn,\n",
278 | " SUM(a.durationSeconds) as durationSeconds,\n",
279 | " MIN(parsedDate) as firstFlowDate,\n",
280 | " MAX(parsedDate) as lastFlowDate,\n",
281 | " COUNT(*) as attemptCount\n",
282 | " FROM\n",
283 | " netflow a\n",
284 | " GROUP BY\n",
285 | " a.firstSeenSrcIp,\n",
286 | " a.firstSeenDestIp\n",
287 | " '''\n",
288 | "\n",
289 | "# query the table\n",
290 | "gdf = bc.sql(query)"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": 5,
296 | "metadata": {
297 | "colab": {},
298 | "colab_type": "code",
299 | "id": "48_W2v8q_zmq",
300 | "outputId": "db0394f1-e082-49b0-c477-e3bba8d3d0f4"
301 | },
302 | "outputs": [
303 | {
304 | "data": {
305 | "text/html": [
306 | "\n",
307 | "\n",
320 | "
\n",
321 | " \n",
322 | " \n",
323 | " | \n",
324 | " source | \n",
325 | " destination | \n",
326 | " targetPorts | \n",
327 | " bytesOut | \n",
328 | " bytesIn | \n",
329 | " durationSeconds | \n",
330 | " firstFlowDate | \n",
331 | " lastFlowDate | \n",
332 | " attemptCount | \n",
333 | "
\n",
334 | " \n",
335 | " \n",
336 | " \n",
337 | " 0 | \n",
338 | " 172.30.2.48 | \n",
339 | " 172.0.0.1 | \n",
340 | " 1 | \n",
341 | " 90 | \n",
342 | " 0 | \n",
343 | " 0 | \n",
344 | " 2013-04-03 06:36:09 | \n",
345 | " 2013-04-03 06:36:09 | \n",
346 | " 1 | \n",
347 | "
\n",
348 | " \n",
349 | " 1 | \n",
350 | " 172.10.2.81 | \n",
351 | " 239.255.255.250 | \n",
352 | " 14 | \n",
353 | " 2800 | \n",
354 | " 0 | \n",
355 | " 6 | \n",
356 | " 2013-04-03 06:36:41 | \n",
357 | " 2013-04-03 06:36:48 | \n",
358 | " 14 | \n",
359 | "
\n",
360 | " \n",
361 | " 2 | \n",
362 | " 172.30.2.58 | \n",
363 | " 172.0.0.1 | \n",
364 | " 1 | \n",
365 | " 90 | \n",
366 | " 0 | \n",
367 | " 0 | \n",
368 | " 2013-04-03 06:36:09 | \n",
369 | " 2013-04-03 06:36:09 | \n",
370 | " 1 | \n",
371 | "
\n",
372 | " \n",
373 | " 3 | \n",
374 | " 172.30.1.171 | \n",
375 | " 10.0.0.13 | \n",
376 | " 1 | \n",
377 | " 454 | \n",
378 | " 633 | \n",
379 | " 0 | \n",
380 | " 2013-04-03 06:48:02 | \n",
381 | " 2013-04-03 06:48:02 | \n",
382 | " 1 | \n",
383 | "
\n",
384 | " \n",
385 | " 4 | \n",
386 | " 172.30.1.17 | \n",
387 | " 10.0.0.7 | \n",
388 | " 1 | \n",
389 | " 453 | \n",
390 | " 632 | \n",
391 | " 0 | \n",
392 | " 2013-04-03 06:47:56 | \n",
393 | " 2013-04-03 06:47:56 | \n",
394 | " 1 | \n",
395 | "
\n",
396 | " \n",
397 | "
\n",
398 | "
"
399 | ],
400 | "text/plain": [
401 | " source destination targetPorts bytesOut bytesIn \\\n",
402 | "0 172.30.2.48 172.0.0.1 1 90 0 \n",
403 | "1 172.10.2.81 239.255.255.250 14 2800 0 \n",
404 | "2 172.30.2.58 172.0.0.1 1 90 0 \n",
405 | "3 172.30.1.171 10.0.0.13 1 454 633 \n",
406 | "4 172.30.1.17 10.0.0.7 1 453 632 \n",
407 | "\n",
408 | " durationSeconds firstFlowDate lastFlowDate attemptCount \n",
409 | "0 0 2013-04-03 06:36:09 2013-04-03 06:36:09 1 \n",
410 | "1 6 2013-04-03 06:36:41 2013-04-03 06:36:48 14 \n",
411 | "2 0 2013-04-03 06:36:09 2013-04-03 06:36:09 1 \n",
412 | "3 0 2013-04-03 06:48:02 2013-04-03 06:48:02 1 \n",
413 | "4 0 2013-04-03 06:47:56 2013-04-03 06:47:56 1 "
414 | ]
415 | },
416 | "execution_count": 5,
417 | "metadata": {},
418 | "output_type": "execute_result"
419 | }
420 | ],
421 | "source": [
422 | "# how's it look?\n",
423 | "gdf.head()"
424 | ]
425 | },
426 | {
427 | "cell_type": "markdown",
428 | "metadata": {
429 | "colab_type": "text",
430 | "id": "6PXbjW1hTxrD"
431 | },
432 | "source": [
433 | "## Apache Spark\n",
434 | "The cell below installs Apache Spark ([PySpark](https://spark.apache.org/docs/latest/api/python/index.html))."
435 | ]
436 | },
437 | {
438 | "cell_type": "code",
439 | "execution_count": 4,
440 | "metadata": {
441 | "colab": {},
442 | "colab_type": "code",
443 | "id": "pnEEvVEtT8xi"
444 | },
445 | "outputs": [],
446 | "source": [
447 | "# Note: This installs Spark (version 2.4.1, as tested in Jan 2020)\n",
448 | "!pip install pyspark"
449 | ]
450 | },
451 | {
452 | "cell_type": "markdown",
453 | "metadata": {
454 | "colab_type": "text",
455 | "id": "W3-XmZkz_zmw"
456 | },
457 | "source": [
458 | "#### PyBlazing vs PySpark\n",
459 | "With everything installed we can launch a SparkSession and see how BlazingSQL stacks up."
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": 6,
465 | "metadata": {
466 | "colab": {
467 | "base_uri": "https://localhost:8080/",
468 | "height": 51
469 | },
470 | "colab_type": "code",
471 | "id": "nioEt2MqT9B0",
472 | "outputId": "f75b9823-5dbd-45b1-9282-562d3d6ddaf0"
473 | },
474 | "outputs": [
475 | {
476 | "name": "stdout",
477 | "output_type": "stream",
478 | "text": [
479 | "CPU times: user 50.2 ms, sys: 12.9 ms, total: 63.1 ms\n",
480 | "Wall time: 3.88 s\n"
481 | ]
482 | }
483 | ],
484 | "source": [
485 | "%%time\n",
486 | "# I copied this cell's snippet from another Google Colab by Luca Canali here: https://colab.research.google.com/github/LucaCanali/sparkMeasure/blob/master/examples/SparkMeasure_Jupyter_Colab_Example.ipynb\n",
487 | "\n",
488 | "from pyspark.sql import SparkSession\n",
489 | "\n",
490 | "# Create Spark Session\n",
491 | "# This example uses a local cluster, you can modify master to use YARN or K8S if available \n",
492 | "# This example downloads sparkMeasure 0.13 for scala 2_11 from maven central\n",
493 | "\n",
494 | "spark = SparkSession \\\n",
495 | " .builder \\\n",
496 | " .master(\"local[*]\") \\\n",
497 | " .appName(\"PySpark Netflow Benchmark code\") \\\n",
498 | " .config(\"spark.jars.packages\",\"ch.cern.sparkmeasure:spark-measure_2.11:0.13\") \\\n",
499 | " .getOrCreate()"
500 | ]
501 | },
502 | {
503 | "cell_type": "markdown",
504 | "metadata": {
505 | "colab_type": "text",
506 | "id": "G8XSppQiUdLY"
507 | },
508 | "source": [
509 | "### Load & Query Table"
510 | ]
511 | },
512 | {
513 | "cell_type": "code",
514 | "execution_count": 5,
515 | "metadata": {
516 | "colab": {
517 | "base_uri": "https://localhost:8080/",
518 | "height": 51
519 | },
520 | "colab_type": "code",
521 | "id": "ZSLuSYSOUDtf",
522 | "outputId": "2b93169b-63c5-4c46-da14-af87645bf51b"
523 | },
524 | "outputs": [
525 | {
526 | "name": "stdout",
527 | "output_type": "stream",
528 | "text": [
529 | "CPU times: user 2.73 ms, sys: 0 ns, total: 2.73 ms\n",
530 | "Wall time: 2.91 s\n"
531 | ]
532 | }
533 | ],
534 | "source": [
535 | "%%time\n",
536 | "# load CSV into Spark\n",
537 | "netflow_df = spark.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('nf-chunk2.csv')"
538 | ]
539 | },
540 | {
541 | "cell_type": "code",
542 | "execution_count": 6,
543 | "metadata": {
544 | "colab": {
545 | "base_uri": "https://localhost:8080/",
546 | "height": 51
547 | },
548 | "colab_type": "code",
549 | "id": "iT3BwLn8UDwE",
550 | "outputId": "4eeff800-489f-4230-adb9-f3a1c16ede66"
551 | },
552 | "outputs": [
553 | {
554 | "name": "stdout",
555 | "output_type": "stream",
556 | "text": [
557 | "CPU times: user 1.06 ms, sys: 611 µs, total: 1.67 ms\n",
558 | "Wall time: 120 ms\n"
559 | ]
560 | }
561 | ],
562 | "source": [
563 | "%%time\n",
564 | "# create table for querying\n",
565 | "netflow_df.createOrReplaceTempView('netflow')"
566 | ]
567 | },
568 | {
569 | "cell_type": "code",
570 | "execution_count": 7,
571 | "metadata": {
572 | "colab": {
573 | "base_uri": "https://localhost:8080/",
574 | "height": 493
575 | },
576 | "colab_type": "code",
577 | "id": "9SBhahA5UD2k",
578 | "outputId": "accc1938-6470-44df-ab7f-70058c755b2b"
579 | },
580 | "outputs": [
581 | {
582 | "name": "stdout",
583 | "output_type": "stream",
584 | "text": [
585 | "+------------+---------------+-----------+--------+-------+---------------+-------------------+-------------------+------------+\n",
586 | "| source| destination|targetPorts|bytesOut|bytesIn|durationSeconds| firstFlowDate| lastFlowDate|attemptCount|\n",
587 | "+------------+---------------+-----------+--------+-------+---------------+-------------------+-------------------+------------+\n",
588 | "| 172.10.1.13|239.255.255.250| 15| 2975| 0| 6|2013-04-03 06:36:19|2013-04-03 06:36:27| 15|\n",
589 | "|172.10.1.232| 172.0.0.1| 1| 180| 180| 0|2013-04-03 06:36:45|2013-04-03 06:36:45| 1|\n",
590 | "|172.10.1.238|239.255.255.250| 2| 700| 0| 6|2013-04-03 06:36:44|2013-04-03 06:36:51| 2|\n",
591 | "| 172.10.1.35| 172.0.0.1| 1| 270| 0| 0|2013-04-03 06:36:21|2013-04-03 06:36:21| 1|\n",
592 | "|172.10.2.137| 172.0.0.1| 1| 90| 90| 0|2013-04-03 06:36:42|2013-04-03 06:36:42| 1|\n",
593 | "+------------+---------------+-----------+--------+-------+---------------+-------------------+-------------------+------------+\n",
594 | "only showing top 5 rows\n",
595 | "\n",
596 | "CPU times: user 1.5 ms, sys: 861 µs, total: 2.36 ms\n",
597 | "Wall time: 1.14 s\n"
598 | ]
599 | }
600 | ],
601 | "source": [
602 | "%%time\n",
603 | "# make a query\n",
604 | "query = '''\n",
605 | " SELECT\n",
606 | " a.firstSeenSrcIp as source,\n",
607 | " a.firstSeenDestIp as destination,\n",
608 | " count(a.firstSeenDestPort) as targetPorts,\n",
609 | " SUM(a.firstSeenSrcTotalBytes) as bytesOut,\n",
610 | " SUM(a.firstSeenDestTotalBytes) as bytesIn,\n",
611 | " SUM(a.durationSeconds) as durationSeconds,\n",
612 | " MIN(parsedDate) as firstFlowDate,\n",
613 | " MAX(parsedDate) as lastFlowDate,\n",
614 | " COUNT(*) as attemptCount\n",
615 | " FROM\n",
616 | " netflow a\n",
617 | " GROUP BY\n",
618 | " a.firstSeenSrcIp,\n",
619 | " a.firstSeenDestIp\n",
620 | " '''\n",
621 | "\n",
622 | "# query with Spark\n",
623 | "edges_df = spark.sql(query)\n",
624 | "\n",
625 | "# set/display results\n",
626 | "edges_df.show(5)"
627 | ]
628 | }
629 | ],
630 | "metadata": {
631 | "accelerator": "GPU",
632 | "colab": {
633 | "collapsed_sections": [],
634 | "name": "vs_pyspark_netflow.ipynb",
635 | "provenance": [],
636 | "toc_visible": true
637 | },
638 | "kernelspec": {
639 | "display_name": "Python 3",
640 | "language": "python",
641 | "name": "python3"
642 | },
643 | "language_info": {
644 | "codemirror_mode": {
645 | "name": "ipython",
646 | "version": 3
647 | },
648 | "file_extension": ".py",
649 | "mimetype": "text/x-python",
650 | "name": "python",
651 | "nbconvert_exporter": "python",
652 | "pygments_lexer": "ipython3",
653 | "version": "3.6.7"
654 | }
655 | },
656 | "nbformat": 4,
657 | "nbformat_minor": 2
658 | }
659 |
--------------------------------------------------------------------------------
/data/Music.csv:
--------------------------------------------------------------------------------
1 | ARTIST,RATING,YEAR,LOCATION,FESTIVAL_SET
2 | Arcade Fire,10,2018,Las Vegas,1
3 | Justice,10,2018,Las Vegas,1
4 | Florence and The Machine,10,2018,Las Vegas,1
5 | Odesza,10,2018,Indio,1
6 | Bon Iver,10,2017,Indio,1
7 | LA Philharmonic + Sigur Ros,10,2017,LA,0
8 | Sigur Ros,10,2014,Malmo,0
9 | Arcade Fire,10,2014,Indio,1
10 | Escort,9,2018,San Francisco,0
11 | Phoenix,9,2018,Berkeley,0
12 | Jamie XX,9,2018,Golden Gate Park,1
13 | Beyonce,10,2018,Indio,1
14 | Soulwax,9,2018,Indio,1
15 | The XX,9,2017,Las Vegas,1
16 | Justice,9,2017,Indio,1
17 | Sigur Ros,9,2017,LA,0
18 | The XX,9,2017,London,0
19 | Porter Robinson and Madeon,9,2017,London,0
20 | Garden City Movement,9,2018,Tel Aviv,0
21 | ACDC,9,2015,Indio,1
22 | Porter Robinson,9,2015,Las Vegas,1
23 | Alt-J,9,2015,Barcelona,1
24 | Arcade Fire,9,2014,LA,0
25 | Phoenix,9,2013,Indio,1
26 | Chvrches,9,2013,Copenhagen,0
27 | Red Hot Chili Peppers,9,2006,Oakland,0
28 | Jungle,8,2018,Las Vegas,1
29 | Sylvan Esso,8,2018,Las Vegas,1
30 | Lake Street Dive,8,2018,San Francisco,0
31 | Elohim,8,2018,Golden Gate Park,1
32 | Tash Sultana,8,2018,Golden Gate Park,1
33 | David Byrne,8,2018,Indio,1
34 | Eminem,8,2018,Indio,1
35 | Tank and the Bangas,8,2018,Indio,1
36 | The Blaze,8,2018,Indio,1
37 | Jungle,8,2018,San Francisco,0
38 | Chance The Rapper,8,2017,Las Vegas,1
39 | Goldroom,8,2017,Las Vegas,1
40 | Mura Masa,8,2017,Las Vegas,1
41 | ZHU,8,2017,Las Vegas,1
42 | Goldroom,8,2017,San Francisco,1
43 | Phoenix,8,2017,Mountain View,1
44 | Hans Zimmer,8,2017,Indio,1
45 | Moderat,8,2017,Indio,1
46 | The XX,8,2017,Indio,1
47 | BORNS,8,2016,Indio,1
48 | Chvrches,8,2016,Indio,1
49 | Gallant,8,2016,Indio,1
50 | Matt & Kim,8,2016,Indio,1
51 | The Lumineers,8,2016,Las Vegas,1
52 | Flume,8,2016,Las Vegas,1
53 | Griz ,8,2016,San Francisco,1
54 | James Vincent McMorrow,8,2016,London,0
55 | Mura Masa,8,2016,San Francisco,0
56 | Alt-J,8,2015,Indio,1
57 | Jamie XX,8,2015,Indio,1
58 | ODESZA,8,2015,Indio,1
59 | Porter Robinson,8,2015,Indio,1
60 | Yelle,8,2015,Indio,1
61 | Sylvan Esso,8,2015,Indio,1
62 | ODESZA,8,2015,Indio,1
63 | Imagine Dragons,8,2015,LA,0
64 | Ben Howard,8,2015,Berkeley ,0
65 | Imagine Dragons,8,2015,Las Vegas,1
66 | Elton John,8,2015,San Francisco,1
67 | Garden City Movement,8,2015,Barcelona,1
68 | Jungle,8,2015,Barcelona,1
69 | Matt and Kim,8,2015,LA,0
70 | Daughter,8,2014,Indio,1
71 | Chromeo,8,2014,Indio,1
72 | Flume,8,2014,Indio,1
73 | Phantogram,8,2014,Monterey,1
74 | Major Lazer,8,2013,Indio,1
75 | The XX,8,2013,Indio,1
76 | Yeasayer,8,2013,Indio,1
77 | The Floor is Made of Lava,8,2013,Copenhagen,0
78 | Taylor Swift,8,2012,Claremont,0
79 | Elton John,8,2010,Ontario,0
80 | First Aid Kit,7,2018,Las Vegas,1
81 | Cut Copy,7,2018,Berkeley,0
82 | Rainbow Kitten Surprise\,7,2018,Golden Gate Park,1
83 | LP,7,2018,Golden Gate Park,1
84 | Chvrches,7,2018,Golden Gate Park,1
85 | Bon Iver,7,2018,Golden Gate Park,1
86 | Bleachers,7,2018,Indio,1
87 | Lola Marsh,7,2018,Tel Aviv,0
88 | The Paz Band,7,2017,Tel Aviv,0
89 | The Revivalists (Acoustic),7,2017,Las Vegas,1
90 | Lorde,7,2017,Las Vegas,1
91 | Treehouse Dubstep,7,2017,Las Vegas,1
92 | Sofi Tukker (Tukker DJ set),7,2017,Las Vegas,1
93 | Tycho,7,2017,Las Vegas,1
94 | Pretty Lights,7,2017,Las Vegas,1
95 | Tokimonsta,7,2017,Las Vegas,0
96 | San Fermin,7,2017,San Francisco,1
97 | Franz Ferdinand,7,2017,Mountain View,1
98 | Ezra Furman,7,2017,Indio,1
99 | FKJ,7,2017,Indio,1
100 | GoldLink,7,2017,Indio,1
101 | Jack Garratt,7,2017,Indio,1
102 | Oh Wonder,7,2017,Indio,1
103 | Phantogram,7,2017,Indio,1
104 | Sam Gellaitry,7,2017,Indio,1
105 | Sigur Ros,7,2017,Oakland,0
106 | LA Philharmonic,7,2017,LA,0
107 | Despacio,7,2016,Indio,1
108 | Goldroom,7,2016,Indio,1
109 | LCD soundsystem,7,2016,Indio,1
110 | Lido,7,2016,Indio,1
111 | Lord Huron,7,2016,Indio,1
112 | Major Lazer,7,2016,Indio,1
113 | Rufus du sol ,7,2016,Indio,1
114 | Spacewench,7,2016,Las Vegas,1
115 | Big Grams,7,2016,San Francisco,1
116 | Rufus Du Sol,7,2016,San Francisco,1
117 | Yellow Claw,7,2015,Indio,1
118 | St. Lucia,7,2015,Indio,1
119 | Jamie XX,7,2015,Indio,1
120 | Klingande,7,2015,Las Vegas,1
121 | Major Lazer,7,2015,Las Vegas,1
122 | Jauz,7,2015,Las Vegas,1
123 | Walk the Moon,7,2015,Las Vegas,1
124 | Madeon,7,2015,Las Vegas,1
125 | Chvrches,7,2015,Oakland,1
126 | Death Cab for Cutie,7,2015,Oakland,1
127 | X Ambassadors,7,2015,Oakland,1
128 | Porter Robinson,7,2015,San Francisco,1
129 | James Bay,7,2015,San Francisco,1
130 | Sam Smith,7,2015,San Francisco,1
131 | ACollective,7,2015,Barcelona,1
132 | Chet faker,7,2015,Barcelona,1
133 | Sylvan Esso,7,2015,Barcelona,1
134 | Chvrches,7,2014,Indio,1
135 | Krewella,7,2014,Indio,1
136 | St Lucia,7,2014,Indio,1
137 | The Naked and Famous,7,2014,Monterey,1
138 | Future Islands,7,2014,Monterey,1
139 | Tokyo Police Club,7,2014,Monterey,1
140 | Macklemore,7,2014,San Francisco,1
141 | Watsky,7,2014,San Francisco,1
142 | The Kooks,7,2014,San Francisco,1
143 | Yeah Yeah Yeahs,7,2013,Indio,1
144 | Passion Pit,7,2013,Indio,1
145 | Purity Ring,7,2013,Indio,1
146 | Red Hot Chili Peppers,7,2013,Indio,1
147 | The Postal Service,7,2013,Indio,1
148 | Vampire weekend,7,2013,Indio,1
149 | Scavenger Hunt,7,2013,LA,0
150 | Ms MR,7,2013,Copenhagen,0
151 | The Fratellis,7,2016,London,0
152 | St . Lucia,7,2015,LA,0
153 | Anderson Paak,7,2016,London,0
154 | Poolside,6,2018,Las Vegas,1
155 | St. Vincent,6,2018,Las Vegas,1
156 | Superorganism,6,2018,Las Vegas,1
157 | Sofi Tukker,6,2018,Las Vegas,1
158 | Sir Sly,6,2018,Berkeley,0
159 | Carly Rae Jepsen,6,2018,Golden Gate Park,1
160 | Alt-j,6,2018,Indio,1
161 | Nile Rogers and CHIC,6,2018,Indio,1
162 | Sudan Archives,6,2018,Indio,1
163 | Petit Biscuit,6,2018,Indio,1
164 | Elohim,6,2018,Indio,1
165 | St. Vincent,6,2018,Indio,1
166 | Nessi Gomes,6,2018,Israel,0
167 | RAC (DJ Set),6,2017,Las Vegas,1
168 | Two Door Cinema Club,6,2017,Las Vegas,1
169 | Milky Chance,6,2017,Las Vegas,1
170 | Alt-J,6,2017,San Francisco,1
171 | RAC,6,2017,San Francisco,1
172 | SOHN,6,2017,San Francisco,1
173 | RAC,6,2017,San Francisco,1
174 | Joseph,6,2017,San Francisco,1
175 | James Vincent McMorrow,6,2017,San Francisco,1
176 | Young the Giant,6,2017,San Francisco,1
177 | Lorde,6,2017,San Francisco,1
178 | Andre McMahon in the Wilderness,6,2017,Mountain View,1
179 | Joseph,6,2017,Indio,1
180 | Nao,6,2017,Indio,1
181 | Porter & Madeon,6,2017,Indio,1
182 | Two Door Cinema Club,6,2017,Indio,1
183 | Tycho,6,2017,Indio,1
184 | Of Monsters and Men,6,2017,Indio,1
185 | Flume,6,2017,Indio,1
186 | Lapsley,6,2017,Indio,1
187 | Jimmy Eat World,6,2012,Las Vegas,1
188 | Keys N Krates,6,2012,Las Vegas,1
189 | Leon Bridges,6,2012,Las Vegas,1
190 | Oh Wonder,6,2012,Las Vegas,1
191 | The Wombats,6,2012,San Francisco,1
192 | Oh Wonder,6,2012,San Francisco,1
193 | The War on Drugs,6,2012,Indio,1
194 | Andre McMahon in the Wilderness,6,2012,Indio,1
195 | Phox,6,2012,Indio,1
196 | Metric,6,2012,Las Vegas,1
197 | Bastille,6,2012,Oakland,1
198 | Halsey,6,2012,Oakland,1
199 | George Ezra,6,2012,San Francisco,1
200 | Mumford and Sons,6,2012,San Francisco,1
201 | Benjamin Booker,6,2012,San Francisco,1
202 | Mac Demarco,6,2012,Barcelona,1
203 | Bastille,6,2012,Indio,1
204 | Ellie Goulding,6,2012,Indio,1
205 | STRFKR,6,2012,Indio,1
206 | The National,6,2012,Monterey,1
207 | Blind Pilot,6,2012,Monterey,1
208 | Beck,6,2012,Monterey,1
209 | Flume,6,2012,San Francisco,1
210 | Lykke Li,6,2012,San Francisco,1
211 | Haim,6,2012,San Francisco,1
212 | Tycho,6,2012,San Francisco,1
213 | Earth Wind & Fire,6,2012,Claremont,0
214 | Of Monsters and Men,6,2012,Indio,1
215 | Japandroids,6,2012,Indio,1
216 | Lumineers,6,2012,Indio,1
217 | Chvrches,6,2012,LA,0
218 | Jack Johnson ,6,2012,Berkeley ,0
219 | Daughter,6,2012,Berkeley ,0
220 | Tom Misch,5,2018,Oakland,0
221 | Chvrches,5,2018,Las Vegas,1
222 | Two Feet,5,2018,Las Vegas,1
223 | Odesza,5,2018,Golden Gate Park,1
224 | Rezz,5,2018,Indio,1
225 | Jacob Banks,5,2017,Las Vegas,1
226 | Future Islands,5,2017,San Francisco,1
227 | Cold War Kids,5,2017,Mountain View,1
228 | Big Gigantic,5,2017,Indio,1
229 | Glass Animals,5,2017,Indio,1
230 | The Head and the Heart,5,2017,Indio,1
231 | What So 0t,5,2017,Indio,1
232 | Calvin Harris,5,2017,Indio,1
233 | Halsey,5,2017,Indio,1
234 | Snails,5,2017,Indio,1
235 | The 1975,5,2017,Indio,1
236 | Mr. Carmack,5,2017,Las Vegas,1
237 | Halsey,5,2017,San Francisco,1
238 | MO,5,2017,Indio,1
239 | Tycho,5,2017,Indio,1
240 | Coasts,5,2011,Indio,1
241 | Alessia Cara,5,2011,Las Vegas,1
242 | Halsey,5,2011,Las Vegas,1
243 | Run the Jewels,5,2011,Las Vegas,1
244 | Silversun Pickups,5,2011,Oakland,1
245 | First Aid Kit,5,2011,San Francisco,1
246 | Broods,5,2011,San Francisco,1
247 | RL Grime,5,2011,San Francisco,1
248 | Belle and Sebastian,5,2011,Barcelona,1
249 | Run the Jewels,5,2011,Barcelona,1
250 | The Strokes,5,2011,Barcelona,1
251 | Haim,5,2011,Indio,1
252 | The Head and the Heart,5,2011,Indio,1
253 | MGMT,5,2011,Indio,1
254 | Empire of the Sun,5,2011,Indio,1
255 | Grouplove,5,2011,Indio,1
256 | The 1975,5,2011,Indio,1
257 | Mr Little Jeans,5,2011,Monterey,1
258 | Atmosphere,5,2011,San Francisco,1
259 | The Chainsmokers,5,2011,Claremont,0
260 | Jessie Ware,5,2011,Indio,1
261 | Van Halen,5,2011,MOuntain View,
262 | Tycho,5,2011,London,0
263 | Foster The People,4,2018,Las Vegas,1
264 | Brasstracks,4,2018,Las Vegas,1
265 | Olivia O'Brien,4,2018,Golden Gate Park,1
266 | Slow Magic,4,2018,Indio,1
267 | Blink-182,4,2017,Las Vegas,1
268 | Classixx,4,2017,Las Vegas,1
269 | Local Natives,4,2017,Las Vegas,1
270 | The Japanese House,4,2017,San Francisco,1
271 | Above and Beyond,4,2017,San Francisco,1
272 | Milky Chance,4,2017,Mountain View,1
273 | Honne,4,2017,Indio,1
274 | RL Grime,4,2014,Indio,1
275 | James Bay,4,2014,Indio,1
276 | Ellie Goulding,4,2014,Indio,1
277 | Louis the child,4,2014,Indio,1
278 | Mr Carmack,4,2014,Indio,1
279 | ZHU,4,2014,Indio,1
280 | Gryffin,4,2014,Las Vegas,1
281 | ZHU,4,2014,Las Vegas,1
282 | Jauz,4,2014,San Francisco,1
283 | Chance The Rapper,4,2014,San Francisco,1
284 | George Ezra,4,2014,Indio,1
285 | Alabama Shakes,4,2014,Indio,1
286 | Kaskade,4,2014,Indio,1
287 | Madeon,4,2014,Indio,1
288 | Milky chance,4,2014,Indio,1
289 | Ryn Weaver,4,2014,Indio,1
290 | The weeknd,4,2014,Indio,1
291 | What So 0t,4,2014,Indio,1
292 | Lindsey Stirling,4,2014,Las Vegas,1
293 | Glass Animals,4,2014,Las Vegas,1
294 | Odesza,4,2014,San Francisco,1
295 | Black keys,4,2014,Barcelona,1
296 | Lorde,4,2014,Indio,1
297 | Adrian Lux,4,2014,Indio,1
298 | Outkast,4,2014,Indio,1
299 | Alesso,4,2014,Indio,1
300 | Beach House,4,2014,Monterey,1
301 | Kanye West,4,2014,San Francisco,1
302 | Disclosure,4,2014,San Francisco,1
303 | Chromeo,4,2014,San Francisco,1
304 | RAC,4,2014,LA,0
305 | Passion Pit,4,2014,San Francisco,0
306 | Banners,4,2014,San Francisco,0
307 | 98 Degrees,4,2014,LA,1
308 | Broken Social Scene,3,2018,Golden Gate Park,1
309 | Portugal the Man,3,2018,Indio,1
310 | Thundercat,3,2017,San Francisco,1
311 | Arkells,3,2017,Indio,1
312 | Jack U,3,2017,Indio,1
313 | Third Eye Blind,3,2017,Las Vegas,1
314 | Years and Years,3,2017,San Francisco,1
315 | Zedd,3,2017,San Francisco,1
316 | Angus and Julia Stone,3,2017,Indio,1
317 | Clean Bandit,3,2017,Indio,1
318 | Ratatat,3,2017,Indio,1
319 | Kaskade,3,2017,Indio,1
320 | Peking Duk,3,2017,Las Vegas,1
321 | Foals ,3,2017,Oakland,1
322 | The Neighborhood,3,2017,Indio,1
323 | Tyler the Creator / Earl Sweatshirt,3,2017,Indio,1
324 | Childish Gambi0,3,2017,Claremont,0
325 | Two Friends,3,2017,San Francisco,0
326 | Digitalism,3,2017,Copenhagen,0
327 | Deorro,2,2017,Las Vegas,1
328 | Bearson,2,2017,Las Vegas,1
329 | Whethan,2,2017,Mountain View,1
330 | Kungs,2,2017,Indio,1
331 | Thomas Jack,2,2017,Indio,1
332 | Vanic,2,2017,Indio,1
333 | Drake,2,2017,Indio,1
334 | Kygo,2,2017,Indio,1
335 | Health,2,2017,Barcelona,1
336 | Muse,2,2017,Indio,1
337 | Modest Mouse,2,2017,Indio,1
338 | The K0cks,2,2017,LA,0
339 | The Chainsmokers,2,2017,London,0
340 | Avicii,2,2017,Vegas,0
341 | 2 Chainz,1,2017,Las Vegas,1
342 | Nick cave and the bad seeds,1,2017,Indio,1
343 | ,,,,
344 | ,,,,
345 | ,,,,
346 | ,,,,
347 | ,,,", ",
348 |
--------------------------------------------------------------------------------
/data/cancer_data_00.csv:
--------------------------------------------------------------------------------
1 | 1,23,12,151
2 | 0,9,13,133
3 | 1,21,27,130
4 | 1,14,16,78
5 | 1,9,19,135
6 | 0,25,25,83
7 | 1,16,26,120
8 | 1,15,18,90
9 | 1,19,24,88
10 | 1,25,11,84
11 | 1,24,21,103
12 | 1,17,15,104
13 | 0,14,15,132
14 | 1,12,22,104
15 | 1,12,13,94
16 | 1,22,19,97
17 | 1,10,16,95
18 | 1,15,14,108
19 | 1,20,14,130
20 | 0,17,11,87
21 | 0,16,14,86
22 | 0,17,24,60
23 | 1,20,27,103
24 | 1,19,12,137
25 | 1,9,13,110
26 | 1,19,27,116
27 | 1,10,24,97
28 | 1,16,24,122
29 | 1,15,15,102
30 | 1,11,16,115
31 | 1,11,22,125
32 | 1,23,26,78
33 | 1,20,18,113
34 | 1,11,21,128
35 | 1,16,23,107
36 | 1,10,13,110
37 | 1,18,12,94
38 | 0,21,11,83
39 | 1,11,15,96
40 | 1,10,14,88
41 | 1,24,16,86
42 | 1,19,27,72
43 | 1,11,11,128
44 | 1,15,21,87
45 | 1,10,15,85
46 | 1,18,11,124
47 | 0,22,12,52
48 | 1,20,14,86
49 | 0,20,21,78
50 | 0,25,11,87
51 | 0,19,25,75
52 | 0,19,22,87
53 | 0,25,15,76
54 | 1,14,26,120
55 | 1,18,25,97
56 | 0,18,13,73
57 | 1,10,19,126
58 | 1,17,20,96
59 | 0,22,15,83
60 | 0,23,26,54
61 | 0,15,18,65
62 | 0,25,15,55
63 | 1,12,22,96
64 | 0,24,17,59
65 | 1,16,19,83
66 | 1,11,21,97
67 | 0,12,13,60
68 | 0,18,12,72
69 | 0,16,17,59
70 | 0,17,21,81
71 | 1,21,18,124
72 | 0,9,26,59
73 | 1,21,12,114
74 | 1,22,25,90
75 | 0,18,13,79
76 | 1,21,18,104
77 | 0,10,17,88
78 | 1,11,21,120
79 | 1,16,18,144
80 | 0,22,16,83
81 | 0,10,18,74
82 | 0,17,21,86
83 | 1,10,15,172
84 | 1,20,14,129
85 | 0,25,21,77
86 | 1,14,13,121
87 | 1,19,26,94
88 | 1,19,11,122
89 | 0,11,11,80
90 | 0,12,23,96
91 | 0,23,27,95
92 | 1,10,12,100
93 | 0,14,14,85
94 | 0,10,17,87
95 | 1,22,26,100
96 | 1,23,16,132
97 | 0,22,14,78
98 | 0,19,27,62
99 | 0,21,24,74
100 | 1,16,27,94
--------------------------------------------------------------------------------
/data/cancer_data_01.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BlazingDB/bsql-demos/ebee8a606a272f3e2ab7a38587a6092fe2018d93/data/cancer_data_01.parquet
--------------------------------------------------------------------------------
/data/cancer_data_02.csv:
--------------------------------------------------------------------------------
1 | 0.278,0.242,0.079
2 | 0.079,0.181,0.057
3 | 0.16,0.207,0.06
4 | 0.284,0.26,0.097
5 | 0.133,0.181,0.059
6 | 0.17,0.209,0.076
7 | 0.109,0.179,0.057
8 | 0.165,0.22,0.075
9 | 0.193,0.235,0.074
10 | 0.24,0.203,0.082
11 | 0.067,0.153,0.057
12 | 0.129,0.184,0.061
13 | 0.246,0.24,0.078
14 | 0.1,0.185,0.053
15 | 0.229,0.207,0.077
16 | 0.16,0.23,0.071
17 | 0.072,0.159,0.059
18 | 0.202,0.216,0.074
19 | 0.103,0.158,0.054
20 | 0.081,0.189,0.058
21 | 0.127,0.197,0.068
22 | 0.065,0.182,0.069
23 | 0.214,0.252,0.07
24 | 0.102,0.177,0.053
25 | 0.146,0.2,0.063
26 | 0.228,0.304,0.074
27 | 0.187,0.225,0.069
28 | 0.107,0.17,0.057
29 | 0.17,0.193,0.065
30 | 0.116,0.174,0.061
31 | 0.189,0.218,0.062
32 | 0.152,0.23,0.078
33 | 0.15,0.225,0.064
34 | 0.172,0.185,0.063
35 | 0.156,0.2,0.065
36 | 0.134,0.19,0.057
37 | 0.11,0.189,0.061
38 | 0.038,0.147,0.059
39 | 0.051,0.157,0.055
40 | 0.126,0.172,0.064
41 | 0.06,0.178,0.056
42 | 0.122,0.19,0.069
43 | 0.219,0.231,0.063
44 | 0.144,0.197,0.068
45 | 0.105,0.175,0.062
46 | 0.169,0.191,0.06
47 | 0.059,0.177,0.065
48 | 0.123,0.213,0.068
49 | 0.091,0.168,0.06
50 | 0.077,0.181,0.057
51 | 0.05,0.15,0.059
52 | 0.061,0.135,0.06
53 | 0.048,0.187,0.061
54 | 0.149,0.209,0.063
55 | 0.071,0.162,0.057
56 | 0.055,0.192,0.059
57 | 0.127,0.192,0.06
58 | 0.137,0.203,0.068
59 | 0.038,0.182,0.055
60 | 0.053,0.168,0.072
61 | 0.081,0.274,0.07
62 | 0.09,0.183,0.068
63 | 0.201,0.195,0.073
64 | 0.088,0.234,0.07
65 | 0.126,0.191,0.066
66 | 0.148,0.195,0.067
67 | 0.078,0.172,0.069
68 | 0.047,0.152,0.057
69 | 0.141,0.211,0.08
70 | 0.052,0.159,0.057
71 | 0.103,0.158,0.055
72 | 0.153,0.19,0.09
73 | 0.183,0.193,0.065
74 | 0.128,0.166,0.066
75 | 0.068,0.172,0.059
76 | 0.084,0.18,0.054
77 | 0.105,0.24,0.066
78 | 0.215,0.215,0.067
79 | 0.345,0.291,0.081
80 | 0.095,0.172,0.06
81 | 0.094,0.184,0.07
82 | 0.154,0.194,0.069
83 | 0.267,0.183,0.068
84 | 0.179,0.163,0.072
85 | 0.072,0.208,0.06
86 | 0.105,0.213,0.06
87 | 0.099,0.208,0.056
88 | 0.121,0.195,0.056
89 | 0.094,0.193,0.064
90 | 0.134,0.212,0.063
91 | 0.086,0.169,0.059
92 | 0.104,0.172,0.061
93 | 0.051,0.139,0.053
94 | 0.082,0.164,0.057
95 | 0.155,0.186,0.063
96 | 0.131,0.21,0.056
97 | 0.071,0.19,0.066
98 | 0.053,0.135,0.069
99 | 0.075,0.162,0.066
100 | 0.114,0.188,0.064
--------------------------------------------------------------------------------
/federated_query_demo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "colab_type": "text",
7 | "id": "8AdUt3HiUrc3"
8 | },
9 | "source": [
10 | "# Querying Multiple Data Formats \n",
11 | "In this notebook, we will cover: \n",
12 | "- How to create and then join BlazingSQL tables from CSV, Parquet, and GPU DataFrame (GDF) sources. \n",
13 | "\n",
14 | "## Imports"
15 | ]
16 | },
17 | {
18 | "cell_type": "code",
19 | "execution_count": 1,
20 | "metadata": {},
21 | "outputs": [],
22 | "source": [
23 | "import os\n",
24 | "import cudf\n",
25 | "from blazingsql import BlazingContext"
26 | ]
27 | },
28 | {
29 | "cell_type": "markdown",
30 | "metadata": {
31 | "colab_type": "text",
32 | "id": "aMwNKxePSwOp"
33 | },
34 | "source": [
35 | "## Import packages and create BlazingContext\n",
36 | "You can think of the BlazingContext much like a SparkContext; this is where information such as FileSystems you have registered and Tables you have created will be stored. "
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 2,
42 | "metadata": {
43 | "colab": {
44 | "base_uri": "https://localhost:8080/",
45 | "height": 35
46 | },
47 | "colab_type": "code",
48 | "id": "azZ7l2q7odYT",
49 | "outputId": "a5302d6e-307e-45c5-a682-c786cc999a40"
50 | },
51 | "outputs": [
52 | {
53 | "name": "stdout",
54 | "output_type": "stream",
55 | "text": [
56 | "BlazingContext ready\n"
57 | ]
58 | }
59 | ],
60 | "source": [
61 | "# start up BlazingSQL\n",
62 | "bc = BlazingContext()"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {
68 | "colab_type": "text",
69 | "id": "N2bqpDEnZyQf"
70 | },
71 | "source": [
72 | "### Create Table from CSV\n",
73 | "Here we create a BlazingSQL table directly from a comma-separated values (CSV) file."
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 3,
79 | "metadata": {
80 | "colab": {},
81 | "colab_type": "code",
82 | "id": "HhRhj-ZvZygH"
83 | },
84 | "outputs": [
85 | {
86 | "data": {
87 | "text/plain": [
88 | ""
89 | ]
90 | },
91 | "execution_count": 3,
92 | "metadata": {},
93 | "output_type": "execute_result"
94 | }
95 | ],
96 | "source": [
97 | "# define column names and types\n",
98 | "column_names = ['diagnosis_result', 'radius', 'texture', 'perimeter']\n",
99 | "column_types = ['float32', 'float32', 'float32', 'float32']\n",
100 | "\n",
101 | "# identify local directory path \n",
102 | "cwd = os.getcwd()\n",
103 | "# add path to data\n",
104 | "data_path = cwd + '/data/cancer_data_00.csv'\n",
105 | "\n",
106 | "# create table from CSV file\n",
107 | "bc.create_table('data_00', data_path, dtype=column_types, names=column_names)"
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "metadata": {
113 | "colab_type": "text",
114 | "id": "HJFz-mqZTJ5Z"
115 | },
116 | "source": [
117 | "### Create Table from Parquet\n",
118 | "Here we create a BlazingSQL table directly from an Apache Parquet file."
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 4,
124 | "metadata": {
125 | "colab": {},
126 | "colab_type": "code",
127 | "id": "HJuvtJDYTMyb"
128 | },
129 | "outputs": [
130 | {
131 | "data": {
132 | "text/plain": [
133 | ""
134 | ]
135 | },
136 | "execution_count": 4,
137 | "metadata": {},
138 | "output_type": "execute_result"
139 | }
140 | ],
141 | "source": [
142 | "# create table from Parquet file\n",
143 | "bc.create_table('data_01', cwd + '/data/cancer_data_01.parquet')"
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "metadata": {
149 | "colab_type": "text",
150 | "id": "98HJFrt5TRa0"
151 | },
152 | "source": [
153 | "### Create Table from GPU DataFrame\n",
154 | "Here we use cuDF to create a GPU DataFrame (GDF), then use BlazingSQL to create a table from that GDF.\n",
155 | "\n",
156 | "The GDF is the standard memory representation for the RAPIDS AI ecosystem."
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": 5,
162 | "metadata": {
163 | "colab": {},
164 | "colab_type": "code",
165 | "id": "14GwxmLsTV_p",
166 | "scrolled": true
167 | },
168 | "outputs": [
169 | {
170 | "data": {
171 | "text/plain": [
172 | ""
173 | ]
174 | },
175 | "execution_count": 5,
176 | "metadata": {},
177 | "output_type": "execute_result"
178 | }
179 | ],
180 | "source": [
181 | "# define column names and types\n",
182 | "column_names = ['compactness', 'symmetry', 'fractal_dimension']\n",
183 | "column_types = ['float32', 'float32', 'float32', 'float32']\n",
184 | "\n",
185 | "# make GDF with cuDF (uses relative path)\n",
186 | "gdf_02 = cudf.read_csv('data/cancer_data_02.csv', dtype=column_types, names=column_names)\n",
187 | "\n",
188 | "# create BlazingSQL table from GDF\n",
189 | "bc.create_table('data_02', gdf_02)"
190 | ]
191 | },
192 | {
193 | "cell_type": "markdown",
194 | "metadata": {
195 | "colab_type": "text",
196 | "id": "9DAZShZ2y-Nx"
197 | },
198 | "source": [
199 | "# Join Tables Together \n",
200 | "\n",
201 | "Now we can use BlazingSQL to join all three data formats in a single federated query. "
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": 6,
207 | "metadata": {
208 | "colab": {
209 | "base_uri": "https://localhost:8080/",
210 | "height": 1000
211 | },
212 | "colab_type": "code",
213 | "id": "HOYSFebvzGcX",
214 | "outputId": "ad133dfd-540e-4142-8f12-a4a70d803bb6",
215 | "scrolled": true
216 | },
217 | "outputs": [
218 | {
219 | "data": {
220 | "text/html": [
221 | "\n",
222 | "\n",
235 | "
\n",
236 | " \n",
237 | " \n",
238 | " | \n",
239 | " diagnosis_result | \n",
240 | " radius | \n",
241 | " texture | \n",
242 | " perimeter | \n",
243 | " area | \n",
244 | " smoothness | \n",
245 | " compactness | \n",
246 | " symmetry | \n",
247 | " fractal_dimension | \n",
248 | "
\n",
249 | " \n",
250 | " \n",
251 | " \n",
252 | " 0 | \n",
253 | " 1.0 | \n",
254 | " 11.0 | \n",
255 | " 21.0 | \n",
256 | " 120.0 | \n",
257 | " 1033.0 | \n",
258 | " 0.115 | \n",
259 | " 0.149000004 | \n",
260 | " 0.209000006 | \n",
261 | " 0.063000001 | \n",
262 | "
\n",
263 | " \n",
264 | " 1 | \n",
265 | " 0.0 | \n",
266 | " 17.0 | \n",
267 | " 21.0 | \n",
268 | " 86.0 | \n",
269 | " 563.0 | \n",
270 | " 0.082 | \n",
271 | " 0.059999999 | \n",
272 | " 0.178000003 | \n",
273 | " 0.056000002 | \n",
274 | "
\n",
275 | " \n",
276 | " 2 | \n",
277 | " 1.0 | \n",
278 | " 19.0 | \n",
279 | " 26.0 | \n",
280 | " 94.0 | \n",
281 | " 578.0 | \n",
282 | " 0.113 | \n",
283 | " 0.229000002 | \n",
284 | " 0.207000002 | \n",
285 | " 0.077 | \n",
286 | "
\n",
287 | " \n",
288 | " 3 | \n",
289 | " 1.0 | \n",
290 | " 19.0 | \n",
291 | " 11.0 | \n",
292 | " 122.0 | \n",
293 | " 1094.0 | \n",
294 | " 0.094 | \n",
295 | " 0.107000001 | \n",
296 | " 0.170000002 | \n",
297 | " 0.057 | \n",
298 | "
\n",
299 | " \n",
300 | " 4 | \n",
301 | " 0.0 | \n",
302 | " 10.0 | \n",
303 | " 17.0 | \n",
304 | " 87.0 | \n",
305 | " 566.0 | \n",
306 | " 0.098 | \n",
307 | " 0.081 | \n",
308 | " 0.18900001 | \n",
309 | " 0.058000002 | \n",
310 | "
\n",
311 | " \n",
312 | " 5 | \n",
313 | " 1.0 | \n",
314 | " 16.0 | \n",
315 | " 19.0 | \n",
316 | " 83.0 | \n",
317 | " 477.0 | \n",
318 | " 0.128 | \n",
319 | " 0.170000002 | \n",
320 | " 0.209000006 | \n",
321 | " 0.075999998 | \n",
322 | "
\n",
323 | " \n",
324 | " 6 | \n",
325 | " 0.0 | \n",
326 | " 22.0 | \n",
327 | " 16.0 | \n",
328 | " 83.0 | \n",
329 | " 477.0 | \n",
330 | " 0.128 | \n",
331 | " 0.170000002 | \n",
332 | " 0.209000006 | \n",
333 | " 0.075999998 | \n",
334 | "
\n",
335 | " \n",
336 | " 7 | \n",
337 | " 0.0 | \n",
338 | " 17.0 | \n",
339 | " 21.0 | \n",
340 | " 86.0 | \n",
341 | " 535.0 | \n",
342 | " 0.116 | \n",
343 | " 0.123000003 | \n",
344 | " 0.213000014 | \n",
345 | " 0.067999996 | \n",
346 | "
\n",
347 | " \n",
348 | " 8 | \n",
349 | " 0.0 | \n",
350 | " 10.0 | \n",
351 | " 17.0 | \n",
352 | " 87.0 | \n",
353 | " 545.0 | \n",
354 | " 0.104 | \n",
355 | " 0.143999994 | \n",
356 | " 0.196999997 | \n",
357 | " 0.067999996 | \n",
358 | "
\n",
359 | " \n",
360 | " 9 | \n",
361 | " 1.0 | \n",
362 | " 23.0 | \n",
363 | " 16.0 | \n",
364 | " 132.0 | \n",
365 | " 1123.0 | \n",
366 | " 0.097 | \n",
367 | " 0.246000007 | \n",
368 | " 0.24000001 | \n",
369 | " 0.078000002 | \n",
370 | "
\n",
371 | " \n",
372 | " 10 | \n",
373 | " 1.0 | \n",
374 | " 16.0 | \n",
375 | " 19.0 | \n",
376 | " 83.0 | \n",
377 | " 524.0 | \n",
378 | " 0.090 | \n",
379 | " 0.037999999 | \n",
380 | " 0.147 | \n",
381 | " 0.059 | \n",
382 | "
\n",
383 | " \n",
384 | " 11 | \n",
385 | " 1.0 | \n",
386 | " 21.0 | \n",
387 | " 18.0 | \n",
388 | " 124.0 | \n",
389 | " 1076.0 | \n",
390 | " 0.110 | \n",
391 | " 0.169 | \n",
392 | " 0.191 | \n",
393 | " 0.059999999 | \n",
394 | "
\n",
395 | " \n",
396 | " 12 | \n",
397 | " 0.0 | \n",
398 | " 22.0 | \n",
399 | " 16.0 | \n",
400 | " 83.0 | \n",
401 | " 524.0 | \n",
402 | " 0.090 | \n",
403 | " 0.037999999 | \n",
404 | " 0.147 | \n",
405 | " 0.059 | \n",
406 | "
\n",
407 | " \n",
408 | " 13 | \n",
409 | " 1.0 | \n",
410 | " 19.0 | \n",
411 | " 26.0 | \n",
412 | " 94.0 | \n",
413 | " 633.0 | \n",
414 | " 0.098 | \n",
415 | " 0.109999999 | \n",
416 | " 0.18900001 | \n",
417 | " 0.060999997 | \n",
418 | "
\n",
419 | " \n",
420 | " 14 | \n",
421 | " 1.0 | \n",
422 | " 16.0 | \n",
423 | " 19.0 | \n",
424 | " 83.0 | \n",
425 | " 527.0 | \n",
426 | " 0.081 | \n",
427 | " 0.037999999 | \n",
428 | " 0.147 | \n",
429 | " 0.059 | \n",
430 | "
\n",
431 | " \n",
432 | " 15 | \n",
433 | " 0.0 | \n",
434 | " 22.0 | \n",
435 | " 16.0 | \n",
436 | " 83.0 | \n",
437 | " 527.0 | \n",
438 | " 0.081 | \n",
439 | " 0.037999999 | \n",
440 | " 0.147 | \n",
441 | " 0.059 | \n",
442 | "
\n",
443 | " \n",
444 | " 16 | \n",
445 | " 1.0 | \n",
446 | " 10.0 | \n",
447 | " 12.0 | \n",
448 | " 100.0 | \n",
449 | " 706.0 | \n",
450 | " 0.104 | \n",
451 | " 0.155000001 | \n",
452 | " 0.186000004 | \n",
453 | " 0.063000001 | \n",
454 | "
\n",
455 | " \n",
456 | " 17 | \n",
457 | " 1.0 | \n",
458 | " 22.0 | \n",
459 | " 26.0 | \n",
460 | " 100.0 | \n",
461 | " 706.0 | \n",
462 | " 0.104 | \n",
463 | " 0.155000001 | \n",
464 | " 0.186000004 | \n",
465 | " 0.063000001 | \n",
466 | "
\n",
467 | " \n",
468 | " 18 | \n",
469 | " 0.0 | \n",
470 | " 10.0 | \n",
471 | " 17.0 | \n",
472 | " 87.0 | \n",
473 | " 561.0 | \n",
474 | " 0.088 | \n",
475 | " 0.077 | \n",
476 | " 0.181000009 | \n",
477 | " 0.057 | \n",
478 | "
\n",
479 | " \n",
480 | " 19 | \n",
481 | " 0.0 | \n",
482 | " 12.0 | \n",
483 | " 23.0 | \n",
484 | " 96.0 | \n",
485 | " 699.0 | \n",
486 | " 0.094 | \n",
487 | " 0.050999999 | \n",
488 | " 0.157000005 | \n",
489 | " 0.055 | \n",
490 | "
\n",
491 | " \n",
492 | " 20 | \n",
493 | " 0.0 | \n",
494 | " 18.0 | \n",
495 | " 12.0 | \n",
496 | " 72.0 | \n",
497 | " 371.0 | \n",
498 | " 0.123 | \n",
499 | " 0.122000001 | \n",
500 | " 0.189999998 | \n",
501 | " 0.068999998 | \n",
502 | "
\n",
503 | " \n",
504 | " 21 | \n",
505 | " 0.0 | \n",
506 | " 12.0 | \n",
507 | " 23.0 | \n",
508 | " 96.0 | \n",
509 | " 657.0 | \n",
510 | " 0.114 | \n",
511 | " 0.136999995 | \n",
512 | " 0.203000009 | \n",
513 | " 0.067999996 | \n",
514 | "
\n",
515 | " \n",
516 | " 22 | \n",
517 | " 0.0 | \n",
518 | " 12.0 | \n",
519 | " 23.0 | \n",
520 | " 96.0 | \n",
521 | " 646.0 | \n",
522 | " 0.105 | \n",
523 | " 0.201000005 | \n",
524 | " 0.194999993 | \n",
525 | " 0.072999999 | \n",
526 | "
\n",
527 | " \n",
528 | " 23 | \n",
529 | " 0.0 | \n",
530 | " 10.0 | \n",
531 | " 18.0 | \n",
532 | " 74.0 | \n",
533 | " 413.0 | \n",
534 | " 0.090 | \n",
535 | " 0.075000003 | \n",
536 | " 0.162 | \n",
537 | " 0.066 | \n",
538 | "
\n",
539 | " \n",
540 | " 24 | \n",
541 | " 1.0 | \n",
542 | " 11.0 | \n",
543 | " 21.0 | \n",
544 | " 97.0 | \n",
545 | " 713.0 | \n",
546 | " 0.091 | \n",
547 | " 0.071000002 | \n",
548 | " 0.162 | \n",
549 | " 0.057 | \n",
550 | "
\n",
551 | " \n",
552 | " 25 | \n",
553 | " 0.0 | \n",
554 | " 22.0 | \n",
555 | " 16.0 | \n",
556 | " 83.0 | \n",
557 | " 506.0 | \n",
558 | " 0.099 | \n",
559 | " null | \n",
560 | " null | \n",
561 | " null | \n",
562 | "
\n",
563 | " \n",
564 | " 26 | \n",
565 | " 0.0 | \n",
566 | " 14.0 | \n",
567 | " 14.0 | \n",
568 | " 85.0 | \n",
569 | " 532.0 | \n",
570 | " 0.097 | \n",
571 | " null | \n",
572 | " null | \n",
573 | " null | \n",
574 | "
\n",
575 | " \n",
576 | " 27 | \n",
577 | " 0.0 | \n",
578 | " 10.0 | \n",
579 | " 17.0 | \n",
580 | " 87.0 | \n",
581 | " 572.0 | \n",
582 | " 0.077 | \n",
583 | " null | \n",
584 | " null | \n",
585 | " null | \n",
586 | "
\n",
587 | " \n",
588 | " 28 | \n",
589 | " 1.0 | \n",
590 | " 16.0 | \n",
591 | " 19.0 | \n",
592 | " 83.0 | \n",
593 | " 477.0 | \n",
594 | " 0.128 | \n",
595 | " 0.170000002 | \n",
596 | " 0.193000004 | \n",
597 | " 0.064999998 | \n",
598 | "
\n",
599 | " \n",
600 | " 29 | \n",
601 | " 0.0 | \n",
602 | " 22.0 | \n",
603 | " 16.0 | \n",
604 | " 83.0 | \n",
605 | " 477.0 | \n",
606 | " 0.128 | \n",
607 | " 0.170000002 | \n",
608 | " 0.193000004 | \n",
609 | " 0.064999998 | \n",
610 | "
\n",
611 | " \n",
612 | " ... | \n",
613 | " ... | \n",
614 | " ... | \n",
615 | " ... | \n",
616 | " ... | \n",
617 | " ... | \n",
618 | " ... | \n",
619 | " ... | \n",
620 | " ... | \n",
621 | " ... | \n",
622 | "
\n",
623 | " \n",
624 | " 286 | \n",
625 | " 1.0 | \n",
626 | " 11.0 | \n",
627 | " 21.0 | \n",
628 | " 97.0 | \n",
629 | " 659.0 | \n",
630 | " 0.114 | \n",
631 | " 0.159999996 | \n",
632 | " 0.207000002 | \n",
633 | " 0.059999999 | \n",
634 | "
\n",
635 | " \n",
636 | " 287 | \n",
637 | " 0.0 | \n",
638 | " 12.0 | \n",
639 | " 13.0 | \n",
640 | " 60.0 | \n",
641 | " 274.0 | \n",
642 | " 0.102 | \n",
643 | " 0.064999998 | \n",
644 | " 0.182000011 | \n",
645 | " 0.068999998 | \n",
646 | "
\n",
647 | " \n",
648 | " 288 | \n",
649 | " 0.0 | \n",
650 | " 10.0 | \n",
651 | " 17.0 | \n",
652 | " 88.0 | \n",
653 | " 520.0 | \n",
654 | " 0.127 | \n",
655 | " 0.193000004 | \n",
656 | " 0.234999999 | \n",
657 | " 0.074000001 | \n",
658 | "
\n",
659 | " \n",
660 | " 289 | \n",
661 | " 0.0 | \n",
662 | " 17.0 | \n",
663 | " 21.0 | \n",
664 | " 86.0 | \n",
665 | " 520.0 | \n",
666 | " 0.108 | \n",
667 | " 0.127000004 | \n",
668 | " 0.196999997 | \n",
669 | " 0.067999996 | \n",
670 | "
\n",
671 | " \n",
672 | " 290 | \n",
673 | " 1.0 | \n",
674 | " 19.0 | \n",
675 | " 26.0 | \n",
676 | " 94.0 | \n",
677 | " 643.0 | \n",
678 | " 0.098 | \n",
679 | " 0.114 | \n",
680 | " 0.188000008 | \n",
681 | " 0.063999996 | \n",
682 | "
\n",
683 | " \n",
684 | " 291 | \n",
685 | " 0.0 | \n",
686 | " 23.0 | \n",
687 | " 27.0 | \n",
688 | " 95.0 | \n",
689 | " 685.0 | \n",
690 | " 0.099 | \n",
691 | " 0.071999997 | \n",
692 | " 0.159000009 | \n",
693 | " 0.059 | \n",
694 | "
\n",
695 | " \n",
696 | " 292 | \n",
697 | " 1.0 | \n",
698 | " 11.0 | \n",
699 | " 21.0 | \n",
700 | " 97.0 | \n",
701 | " 645.0 | \n",
702 | " 0.105 | \n",
703 | " 0.187000006 | \n",
704 | " 0.224999994 | \n",
705 | " 0.068999998 | \n",
706 | "
\n",
707 | " \n",
708 | " 293 | \n",
709 | " 0.0 | \n",
710 | " 16.0 | \n",
711 | " 17.0 | \n",
712 | " 59.0 | \n",
713 | " 261.0 | \n",
714 | " 0.077 | \n",
715 | " 0.088 | \n",
716 | " 0.233999997 | \n",
717 | " 0.07 | \n",
718 | "
\n",
719 | " \n",
720 | " 294 | \n",
721 | " 0.0 | \n",
722 | " 9.0 | \n",
723 | " 26.0 | \n",
724 | " 59.0 | \n",
725 | " 261.0 | \n",
726 | " 0.077 | \n",
727 | " 0.088 | \n",
728 | " 0.233999997 | \n",
729 | " 0.07 | \n",
730 | "
\n",
731 | " \n",
732 | " 295 | \n",
733 | " 1.0 | \n",
734 | " 21.0 | \n",
735 | " 18.0 | \n",
736 | " 104.0 | \n",
737 | " 783.0 | \n",
738 | " 0.084 | \n",
739 | " 0.100000001 | \n",
740 | " 0.185000002 | \n",
741 | " 0.052999999 | \n",
742 | "
\n",
743 | " \n",
744 | " 296 | \n",
745 | " 0.0 | \n",
746 | " 10.0 | \n",
747 | " 17.0 | \n",
748 | " 88.0 | \n",
749 | " 559.0 | \n",
750 | " 0.102 | \n",
751 | " 0.126000002 | \n",
752 | " 0.172000006 | \n",
753 | " 0.063999996 | \n",
754 | "
\n",
755 | " \n",
756 | " 297 | \n",
757 | " 1.0 | \n",
758 | " 14.0 | \n",
759 | " 13.0 | \n",
760 | " 121.0 | \n",
761 | " 1075.0 | \n",
762 | " 0.099 | \n",
763 | " null | \n",
764 | " null | \n",
765 | " null | \n",
766 | "
\n",
767 | " \n",
768 | " 298 | \n",
769 | " 1.0 | \n",
770 | " 19.0 | \n",
771 | " 26.0 | \n",
772 | " 94.0 | \n",
773 | " 648.0 | \n",
774 | " 0.094 | \n",
775 | " null | \n",
776 | " null | \n",
777 | " null | \n",
778 | "
\n",
779 | " \n",
780 | " 299 | \n",
781 | " 1.0 | \n",
782 | " 19.0 | \n",
783 | " 11.0 | \n",
784 | " 122.0 | \n",
785 | " 1076.0 | \n",
786 | " 0.090 | \n",
787 | " null | \n",
788 | " null | \n",
789 | " null | \n",
790 | "
\n",
791 | " \n",
792 | " 300 | \n",
793 | " 0.0 | \n",
794 | " 11.0 | \n",
795 | " 11.0 | \n",
796 | " 80.0 | \n",
797 | " 466.0 | \n",
798 | " 0.088 | \n",
799 | " null | \n",
800 | " null | \n",
801 | " null | \n",
802 | "
\n",
803 | " \n",
804 | " 301 | \n",
805 | " 0.0 | \n",
806 | " 12.0 | \n",
807 | " 23.0 | \n",
808 | " 96.0 | \n",
809 | " 652.0 | \n",
810 | " 0.113 | \n",
811 | " null | \n",
812 | " null | \n",
813 | " null | \n",
814 | "
\n",
815 | " \n",
816 | " 302 | \n",
817 | " 0.0 | \n",
818 | " 23.0 | \n",
819 | " 27.0 | \n",
820 | " 95.0 | \n",
821 | " 663.0 | \n",
822 | " 0.090 | \n",
823 | " null | \n",
824 | " null | \n",
825 | " null | \n",
826 | "
\n",
827 | " \n",
828 | " 303 | \n",
829 | " 0.0 | \n",
830 | " 10.0 | \n",
831 | " 17.0 | \n",
832 | " 87.0 | \n",
833 | " 555.0 | \n",
834 | " 0.102 | \n",
835 | " null | \n",
836 | " null | \n",
837 | " null | \n",
838 | "
\n",
839 | " \n",
840 | " 304 | \n",
841 | " 0.0 | \n",
842 | " 16.0 | \n",
843 | " 17.0 | \n",
844 | " 59.0 | \n",
845 | " 244.0 | \n",
846 | " 0.098 | \n",
847 | " null | \n",
848 | " null | \n",
849 | " null | \n",
850 | "
\n",
851 | " \n",
852 | " 305 | \n",
853 | " 0.0 | \n",
854 | " 9.0 | \n",
855 | " 26.0 | \n",
856 | " 59.0 | \n",
857 | " 244.0 | \n",
858 | " 0.098 | \n",
859 | " null | \n",
860 | " null | \n",
861 | " null | \n",
862 | "
\n",
863 | " \n",
864 | " 306 | \n",
865 | " 1.0 | \n",
866 | " 21.0 | \n",
867 | " 18.0 | \n",
868 | " 104.0 | \n",
869 | " 781.0 | \n",
870 | " 0.097 | \n",
871 | " null | \n",
872 | " null | \n",
873 | " null | \n",
874 | "
\n",
875 | " \n",
876 | " 307 | \n",
877 | " 1.0 | \n",
878 | " 11.0 | \n",
879 | " 21.0 | \n",
880 | " 120.0 | \n",
881 | " 1040.0 | \n",
882 | " 0.095 | \n",
883 | " null | \n",
884 | " null | \n",
885 | " null | \n",
886 | "
\n",
887 | " \n",
888 | " 308 | \n",
889 | " 1.0 | \n",
890 | " 16.0 | \n",
891 | " 19.0 | \n",
892 | " 83.0 | \n",
893 | " 506.0 | \n",
894 | " 0.099 | \n",
895 | " null | \n",
896 | " null | \n",
897 | " null | \n",
898 | "
\n",
899 | " \n",
900 | " 309 | \n",
901 | " 1.0 | \n",
902 | " 22.0 | \n",
903 | " 25.0 | \n",
904 | " 90.0 | \n",
905 | " 578.0 | \n",
906 | " 0.119 | \n",
907 | " null | \n",
908 | " null | \n",
909 | " null | \n",
910 | "
\n",
911 | " \n",
912 | " 310 | \n",
913 | " 1.0 | \n",
914 | " 11.0 | \n",
915 | " 21.0 | \n",
916 | " 97.0 | \n",
917 | " 659.0 | \n",
918 | " 0.114 | \n",
919 | " 0.159999996 | \n",
920 | " 0.230000004 | \n",
921 | " 0.071000002 | \n",
922 | "
\n",
923 | " \n",
924 | " 311 | \n",
925 | " 0.0 | \n",
926 | " 14.0 | \n",
927 | " 14.0 | \n",
928 | " 85.0 | \n",
929 | " 552.0 | \n",
930 | " 0.074 | \n",
931 | " 0.050999999 | \n",
932 | " 0.138999999 | \n",
933 | " 0.052999999 | \n",
934 | "
\n",
935 | " \n",
936 | " 312 | \n",
937 | " 0.0 | \n",
938 | " 25.0 | \n",
939 | " 21.0 | \n",
940 | " 77.0 | \n",
941 | " 443.0 | \n",
942 | " 0.097 | \n",
943 | " 0.071999997 | \n",
944 | " 0.208000004 | \n",
945 | " 0.059999999 | \n",
946 | "
\n",
947 | " \n",
948 | " 313 | \n",
949 | " 0.0 | \n",
950 | " 17.0 | \n",
951 | " 21.0 | \n",
952 | " 86.0 | \n",
953 | " 520.0 | \n",
954 | " 0.108 | \n",
955 | " 0.127000004 | \n",
956 | " 0.192000002 | \n",
957 | " 0.059999999 | \n",
958 | "
\n",
959 | " \n",
960 | " 314 | \n",
961 | " 0.0 | \n",
962 | " 23.0 | \n",
963 | " 27.0 | \n",
964 | " 95.0 | \n",
965 | " 685.0 | \n",
966 | " 0.099 | \n",
967 | " 0.071999997 | \n",
968 | " 0.208000004 | \n",
969 | " 0.059999999 | \n",
970 | "
\n",
971 | " \n",
972 | " 315 | \n",
973 | " 0.0 | \n",
974 | " 10.0 | \n",
975 | " 17.0 | \n",
976 | " 88.0 | \n",
977 | " 559.0 | \n",
978 | " 0.102 | \n",
979 | " 0.126000002 | \n",
980 | " 0.191 | \n",
981 | " 0.066 | \n",
982 | "
\n",
983 | " \n",
984 | "
\n",
985 | "
316 rows × 9 columns
\n",
986 | "
"
987 | ],
988 | "text/plain": [
989 | " diagnosis_result radius texture perimeter area smoothness \\\n",
990 | "0 1.0 11.0 21.0 120.0 1033.0 0.115 \n",
991 | "1 0.0 17.0 21.0 86.0 563.0 0.082 \n",
992 | "2 1.0 19.0 26.0 94.0 578.0 0.113 \n",
993 | "3 1.0 19.0 11.0 122.0 1094.0 0.094 \n",
994 | "4 0.0 10.0 17.0 87.0 566.0 0.098 \n",
995 | "5 1.0 16.0 19.0 83.0 477.0 0.128 \n",
996 | "6 0.0 22.0 16.0 83.0 477.0 0.128 \n",
997 | "7 0.0 17.0 21.0 86.0 535.0 0.116 \n",
998 | "8 0.0 10.0 17.0 87.0 545.0 0.104 \n",
999 | "9 1.0 23.0 16.0 132.0 1123.0 0.097 \n",
1000 | "10 1.0 16.0 19.0 83.0 524.0 0.090 \n",
1001 | "11 1.0 21.0 18.0 124.0 1076.0 0.110 \n",
1002 | "12 0.0 22.0 16.0 83.0 524.0 0.090 \n",
1003 | "13 1.0 19.0 26.0 94.0 633.0 0.098 \n",
1004 | "14 1.0 16.0 19.0 83.0 527.0 0.081 \n",
1005 | "15 0.0 22.0 16.0 83.0 527.0 0.081 \n",
1006 | "16 1.0 10.0 12.0 100.0 706.0 0.104 \n",
1007 | "17 1.0 22.0 26.0 100.0 706.0 0.104 \n",
1008 | "18 0.0 10.0 17.0 87.0 561.0 0.088 \n",
1009 | "19 0.0 12.0 23.0 96.0 699.0 0.094 \n",
1010 | "20 0.0 18.0 12.0 72.0 371.0 0.123 \n",
1011 | "21 0.0 12.0 23.0 96.0 657.0 0.114 \n",
1012 | "22 0.0 12.0 23.0 96.0 646.0 0.105 \n",
1013 | "23 0.0 10.0 18.0 74.0 413.0 0.090 \n",
1014 | "24 1.0 11.0 21.0 97.0 713.0 0.091 \n",
1015 | "25 0.0 22.0 16.0 83.0 506.0 0.099 \n",
1016 | "26 0.0 14.0 14.0 85.0 532.0 0.097 \n",
1017 | "27 0.0 10.0 17.0 87.0 572.0 0.077 \n",
1018 | "28 1.0 16.0 19.0 83.0 477.0 0.128 \n",
1019 | "29 0.0 22.0 16.0 83.0 477.0 0.128 \n",
1020 | ".. ... ... ... ... ... ... \n",
1021 | "286 1.0 11.0 21.0 97.0 659.0 0.114 \n",
1022 | "287 0.0 12.0 13.0 60.0 274.0 0.102 \n",
1023 | "288 0.0 10.0 17.0 88.0 520.0 0.127 \n",
1024 | "289 0.0 17.0 21.0 86.0 520.0 0.108 \n",
1025 | "290 1.0 19.0 26.0 94.0 643.0 0.098 \n",
1026 | "291 0.0 23.0 27.0 95.0 685.0 0.099 \n",
1027 | "292 1.0 11.0 21.0 97.0 645.0 0.105 \n",
1028 | "293 0.0 16.0 17.0 59.0 261.0 0.077 \n",
1029 | "294 0.0 9.0 26.0 59.0 261.0 0.077 \n",
1030 | "295 1.0 21.0 18.0 104.0 783.0 0.084 \n",
1031 | "296 0.0 10.0 17.0 88.0 559.0 0.102 \n",
1032 | "297 1.0 14.0 13.0 121.0 1075.0 0.099 \n",
1033 | "298 1.0 19.0 26.0 94.0 648.0 0.094 \n",
1034 | "299 1.0 19.0 11.0 122.0 1076.0 0.090 \n",
1035 | "300 0.0 11.0 11.0 80.0 466.0 0.088 \n",
1036 | "301 0.0 12.0 23.0 96.0 652.0 0.113 \n",
1037 | "302 0.0 23.0 27.0 95.0 663.0 0.090 \n",
1038 | "303 0.0 10.0 17.0 87.0 555.0 0.102 \n",
1039 | "304 0.0 16.0 17.0 59.0 244.0 0.098 \n",
1040 | "305 0.0 9.0 26.0 59.0 244.0 0.098 \n",
1041 | "306 1.0 21.0 18.0 104.0 781.0 0.097 \n",
1042 | "307 1.0 11.0 21.0 120.0 1040.0 0.095 \n",
1043 | "308 1.0 16.0 19.0 83.0 506.0 0.099 \n",
1044 | "309 1.0 22.0 25.0 90.0 578.0 0.119 \n",
1045 | "310 1.0 11.0 21.0 97.0 659.0 0.114 \n",
1046 | "311 0.0 14.0 14.0 85.0 552.0 0.074 \n",
1047 | "312 0.0 25.0 21.0 77.0 443.0 0.097 \n",
1048 | "313 0.0 17.0 21.0 86.0 520.0 0.108 \n",
1049 | "314 0.0 23.0 27.0 95.0 685.0 0.099 \n",
1050 | "315 0.0 10.0 17.0 88.0 559.0 0.102 \n",
1051 | "\n",
1052 | " compactness symmetry fractal_dimension \n",
1053 | "0 0.149000004 0.209000006 0.063000001 \n",
1054 | "1 0.059999999 0.178000003 0.056000002 \n",
1055 | "2 0.229000002 0.207000002 0.077 \n",
1056 | "3 0.107000001 0.170000002 0.057 \n",
1057 | "4 0.081 0.18900001 0.058000002 \n",
1058 | "5 0.170000002 0.209000006 0.075999998 \n",
1059 | "6 0.170000002 0.209000006 0.075999998 \n",
1060 | "7 0.123000003 0.213000014 0.067999996 \n",
1061 | "8 0.143999994 0.196999997 0.067999996 \n",
1062 | "9 0.246000007 0.24000001 0.078000002 \n",
1063 | "10 0.037999999 0.147 0.059 \n",
1064 | "11 0.169 0.191 0.059999999 \n",
1065 | "12 0.037999999 0.147 0.059 \n",
1066 | "13 0.109999999 0.18900001 0.060999997 \n",
1067 | "14 0.037999999 0.147 0.059 \n",
1068 | "15 0.037999999 0.147 0.059 \n",
1069 | "16 0.155000001 0.186000004 0.063000001 \n",
1070 | "17 0.155000001 0.186000004 0.063000001 \n",
1071 | "18 0.077 0.181000009 0.057 \n",
1072 | "19 0.050999999 0.157000005 0.055 \n",
1073 | "20 0.122000001 0.189999998 0.068999998 \n",
1074 | "21 0.136999995 0.203000009 0.067999996 \n",
1075 | "22 0.201000005 0.194999993 0.072999999 \n",
1076 | "23 0.075000003 0.162 0.066 \n",
1077 | "24 0.071000002 0.162 0.057 \n",
1078 | "25 null null null \n",
1079 | "26 null null null \n",
1080 | "27 null null null \n",
1081 | "28 0.170000002 0.193000004 0.064999998 \n",
1082 | "29 0.170000002 0.193000004 0.064999998 \n",
1083 | ".. ... ... ... \n",
1084 | "286 0.159999996 0.207000002 0.059999999 \n",
1085 | "287 0.064999998 0.182000011 0.068999998 \n",
1086 | "288 0.193000004 0.234999999 0.074000001 \n",
1087 | "289 0.127000004 0.196999997 0.067999996 \n",
1088 | "290 0.114 0.188000008 0.063999996 \n",
1089 | "291 0.071999997 0.159000009 0.059 \n",
1090 | "292 0.187000006 0.224999994 0.068999998 \n",
1091 | "293 0.088 0.233999997 0.07 \n",
1092 | "294 0.088 0.233999997 0.07 \n",
1093 | "295 0.100000001 0.185000002 0.052999999 \n",
1094 | "296 0.126000002 0.172000006 0.063999996 \n",
1095 | "297 null null null \n",
1096 | "298 null null null \n",
1097 | "299 null null null \n",
1098 | "300 null null null \n",
1099 | "301 null null null \n",
1100 | "302 null null null \n",
1101 | "303 null null null \n",
1102 | "304 null null null \n",
1103 | "305 null null null \n",
1104 | "306 null null null \n",
1105 | "307 null null null \n",
1106 | "308 null null null \n",
1107 | "309 null null null \n",
1108 | "310 0.159999996 0.230000004 0.071000002 \n",
1109 | "311 0.050999999 0.138999999 0.052999999 \n",
1110 | "312 0.071999997 0.208000004 0.059999999 \n",
1111 | "313 0.127000004 0.192000002 0.059999999 \n",
1112 | "314 0.071999997 0.208000004 0.059999999 \n",
1113 | "315 0.126000002 0.191 0.066 \n",
1114 | "\n",
1115 | "[316 rows x 9 columns]"
1116 | ]
1117 | },
1118 | "execution_count": 6,
1119 | "metadata": {},
1120 | "output_type": "execute_result"
1121 | }
1122 | ],
1123 | "source": [
1124 | "# grab everything from 00 & 02, area & smoothness from 01\n",
1125 | "query = '''\n",
1126 | " SELECT \n",
1127 | " a.*, \n",
1128 | " b.area, b.smoothness, \n",
1129 | " c.* \n",
1130 | " FROM \n",
1131 | " data_00 AS a\n",
1132 | " LEFT JOIN \n",
1133 | " data_01 AS b\n",
1134 | " ON (a.perimeter = b.perimeter)\n",
1135 | " LEFT JOIN \n",
1136 | " data_02 AS c\n",
1137 | " ON (b.compactness = c.compactness)\n",
1138 | " '''\n",
1139 | "\n",
1140 | "# join the tables together (type(gdf)==cudf.core.dataframe.Dataframe)\n",
1141 | "gdf = bc.sql(query)\n",
1142 | "\n",
1143 | "# display result\n",
1144 | "gdf"
1145 | ]
1146 | },
1147 | {
1148 | "cell_type": "markdown",
1149 | "metadata": {
1150 | "colab_type": "text",
1151 | "id": "wygAeTIFTm2X"
1152 | },
1153 | "source": [
1154 | "# You're Ready to Rock\n",
1155 | "And... thats it! You are now live with BlazingSQL.\n",
1156 | "\n",
1157 | "Check out our [docs](https://docs.blazingdb.com) to get fancy or to learn more about how BlazingSQL works with the rest of [RAPIDS AI](https://rapids.ai/)."
1158 | ]
1159 | }
1160 | ],
1161 | "metadata": {
1162 | "accelerator": "GPU",
1163 | "colab": {
1164 | "collapsed_sections": [
1165 | "McVBO7GHRDzz"
1166 | ],
1167 | "name": "BlazingSQL_Federated_Query_Demo.ipynb",
1168 | "provenance": [],
1169 | "toc_visible": true
1170 | },
1171 | "kernelspec": {
1172 | "display_name": "Python 3",
1173 | "language": "python",
1174 | "name": "python3"
1175 | },
1176 | "language_info": {
1177 | "codemirror_mode": {
1178 | "name": "ipython",
1179 | "version": 3
1180 | },
1181 | "file_extension": ".py",
1182 | "mimetype": "text/x-python",
1183 | "name": "python",
1184 | "nbconvert_exporter": "python",
1185 | "pygments_lexer": "ipython3",
1186 | "version": "3.7.3"
1187 | }
1188 | },
1189 | "nbformat": 4,
1190 | "nbformat_minor": 4
1191 | }
1192 |
--------------------------------------------------------------------------------
/graphistry_netflow_demo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "colab_type": "text",
7 | "id": "kJyD4oSbugE0"
8 | },
9 | "source": [
10 | "# Graphistry Netflow Demo\n",
11 | "\n",
12 | "In this example we are taking millions of rows of netflow (network traffic flow) data in order to search for anomalous activity within a network. We will query 70M+ rows of network security data (netflow) with BlazingSQL and pass it to Graphistry for visualization."
13 | ]
14 | },
15 | {
16 | "cell_type": "markdown",
17 | "metadata": {},
18 | "source": [
19 | "## Blazing Context\n",
20 | "Here we are importing cuDF and BlazingContext. You can think of the BlazingContext much like a Spark Context (i.e. where information such as FileSystems you have registered and Tables you have created will be stored). If you have issues running this cell, restart runtime and try running it again."
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 12,
26 | "metadata": {},
27 | "outputs": [
28 | {
29 | "name": "stdout",
30 | "output_type": "stream",
31 | "text": [
32 | "Already connected to the Orchestrator\n",
33 | "BlazingContext ready\n"
34 | ]
35 | }
36 | ],
37 | "source": [
38 | "from blazingsql import BlazingContext \n",
39 | "\n",
40 | "bc = BlazingContext()"
41 | ]
42 | },
43 | {
44 | "cell_type": "markdown",
45 | "metadata": {
46 | "colab_type": "text",
47 | "id": "yp7z8bfivbna"
48 | },
49 | "source": [
50 | "### Create & Query Tables\n",
51 | "In this next cell we identify the full path to the data."
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 13,
57 | "metadata": {},
58 | "outputs": [
59 | {
60 | "data": {
61 | "text/plain": [
62 | "'/home/winston/bsql-demos/data/*_0.parquet'"
63 | ]
64 | },
65 | "execution_count": 13,
66 | "metadata": {},
67 | "output_type": "execute_result"
68 | }
69 | ],
70 | "source": [
71 | "# identify working directory path\n",
72 | "local_path = !pwd\n",
73 | "\n",
74 | "# make wildcard path to load all 4 parquet files into blazingsql\n",
75 | "path = str(local_path) + '/data/*_0.parquet'\n",
76 | "\n",
77 | "# what's the path? \n",
78 | "path"
79 | ]
80 | },
81 | {
82 | "cell_type": "markdown",
83 | "metadata": {},
84 | "source": [
85 | "#### Create\n",
86 | "Here use the path identified above to load all 4 parquet files into a single BlazingSQL table. This is done by using a wildcard (*) in the file path. \n",
87 | "\n",
88 | "Note: point path to `data/small-chunk2.csv` for pre-downloaded data."
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 31,
94 | "metadata": {
95 | "colab": {},
96 | "colab_type": "code",
97 | "id": "lU-2wlwQntnq"
98 | },
99 | "outputs": [
100 | {
101 | "name": "stdout",
102 | "output_type": "stream",
103 | "text": [
104 | "CPU times: user 4.16 ms, sys: 4.18 ms, total: 8.35 ms\n",
105 | "Wall time: 298 ms\n"
106 | ]
107 | },
108 | {
109 | "data": {
110 | "text/plain": [
111 | ""
112 | ]
113 | },
114 | "execution_count": 31,
115 | "metadata": {},
116 | "output_type": "execute_result"
117 | }
118 | ],
119 | "source": [
120 | "%%time\n",
121 | "# blazingsql table from gpu dataframe\n",
122 | "bc.create_table('netflow', path)"
123 | ]
124 | },
125 | {
126 | "cell_type": "markdown",
127 | "metadata": {
128 | "colab_type": "text",
129 | "id": "cgivbut9df-R"
130 | },
131 | "source": [
132 | "#### Query\n",
133 | "With the table made, we can simply run a SQL query.\n",
134 | "\n",
135 | "We are going to run some joins and aggregations in order to condese these millions of rows into thousands of rows that represent nodes and edges."
136 | ]
137 | },
138 | {
139 | "cell_type": "code",
140 | "execution_count": 32,
141 | "metadata": {
142 | "colab": {
143 | "base_uri": "https://localhost:8080/",
144 | "height": 277
145 | },
146 | "colab_type": "code",
147 | "id": "umBG2Tp0wbQx",
148 | "outputId": "b89e3666-f85a-40e9-e7c4-cda9a80b7fe5"
149 | },
150 | "outputs": [
151 | {
152 | "name": "stdout",
153 | "output_type": "stream",
154 | "text": [
155 | "CPU times: user 29.3 ms, sys: 41.9 ms, total: 71.3 ms\n",
156 | "Wall time: 4.51 s\n"
157 | ]
158 | }
159 | ],
160 | "source": [
161 | "%%time\n",
162 | "# what are we looking for \n",
163 | "query = '''\n",
164 | " SELECT\n",
165 | " a.firstSeenSrcIp as source,\n",
166 | " a.firstSeenDestIp as destination,\n",
167 | " count(a.firstSeenDestPort) as targetPorts,\n",
168 | " SUM(a.firstSeenSrcTotalBytes) as bytesOut,\n",
169 | " SUM(a.firstSeenDestTotalBytes) as bytesIn,\n",
170 | " SUM(a.durationSeconds) as durationSeconds,\n",
171 | " MIN(parsedDate) as firstFlowDate,\n",
172 | " MAX(parsedDate) as lastFlowDate,\n",
173 | " COUNT(*) as attemptCount\n",
174 | " FROM\n",
175 | " netflow a\n",
176 | " GROUP BY\n",
177 | " a.firstSeenSrcIp,\n",
178 | " a.firstSeenDestIp\n",
179 | " '''\n",
180 | "\n",
181 | "# run sql query (returns cuDF DataFrame)\n",
182 | "gdf = bc.sql(query)\n",
183 | "\n",
184 | "# how do the results look?\n",
185 | "gdf.head(25)"
186 | ]
187 | }
188 | ],
189 | "metadata": {
190 | "file_extension": ".py",
191 | "kernelspec": {
192 | "display_name": "Python 3",
193 | "language": "python",
194 | "name": "python3"
195 | },
196 | "language_info": {
197 | "codemirror_mode": {
198 | "name": "ipython",
199 | "version": 3
200 | },
201 | "file_extension": ".py",
202 | "mimetype": "text/x-python",
203 | "name": "python",
204 | "nbconvert_exporter": "python",
205 | "pygments_lexer": "ipython3",
206 | "version": "3.7.3"
207 | },
208 | "mimetype": "text/x-python",
209 | "name": "python",
210 | "npconvert_exporter": "python",
211 | "pygments_lexer": "ipython3",
212 | "version": 3
213 | },
214 | "nbformat": 4,
215 | "nbformat_minor": 4
216 | }
217 |
--------------------------------------------------------------------------------
/imgs/bsql_main.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BlazingDB/bsql-demos/ebee8a606a272f3e2ab7a38587a6092fe2018d93/imgs/bsql_main.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | blazingsql>=0.11
2 | cudf>=0.11
3 | cuml>=0.11
--------------------------------------------------------------------------------
/sample_use_cases/csv_to_parquet.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# CSV to Parquet"
8 | ]
9 | },
10 | {
11 | "cell_type": "markdown",
12 | "metadata": {},
13 | "source": [
14 | "In this demo we'll walk through querying a CSV file from an AWS S3 bucket and saving the results locally as a Parquet file."
15 | ]
16 | },
17 | {
18 | "cell_type": "markdown",
19 | "metadata": {},
20 | "source": [
21 | "Identify the Dask Client (`client`) of your local GPUs, and pass it to BlazingContext (`bc`) upon initialization to activate distributed query execution with BlazingSQL."
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 1,
27 | "metadata": {},
28 | "outputs": [
29 | {
30 | "name": "stdout",
31 | "output_type": "stream",
32 | "text": [
33 | "BlazingContext ready\n"
34 | ]
35 | }
36 | ],
37 | "source": [
38 | "from dask_cuda import LocalCUDACluster\n",
39 | "cluster = LocalCUDACluster()\n",
40 | "\n",
41 | "from dask.distributed import Client\n",
42 | "client = Client(cluster)\n",
43 | "\n",
44 | "from blazingsql import BlazingContext\n",
45 | "bc = BlazingContext(dask_client=client, network_interface='lo')"
46 | ]
47 | },
48 | {
49 | "cell_type": "markdown",
50 | "metadata": {},
51 | "source": [
52 | "Register a public AWS S3 bucket and create a table (`taxi`) from it."
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 2,
58 | "metadata": {},
59 | "outputs": [
60 | {
61 | "data": {
62 | "text/plain": [
63 | ""
64 | ]
65 | },
66 | "execution_count": 2,
67 | "metadata": {},
68 | "output_type": "execute_result"
69 | }
70 | ],
71 | "source": [
72 | "bc.s3('blazingsql-colab', bucket_name='blazingsql-colab')\n",
73 | "\n",
74 | "col_names = ['key', 'fare', 'pickup_x', 'pickup_y', 'dropoff_x', 'dropoff_y', 'passenger_count']\n",
75 | "bc.create_table('taxi', 's3://blazingsql-colab/taxi_data/taxi_00.csv', names=col_names)"
76 | ]
77 | },
78 | {
79 | "cell_type": "markdown",
80 | "metadata": {},
81 | "source": [
82 | "Tag the file path to the local directory where results will be saved as `data_dir`."
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 3,
88 | "metadata": {},
89 | "outputs": [],
90 | "source": [
91 | "from os import getcwd\n",
92 | "data_dir = getcwd().replace('/sample_use_cases', '/data')"
93 | ]
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "metadata": {},
98 | "source": [
99 | "\n",
100 | "\n",
101 | "As BlazingSQL returns a distributed query's results as a dask_cudf.DataFrame, we can call write those results directly [.to_parquet()](https://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.to_parquet)."
102 | ]
103 | },
104 | {
105 | "cell_type": "code",
106 | "execution_count": 4,
107 | "metadata": {},
108 | "outputs": [],
109 | "source": [
110 | "bc.sql('SELECT * FROM taxi').to_parquet(f'{data_dir}/yellow_cab')"
111 | ]
112 | },
113 | {
114 | "cell_type": "markdown",
115 | "metadata": {},
116 | "source": [
117 | "Create a table from that newly written file, and run a simple query to see how it looks by `.compute()`ing to a cudf.DataFrame for display."
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 5,
123 | "metadata": {},
124 | "outputs": [
125 | {
126 | "data": {
127 | "text/html": [
128 | "\n",
129 | "\n",
142 | "
\n",
143 | " \n",
144 | " \n",
145 | " | \n",
146 | " key | \n",
147 | " fare | \n",
148 | " pickup_x | \n",
149 | " pickup_y | \n",
150 | " dropoff_x | \n",
151 | " dropoff_y | \n",
152 | " passenger_count | \n",
153 | " index | \n",
154 | "
\n",
155 | " \n",
156 | " \n",
157 | " \n",
158 | " 0 | \n",
159 | " 2012-02-02 22:30:19.0000002 | \n",
160 | " 8.9 | \n",
161 | " -73.988703 | \n",
162 | " 40.758803 | \n",
163 | " -73.986517 | \n",
164 | " 40.737205 | \n",
165 | " 1 | \n",
166 | " 0 | \n",
167 | "
\n",
168 | " \n",
169 | " 1 | \n",
170 | " 2014-09-20 07:19:24.0000001 | \n",
171 | " 4.0 | \n",
172 | " -73.990208 | \n",
173 | " 40.746703 | \n",
174 | " -73.994729 | \n",
175 | " 40.750512 | \n",
176 | " 1 | \n",
177 | " 1 | \n",
178 | "
\n",
179 | " \n",
180 | " 2 | \n",
181 | " 2013-02-23 07:18:05.0000001 | \n",
182 | " 5.5 | \n",
183 | " -74.016757 | \n",
184 | " 40.709438 | \n",
185 | " -74.009 | \n",
186 | " 40.719496 | \n",
187 | " 3 | \n",
188 | " 2 | \n",
189 | "
\n",
190 | " \n",
191 | " 3 | \n",
192 | " 2015-04-18 23:49:27.0000009 | \n",
193 | " 13.5 | \n",
194 | " -74.002708 | \n",
195 | " 40.733730 | \n",
196 | " -73.98609924 | \n",
197 | " 40.73477554 | \n",
198 | " 1 | \n",
199 | " 3 | \n",
200 | "
\n",
201 | " \n",
202 | " 4 | \n",
203 | " 2010-03-04 08:15:59.0000001 | \n",
204 | " 10.5 | \n",
205 | " -73.988356 | \n",
206 | " 40.737665 | \n",
207 | " -74.012459 | \n",
208 | " 40.713934 | \n",
209 | " 1 | \n",
210 | " 4 | \n",
211 | "
\n",
212 | " \n",
213 | " ... | \n",
214 | " ... | \n",
215 | " ... | \n",
216 | " ... | \n",
217 | " ... | \n",
218 | " ... | \n",
219 | " ... | \n",
220 | " ... | \n",
221 | " ... | \n",
222 | "
\n",
223 | " \n",
224 | " 4999995 | \n",
225 | " 2011-02-24 16:06:26.0000001 | \n",
226 | " 6.9 | \n",
227 | " -73.966542 | \n",
228 | " 40.804975 | \n",
229 | " -73.949043 | \n",
230 | " 40.804227 | \n",
231 | " 2 | \n",
232 | " 4999995 | \n",
233 | "
\n",
234 | " \n",
235 | " 4999996 | \n",
236 | " 2009-09-22 19:20:22.0000009 | \n",
237 | " 9.7 | \n",
238 | " -73.980055 | \n",
239 | " 40.752535 | \n",
240 | " -74.006443 | \n",
241 | " 40.739613 | \n",
242 | " 1 | \n",
243 | " 4999996 | \n",
244 | "
\n",
245 | " \n",
246 | " 4999997 | \n",
247 | " 2012-04-19 02:17:32.0000001 | \n",
248 | " 14.1 | \n",
249 | " -73.998508 | \n",
250 | " 40.745305 | \n",
251 | " -73.953184 | \n",
252 | " 40.799361 | \n",
253 | " 2 | \n",
254 | " 4999997 | \n",
255 | "
\n",
256 | " \n",
257 | " 4999998 | \n",
258 | " 2012-06-08 11:09:47.0000006 | \n",
259 | " 3.3 | \n",
260 | " -73.953630 | \n",
261 | " 40.778797 | \n",
262 | " -73.946068 | \n",
263 | " 40.775552 | \n",
264 | " 1 | \n",
265 | " 4999998 | \n",
266 | "
\n",
267 | " \n",
268 | " 4999999 | \n",
269 | " 2009-06-21 11:07:00.00000036 | \n",
270 | " 6.5 | \n",
271 | " -73.981578 | \n",
272 | " 40.772575 | \n",
273 | " -73.963333 | \n",
274 | " 40.762132 | \n",
275 | " 1 | \n",
276 | " 4999999 | \n",
277 | "
\n",
278 | " \n",
279 | "
\n",
280 | "
5000000 rows × 8 columns
\n",
281 | "
"
282 | ],
283 | "text/plain": [
284 | " key fare pickup_x pickup_y \\\n",
285 | "0 2012-02-02 22:30:19.0000002 8.9 -73.988703 40.758803 \n",
286 | "1 2014-09-20 07:19:24.0000001 4.0 -73.990208 40.746703 \n",
287 | "2 2013-02-23 07:18:05.0000001 5.5 -74.016757 40.709438 \n",
288 | "3 2015-04-18 23:49:27.0000009 13.5 -74.002708 40.733730 \n",
289 | "4 2010-03-04 08:15:59.0000001 10.5 -73.988356 40.737665 \n",
290 | "... ... ... ... ... \n",
291 | "4999995 2011-02-24 16:06:26.0000001 6.9 -73.966542 40.804975 \n",
292 | "4999996 2009-09-22 19:20:22.0000009 9.7 -73.980055 40.752535 \n",
293 | "4999997 2012-04-19 02:17:32.0000001 14.1 -73.998508 40.745305 \n",
294 | "4999998 2012-06-08 11:09:47.0000006 3.3 -73.953630 40.778797 \n",
295 | "4999999 2009-06-21 11:07:00.00000036 6.5 -73.981578 40.772575 \n",
296 | "\n",
297 | " dropoff_x dropoff_y passenger_count index \n",
298 | "0 -73.986517 40.737205 1 0 \n",
299 | "1 -73.994729 40.750512 1 1 \n",
300 | "2 -74.009 40.719496 3 2 \n",
301 | "3 -73.98609924 40.73477554 1 3 \n",
302 | "4 -74.012459 40.713934 1 4 \n",
303 | "... ... ... ... ... \n",
304 | "4999995 -73.949043 40.804227 2 4999995 \n",
305 | "4999996 -74.006443 40.739613 1 4999996 \n",
306 | "4999997 -73.953184 40.799361 2 4999997 \n",
307 | "4999998 -73.946068 40.775552 1 4999998 \n",
308 | "4999999 -73.963333 40.762132 1 4999999 \n",
309 | "\n",
310 | "[5000000 rows x 8 columns]"
311 | ]
312 | },
313 | "execution_count": 5,
314 | "metadata": {},
315 | "output_type": "execute_result"
316 | }
317 | ],
318 | "source": [
319 | "bc.create_table('parquet_taxi', f'{data_dir}/yellow_cab/part.0.parquet')\n",
320 | "\n",
321 | "bc.sql('select * from parquet_taxi').compute()"
322 | ]
323 | },
324 | {
325 | "cell_type": "markdown",
326 | "metadata": {},
327 | "source": [
328 | "You can find the Python script version of this Notebook at [/python_scripts/csv_to_parquet.py](python_scripts/csv_to_parquet.py)."
329 | ]
330 | }
331 | ],
332 | "metadata": {
333 | "kernelspec": {
334 | "display_name": "RAPIDS Nightly",
335 | "language": "python",
336 | "name": "python3"
337 | },
338 | "language_info": {
339 | "codemirror_mode": {
340 | "name": "ipython",
341 | "version": 3
342 | },
343 | "file_extension": ".py",
344 | "mimetype": "text/x-python",
345 | "name": "python",
346 | "nbconvert_exporter": "python",
347 | "pygments_lexer": "ipython3",
348 | "version": "3.7.6"
349 | }
350 | },
351 | "nbformat": 4,
352 | "nbformat_minor": 4
353 | }
354 |
--------------------------------------------------------------------------------
/sample_use_cases/python_scripts/csv_to_parquet.py:
--------------------------------------------------------------------------------
1 | from dask.distributed import Client
2 | from blazingsql import BlazingContext
3 | from dask_cuda import LocalCUDACluster
4 |
5 | # initalize BlazingContext with the Dask Client of local GPUs to distribute query execution
6 | bc = BlazingContext(dask_client=Client(LocalCUDACluster()), network_interface='lo')
7 |
8 | # register public AWS S3 bucket
9 | bc.s3('blazingsql-colab', bucket_name='blazingsql-colab')
10 |
11 | # create a table from that S3 bucket
12 | col_names = ['key', 'fare', 'pickup_x', 'pickup_y', 'dropoff_x', 'dropoff_y', 'passenger_count']
13 | bc.create_table('taxi', 's3://blazingsql-colab/taxi_data/taxi_00.csv', names=col_names)
14 |
15 | # query the table & write results locally as parquet
16 | bc.sql('SELECT * FROM taxi').to_parquet(f'../../data/yellow_cab')
17 |
--------------------------------------------------------------------------------
/taxi_fare_prediction.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "colab_type": "text",
7 | "id": "l4fOFMjbRvkZ"
8 | },
9 | "source": [
10 | "# BlazingSQL + cuML NYC Taxi Cab Fare Prediction\n",
11 | "\n",
12 | "This demo uses publicly available [NYC Taxi Cab Data](https://www.kaggle.com/c/new-york-city-taxi-fare-prediction) to predict the total fare of a taxi ride in New York City given the pickup and dropoff locations. \n",
13 | "\n",
14 | "In this notebook, we will cover: \n",
15 | "- How to read and query multiple CSV files with BlazingSQL.\n",
16 | "- How to implement a linear regression model with cuML.\n",
17 | "\n",
18 | "### Imports\n",
19 | "This next cell will import all packages you need to run this notebook end-to-end."
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 1,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "import os\n",
29 | "import urllib\n",
30 | "from cuml import LinearRegression\n",
31 | "from blazingsql import BlazingContext"
32 | ]
33 | },
34 | {
35 | "cell_type": "markdown",
36 | "metadata": {},
37 | "source": [
38 | "## Create BlazingContext\n",
39 | "You can think of the BlazingContext much like a Spark Context (i.e. where information such as FileSystems you have registered and Tables you have created will be stored). If you have issues running this cell, restart runtime and try running it again."
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 2,
45 | "metadata": {},
46 | "outputs": [
47 | {
48 | "name": "stdout",
49 | "output_type": "stream",
50 | "text": [
51 | "BlazingContext ready\n"
52 | ]
53 | }
54 | ],
55 | "source": [
56 | "# connect to BlazingSQL\n",
57 | "bc = BlazingContext()"
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {
63 | "colab_type": "text",
64 | "id": "Gt0TPBqif50q"
65 | },
66 | "source": [
67 | "### Download Data\n",
68 | "For this demo we will train our model with 25,000,000 rows of data from 5 CSV files (5M rows each).\n",
69 | "\n",
70 | "The cell below will check if you already have them, and, if you don't, will download them from AWS for you. "
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 3,
76 | "metadata": {},
77 | "outputs": [
78 | {
79 | "name": "stdout",
80 | "output_type": "stream",
81 | "text": [
82 | "Downloading https://blazingsql-colab.s3.amazonaws.com/taxi_data/taxi_00.csv to data/taxi_00.csv\n",
83 | "Downloading https://blazingsql-colab.s3.amazonaws.com/taxi_data/taxi_01.csv to data/taxi_01.csv\n",
84 | "Downloading https://blazingsql-colab.s3.amazonaws.com/taxi_data/taxi_02.csv to data/taxi_02.csv\n",
85 | "Downloading https://blazingsql-colab.s3.amazonaws.com/taxi_data/taxi_03.csv to data/taxi_03.csv\n",
86 | "Downloading https://blazingsql-colab.s3.amazonaws.com/taxi_data/taxi_04.csv to data/taxi_04.csv\n",
87 | "CPU times: user 4.19 s, sys: 5.16 s, total: 9.36 s\n",
88 | "Wall time: 26.8 s\n"
89 | ]
90 | }
91 | ],
92 | "source": [
93 | "%%time\n",
94 | "# download taxi data\n",
95 | "base_url = 'https://blazingsql-colab.s3.amazonaws.com/taxi_data/'\n",
96 | "for i in range(0, 5):\n",
97 | " fn = 'taxi_0' + str(i) + '.csv'\n",
98 | " # check if we already have the file\n",
99 | " if not os.path.isfile('data/' + fn):\n",
100 | " # we don't let me know we're downloading it now\n",
101 | " print(f'Downloading {base_url + fn} to data/{fn}')\n",
102 | " # download file\n",
103 | " urllib.request.urlretrieve(base_url + fn, 'data/' + fn)\n",
104 | " # we already have data\n",
105 | " else:\n",
106 | " # let us know\n",
107 | " print(f'data/{fn} already downloaded')"
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "metadata": {
113 | "colab_type": "text",
114 | "id": "PXtydYrimQGt"
115 | },
116 | "source": [
117 | "## Extract, transform, load\n",
118 | "In order to train our Linear Regression model, we must first preform ETL to prepare our data.\n",
119 | "\n",
120 | "BlazingSQL currently requires the full file path to create tables, the cell below will identify that path for you."
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": 4,
126 | "metadata": {},
127 | "outputs": [
128 | {
129 | "data": {
130 | "text/plain": [
131 | "'/home/jupyter-winston/bsql-demos/data/taxi_0*.csv'"
132 | ]
133 | },
134 | "execution_count": 4,
135 | "metadata": {},
136 | "output_type": "execute_result"
137 | }
138 | ],
139 | "source": [
140 | "# identify current working directory\n",
141 | "cwd = os.getcwd()\n",
142 | "# add path to data w/ wildcard (*) so BSQL can read all 5 files at once\n",
143 | "data_path = cwd + '/data/taxi_0*.csv'\n",
144 | "# how's it look?\n",
145 | "data_path"
146 | ]
147 | },
148 | {
149 | "cell_type": "markdown",
150 | "metadata": {},
151 | "source": [
152 | "### ETL: Create Table \n",
153 | "In this next cell we will create a single BlazingSQL table from all 5 CSVs."
154 | ]
155 | },
156 | {
157 | "cell_type": "code",
158 | "execution_count": 5,
159 | "metadata": {
160 | "colab": {},
161 | "colab_type": "code",
162 | "id": "Gr7CUSrsEBmW"
163 | },
164 | "outputs": [
165 | {
166 | "name": "stdout",
167 | "output_type": "stream",
168 | "text": [
169 | "CPU times: user 3.13 ms, sys: 2.44 ms, total: 5.57 ms\n",
170 | "Wall time: 4.66 ms\n"
171 | ]
172 | },
173 | {
174 | "data": {
175 | "text/plain": [
176 | ""
177 | ]
178 | },
179 | "execution_count": 5,
180 | "metadata": {},
181 | "output_type": "execute_result"
182 | }
183 | ],
184 | "source": [
185 | "%%time\n",
186 | "# tag column names and types\n",
187 | "col_names = ['key', 'fare_amount', 'pickup_longitude', 'pickup_latitude', \n",
188 | " 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']\n",
189 | "col_types = ['date64', 'float32', 'float32', 'float32',\n",
190 | " 'float32', 'float32', 'float32']\n",
191 | "\n",
192 | "# create a table from all 5 taxi files at once\n",
193 | "bc.create_table('train_taxi', data_path, names=col_names, dtype=col_types, header=0)"
194 | ]
195 | },
196 | {
197 | "cell_type": "markdown",
198 | "metadata": {
199 | "colab_type": "text",
200 | "id": "XnzjqEFnmDC5"
201 | },
202 | "source": [
203 | "### ETL: Query Tables for Training Data"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 7,
209 | "metadata": {
210 | "colab": {
211 | "base_uri": "https://localhost:8080/",
212 | "height": 425
213 | },
214 | "colab_type": "code",
215 | "id": "_MDxz73ZMhhK",
216 | "outputId": "f2abeafc-0cdf-46b1-ddf5-a5cde3d37792"
217 | },
218 | "outputs": [
219 | {
220 | "data": {
221 | "text/html": [
222 | "\n",
223 | "\n",
236 | "
\n",
237 | " \n",
238 | " \n",
239 | " | \n",
240 | " hours | \n",
241 | " days | \n",
242 | " months | \n",
243 | " years | \n",
244 | " longitude_distance | \n",
245 | " latitude_distance | \n",
246 | " passenger_count | \n",
247 | "
\n",
248 | " \n",
249 | " \n",
250 | " \n",
251 | " 0 | \n",
252 | " 20.0 | \n",
253 | " 10.0 | \n",
254 | " 9.0 | \n",
255 | " 13.0 | \n",
256 | " 0.049057 | \n",
257 | " 0.003063 | \n",
258 | " 1.0 | \n",
259 | "
\n",
260 | " \n",
261 | " 1 | \n",
262 | " 20.0 | \n",
263 | " 22.0 | \n",
264 | " 11.0 | \n",
265 | " 9.0 | \n",
266 | " 0.003464 | \n",
267 | " 0.007088 | \n",
268 | " 1.0 | \n",
269 | "
\n",
270 | " \n",
271 | " 2 | \n",
272 | " 21.0 | \n",
273 | " 4.0 | \n",
274 | " 12.0 | \n",
275 | " 9.0 | \n",
276 | " 0.003151 | \n",
277 | " 0.007584 | \n",
278 | " 1.0 | \n",
279 | "
\n",
280 | " \n",
281 | " 3 | \n",
282 | " 22.0 | \n",
283 | " 6.0 | \n",
284 | " 5.0 | \n",
285 | " 15.0 | \n",
286 | " 0.007141 | \n",
287 | " 0.011543 | \n",
288 | " 1.0 | \n",
289 | "
\n",
290 | " \n",
291 | " 4 | \n",
292 | " 23.0 | \n",
293 | " 27.0 | \n",
294 | " 4.0 | \n",
295 | " 9.0 | \n",
296 | " -0.014870 | \n",
297 | " -0.033161 | \n",
298 | " 1.0 | \n",
299 | "
\n",
300 | " \n",
301 | "
\n",
302 | "
"
303 | ],
304 | "text/plain": [
305 | " hours days months years longitude_distance latitude_distance \\\n",
306 | "0 20.0 10.0 9.0 13.0 0.049057 0.003063 \n",
307 | "1 20.0 22.0 11.0 9.0 0.003464 0.007088 \n",
308 | "2 21.0 4.0 12.0 9.0 0.003151 0.007584 \n",
309 | "3 22.0 6.0 5.0 15.0 0.007141 0.011543 \n",
310 | "4 23.0 27.0 4.0 9.0 -0.014870 -0.033161 \n",
311 | "\n",
312 | " passenger_count \n",
313 | "0 1.0 \n",
314 | "1 1.0 \n",
315 | "2 1.0 \n",
316 | "3 1.0 \n",
317 | "4 1.0 "
318 | ]
319 | },
320 | "execution_count": 7,
321 | "metadata": {},
322 | "output_type": "execute_result"
323 | }
324 | ],
325 | "source": [
326 | "# extract time columns, long & lat, # riders (all floats)\n",
327 | "query = '''\n",
328 | " select \n",
329 | " cast(hour(key) as float) hours, \n",
330 | " cast(dayofmonth(key) as float) days, \n",
331 | " cast(month(key) as float) months, \n",
332 | " cast(year(key) - 2000 as float) years, \n",
333 | " dropoff_longitude - pickup_longitude as longitude_distance, \n",
334 | " dropoff_latitude - pickup_latitude as latitude_distance, \n",
335 | " passenger_count \n",
336 | " from \n",
337 | " train_taxi\n",
338 | " '''\n",
339 | "\n",
340 | "# run query on table (returns cuDF DataFrame)\n",
341 | "X_train = bc.sql(query)\n",
342 | "\n",
343 | "# fill any null values \n",
344 | "X_train['longitude_distance'] = X_train['longitude_distance'].fillna(0)\n",
345 | "X_train['latitude_distance'] = X_train['latitude_distance'].fillna(0)\n",
346 | "X_train['passenger_count'] = X_train['passenger_count'].fillna(0)\n",
347 | "\n",
348 | "# how's it look? \n",
349 | "X_train.head()"
350 | ]
351 | },
352 | {
353 | "cell_type": "code",
354 | "execution_count": 8,
355 | "metadata": {},
356 | "outputs": [
357 | {
358 | "data": {
359 | "text/html": [
360 | "\n",
361 | "\n",
374 | "
\n",
375 | " \n",
376 | " \n",
377 | " | \n",
378 | " fare_amount | \n",
379 | "
\n",
380 | " \n",
381 | " \n",
382 | " \n",
383 | " 0 | \n",
384 | " 17.0 | \n",
385 | "
\n",
386 | " \n",
387 | " 1 | \n",
388 | " 3.3 | \n",
389 | "
\n",
390 | " \n",
391 | " 2 | \n",
392 | " 4.1 | \n",
393 | "
\n",
394 | " \n",
395 | " 3 | \n",
396 | " 6.0 | \n",
397 | "
\n",
398 | " \n",
399 | " 4 | \n",
400 | " 8.9 | \n",
401 | "
\n",
402 | " \n",
403 | "
\n",
404 | "
"
405 | ],
406 | "text/plain": [
407 | " fare_amount\n",
408 | "0 17.0\n",
409 | "1 3.3\n",
410 | "2 4.1\n",
411 | "3 6.0\n",
412 | "4 8.9"
413 | ]
414 | },
415 | "execution_count": 8,
416 | "metadata": {},
417 | "output_type": "execute_result"
418 | }
419 | ],
420 | "source": [
421 | "# query dependent variable y\n",
422 | "y_train = bc.sql('SELECT fare_amount FROM train_taxi')\n",
423 | "# how's it look?\n",
424 | "y_train.head()"
425 | ]
426 | },
427 | {
428 | "cell_type": "markdown",
429 | "metadata": {},
430 | "source": [
431 | "## Linear Regression\n",
432 | "To learn more about the cuML's LinearRegression model, check out [Beginner’s Guide to Linear Regression in Google Colab with cuML](https://medium.com/future-vision/beginners-guide-to-linear-regression-in-python-with-cuml-30e2709c761?source=friends_link&sk=1da35920b9e2ffea59d5cb3c998bfeae).\n",
433 | "\n",
434 | "### LR: Train Model"
435 | ]
436 | },
437 | {
438 | "cell_type": "code",
439 | "execution_count": 9,
440 | "metadata": {
441 | "colab": {
442 | "base_uri": "https://localhost:8080/",
443 | "height": 531
444 | },
445 | "colab_type": "code",
446 | "id": "tVUZvT9TB6Ii",
447 | "outputId": "d61c0249-47ee-40b8-a72f-9d62383f23dd"
448 | },
449 | "outputs": [
450 | {
451 | "name": "stdout",
452 | "output_type": "stream",
453 | "text": [
454 | "Coefficients:\n",
455 | "0 -0.027069\n",
456 | "1 0.003295\n",
457 | "2 0.107198\n",
458 | "3 0.636705\n",
459 | "4 0.000932\n",
460 | "5 -0.000494\n",
461 | "6 0.092028\n",
462 | "dtype: float32\n",
463 | "\n",
464 | "Y intercept:\n",
465 | "3.3608126640319824\n",
466 | "\n",
467 | "CPU times: user 892 ms, sys: 412 ms, total: 1.3 s\n",
468 | "Wall time: 2.25 s\n"
469 | ]
470 | }
471 | ],
472 | "source": [
473 | "%%time\n",
474 | "# call & create cuML model\n",
475 | "lr = LinearRegression(fit_intercept=True, normalize=False, algorithm=\"eig\")\n",
476 | "\n",
477 | "# train Linear Regression model \n",
478 | "reg = lr.fit(X_train, y_train)\n",
479 | "\n",
480 | "# display results\n",
481 | "print(f\"Coefficients:\\n{reg.coef_}\\n\")\n",
482 | "print(f\"Y intercept:\\n{reg.intercept_}\\n\")"
483 | ]
484 | },
485 | {
486 | "cell_type": "markdown",
487 | "metadata": {
488 | "colab_type": "text",
489 | "id": "pHtni9xcl-ht"
490 | },
491 | "source": [
492 | "### LR: Use Model to Predict Future Taxi Fares \n",
493 | "\n",
494 | "#### Download Test Data\n",
495 | "The cell below will check to see if you've already got the Test data, and, if you don't, will download it for you."
496 | ]
497 | },
498 | {
499 | "cell_type": "code",
500 | "execution_count": 10,
501 | "metadata": {},
502 | "outputs": [
503 | {
504 | "name": "stdout",
505 | "output_type": "stream",
506 | "text": [
507 | "--2020-01-23 04:49:37-- https://blazingsql-demos.s3-us-west-1.amazonaws.com/test.csv\n",
508 | "Resolving blazingsql-demos.s3-us-west-1.amazonaws.com (blazingsql-demos.s3-us-west-1.amazonaws.com)... 52.219.116.137\n",
509 | "Connecting to blazingsql-demos.s3-us-west-1.amazonaws.com (blazingsql-demos.s3-us-west-1.amazonaws.com)|52.219.116.137|:443... connected.\n",
510 | "HTTP request sent, awaiting response... 200 OK\n",
511 | "Length: 982916 (960K) [text/csv]\n",
512 | "Saving to: ‘data/test.csv’\n",
513 | "\n",
514 | "test.csv 100%[===================>] 959.88K 2.22MB/s in 0.4s \n",
515 | "\n",
516 | "2020-01-23 04:49:38 (2.22 MB/s) - ‘data/test.csv’ saved [982916/982916]\n",
517 | "\n",
518 | "CPU times: user 8.09 ms, sys: 26.9 ms, total: 35 ms\n",
519 | "Wall time: 902 ms\n"
520 | ]
521 | }
522 | ],
523 | "source": [
524 | "%%time\n",
525 | "# do we have Test taxi file?\n",
526 | "if not os.path.isfile('/data/test.csv'):\n",
527 | " !wget -P data https://blazingsql-demos.s3-us-west-1.amazonaws.com/test.csv\n",
528 | "else:\n",
529 | " print('test data already downloaded')"
530 | ]
531 | },
532 | {
533 | "cell_type": "code",
534 | "execution_count": 11,
535 | "metadata": {
536 | "colab": {},
537 | "colab_type": "code",
538 | "id": "yRM5PosNiuGh"
539 | },
540 | "outputs": [
541 | {
542 | "name": "stdout",
543 | "output_type": "stream",
544 | "text": [
545 | "CPU times: user 1.68 ms, sys: 5.19 ms, total: 6.87 ms\n",
546 | "Wall time: 5.42 ms\n"
547 | ]
548 | },
549 | {
550 | "data": {
551 | "text/plain": [
552 | ""
553 | ]
554 | },
555 | "execution_count": 11,
556 | "metadata": {},
557 | "output_type": "execute_result"
558 | }
559 | ],
560 | "source": [
561 | "%%time\n",
562 | "# set column names and types\n",
563 | "col_names = ['key', 'fare_amount', 'pickup_longitude', 'pickup_latitude', \n",
564 | " 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']\n",
565 | "col_types = ['date64', 'float32', 'float32', 'float32', 'float32', 'float32', 'float32']\n",
566 | "\n",
567 | "# tag path to test data\n",
568 | "test_path = cwd + '/data/test.csv'\n",
569 | "\n",
570 | "# create test table directly from CSV\n",
571 | "bc.create_table('test_taxi', test_path, names=col_names, dtype=col_types)"
572 | ]
573 | },
574 | {
575 | "cell_type": "code",
576 | "execution_count": 12,
577 | "metadata": {
578 | "colab": {},
579 | "colab_type": "code",
580 | "id": "g4I8AJ51dpW5"
581 | },
582 | "outputs": [
583 | {
584 | "name": "stdout",
585 | "output_type": "stream",
586 | "text": [
587 | "CPU times: user 61.8 ms, sys: 1.41 ms, total: 63.2 ms\n",
588 | "Wall time: 36.9 ms\n"
589 | ]
590 | },
591 | {
592 | "data": {
593 | "text/html": [
594 | "\n",
595 | "\n",
608 | "
\n",
609 | " \n",
610 | " \n",
611 | " | \n",
612 | " hours | \n",
613 | " days | \n",
614 | " months | \n",
615 | " years | \n",
616 | " longitude_distance | \n",
617 | " latitude_distance | \n",
618 | " passenger_count | \n",
619 | "
\n",
620 | " \n",
621 | " \n",
622 | " \n",
623 | " 0 | \n",
624 | " 13.0 | \n",
625 | " 27.0 | \n",
626 | " 1.0 | \n",
627 | " 15.0 | \n",
628 | " -0.008110 | \n",
629 | " -0.019970 | \n",
630 | " 1.0 | \n",
631 | "
\n",
632 | " \n",
633 | " 1 | \n",
634 | " 13.0 | \n",
635 | " 27.0 | \n",
636 | " 1.0 | \n",
637 | " 15.0 | \n",
638 | " -0.012024 | \n",
639 | " 0.019814 | \n",
640 | " 1.0 | \n",
641 | "
\n",
642 | " \n",
643 | " 2 | \n",
644 | " 11.0 | \n",
645 | " 8.0 | \n",
646 | " 10.0 | \n",
647 | " 11.0 | \n",
648 | " 0.002869 | \n",
649 | " -0.005119 | \n",
650 | " 1.0 | \n",
651 | "
\n",
652 | " \n",
653 | " 3 | \n",
654 | " 21.0 | \n",
655 | " 1.0 | \n",
656 | " 12.0 | \n",
657 | " 12.0 | \n",
658 | " -0.009277 | \n",
659 | " -0.016178 | \n",
660 | " 1.0 | \n",
661 | "
\n",
662 | " \n",
663 | " 4 | \n",
664 | " 21.0 | \n",
665 | " 1.0 | \n",
666 | " 12.0 | \n",
667 | " 12.0 | \n",
668 | " -0.022537 | \n",
669 | " -0.045345 | \n",
670 | " 1.0 | \n",
671 | "
\n",
672 | " \n",
673 | "
\n",
674 | "
"
675 | ],
676 | "text/plain": [
677 | " hours days months years longitude_distance latitude_distance \\\n",
678 | "0 13.0 27.0 1.0 15.0 -0.008110 -0.019970 \n",
679 | "1 13.0 27.0 1.0 15.0 -0.012024 0.019814 \n",
680 | "2 11.0 8.0 10.0 11.0 0.002869 -0.005119 \n",
681 | "3 21.0 1.0 12.0 12.0 -0.009277 -0.016178 \n",
682 | "4 21.0 1.0 12.0 12.0 -0.022537 -0.045345 \n",
683 | "\n",
684 | " passenger_count \n",
685 | "0 1.0 \n",
686 | "1 1.0 \n",
687 | "2 1.0 \n",
688 | "3 1.0 \n",
689 | "4 1.0 "
690 | ]
691 | },
692 | "execution_count": 12,
693 | "metadata": {},
694 | "output_type": "execute_result"
695 | }
696 | ],
697 | "source": [
698 | "%%time\n",
699 | "# extract time columns, long & lat, # riders (all floats)\n",
700 | "query = '''\n",
701 | " select \n",
702 | " cast(hour(key) as float) hours, \n",
703 | " cast(dayofmonth(key) as float) days, \n",
704 | " cast(month(key) as float) months, \n",
705 | " cast(year(key) - 2000 as float) years, \n",
706 | " dropoff_longitude - pickup_longitude as longitude_distance, \n",
707 | " dropoff_latitude - pickup_latitude as latitude_distance, \n",
708 | " passenger_count\n",
709 | " from \n",
710 | " test_taxi\n",
711 | " '''\n",
712 | "\n",
713 | "# run query on table (returns cuDF DataFrame)\n",
714 | "X_test = bc.sql(query)\n",
715 | "\n",
716 | "# fill null values \n",
717 | "X_test['longitude_distance'] = X_test['longitude_distance'].fillna(0)\n",
718 | "X_test['latitude_distance'] = X_test['latitude_distance'].fillna(0)\n",
719 | "X_test['passenger_count'] = X_test['passenger_count'].fillna(0)\n",
720 | "\n",
721 | "# how's it look? \n",
722 | "X_test.head()"
723 | ]
724 | },
725 | {
726 | "cell_type": "code",
727 | "execution_count": 13,
728 | "metadata": {
729 | "colab": {},
730 | "colab_type": "code",
731 | "id": "zCft6P5QkepN"
732 | },
733 | "outputs": [
734 | {
735 | "data": {
736 | "text/plain": [
737 | "0 12.847689\n",
738 | "1 12.847666\n",
739 | "2 11.257179\n",
740 | "3 11.814514\n",
741 | "4 11.814518\n",
742 | "5 11.814510\n",
743 | "6 11.223505\n",
744 | "7 11.223265\n",
745 | "8 11.223516\n",
746 | "9 12.234369\n",
747 | "10 12.234383\n",
748 | "11 12.234411\n",
749 | "12 9.695659\n",
750 | "13 9.695644\n",
751 | "14 11.467134\n",
752 | "15 11.467148\n",
753 | "16 11.460003\n",
754 | "17 11.460035\n",
755 | "18 11.460011\n",
756 | "19 11.460001\n",
757 | "20 13.480091\n",
758 | "21 12.704147\n",
759 | "22 12.704123\n",
760 | "23 12.704136\n",
761 | "24 12.704132\n",
762 | "25 12.704119\n",
763 | "26 12.704292\n",
764 | "27 12.704145\n",
765 | "28 12.704140\n",
766 | "29 12.704115\n",
767 | " ... \n",
768 | "9884 12.641771\n",
769 | "9885 12.641808\n",
770 | "9886 12.641790\n",
771 | "9887 12.641766\n",
772 | "9888 12.641785\n",
773 | "9889 12.641790\n",
774 | "9890 12.641781\n",
775 | "9891 12.641809\n",
776 | "9892 12.641788\n",
777 | "9893 12.641804\n",
778 | "9894 12.641783\n",
779 | "9895 12.641851\n",
780 | "9896 12.641764\n",
781 | "9897 13.446104\n",
782 | "9898 13.204254\n",
783 | "9899 14.129877\n",
784 | "9900 13.363419\n",
785 | "9901 13.627535\n",
786 | "9902 14.162102\n",
787 | "9903 13.824402\n",
788 | "9904 13.664045\n",
789 | "9905 13.252615\n",
790 | "9906 14.129101\n",
791 | "9907 13.444111\n",
792 | "9908 13.710255\n",
793 | "9909 13.707689\n",
794 | "9910 13.150122\n",
795 | "9911 13.413801\n",
796 | "9912 13.645849\n",
797 | "9913 13.251087\n",
798 | "Length: 9914, dtype: float32"
799 | ]
800 | },
801 | "execution_count": 13,
802 | "metadata": {},
803 | "output_type": "execute_result"
804 | }
805 | ],
806 | "source": [
807 | "# predict fares \n",
808 | "predictions = lr.predict(X_test)\n",
809 | "\n",
810 | "# display predictions\n",
811 | "predictions"
812 | ]
813 | },
814 | {
815 | "cell_type": "code",
816 | "execution_count": 14,
817 | "metadata": {
818 | "colab": {},
819 | "colab_type": "code",
820 | "id": "GdjUjJ42l2BI"
821 | },
822 | "outputs": [
823 | {
824 | "data": {
825 | "text/html": [
826 | "\n",
827 | "\n",
840 | "
\n",
841 | " \n",
842 | " \n",
843 | " | \n",
844 | " hours | \n",
845 | " days | \n",
846 | " months | \n",
847 | " years | \n",
848 | " longitude_distance | \n",
849 | " latitude_distance | \n",
850 | " passenger_count | \n",
851 | " predicted_fare | \n",
852 | "
\n",
853 | " \n",
854 | " \n",
855 | " \n",
856 | " 0 | \n",
857 | " 13.0 | \n",
858 | " 27.0 | \n",
859 | " 1.0 | \n",
860 | " 15.0 | \n",
861 | " -0.008110 | \n",
862 | " -0.019970 | \n",
863 | " 1.0 | \n",
864 | " 12.847689 | \n",
865 | "
\n",
866 | " \n",
867 | " 1 | \n",
868 | " 13.0 | \n",
869 | " 27.0 | \n",
870 | " 1.0 | \n",
871 | " 15.0 | \n",
872 | " -0.012024 | \n",
873 | " 0.019814 | \n",
874 | " 1.0 | \n",
875 | " 12.847666 | \n",
876 | "
\n",
877 | " \n",
878 | " 2 | \n",
879 | " 11.0 | \n",
880 | " 8.0 | \n",
881 | " 10.0 | \n",
882 | " 11.0 | \n",
883 | " 0.002869 | \n",
884 | " -0.005119 | \n",
885 | " 1.0 | \n",
886 | " 11.257179 | \n",
887 | "
\n",
888 | " \n",
889 | " 3 | \n",
890 | " 21.0 | \n",
891 | " 1.0 | \n",
892 | " 12.0 | \n",
893 | " 12.0 | \n",
894 | " -0.009277 | \n",
895 | " -0.016178 | \n",
896 | " 1.0 | \n",
897 | " 11.814514 | \n",
898 | "
\n",
899 | " \n",
900 | " 4 | \n",
901 | " 21.0 | \n",
902 | " 1.0 | \n",
903 | " 12.0 | \n",
904 | " 12.0 | \n",
905 | " -0.022537 | \n",
906 | " -0.045345 | \n",
907 | " 1.0 | \n",
908 | " 11.814518 | \n",
909 | "
\n",
910 | " \n",
911 | "
\n",
912 | "
"
913 | ],
914 | "text/plain": [
915 | " hours days months years longitude_distance latitude_distance \\\n",
916 | "0 13.0 27.0 1.0 15.0 -0.008110 -0.019970 \n",
917 | "1 13.0 27.0 1.0 15.0 -0.012024 0.019814 \n",
918 | "2 11.0 8.0 10.0 11.0 0.002869 -0.005119 \n",
919 | "3 21.0 1.0 12.0 12.0 -0.009277 -0.016178 \n",
920 | "4 21.0 1.0 12.0 12.0 -0.022537 -0.045345 \n",
921 | "\n",
922 | " passenger_count predicted_fare \n",
923 | "0 1.0 12.847689 \n",
924 | "1 1.0 12.847666 \n",
925 | "2 1.0 11.257179 \n",
926 | "3 1.0 11.814514 \n",
927 | "4 1.0 11.814518 "
928 | ]
929 | },
930 | "execution_count": 14,
931 | "metadata": {},
932 | "output_type": "execute_result"
933 | }
934 | ],
935 | "source": [
936 | "# add predictions to test dataframe\n",
937 | "X_test['predicted_fare'] = predictions\n",
938 | "\n",
939 | "# how's that look?\n",
940 | "X_test.head()"
941 | ]
942 | }
943 | ],
944 | "metadata": {
945 | "accelerator": "GPU",
946 | "colab": {
947 | "collapsed_sections": [],
948 | "name": "BlazingSQL_cuML_Taxi_Fare_Prediction.ipynb",
949 | "provenance": []
950 | },
951 | "kernelspec": {
952 | "display_name": "Python 3",
953 | "language": "python",
954 | "name": "python3"
955 | },
956 | "language_info": {
957 | "codemirror_mode": {
958 | "name": "ipython",
959 | "version": 3
960 | },
961 | "file_extension": ".py",
962 | "mimetype": "text/x-python",
963 | "name": "python",
964 | "nbconvert_exporter": "python",
965 | "pygments_lexer": "ipython3",
966 | "version": "3.6.7"
967 | }
968 | },
969 | "nbformat": 4,
970 | "nbformat_minor": 4
971 | }
972 |
--------------------------------------------------------------------------------
/utils/env-check.py:
--------------------------------------------------------------------------------
1 | import sys, os
2 |
3 | sys.path.append('/usr/local/lib/python3.6/site-packages/')
4 | os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'
5 | os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'
6 |
7 | import pynvml
8 |
9 | pynvml.nvmlInit()
10 | handle = pynvml.nvmlDeviceGetHandleByIndex(0)
11 | device_name = pynvml.nvmlDeviceGetName(handle)
12 |
13 | if device_name != b'Tesla T4':
14 | raise Exception("""
15 | Unfortunately Colab didn't give you a T4 GPU.
16 |
17 | Make sure you've configured Colab to request a GPU instance type.
18 |
19 | If you get a K80 GPU, try Runtime -> Reset all runtimes...
20 | """)
21 | else:
22 | print('*********************************************')
23 | print('Woo! Your instance has the right kind of GPU!')
24 | print('*********************************************')
25 | print()
26 |
--------------------------------------------------------------------------------
/vs_pyspark_netflow.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "colab_type": "text",
7 | "id": "d0hJ4z8rBOFC"
8 | },
9 | "source": [
10 | "# BlazingSQL vs. Apache Spark \n",
11 | "\n",
12 | "Below we have one of our popular workloads running with [BlazingSQL](https://blazingsql.com/), and then with Apache Spark + PySpark.\n",
13 | "\n",
14 | "In this notebook, we will cover: \n",
15 | "- How to read and query csv files with BlazingSQL.\n",
16 | "- How BlazingSQL compares against Apache Spark (analyzing over 20M records)."
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "colab_type": "text",
23 | "id": "0guvG6Ws_zmX"
24 | },
25 | "source": [
26 | "## Import packages and create Blazing Context\n",
27 | "You can think of the BlazingContext much like a Spark Context (i.e. information such as FileSystems you have registered and Tables you have created will be stored here). "
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 1,
33 | "metadata": {
34 | "colab": {
35 | "base_uri": "https://localhost:8080/",
36 | "height": 35
37 | },
38 | "colab_type": "code",
39 | "id": "ojm_V-WAtz0f",
40 | "outputId": "a46625f4-1494-4a13-eb13-2f38efd80ccf"
41 | },
42 | "outputs": [
43 | {
44 | "name": "stdout",
45 | "output_type": "stream",
46 | "text": [
47 | "BlazingContext ready\n"
48 | ]
49 | }
50 | ],
51 | "source": [
52 | "from blazingsql import BlazingContext\n",
53 | "# start up BlazingSQL\n",
54 | "bc = BlazingContext()"
55 | ]
56 | },
57 | {
58 | "cell_type": "markdown",
59 | "metadata": {
60 | "colab_type": "text",
61 | "id": "yp7z8bfivbna"
62 | },
63 | "source": [
64 | "### Load & Query Table\n",
65 | "First, we need to download the netflow data (21,526,138 records) from AWS. If you do not wish to download the full 2.5G file, the first 100,000 rows of data are pre-downloaded at `data/small-chunk2.csv`, simply skip the cell below and change the file path when propmted 2 cells from now."
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 2,
71 | "metadata": {
72 | "colab": {},
73 | "colab_type": "code",
74 | "id": "2dAt6DfG37KH"
75 | },
76 | "outputs": [
77 | {
78 | "name": "stdout",
79 | "output_type": "stream",
80 | "text": [
81 | "--2020-01-20 22:14:17-- https://blazingsql-colab.s3.amazonaws.com/netflow_data/nf-chunk2.csv\n",
82 | "Resolving blazingsql-colab.s3.amazonaws.com (blazingsql-colab.s3.amazonaws.com)... 52.216.112.139\n",
83 | "Connecting to blazingsql-colab.s3.amazonaws.com (blazingsql-colab.s3.amazonaws.com)|52.216.112.139|:443... connected.\n",
84 | "HTTP request sent, awaiting response... 200 OK\n",
85 | "Length: 2725056295 (2.5G) [text/csv]\n",
86 | "Saving to: ‘data/nf-chunk2.csv’\n",
87 | "\n",
88 | "nf-chunk2.csv 100%[===================>] 2.54G 51.8MB/s in 49s \n",
89 | "\n",
90 | "2020-01-20 22:15:06 (53.2 MB/s) - ‘data/nf-chunk2.csv’ saved [2725056295/2725056295]\n",
91 | "\n"
92 | ]
93 | }
94 | ],
95 | "source": [
96 | "# save nf-chunk2 to data folder, may take a few minutes to download\n",
97 | "!wget -P data/ https://blazingsql-colab.s3.amazonaws.com/netflow_data/nf-chunk2.csv "
98 | ]
99 | },
100 | {
101 | "cell_type": "markdown",
102 | "metadata": {
103 | "colab_type": "text",
104 | "id": "OTEaAsp2_zmf"
105 | },
106 | "source": [
107 | "## BlazingSQL \n",
108 | "Data in hand, we can test the preformance of BlazingSQL on this dataset. \n",
109 | "\n",
110 | "To use pre-downloaded data, change the file path to `data/small-chunk2.csv`."
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": 3,
116 | "metadata": {
117 | "colab": {
118 | "base_uri": "https://localhost:8080/",
119 | "height": 52
120 | },
121 | "colab_type": "code",
122 | "id": "rirBsYQU3NH5",
123 | "outputId": "51ced2b1-b930-4173-bbfa-09672e751d3f"
124 | },
125 | "outputs": [
126 | {
127 | "data": {
128 | "text/plain": [
129 | "'/home/winston/bsql-demos/data/nf-chunk2.csv'"
130 | ]
131 | },
132 | "execution_count": 3,
133 | "metadata": {},
134 | "output_type": "execute_result"
135 | }
136 | ],
137 | "source": [
138 | "import os\n",
139 | "# determine current working directory \n",
140 | "cwd = os.getcwd()\n",
141 | "# complete path to data\n",
142 | "path = cwd + '/data/nf-chunk2.csv'\n",
143 | "# what's the path?\n",
144 | "path"
145 | ]
146 | },
147 | {
148 | "cell_type": "code",
149 | "execution_count": 4,
150 | "metadata": {
151 | "colab": {
152 | "base_uri": "https://localhost:8080/",
153 | "height": 52
154 | },
155 | "colab_type": "code",
156 | "id": "zCzLEFfB3N4k",
157 | "outputId": "10ff9097-2736-423e-969d-de75983fbdda"
158 | },
159 | "outputs": [
160 | {
161 | "name": "stdout",
162 | "output_type": "stream",
163 | "text": [
164 | "CPU times: user 9.9 ms, sys: 13.1 ms, total: 23 ms\n",
165 | "Wall time: 1.14 s\n"
166 | ]
167 | },
168 | {
169 | "data": {
170 | "text/plain": [
171 | ""
172 | ]
173 | },
174 | "execution_count": 4,
175 | "metadata": {},
176 | "output_type": "execute_result"
177 | }
178 | ],
179 | "source": [
180 | "%%time\n",
181 | "# Create BlazingSQL table from GDF - There is no copy in this process\n",
182 | "bc.create_table('netflow', path, header=0)"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": 5,
188 | "metadata": {
189 | "colab": {
190 | "base_uri": "https://localhost:8080/",
191 | "height": 295
192 | },
193 | "colab_type": "code",
194 | "id": "umBG2Tp0wbQx",
195 | "outputId": "0975395e-7f5b-4244-afa3-45c8658ce61c"
196 | },
197 | "outputs": [
198 | {
199 | "name": "stdout",
200 | "output_type": "stream",
201 | "text": [
202 | "CPU times: user 5.07 s, sys: 2.61 s, total: 7.67 s\n",
203 | "Wall time: 10.4 s\n"
204 | ]
205 | }
206 | ],
207 | "source": [
208 | "%%time\n",
209 | "# define the query\n",
210 | "query = '''\n",
211 | " SELECT\n",
212 | " a.firstSeenSrcIp as source,\n",
213 | " a.firstSeenDestIp as destination,\n",
214 | " count(a.firstSeenDestPort) as targetPorts,\n",
215 | " SUM(a.firstSeenSrcTotalBytes) as bytesOut,\n",
216 | " SUM(a.firstSeenDestTotalBytes) as bytesIn,\n",
217 | " SUM(a.durationSeconds) as durationSeconds,\n",
218 | " MIN(parsedDate) as firstFlowDate,\n",
219 | " MAX(parsedDate) as lastFlowDate,\n",
220 | " COUNT(*) as attemptCount\n",
221 | " FROM \n",
222 | " netflow a\n",
223 | " GROUP BY\n",
224 | " a.firstSeenSrcIp,\n",
225 | " a.firstSeenDestIp\n",
226 | " '''\n",
227 | "\n",
228 | "# query the table (returns cuDF DataFrame)\n",
229 | "gdf = bc.sql(query)"
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "execution_count": 6,
235 | "metadata": {
236 | "colab": {},
237 | "colab_type": "code",
238 | "id": "48_W2v8q_zmq",
239 | "outputId": "db0394f1-e082-49b0-c477-e3bba8d3d0f4"
240 | },
241 | "outputs": [
242 | {
243 | "data": {
244 | "text/html": [
245 | "\n",
246 | "\n",
259 | "
\n",
260 | " \n",
261 | " \n",
262 | " | \n",
263 | " source | \n",
264 | " destination | \n",
265 | " targetPorts | \n",
266 | " bytesOut | \n",
267 | " bytesIn | \n",
268 | " durationSeconds | \n",
269 | " firstFlowDate | \n",
270 | " lastFlowDate | \n",
271 | " attemptCount | \n",
272 | "
\n",
273 | " \n",
274 | " \n",
275 | " \n",
276 | " 0 | \n",
277 | " 172.30.2.60 | \n",
278 | " 10.0.0.9 | \n",
279 | " 82 | \n",
280 | " 34839 | \n",
281 | " 47716 | \n",
282 | " 134 | \n",
283 | " 2013-04-03 06:48:47 | \n",
284 | " 2013-04-03 12:12:37 | \n",
285 | " 82 | \n",
286 | "
\n",
287 | " \n",
288 | " 1 | \n",
289 | " 172.10.1.162 | \n",
290 | " 10.0.0.11 | \n",
291 | " 87 | \n",
292 | " 39628 | \n",
293 | " 53983 | \n",
294 | " 24 | \n",
295 | " 2013-04-03 06:50:13 | \n",
296 | " 2013-04-03 14:58:35 | \n",
297 | " 87 | \n",
298 | "
\n",
299 | " \n",
300 | " 2 | \n",
301 | " 10.1.0.76 | \n",
302 | " 172.10.1.82 | \n",
303 | " 1 | \n",
304 | " 633 | \n",
305 | " 392 | \n",
306 | " 0 | \n",
307 | " 2013-04-03 09:55:05 | \n",
308 | " 2013-04-03 09:55:05 | \n",
309 | " 1 | \n",
310 | "
\n",
311 | " \n",
312 | " 3 | \n",
313 | " 172.30.1.56 | \n",
314 | " 172.0.0.1 | \n",
315 | " 25 | \n",
316 | " 3330 | \n",
317 | " 3240 | \n",
318 | " 67 | \n",
319 | " 2013-04-03 01:59:09 | \n",
320 | " 2013-04-03 22:05:39 | \n",
321 | " 25 | \n",
322 | "
\n",
323 | " \n",
324 | " 4 | \n",
325 | " 172.30.1.10 | \n",
326 | " 10.0.0.12 | \n",
327 | " 69 | \n",
328 | " 31042 | \n",
329 | " 43044 | \n",
330 | " 25 | \n",
331 | " 2013-04-03 06:48:01 | \n",
332 | " 2013-04-03 12:11:40 | \n",
333 | " 69 | \n",
334 | "
\n",
335 | " \n",
336 | " 5 | \n",
337 | " 172.10.1.89 | \n",
338 | " 10.0.0.5 | \n",
339 | " 112 | \n",
340 | " 51222 | \n",
341 | " 70260 | \n",
342 | " 24 | \n",
343 | " 2013-04-03 06:48:24 | \n",
344 | " 2013-04-03 15:17:39 | \n",
345 | " 112 | \n",
346 | "
\n",
347 | " \n",
348 | " 6 | \n",
349 | " 172.10.1.234 | \n",
350 | " 10.0.0.5 | \n",
351 | " 104 | \n",
352 | " 47287 | \n",
353 | " 64750 | \n",
354 | " 18 | \n",
355 | " 2013-04-03 06:53:55 | \n",
356 | " 2013-04-03 15:11:07 | \n",
357 | " 104 | \n",
358 | "
\n",
359 | " \n",
360 | " 7 | \n",
361 | " 172.30.2.125 | \n",
362 | " 10.0.0.9 | \n",
363 | " 69 | \n",
364 | " 30701 | \n",
365 | " 41558 | \n",
366 | " 341 | \n",
367 | " 2013-04-03 06:50:50 | \n",
368 | " 2013-04-03 12:12:37 | \n",
369 | " 69 | \n",
370 | "
\n",
371 | " \n",
372 | " 8 | \n",
373 | " 172.30.1.85 | \n",
374 | " 10.0.0.8 | \n",
375 | " 84 | \n",
376 | " 37828 | \n",
377 | " 52864 | \n",
378 | " 3 | \n",
379 | " 2013-04-03 06:48:21 | \n",
380 | " 2013-04-03 12:06:53 | \n",
381 | " 84 | \n",
382 | "
\n",
383 | " \n",
384 | " 9 | \n",
385 | " 10.0.0.9 | \n",
386 | " 172.30.1.124 | \n",
387 | " 1 | \n",
388 | " 632 | \n",
389 | " 391 | \n",
390 | " 0 | \n",
391 | " 2013-04-03 10:36:04 | \n",
392 | " 2013-04-03 10:36:04 | \n",
393 | " 1 | \n",
394 | "
\n",
395 | " \n",
396 | "
\n",
397 | "
"
398 | ],
399 | "text/plain": [
400 | " source destination targetPorts bytesOut bytesIn \\\n",
401 | "0 172.30.2.60 10.0.0.9 82 34839 47716 \n",
402 | "1 172.10.1.162 10.0.0.11 87 39628 53983 \n",
403 | "2 10.1.0.76 172.10.1.82 1 633 392 \n",
404 | "3 172.30.1.56 172.0.0.1 25 3330 3240 \n",
405 | "4 172.30.1.10 10.0.0.12 69 31042 43044 \n",
406 | "5 172.10.1.89 10.0.0.5 112 51222 70260 \n",
407 | "6 172.10.1.234 10.0.0.5 104 47287 64750 \n",
408 | "7 172.30.2.125 10.0.0.9 69 30701 41558 \n",
409 | "8 172.30.1.85 10.0.0.8 84 37828 52864 \n",
410 | "9 10.0.0.9 172.30.1.124 1 632 391 \n",
411 | "\n",
412 | " durationSeconds firstFlowDate lastFlowDate attemptCount \n",
413 | "0 134 2013-04-03 06:48:47 2013-04-03 12:12:37 82 \n",
414 | "1 24 2013-04-03 06:50:13 2013-04-03 14:58:35 87 \n",
415 | "2 0 2013-04-03 09:55:05 2013-04-03 09:55:05 1 \n",
416 | "3 67 2013-04-03 01:59:09 2013-04-03 22:05:39 25 \n",
417 | "4 25 2013-04-03 06:48:01 2013-04-03 12:11:40 69 \n",
418 | "5 24 2013-04-03 06:48:24 2013-04-03 15:17:39 112 \n",
419 | "6 18 2013-04-03 06:53:55 2013-04-03 15:11:07 104 \n",
420 | "7 341 2013-04-03 06:50:50 2013-04-03 12:12:37 69 \n",
421 | "8 3 2013-04-03 06:48:21 2013-04-03 12:06:53 84 \n",
422 | "9 0 2013-04-03 10:36:04 2013-04-03 10:36:04 1 "
423 | ]
424 | },
425 | "execution_count": 6,
426 | "metadata": {},
427 | "output_type": "execute_result"
428 | }
429 | ],
430 | "source": [
431 | "# how's it look?\n",
432 | "gdf.head(10)"
433 | ]
434 | },
435 | {
436 | "cell_type": "markdown",
437 | "metadata": {
438 | "colab_type": "text",
439 | "id": "6PXbjW1hTxrD"
440 | },
441 | "source": [
442 | "## Apache Spark\n",
443 | "The cell below installs Apache Spark ([PySpark](https://spark.apache.org/docs/latest/api/python/index.html))."
444 | ]
445 | },
446 | {
447 | "cell_type": "code",
448 | "execution_count": 7,
449 | "metadata": {
450 | "colab": {},
451 | "colab_type": "code",
452 | "id": "pnEEvVEtT8xi"
453 | },
454 | "outputs": [
455 | {
456 | "name": "stdout",
457 | "output_type": "stream",
458 | "text": [
459 | "Collecting pyspark\n",
460 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/87/21/f05c186f4ddb01d15d0ddc36ef4b7e3cedbeb6412274a41f26b55a650ee5/pyspark-2.4.4.tar.gz (215.7MB)\n",
461 | "\u001b[K |████████████████████████████████| 215.7MB 50kB/s s eta 0:00:01\n",
462 | "\u001b[?25hCollecting py4j==0.10.7\n",
463 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)\n",
464 | "\u001b[K |████████████████████████████████| 204kB 54.4MB/s eta 0:00:01\n",
465 | "\u001b[?25hBuilding wheels for collected packages: pyspark\n",
466 | " Building wheel for pyspark (setup.py) ... \u001b[?25ldone\n",
467 | "\u001b[?25h Created wheel for pyspark: filename=pyspark-2.4.4-py2.py3-none-any.whl size=216130387 sha256=14abaa33edbf681f432ee00d234718731961da639e5eec86c4784667d43b4f5d\n",
468 | " Stored in directory: /home/winston/.cache/pip/wheels/ab/09/4d/0d184230058e654eb1b04467dbc1292f00eaa186544604b471\n",
469 | "Successfully built pyspark\n",
470 | "Installing collected packages: py4j, pyspark\n",
471 | "Successfully installed py4j-0.10.7 pyspark-2.4.4\n"
472 | ]
473 | }
474 | ],
475 | "source": [
476 | "# installs Spark (2.4.4 Jan 2020)\n",
477 | "!pip install pyspark"
478 | ]
479 | },
480 | {
481 | "cell_type": "markdown",
482 | "metadata": {
483 | "colab_type": "text",
484 | "id": "W3-XmZkz_zmw"
485 | },
486 | "source": [
487 | "#### PyBlazing vs PySpark\n",
488 | "With everything installed we can launch a SparkSession and see how BlazingSQL stacks up."
489 | ]
490 | },
491 | {
492 | "cell_type": "code",
493 | "execution_count": 1,
494 | "metadata": {
495 | "colab": {
496 | "base_uri": "https://localhost:8080/",
497 | "height": 51
498 | },
499 | "colab_type": "code",
500 | "id": "nioEt2MqT9B0",
501 | "outputId": "f75b9823-5dbd-45b1-9282-562d3d6ddaf0"
502 | },
503 | "outputs": [
504 | {
505 | "name": "stdout",
506 | "output_type": "stream",
507 | "text": [
508 | "CPU times: user 321 ms, sys: 208 ms, total: 529 ms\n",
509 | "Wall time: 3.65 s\n"
510 | ]
511 | }
512 | ],
513 | "source": [
514 | "%%time\n",
515 | "# copied this cell's snippet from another Google Colab by Luca Canali here: https://colab.research.google.com/github/LucaCanali/sparkMeasure/blob/master/examples/SparkMeasure_Jupyter_Colab_Example.ipynb\n",
516 | "\n",
517 | "from pyspark.sql import SparkSession\n",
518 | "\n",
519 | "# Create Spark Session\n",
520 | "# This example uses a local cluster, you can modify master to use YARN or K8S if available \n",
521 | "# This example downloads sparkMeasure 0.13 for scala 2_11 from maven central\n",
522 | "\n",
523 | "spark = SparkSession \\\n",
524 | " .builder \\\n",
525 | " .master(\"local[*]\") \\\n",
526 | " .appName(\"PySpark Netflow Benchmark code\") \\\n",
527 | " .config(\"spark.jars.packages\",\"ch.cern.sparkmeasure:spark-measure_2.11:0.13\") \\\n",
528 | " .getOrCreate()"
529 | ]
530 | },
531 | {
532 | "cell_type": "markdown",
533 | "metadata": {
534 | "colab_type": "text",
535 | "id": "G8XSppQiUdLY"
536 | },
537 | "source": [
538 | "### Load & Query Table"
539 | ]
540 | },
541 | {
542 | "cell_type": "code",
543 | "execution_count": 2,
544 | "metadata": {
545 | "colab": {
546 | "base_uri": "https://localhost:8080/",
547 | "height": 51
548 | },
549 | "colab_type": "code",
550 | "id": "ZSLuSYSOUDtf",
551 | "outputId": "2b93169b-63c5-4c46-da14-af87645bf51b"
552 | },
553 | "outputs": [
554 | {
555 | "name": "stdout",
556 | "output_type": "stream",
557 | "text": [
558 | "CPU times: user 20.2 ms, sys: 11.3 ms, total: 31.5 ms\n",
559 | "Wall time: 2min 46s\n"
560 | ]
561 | }
562 | ],
563 | "source": [
564 | "%%time\n",
565 | "# load CSV into Spark\n",
566 | "netflow_df = spark.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('data/nf-chunk2.csv')"
567 | ]
568 | },
569 | {
570 | "cell_type": "code",
571 | "execution_count": 3,
572 | "metadata": {
573 | "colab": {
574 | "base_uri": "https://localhost:8080/",
575 | "height": 51
576 | },
577 | "colab_type": "code",
578 | "id": "iT3BwLn8UDwE",
579 | "outputId": "4eeff800-489f-4230-adb9-f3a1c16ede66"
580 | },
581 | "outputs": [
582 | {
583 | "name": "stdout",
584 | "output_type": "stream",
585 | "text": [
586 | "CPU times: user 1.72 ms, sys: 176 µs, total: 1.9 ms\n",
587 | "Wall time: 157 ms\n"
588 | ]
589 | }
590 | ],
591 | "source": [
592 | "%%time\n",
593 | "# create table for querying\n",
594 | "netflow_df.createOrReplaceTempView('netflow')"
595 | ]
596 | },
597 | {
598 | "cell_type": "code",
599 | "execution_count": 4,
600 | "metadata": {
601 | "colab": {
602 | "base_uri": "https://localhost:8080/",
603 | "height": 493
604 | },
605 | "colab_type": "code",
606 | "id": "9SBhahA5UD2k",
607 | "outputId": "accc1938-6470-44df-ab7f-70058c755b2b"
608 | },
609 | "outputs": [
610 | {
611 | "name": "stdout",
612 | "output_type": "stream",
613 | "text": [
614 | "+------------+---------------+-----------+--------+-------+---------------+-------------------+-------------------+------------+\n",
615 | "| source| destination|targetPorts|bytesOut|bytesIn|durationSeconds| firstFlowDate| lastFlowDate|attemptCount|\n",
616 | "+------------+---------------+-----------+--------+-------+---------------+-------------------+-------------------+------------+\n",
617 | "| 172.10.1.13|239.255.255.250| 15| 2975| 0| 6|2013-04-03 06:36:19|2013-04-03 06:36:27| 15|\n",
618 | "|172.30.1.204|239.255.255.250| 8| 1750| 0| 6|2013-04-03 06:36:13|2013-04-03 06:36:20| 8|\n",
619 | "| 172.30.2.86| 172.0.0.1| 1| 540| 0| 2|2013-04-03 06:36:09|2013-04-03 06:36:09| 1|\n",
620 | "|172.30.1.246| 172.0.0.1| 29| 2610| 2610| 0|2013-04-03 00:26:46|2013-04-03 23:06:00| 29|\n",
621 | "| 172.30.1.51|239.255.255.250| 16| 3850| 0| 18|2013-04-03 06:35:22|2013-04-03 06:44:08| 16|\n",
622 | "| 172.10.1.35| 172.0.0.1| 1| 270| 0| 0|2013-04-03 06:36:21|2013-04-03 06:36:21| 1|\n",
623 | "| 172.20.1.91|239.255.255.250| 19| 3675| 0| 6|2013-04-03 06:36:50|2013-04-03 06:36:59| 19|\n",
624 | "|172.20.1.249|239.255.255.250| 2| 700| 0| 6|2013-04-03 06:37:17|2013-04-03 06:37:23| 2|\n",
625 | "|172.10.1.232| 172.0.0.1| 30| 3060| 3060| 48|2013-04-03 01:31:31|2013-04-03 22:53:36| 30|\n",
626 | "|172.10.1.238|239.255.255.250| 2| 700| 0| 6|2013-04-03 06:36:44|2013-04-03 06:36:51| 2|\n",
627 | "+------------+---------------+-----------+--------+-------+---------------+-------------------+-------------------+------------+\n",
628 | "only showing top 10 rows\n",
629 | "\n",
630 | "CPU times: user 4.39 ms, sys: 8.82 ms, total: 13.2 ms\n",
631 | "Wall time: 1min 9s\n"
632 | ]
633 | }
634 | ],
635 | "source": [
636 | "%%time\n",
637 | "# define the same query run tested on blazingsql above\n",
638 | "query = '''\n",
639 | " SELECT\n",
640 | " a.firstSeenSrcIp as source,\n",
641 | " a.firstSeenDestIp as destination,\n",
642 | " count(a.firstSeenDestPort) as targetPorts,\n",
643 | " SUM(a.firstSeenSrcTotalBytes) as bytesOut,\n",
644 | " SUM(a.firstSeenDestTotalBytes) as bytesIn,\n",
645 | " SUM(a.durationSeconds) as durationSeconds,\n",
646 | " MIN(parsedDate) as firstFlowDate,\n",
647 | " MAX(parsedDate) as lastFlowDate,\n",
648 | " COUNT(*) as attemptCount\n",
649 | " FROM\n",
650 | " netflow a\n",
651 | " GROUP BY\n",
652 | " a.firstSeenSrcIp,\n",
653 | " a.firstSeenDestIp\n",
654 | " '''\n",
655 | "\n",
656 | "# query with Spark\n",
657 | "edges_df = spark.sql(query)\n",
658 | "\n",
659 | "# set/display results\n",
660 | "edges_df.show(10)"
661 | ]
662 | }
663 | ],
664 | "metadata": {
665 | "accelerator": "GPU",
666 | "colab": {
667 | "collapsed_sections": [],
668 | "name": "vs_pyspark_netflow.ipynb",
669 | "provenance": [],
670 | "toc_visible": true
671 | },
672 | "kernelspec": {
673 | "display_name": "Python 3",
674 | "language": "python",
675 | "name": "python3"
676 | },
677 | "language_info": {
678 | "codemirror_mode": {
679 | "name": "ipython",
680 | "version": 3
681 | },
682 | "file_extension": ".py",
683 | "mimetype": "text/x-python",
684 | "name": "python",
685 | "nbconvert_exporter": "python",
686 | "pygments_lexer": "ipython3",
687 | "version": "3.7.6"
688 | }
689 | },
690 | "nbformat": 4,
691 | "nbformat_minor": 4
692 | }
693 |
--------------------------------------------------------------------------------