├── .gitignore
├── LICENSE
├── README.md
├── blazingsql_demo.ipynb
├── colab_notebooks
    ├── blazingsql_demo.ipynb
    ├── federated_query_demo.ipynb
    ├── graphistry_netflow_demo.ipynb
    └── vs_pyspark_netflow.ipynb
├── data
    ├── Music.csv
    ├── cancer_data_00.csv
    ├── cancer_data_01.parquet
    ├── cancer_data_02.csv
    └── small-chunk2.csv
├── federated_query_demo.ipynb
├── graphistry_netflow_demo.ipynb
├── imgs
    └── bsql_main.png
├── requirements.txt
├── sample_use_cases
    ├── csv_to_parquet.ipynb
    └── python_scripts
    │   └── csv_to_parquet.py
├── taxi_fare_prediction.ipynb
├── utils
    ├── blazing_conda_test.ipynb
    └── env-check.py
└── vs_pyspark_netflow.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | .log
3 | .csv
4 | .parquet
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # BlazingSQL Demos
 2 | Demo Python notebooks using BlazingSQL with the RAPIDS AI ecoystem.
 3 | 
 4 | | Notebook Title | Description |Launch in Colab|
 5 | |----------------|----------------|----------------|
 6 | | Getting Started | How to set up and get started with BlazingSQL and the RAPIDS AI suite |[![Google Colab Badge](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BlazingDB/bsql-demos/blob/master/colab_notebooks/blazingsql_demo.ipynb)|
 7 | | Netflow | Query 65M rows of network security data (netflow) with BlazingSQL and then pass to Graphistry to visualize and interact with the data |[![Google Colab Badge](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BlazingDB/bsql-demos/blob/master/colab_notebooks/graphistry_netflow_demo.ipynb)|
 8 | | Taxi | Train a linear regression model with cuML on 55 million rows of public NYC Taxi Data loaded with BlazingSQL |[![Google Colab Badge](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BlazingDB/bsql-demos/blob/master/taxi_fare_prediction.ipynb)|
 9 | | BlazingSQL vs. Apache Spark | Analyze 20 million rows of net flow data. Compare BlazingSQL and Apache Spark timings for the same workload |[![Google Colab Badge](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BlazingDB/bsql-demos/blob/master/colab_notebooks/vs_pyspark_netflow.ipynb)|
10 | | Federated Query | In a single query, join an Apache Parquet file, a CSV file, and a GPU DataFrame (GDF) in GPU memory. |[![Google Colab Badge](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/BlazingDB/bsql-demos/blob/master/colab_notebooks/federated_query_demo.ipynb)|


--------------------------------------------------------------------------------
/blazingsql_demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "colab_type": "text",
  7 |     "id": "8AdUt3HiUrc3"
  8 |    },
  9 |    "source": [
 10 |     "# Getting Started with BlazingSQL\n",
 11 |     "\n",
 12 |     "[BlazingSQL](https://github.com/BlazingDB/blazingsql) provides an open-source SQL interface to ETL massive datasets directly into GPU memory and the [RAPIDS.ai](https://github.com/rapidsai) Ecosystem. \n",
 13 |     "\n",
 14 |     "In this notebook, we will cover how to query cuDF (GPU) DataFrames with BlazingSQL. \n",
 15 |     "\n",
 16 |     "To learn more about the GPU DataFrame and how it enables end-to-end workloads on RAPIDS, check out our [blog post](https://blog.blazingdb.com/blazingsql-part-1-the-gpu-dataframe-gdf-and-cudf-in-rapids-ai-96ec15102240).\n",
 17 |     "\n",
 18 |     "## Imports"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 1,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "import cudf\n",
 28 |     "from blazingsql import BlazingContext"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {
 34 |     "colab_type": "text",
 35 |     "id": "aMwNKxePSwOp"
 36 |    },
 37 |    "source": [
 38 |     "## Connect to BlazingSQL - Create BlazingContext\n",
 39 |     "You can think of the BlazingContext much like a SparkContext; this is where information such as FileSystems you have registered and Tables you have created will be stored."
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 2,
 45 |    "metadata": {
 46 |     "colab": {
 47 |      "base_uri": "https://localhost:8080/",
 48 |      "height": 35
 49 |     },
 50 |     "colab_type": "code",
 51 |     "id": "ZR_vWwtMcvvY",
 52 |     "outputId": "c78cc40a-f7d8-4ac5-c255-d99edd03b785"
 53 |    },
 54 |    "outputs": [
 55 |     {
 56 |      "name": "stdout",
 57 |      "output_type": "stream",
 58 |      "text": [
 59 |       "BlazingContext ready\n"
 60 |      ]
 61 |     }
 62 |    ],
 63 |    "source": [
 64 |     "bc = BlazingContext()"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "markdown",
 69 |    "metadata": {
 70 |     "colab_type": "text",
 71 |     "id": "N2bqpDEnZyQf"
 72 |    },
 73 |    "source": [
 74 |     "## cuDF -> BSQL\n",
 75 |     "In the next few cells, we'll genereate a cuDF DataFrame and create a BlazingSQL table from it. "
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 3,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "data": {
 85 |       "text/html": [
 86 |        "<div>\n",
 87 |        "<style scoped>\n",
 88 |        "    .dataframe tbody tr th:only-of-type {\n",
 89 |        "        vertical-align: middle;\n",
 90 |        "    }\n",
 91 |        "\n",
 92 |        "    .dataframe tbody tr th {\n",
 93 |        "        vertical-align: top;\n",
 94 |        "    }\n",
 95 |        "\n",
 96 |        "    .dataframe thead th {\n",
 97 |        "        text-align: right;\n",
 98 |        "    }\n",
 99 |        "</style>\n",
100 |        "<table border=\"1\" class=\"dataframe\">\n",
101 |        "  <thead>\n",
102 |        "    <tr style=\"text-align: right;\">\n",
103 |        "      <th></th>\n",
104 |        "      <th>id</th>\n",
105 |        "      <th>rank</th>\n",
106 |        "      <th>score</th>\n",
107 |        "    </tr>\n",
108 |        "  </thead>\n",
109 |        "  <tbody>\n",
110 |        "    <tr>\n",
111 |        "      <th>0</th>\n",
112 |        "      <td>1</td>\n",
113 |        "      <td>1</td>\n",
114 |        "      <td>a</td>\n",
115 |        "    </tr>\n",
116 |        "    <tr>\n",
117 |        "      <th>1</th>\n",
118 |        "      <td>7</td>\n",
119 |        "      <td>3</td>\n",
120 |        "      <td>b</td>\n",
121 |        "    </tr>\n",
122 |        "    <tr>\n",
123 |        "      <th>2</th>\n",
124 |        "      <td>4</td>\n",
125 |        "      <td>4</td>\n",
126 |        "      <td>c</td>\n",
127 |        "    </tr>\n",
128 |        "    <tr>\n",
129 |        "      <th>3</th>\n",
130 |        "      <td>2</td>\n",
131 |        "      <td>3</td>\n",
132 |        "      <td>d</td>\n",
133 |        "    </tr>\n",
134 |        "    <tr>\n",
135 |        "      <th>4</th>\n",
136 |        "      <td>9</td>\n",
137 |        "      <td>5</td>\n",
138 |        "      <td>e</td>\n",
139 |        "    </tr>\n",
140 |        "  </tbody>\n",
141 |        "</table>\n",
142 |        "</div>"
143 |       ],
144 |       "text/plain": [
145 |        "   id  rank score\n",
146 |        "0   1     1     a\n",
147 |        "1   7     3     b\n",
148 |        "2   4     4     c\n",
149 |        "3   2     3     d\n",
150 |        "4   9     5     e"
151 |       ]
152 |      },
153 |      "execution_count": 3,
154 |      "metadata": {},
155 |      "output_type": "execute_result"
156 |     }
157 |    ],
158 |    "source": [
159 |     "# generate cuDF DataFrame\n",
160 |     "df = cudf.DataFrame()\n",
161 |     "\n",
162 |     "# add id & value columns\n",
163 |     "df['id'] = [1, 7, 4, 2, 9]\n",
164 |     "df['rank'] = [1, 3, 4, 3, 5]\n",
165 |     "df['score'] = ['a', 'b', 'c', 'd', 'e']\n",
166 |     "\n",
167 |     "# how's it look?\n",
168 |     "df"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "markdown",
173 |    "metadata": {
174 |     "colab_type": "text",
175 |     "id": "HJFz-mqZTJ5Z"
176 |    },
177 |    "source": [
178 |     "#### Create a Table\n",
179 |     "Now we can easily create a table with BlazingContext's `.create_table()` method. "
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 4,
185 |    "metadata": {},
186 |    "outputs": [
187 |     {
188 |      "data": {
189 |       "text/plain": [
190 |        "<pyblazing.apiv2.context.BlazingTable at 0x7f2d14037cc0>"
191 |       ]
192 |      },
193 |      "execution_count": 4,
194 |      "metadata": {},
195 |      "output_type": "execute_result"
196 |     }
197 |    ],
198 |    "source": [
199 |     "# BlazingSQL table from DataFrame\n",
200 |     "bc.create_table('table_a', df)"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {
206 |     "colab_type": "text",
207 |     "id": "98HJFrt5TRa0"
208 |    },
209 |    "source": [
210 |     "## Query a Table\n",
211 |     "We can can now execute SQL queries with `.sql()`, which processes data on GPU and returns results as cuDF DataFrames!"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 5,
217 |    "metadata": {},
218 |    "outputs": [
219 |     {
220 |      "data": {
221 |       "text/html": [
222 |        "<div>\n",
223 |        "<style scoped>\n",
224 |        "    .dataframe tbody tr th:only-of-type {\n",
225 |        "        vertical-align: middle;\n",
226 |        "    }\n",
227 |        "\n",
228 |        "    .dataframe tbody tr th {\n",
229 |        "        vertical-align: top;\n",
230 |        "    }\n",
231 |        "\n",
232 |        "    .dataframe thead th {\n",
233 |        "        text-align: right;\n",
234 |        "    }\n",
235 |        "</style>\n",
236 |        "<table border=\"1\" class=\"dataframe\">\n",
237 |        "  <thead>\n",
238 |        "    <tr style=\"text-align: right;\">\n",
239 |        "      <th></th>\n",
240 |        "      <th>id</th>\n",
241 |        "      <th>rank</th>\n",
242 |        "      <th>score</th>\n",
243 |        "    </tr>\n",
244 |        "  </thead>\n",
245 |        "  <tbody>\n",
246 |        "    <tr>\n",
247 |        "      <th>0</th>\n",
248 |        "      <td>1</td>\n",
249 |        "      <td>1</td>\n",
250 |        "      <td>a</td>\n",
251 |        "    </tr>\n",
252 |        "    <tr>\n",
253 |        "      <th>1</th>\n",
254 |        "      <td>7</td>\n",
255 |        "      <td>3</td>\n",
256 |        "      <td>b</td>\n",
257 |        "    </tr>\n",
258 |        "  </tbody>\n",
259 |        "</table>\n",
260 |        "</div>"
261 |       ],
262 |       "text/plain": [
263 |        "   id  rank score\n",
264 |        "0   1     1     a\n",
265 |        "1   7     3     b"
266 |       ]
267 |      },
268 |      "execution_count": 5,
269 |      "metadata": {},
270 |      "output_type": "execute_result"
271 |     }
272 |    ],
273 |    "source": [
274 |     "# query everything from the first 2 instances \n",
275 |     "bc.sql('select * from table_a LIMIT 2')"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": 6,
281 |    "metadata": {},
282 |    "outputs": [
283 |     {
284 |      "data": {
285 |       "text/html": [
286 |        "<div>\n",
287 |        "<style scoped>\n",
288 |        "    .dataframe tbody tr th:only-of-type {\n",
289 |        "        vertical-align: middle;\n",
290 |        "    }\n",
291 |        "\n",
292 |        "    .dataframe tbody tr th {\n",
293 |        "        vertical-align: top;\n",
294 |        "    }\n",
295 |        "\n",
296 |        "    .dataframe thead th {\n",
297 |        "        text-align: right;\n",
298 |        "    }\n",
299 |        "</style>\n",
300 |        "<table border=\"1\" class=\"dataframe\">\n",
301 |        "  <thead>\n",
302 |        "    <tr style=\"text-align: right;\">\n",
303 |        "      <th></th>\n",
304 |        "      <th>count(*)</th>\n",
305 |        "    </tr>\n",
306 |        "  </thead>\n",
307 |        "  <tbody>\n",
308 |        "    <tr>\n",
309 |        "      <th>0</th>\n",
310 |        "      <td>5</td>\n",
311 |        "    </tr>\n",
312 |        "  </tbody>\n",
313 |        "</table>\n",
314 |        "</div>"
315 |       ],
316 |       "text/plain": [
317 |        "   count(*)\n",
318 |        "0         5"
319 |       ]
320 |      },
321 |      "execution_count": 6,
322 |      "metadata": {},
323 |      "output_type": "execute_result"
324 |     }
325 |    ],
326 |    "source": [
327 |     "# query table - how many instances are there?\n",
328 |     "bc.sql('select count(*) from table_a')"
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "code",
333 |    "execution_count": 7,
334 |    "metadata": {
335 |     "colab": {
336 |      "base_uri": "https://localhost:8080/",
337 |      "height": 1000
338 |     },
339 |     "colab_type": "code",
340 |     "id": "14GwxmLsTV_p",
341 |     "outputId": "144b7601-5363-49f8-d5af-13e80917672c"
342 |    },
343 |    "outputs": [
344 |     {
345 |      "data": {
346 |       "text/html": [
347 |        "<div>\n",
348 |        "<style scoped>\n",
349 |        "    .dataframe tbody tr th:only-of-type {\n",
350 |        "        vertical-align: middle;\n",
351 |        "    }\n",
352 |        "\n",
353 |        "    .dataframe tbody tr th {\n",
354 |        "        vertical-align: top;\n",
355 |        "    }\n",
356 |        "\n",
357 |        "    .dataframe thead th {\n",
358 |        "        text-align: right;\n",
359 |        "    }\n",
360 |        "</style>\n",
361 |        "<table border=\"1\" class=\"dataframe\">\n",
362 |        "  <thead>\n",
363 |        "    <tr style=\"text-align: right;\">\n",
364 |        "      <th></th>\n",
365 |        "      <th>id</th>\n",
366 |        "      <th>rank</th>\n",
367 |        "      <th>score</th>\n",
368 |        "    </tr>\n",
369 |        "  </thead>\n",
370 |        "  <tbody>\n",
371 |        "    <tr>\n",
372 |        "      <th>0</th>\n",
373 |        "      <td>7</td>\n",
374 |        "      <td>3</td>\n",
375 |        "      <td>b</td>\n",
376 |        "    </tr>\n",
377 |        "    <tr>\n",
378 |        "      <th>1</th>\n",
379 |        "      <td>9</td>\n",
380 |        "      <td>5</td>\n",
381 |        "      <td>e</td>\n",
382 |        "    </tr>\n",
383 |        "  </tbody>\n",
384 |        "</table>\n",
385 |        "</div>"
386 |       ],
387 |       "text/plain": [
388 |        "   id  rank score\n",
389 |        "0   7     3     b\n",
390 |        "1   9     5     e"
391 |       ]
392 |      },
393 |      "execution_count": 7,
394 |      "metadata": {},
395 |      "output_type": "execute_result"
396 |     }
397 |    ],
398 |    "source": [
399 |     "# query events with a value of at least 7\n",
400 |     "bc.sql('SELECT * FROM table_a WHERE id >= 7')"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "markdown",
405 |    "metadata": {
406 |     "colab_type": "text",
407 |     "id": "wygAeTIFTm2X"
408 |    },
409 |    "source": [
410 |     "# You're Ready to Rock\n",
411 |     "And... thats it! You are now live with BlazingSQL.\n",
412 |     "\n",
413 |     "\n",
414 |     "Check out our [docs](https://docs.blazingdb.com) to get fancy or to learn more about how BlazingSQL works with the rest of [RAPIDS AI](https://rapids.ai/)."
415 |    ]
416 |   }
417 |  ],
418 |  "metadata": {
419 |   "accelerator": "GPU",
420 |   "colab": {
421 |    "collapsed_sections": [],
422 |    "name": "blazingsql_demo.ipynb",
423 |    "provenance": []
424 |   },
425 |   "kernelspec": {
426 |    "display_name": "winston@blazingdb.com",
427 |    "language": "python",
428 |    "name": "condaenv-winston_blazingdb.com"
429 |   },
430 |   "language_info": {
431 |    "codemirror_mode": {
432 |     "name": "ipython",
433 |     "version": 3
434 |    },
435 |    "file_extension": ".py",
436 |    "mimetype": "text/x-python",
437 |    "name": "python",
438 |    "nbconvert_exporter": "python",
439 |    "pygments_lexer": "ipython3",
440 |    "version": "3.7.3"
441 |   }
442 |  },
443 |  "nbformat": 4,
444 |  "nbformat_minor": 4
445 | }
446 | 


--------------------------------------------------------------------------------
/colab_notebooks/blazingsql_demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "colab_type": "text",
  7 |     "id": "8AdUt3HiUrc3"
  8 |    },
  9 |    "source": [
 10 |     "# Getting Started with BlazingSQL\n",
 11 |     "\n",
 12 |     "In this notebook, we will cover: \n",
 13 |     "- How to set up [BlazingSQL](https://blazingsql.com) and the [RAPIDS AI](https://rapids.ai/) suite.\n",
 14 |     "- How to read and query csv files with cuDF and BlazingSQL.\n",
 15 |     "![Impression](https://www.google-analytics.com/collect?v=1&tid=UA-39814657-5&cid=555&t=event&ec=guides&ea=bsql-quick-start-guide&dt=bsql-quick-start-guide)"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {
 21 |     "colab_type": "text",
 22 |     "id": "_h26epJpUeZP"
 23 |    },
 24 |    "source": [
 25 |     "## Setup\n",
 26 |     "### Environment Sanity Check \n",
 27 |     "\n",
 28 |     "RAPIDS packages (BlazingSQL included) require Pascal+ architecture to run. For Colab, this translates to a T4 GPU instance. \n",
 29 |     "\n",
 30 |     "The cell below will let you know what type of GPU you've been allocated, and how to proceed."
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 1,
 36 |    "metadata": {
 37 |     "colab": {
 38 |      "base_uri": "https://localhost:8080/",
 39 |      "height": 322
 40 |     },
 41 |     "colab_type": "code",
 42 |     "id": "_lf6yKBoRYGy",
 43 |     "outputId": "8e9f7e7e-b89f-49bd-fd3c-c435ffb55c9c"
 44 |    },
 45 |    "outputs": [
 46 |     {
 47 |      "name": "stdout",
 48 |      "output_type": "stream",
 49 |      "text": [
 50 |       "\n",
 51 |       "\n",
 52 |       "***********************************\n",
 53 |       "GPU = b'Tesla T4'\n",
 54 |       "Woo! You got the right kind of GPU!\n",
 55 |       "***********************************\n",
 56 |       "\n",
 57 |       "\n"
 58 |      ]
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "!wget https://github.com/BlazingDB/bsql-demos/raw/master/utils/colab_env.py\n",
 63 |     "!python colab_env.py "
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {
 69 |     "colab_type": "text",
 70 |     "id": "xM8xTlqeRi-g"
 71 |    },
 72 |    "source": [
 73 |     "## Installs \n",
 74 |     "The cell below pulls our Google Colab install script from the `bsql-demos` repo then runs it. The script first installs miniconda, then uses miniconda to install BlazingSQL and RAPIDS AI. This takes a few minutes to run. "
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {
 81 |     "colab": {},
 82 |     "colab_type": "code",
 83 |     "id": "gfWF_lG1HqV7"
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "!wget https://github.com/BlazingDB/bsql-demos/raw/master/utils/bsql-colab.sh \n",
 88 |     "!bash bsql-colab.sh\n",
 89 |     "\n",
 90 |     "import sys, os, time\n",
 91 |     "sys.path.append('/usr/local/lib/python3.6/site-packages/')\n",
 92 |     "os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'\n",
 93 |     "os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'\n",
 94 |     "\n",
 95 |     "import subprocess\n",
 96 |     "subprocess.Popen(['blazingsql-orchestrator', '9100', '8889', '127.0.0.1', '8890'],stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)\n",
 97 |     "subprocess.Popen(['java', '-jar', '/usr/local/lib/blazingsql-algebra.jar', '-p', '8890'])\n",
 98 |     "\n",
 99 |     "import pyblazing.apiv2.context as cont\n",
100 |     "cont.runRal()\n",
101 |     "time.sleep(1) "
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {
107 |     "colab_type": "text",
108 |     "id": "aMwNKxePSwOp"
109 |    },
110 |    "source": [
111 |     "## Import packages and create Blazing Context\n",
112 |     "You can think of the BlazingContext much like a Spark Context (i.e. where information such as FileSystems you have registered and Tables you have created will be stored). If you have issues running this cell, restart runtime and try running it again.\n"
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": 1,
118 |    "metadata": {
119 |     "colab": {
120 |      "base_uri": "https://localhost:8080/",
121 |      "height": 35
122 |     },
123 |     "colab_type": "code",
124 |     "id": "ZR_vWwtMcvvY",
125 |     "outputId": "c78cc40a-f7d8-4ac5-c255-d99edd03b785"
126 |    },
127 |    "outputs": [
128 |     {
129 |      "name": "stdout",
130 |      "output_type": "stream",
131 |      "text": [
132 |       "BlazingContext ready\n"
133 |      ]
134 |     }
135 |    ],
136 |    "source": [
137 |     "import cudf\n",
138 |     "from blazingsql import BlazingContext\n",
139 |     "# start up BlazingSQL\n",
140 |     "bc = BlazingContext()"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "markdown",
145 |    "metadata": {
146 |     "colab_type": "text",
147 |     "id": "N2bqpDEnZyQf"
148 |    },
149 |    "source": [
150 |     "## Read CSV\n",
151 |     "First we need to download a CSV file. Then we use cuDF to read the CSV file, which gives us a GPU DataFrame (GDF). To learn more about the GDF and how it enables end to end workloads on rapids, check out our [blog post](https://blog.blazingdb.com/blazingsql-part-1-the-gpu-dataframe-gdf-and-cudf-in-rapids-ai-96ec15102240)."
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": 2,
157 |    "metadata": {
158 |     "colab": {
159 |      "base_uri": "https://localhost:8080/",
160 |      "height": 204
161 |     },
162 |     "colab_type": "code",
163 |     "id": "iqRDacOBOg44",
164 |     "outputId": "dccb35e0-c284-498b-80b7-8cfc84a7a6a7"
165 |    },
166 |    "outputs": [
167 |     {
168 |      "name": "stdout",
169 |      "output_type": "stream",
170 |      "text": [
171 |       "--2020-01-23 02:59:55--  https://s3.amazonaws.com/blazingsql-colab/Music.csv\n",
172 |       "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.217.0.133\n",
173 |       "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.217.0.133|:443... connected.\n",
174 |       "HTTP request sent, awaiting response... 200 OK\n",
175 |       "Length: 10473 (10K) [text/csv]\n",
176 |       "Saving to: ‘Music.csv’\n",
177 |       "\n",
178 |       "Music.csv           100%[===================>]  10.23K  --.-KB/s    in 0s      \n",
179 |       "\n",
180 |       "2020-01-23 02:59:55 (190 MB/s) - ‘Music.csv’ saved [10473/10473]\n",
181 |       "\n"
182 |      ]
183 |     }
184 |    ],
185 |    "source": [
186 |     "#Download the test CSV\n",
187 |     "!wget 'https://s3.amazonaws.com/blazingsql-colab/Music.csv'"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "code",
192 |    "execution_count": 3,
193 |    "metadata": {
194 |     "colab": {},
195 |     "colab_type": "code",
196 |     "id": "HhRhj-ZvZygH"
197 |    },
198 |    "outputs": [
199 |     {
200 |      "data": {
201 |       "text/html": [
202 |        "<div>\n",
203 |        "<style scoped>\n",
204 |        "    .dataframe tbody tr th:only-of-type {\n",
205 |        "        vertical-align: middle;\n",
206 |        "    }\n",
207 |        "\n",
208 |        "    .dataframe tbody tr th {\n",
209 |        "        vertical-align: top;\n",
210 |        "    }\n",
211 |        "\n",
212 |        "    .dataframe thead th {\n",
213 |        "        text-align: right;\n",
214 |        "    }\n",
215 |        "</style>\n",
216 |        "<table border=\"1\" class=\"dataframe\">\n",
217 |        "  <thead>\n",
218 |        "    <tr style=\"text-align: right;\">\n",
219 |        "      <th></th>\n",
220 |        "      <th>ARTIST</th>\n",
221 |        "      <th>RATING</th>\n",
222 |        "      <th>YEAR</th>\n",
223 |        "      <th>LOCATION</th>\n",
224 |        "      <th>FESTIVAL_SET</th>\n",
225 |        "    </tr>\n",
226 |        "  </thead>\n",
227 |        "  <tbody>\n",
228 |        "    <tr>\n",
229 |        "      <th>0</th>\n",
230 |        "      <td>Arcade Fire</td>\n",
231 |        "      <td>10.0</td>\n",
232 |        "      <td>2018.0</td>\n",
233 |        "      <td>Las Vegas</td>\n",
234 |        "      <td>1.0</td>\n",
235 |        "    </tr>\n",
236 |        "    <tr>\n",
237 |        "      <th>1</th>\n",
238 |        "      <td>Justice</td>\n",
239 |        "      <td>10.0</td>\n",
240 |        "      <td>2018.0</td>\n",
241 |        "      <td>Las Vegas</td>\n",
242 |        "      <td>1.0</td>\n",
243 |        "    </tr>\n",
244 |        "    <tr>\n",
245 |        "      <th>2</th>\n",
246 |        "      <td>Florence and The Machine</td>\n",
247 |        "      <td>10.0</td>\n",
248 |        "      <td>2018.0</td>\n",
249 |        "      <td>Las Vegas</td>\n",
250 |        "      <td>1.0</td>\n",
251 |        "    </tr>\n",
252 |        "    <tr>\n",
253 |        "      <th>3</th>\n",
254 |        "      <td>Odesza</td>\n",
255 |        "      <td>10.0</td>\n",
256 |        "      <td>2018.0</td>\n",
257 |        "      <td>Indio</td>\n",
258 |        "      <td>1.0</td>\n",
259 |        "    </tr>\n",
260 |        "    <tr>\n",
261 |        "      <th>4</th>\n",
262 |        "      <td>Bon Iver</td>\n",
263 |        "      <td>10.0</td>\n",
264 |        "      <td>2017.0</td>\n",
265 |        "      <td>Indio</td>\n",
266 |        "      <td>1.0</td>\n",
267 |        "    </tr>\n",
268 |        "  </tbody>\n",
269 |        "</table>\n",
270 |        "</div>"
271 |       ],
272 |       "text/plain": [
273 |        "                     ARTIST  RATING    YEAR   LOCATION  FESTIVAL_SET\n",
274 |        "0               Arcade Fire    10.0  2018.0  Las Vegas           1.0\n",
275 |        "1                   Justice    10.0  2018.0  Las Vegas           1.0\n",
276 |        "2  Florence and The Machine    10.0  2018.0  Las Vegas           1.0\n",
277 |        "3                    Odesza    10.0  2018.0      Indio           1.0\n",
278 |        "4                  Bon Iver    10.0  2017.0      Indio           1.0"
279 |       ]
280 |      },
281 |      "execution_count": 3,
282 |      "metadata": {},
283 |      "output_type": "execute_result"
284 |     }
285 |    ],
286 |    "source": [
287 |     "# like pandas, cudf can simply read the csv\n",
288 |     "gdf = cudf.read_csv('Music.csv')\n",
289 |     "\n",
290 |     "# let's see how it looks\n",
291 |     "gdf.head()"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "markdown",
296 |    "metadata": {
297 |     "colab_type": "text",
298 |     "id": "HJFz-mqZTJ5Z"
299 |    },
300 |    "source": [
301 |     "## Create a Table\n",
302 |     "Now we just need to create a table. "
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 4,
308 |    "metadata": {
309 |     "colab": {},
310 |     "colab_type": "code",
311 |     "id": "HJuvtJDYTMyb"
312 |    },
313 |    "outputs": [
314 |     {
315 |      "data": {
316 |       "text/plain": [
317 |        "<pyblazing.apiv2.context.BlazingTable at 0x7f32bf626e10>"
318 |       ]
319 |      },
320 |      "execution_count": 4,
321 |      "metadata": {},
322 |      "output_type": "execute_result"
323 |     }
324 |    ],
325 |    "source": [
326 |     "bc.create_table('music', gdf, header=0)"
327 |    ]
328 |   },
329 |   {
330 |    "cell_type": "markdown",
331 |    "metadata": {
332 |     "colab_type": "text",
333 |     "id": "98HJFrt5TRa0"
334 |    },
335 |    "source": [
336 |     "## Query a Table\n",
337 |     "That's it! Now when you can write a SQL query the data will get processed on the GPU with BlazingSQL, and the output will be a GPU DataFrame (GDF) inside RAPIDS!"
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": 5,
343 |    "metadata": {
344 |     "colab": {
345 |      "base_uri": "https://localhost:8080/",
346 |      "height": 1000
347 |     },
348 |     "colab_type": "code",
349 |     "id": "14GwxmLsTV_p",
350 |     "outputId": "144b7601-5363-49f8-d5af-13e80917672c"
351 |    },
352 |    "outputs": [
353 |     {
354 |      "data": {
355 |       "text/html": [
356 |        "<div>\n",
357 |        "<style scoped>\n",
358 |        "    .dataframe tbody tr th:only-of-type {\n",
359 |        "        vertical-align: middle;\n",
360 |        "    }\n",
361 |        "\n",
362 |        "    .dataframe tbody tr th {\n",
363 |        "        vertical-align: top;\n",
364 |        "    }\n",
365 |        "\n",
366 |        "    .dataframe thead th {\n",
367 |        "        text-align: right;\n",
368 |        "    }\n",
369 |        "</style>\n",
370 |        "<table border=\"1\" class=\"dataframe\">\n",
371 |        "  <thead>\n",
372 |        "    <tr style=\"text-align: right;\">\n",
373 |        "      <th></th>\n",
374 |        "      <th>ARTIST</th>\n",
375 |        "      <th>RATING</th>\n",
376 |        "      <th>LOCATION</th>\n",
377 |        "    </tr>\n",
378 |        "  </thead>\n",
379 |        "  <tbody>\n",
380 |        "    <tr>\n",
381 |        "      <th>0</th>\n",
382 |        "      <td>Arcade Fire</td>\n",
383 |        "      <td>10.0</td>\n",
384 |        "      <td>Las Vegas</td>\n",
385 |        "    </tr>\n",
386 |        "    <tr>\n",
387 |        "      <th>1</th>\n",
388 |        "      <td>Justice</td>\n",
389 |        "      <td>10.0</td>\n",
390 |        "      <td>Las Vegas</td>\n",
391 |        "    </tr>\n",
392 |        "    <tr>\n",
393 |        "      <th>2</th>\n",
394 |        "      <td>Florence and The Machine</td>\n",
395 |        "      <td>10.0</td>\n",
396 |        "      <td>Las Vegas</td>\n",
397 |        "    </tr>\n",
398 |        "    <tr>\n",
399 |        "      <th>3</th>\n",
400 |        "      <td>Odesza</td>\n",
401 |        "      <td>10.0</td>\n",
402 |        "      <td>Indio</td>\n",
403 |        "    </tr>\n",
404 |        "    <tr>\n",
405 |        "      <th>4</th>\n",
406 |        "      <td>Bon Iver</td>\n",
407 |        "      <td>10.0</td>\n",
408 |        "      <td>Indio</td>\n",
409 |        "    </tr>\n",
410 |        "    <tr>\n",
411 |        "      <th>5</th>\n",
412 |        "      <td>LA Philharmonic + Sigur Ros</td>\n",
413 |        "      <td>10.0</td>\n",
414 |        "      <td>LA</td>\n",
415 |        "    </tr>\n",
416 |        "    <tr>\n",
417 |        "      <th>6</th>\n",
418 |        "      <td>Sigur Ros</td>\n",
419 |        "      <td>10.0</td>\n",
420 |        "      <td>Malmo</td>\n",
421 |        "    </tr>\n",
422 |        "    <tr>\n",
423 |        "      <th>7</th>\n",
424 |        "      <td>Arcade Fire</td>\n",
425 |        "      <td>10.0</td>\n",
426 |        "      <td>Indio</td>\n",
427 |        "    </tr>\n",
428 |        "    <tr>\n",
429 |        "      <th>8</th>\n",
430 |        "      <td>Escort</td>\n",
431 |        "      <td>9.0</td>\n",
432 |        "      <td>San Francisco</td>\n",
433 |        "    </tr>\n",
434 |        "    <tr>\n",
435 |        "      <th>9</th>\n",
436 |        "      <td>Phoenix</td>\n",
437 |        "      <td>9.0</td>\n",
438 |        "      <td>Berkeley</td>\n",
439 |        "    </tr>\n",
440 |        "  </tbody>\n",
441 |        "</table>\n",
442 |        "</div>"
443 |       ],
444 |       "text/plain": [
445 |        "                        ARTIST  RATING       LOCATION\n",
446 |        "0                  Arcade Fire    10.0      Las Vegas\n",
447 |        "1                      Justice    10.0      Las Vegas\n",
448 |        "2     Florence and The Machine    10.0      Las Vegas\n",
449 |        "3                       Odesza    10.0          Indio\n",
450 |        "4                     Bon Iver    10.0          Indio\n",
451 |        "5  LA Philharmonic + Sigur Ros    10.0             LA\n",
452 |        "6                    Sigur Ros    10.0          Malmo\n",
453 |        "7                  Arcade Fire    10.0          Indio\n",
454 |        "8                       Escort     9.0  San Francisco\n",
455 |        "9                      Phoenix     9.0       Berkeley"
456 |       ]
457 |      },
458 |      "execution_count": 5,
459 |      "metadata": {},
460 |      "output_type": "execute_result"
461 |     }
462 |    ],
463 |    "source": [
464 |     "# query 10 events with a rating of at least 7\n",
465 |     "gdf = bc.sql('select ARTIST, RATING, LOCATION from music where RATING >= 7 limit 10')\n",
466 |     "\n",
467 |     "# display GDF (just like pandas)\n",
468 |     "gdf"
469 |    ]
470 |   },
471 |   {
472 |    "cell_type": "markdown",
473 |    "metadata": {
474 |     "colab_type": "text",
475 |     "id": "wygAeTIFTm2X"
476 |    },
477 |    "source": [
478 |     "# You're Ready to Rock\n",
479 |     "And... thats it! You are now live with BlazingSQL.\n",
480 |     "\n",
481 |     "\n",
482 |     "Check out our [docs](https://docs.blazingdb.com) to get fancy or to learn more about how BlazingSQL works with the rest of [RAPIDS AI](https://rapids.ai/)."
483 |    ]
484 |   }
485 |  ],
486 |  "metadata": {
487 |   "accelerator": "GPU",
488 |   "colab": {
489 |    "collapsed_sections": [],
490 |    "name": "blazingsql_demo.ipynb",
491 |    "provenance": []
492 |   },
493 |   "kernelspec": {
494 |    "display_name": "Python 3",
495 |    "language": "python",
496 |    "name": "python3"
497 |   },
498 |   "language_info": {
499 |    "codemirror_mode": {
500 |     "name": "ipython",
501 |     "version": 3
502 |    },
503 |    "file_extension": ".py",
504 |    "mimetype": "text/x-python",
505 |    "name": "python",
506 |    "nbconvert_exporter": "python",
507 |    "pygments_lexer": "ipython3",
508 |    "version": "3.6.7"
509 |   }
510 |  },
511 |  "nbformat": 4,
512 |  "nbformat_minor": 4
513 | }
514 | 


--------------------------------------------------------------------------------
/colab_notebooks/graphistry_netflow_demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "colab_type": "text",
  7 |     "id": "kJyD4oSbugE0"
  8 |    },
  9 |    "source": [
 10 |     "# Graphistry Netflow Demo\n",
 11 |     "\n",
 12 |     "In this example we are taking millions of rows of netflow (network traffic flow) data in order to search for anomalous activity within a network.\n",
 13 |     "\n",
 14 |     "In this notebook, we will: \n",
 15 |     "- Set up [BlazingSQL](https://blazingsql.com) and the [RAPIDS AI](https://rapids.ai/) suite.\n",
 16 |     "- Query 20M rows of network security data (netflow) with BlazingSQL and then pass to Graphistry to visualize and interact with the data.\n",
 17 |     "![Impression](https://www.google-analytics.com/collect?v=1&tid=UA-39814657-5&cid=555&t=event&ec=guides&ea=graphistry-netflow-demo&dt=graphistry-netflow-demo)"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "markdown",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "## Setup\n",
 25 |     "### Environment Sanity Check \n",
 26 |     "\n",
 27 |     "RAPIDS packages (BlazingSQL included) require Pascal+ architecture to run. For Colab, this translates to a T4 GPU instance. \n",
 28 |     "\n",
 29 |     "The cell below will let you know what type of GPU you've been allocated, and how to proceed."
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 1,
 35 |    "metadata": {
 36 |     "colab": {
 37 |      "base_uri": "https://localhost:8080/",
 38 |      "height": 312
 39 |     },
 40 |     "colab_type": "code",
 41 |     "id": "zxhxwrfI7aoT",
 42 |     "outputId": "0880eafa-a0b1-4f39-d3dc-bab9d4e8b127"
 43 |    },
 44 |    "outputs": [
 45 |     {
 46 |      "name": "stdout",
 47 |      "output_type": "stream",
 48 |      "text": [
 49 |       "\n",
 50 |       "\n",
 51 |       "***********************************\n",
 52 |       "GPU = b'Tesla T4'\n",
 53 |       "Woo! You got the right kind of GPU!\n",
 54 |       "***********************************\n",
 55 |       "\n",
 56 |       "\n"
 57 |      ]
 58 |     }
 59 |    ],
 60 |    "source": [
 61 |     "!wget https://github.com/BlazingDB/bsql-demos/raw/master/utils/colab_env.py\n",
 62 |     "!python colab_env.py "
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "markdown",
 67 |    "metadata": {},
 68 |    "source": [
 69 |     "## Installs \n",
 70 |     "The cell below pulls our Google Colab install script from the `bsql-demos` repo then runs it. The script first installs miniconda, then uses miniconda to install BlazingSQL and RAPIDS AI. This takes a few minutes to run. "
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 3,
 76 |    "metadata": {
 77 |     "colab": {
 78 |      "base_uri": "https://localhost:8080/",
 79 |      "height": 35
 80 |     },
 81 |     "colab_type": "code",
 82 |     "id": "a7RprJxtZZtQ",
 83 |     "outputId": "5ed256e4-93ee-4295-914d-c5c75c9d6059"
 84 |    },
 85 |    "outputs": [],
 86 |    "source": [
 87 |     "!wget https://github.com/BlazingDB/bsql-demos/raw/master/utils/bsql-colab.sh \n",
 88 |     "!bash bsql-colab.sh\n",
 89 |     "\n",
 90 |     "import sys, os, time\n",
 91 |     "sys.path.append('/usr/local/lib/python3.6/site-packages/')\n",
 92 |     "os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'\n",
 93 |     "os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'\n",
 94 |     "\n",
 95 |     "import subprocess\n",
 96 |     "subprocess.Popen(['blazingsql-orchestrator', '9100', '8889', '127.0.0.1', '8890'],stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)\n",
 97 |     "subprocess.Popen(['java', '-jar', '/usr/local/lib/blazingsql-algebra.jar', '-p', '8890'])\n",
 98 |     "\n",
 99 |     "import pyblazing.apiv2.context as cont\n",
100 |     "cont.runRal()\n",
101 |     "time.sleep(1) \n",
102 |     "!wget https://github.com/BlazingDB/bsql-demos/raw/master/utils/colab_env.py\n",
103 |     "!python colab_env.py "
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {
109 |     "colab_type": "text",
110 |     "id": "4guM6G87ul8e"
111 |    },
112 |    "source": [
113 |     "## Download CSV\n",
114 |     "\n",
115 |     "The cell below will download the data for this demo from AWS and store it locally as `nf-chunk2.csv`. "
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": null,
121 |    "metadata": {
122 |     "colab": {
123 |      "base_uri": "https://localhost:8080/",
124 |      "height": 208
125 |     },
126 |     "colab_type": "code",
127 |     "id": "F6teFkVGufUf",
128 |     "outputId": "42fedd97-8baf-4d1a-ea41-95602cd8cb11"
129 |    },
130 |    "outputs": [
131 |     {
132 |      "name": "stdout",
133 |      "output_type": "stream",
134 |      "text": [
135 |       "--2019-08-23 21:43:50--  https://blazingsql-colab.s3.amazonaws.com/netflow_data/nf-chunk2.csv\n",
136 |       "Resolving blazingsql-colab.s3.amazonaws.com (blazingsql-colab.s3.amazonaws.com)... 52.216.137.76\n",
137 |       "Connecting to blazingsql-colab.s3.amazonaws.com (blazingsql-colab.s3.amazonaws.com)|52.216.137.76|:443... connected.\n",
138 |       "HTTP request sent, awaiting response... 200 OK\n",
139 |       "Length: 2725056295 (2.5G) [text/csv]\n",
140 |       "Saving to: ‘nf-chunk2.csv’\n",
141 |       "\n",
142 |       "nf-chunk2.csv       100%[===================>]   2.54G  49.2MB/s    in 56s     \n",
143 |       "\n",
144 |       "2019-08-23 21:44:46 (46.2 MB/s) - ‘nf-chunk2.csv’ saved [2725056295/2725056295]\n",
145 |       "\n"
146 |      ]
147 |     }
148 |    ],
149 |    "source": [
150 |     "!wget https://blazingsql-colab.s3.amazonaws.com/netflow_data/nf-chunk2.csv"
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "markdown",
155 |    "metadata": {},
156 |    "source": [
157 |     "## Blazing Context\n",
158 |     "Here we are importing cuDF and BlazingContext. You can think of the BlazingContext much like a Spark Context (i.e. where information such as FileSystems you have registered and Tables you have created will be stored). If you have issues running this cell, restart runtime and try running it again."
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 1,
164 |    "metadata": {
165 |     "colab": {
166 |      "base_uri": "https://localhost:8080/",
167 |      "height": 69
168 |     },
169 |     "colab_type": "code",
170 |     "id": "pqQ8lqL8vb-8",
171 |     "outputId": "4e5ebc46-6319-4d3a-851c-7d6a2ac2825d"
172 |    },
173 |    "outputs": [
174 |     {
175 |      "name": "stdout",
176 |      "output_type": "stream",
177 |      "text": [
178 |       "BlazingContext ready\n"
179 |      ]
180 |     }
181 |    ],
182 |    "source": [
183 |     "from blazingsql import BlazingContext\n",
184 |     "import cudf\n",
185 |     "# start up BlazingSQL\n",
186 |     "bc = BlazingContext()"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "markdown",
191 |    "metadata": {
192 |     "colab_type": "text",
193 |     "id": "yp7z8bfivbna"
194 |    },
195 |    "source": [
196 |     "### Load & Query Tables\n",
197 |     "\n",
198 |     "In the cell below, we are first loading the CSV file into a GPU DataFrame (gdf), and then creating tables so that we can run SQL queries on those GDFs. \n",
199 |     "\n",
200 |     "Note: when you create a table off of a GDF there is no copy, it is merely registering the schema."
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 2,
206 |    "metadata": {
207 |     "colab": {},
208 |     "colab_type": "code",
209 |     "id": "lU-2wlwQntnq"
210 |    },
211 |    "outputs": [
212 |     {
213 |      "data": {
214 |       "text/plain": [
215 |        "<pyblazing.apiv2.sql.Table at 0x7f59488bceb8>"
216 |       ]
217 |      },
218 |      "execution_count": 2,
219 |      "metadata": {},
220 |      "output_type": "execute_result"
221 |     }
222 |    ],
223 |    "source": [
224 |     "# Load CSVs into GPU DataFrames (gdf)\n",
225 |     "netflow_gdf = cudf.read_csv('nf-chunk2.csv')\n",
226 |     "\n",
227 |     "# Create BlazingSQL Tables - There is no copy in this process\n",
228 |     "bc.create_table('netflow', netflow_gdf)"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {
234 |     "colab_type": "text",
235 |     "id": "cgivbut9df-R"
236 |    },
237 |    "source": [
238 |     "#### Query\n",
239 |     "With the table made, we can simply run a SQL query.\n",
240 |     "\n",
241 |     "We are going to run some joins and aggregations in order to condese these millions of rows into thousands of rows that represent nodes and edges."
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 3,
247 |    "metadata": {
248 |     "colab": {
249 |      "base_uri": "https://localhost:8080/",
250 |      "height": 277
251 |     },
252 |     "colab_type": "code",
253 |     "id": "umBG2Tp0wbQx",
254 |     "outputId": "b89e3666-f85a-40e9-e7c4-cda9a80b7fe5"
255 |    },
256 |    "outputs": [
257 |     {
258 |      "name": "stdout",
259 |      "output_type": "stream",
260 |      "text": [
261 |       "CPU times: user 32.3 ms, sys: 453 µs, total: 32.8 ms\n",
262 |       "Wall time: 438 ms\n"
263 |      ]
264 |     }
265 |    ],
266 |    "source": [
267 |     "%%time\n",
268 |     "# make a query\n",
269 |     "query = '''\n",
270 |     "        SELECT\n",
271 |     "            a.firstSeenSrcIp as source,\n",
272 |     "            a.firstSeenDestIp as destination,\n",
273 |     "            count(a.firstSeenDestPort) as targetPorts,\n",
274 |     "            SUM(a.firstSeenSrcTotalBytes) as bytesOut,\n",
275 |     "            SUM(a.firstSeenDestTotalBytes) as bytesIn,\n",
276 |     "            SUM(a.durationSeconds) as durationSeconds,\n",
277 |     "            MIN(parsedDate) as firstFlowDate,\n",
278 |     "            MAX(parsedDate) as lastFlowDate,\n",
279 |     "            COUNT(*) as attemptCount\n",
280 |     "        FROM\n",
281 |     "            netflow a\n",
282 |     "        GROUP BY\n",
283 |     "            a.firstSeenSrcIp,\n",
284 |     "            a.firstSeenDestIp\n",
285 |     "            '''\n",
286 |     "\n",
287 |     "# query the table\n",
288 |     "gdf = bc.sql(query)"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": 4,
294 |    "metadata": {},
295 |    "outputs": [
296 |     {
297 |      "data": {
298 |       "text/html": [
299 |        "<div>\n",
300 |        "<style scoped>\n",
301 |        "    .dataframe tbody tr th:only-of-type {\n",
302 |        "        vertical-align: middle;\n",
303 |        "    }\n",
304 |        "\n",
305 |        "    .dataframe tbody tr th {\n",
306 |        "        vertical-align: top;\n",
307 |        "    }\n",
308 |        "\n",
309 |        "    .dataframe thead th {\n",
310 |        "        text-align: right;\n",
311 |        "    }\n",
312 |        "</style>\n",
313 |        "<table border=\"1\" class=\"dataframe\">\n",
314 |        "  <thead>\n",
315 |        "    <tr style=\"text-align: right;\">\n",
316 |        "      <th></th>\n",
317 |        "      <th>source</th>\n",
318 |        "      <th>destination</th>\n",
319 |        "      <th>targetPorts</th>\n",
320 |        "      <th>bytesOut</th>\n",
321 |        "      <th>bytesIn</th>\n",
322 |        "      <th>durationSeconds</th>\n",
323 |        "      <th>firstFlowDate</th>\n",
324 |        "      <th>lastFlowDate</th>\n",
325 |        "      <th>attemptCount</th>\n",
326 |        "    </tr>\n",
327 |        "  </thead>\n",
328 |        "  <tbody>\n",
329 |        "    <tr>\n",
330 |        "      <th>0</th>\n",
331 |        "      <td>172.10.1.226</td>\n",
332 |        "      <td>239.255.255.250</td>\n",
333 |        "      <td>3</td>\n",
334 |        "      <td>875</td>\n",
335 |        "      <td>0</td>\n",
336 |        "      <td>6</td>\n",
337 |        "      <td>2013-04-03 06:36:44</td>\n",
338 |        "      <td>2013-04-03 06:36:51</td>\n",
339 |        "      <td>3</td>\n",
340 |        "    </tr>\n",
341 |        "    <tr>\n",
342 |        "      <th>1</th>\n",
343 |        "      <td>172.30.1.200</td>\n",
344 |        "      <td>239.255.255.250</td>\n",
345 |        "      <td>9</td>\n",
346 |        "      <td>2275</td>\n",
347 |        "      <td>0</td>\n",
348 |        "      <td>12</td>\n",
349 |        "      <td>2013-04-03 06:35:52</td>\n",
350 |        "      <td>2013-04-03 06:43:26</td>\n",
351 |        "      <td>9</td>\n",
352 |        "    </tr>\n",
353 |        "    <tr>\n",
354 |        "      <th>2</th>\n",
355 |        "      <td>172.30.1.225</td>\n",
356 |        "      <td>172.0.0.1</td>\n",
357 |        "      <td>1</td>\n",
358 |        "      <td>90</td>\n",
359 |        "      <td>90</td>\n",
360 |        "      <td>0</td>\n",
361 |        "      <td>2013-04-03 06:36:14</td>\n",
362 |        "      <td>2013-04-03 06:36:14</td>\n",
363 |        "      <td>1</td>\n",
364 |        "    </tr>\n",
365 |        "    <tr>\n",
366 |        "      <th>3</th>\n",
367 |        "      <td>172.30.1.46</td>\n",
368 |        "      <td>239.255.255.250</td>\n",
369 |        "      <td>17</td>\n",
370 |        "      <td>4025</td>\n",
371 |        "      <td>0</td>\n",
372 |        "      <td>18</td>\n",
373 |        "      <td>2013-04-03 06:35:22</td>\n",
374 |        "      <td>2013-04-03 06:44:08</td>\n",
375 |        "      <td>17</td>\n",
376 |        "    </tr>\n",
377 |        "    <tr>\n",
378 |        "      <th>4</th>\n",
379 |        "      <td>172.20.2.71</td>\n",
380 |        "      <td>239.255.255.250</td>\n",
381 |        "      <td>3</td>\n",
382 |        "      <td>875</td>\n",
383 |        "      <td>0</td>\n",
384 |        "      <td>6</td>\n",
385 |        "      <td>2013-04-03 06:37:11</td>\n",
386 |        "      <td>2013-04-03 06:37:18</td>\n",
387 |        "      <td>3</td>\n",
388 |        "    </tr>\n",
389 |        "    <tr>\n",
390 |        "      <th>5</th>\n",
391 |        "      <td>172.10.1.233</td>\n",
392 |        "      <td>172.0.0.1</td>\n",
393 |        "      <td>1</td>\n",
394 |        "      <td>180</td>\n",
395 |        "      <td>180</td>\n",
396 |        "      <td>0</td>\n",
397 |        "      <td>2013-04-03 06:36:45</td>\n",
398 |        "      <td>2013-04-03 06:36:45</td>\n",
399 |        "      <td>1</td>\n",
400 |        "    </tr>\n",
401 |        "    <tr>\n",
402 |        "      <th>6</th>\n",
403 |        "      <td>172.30.1.102</td>\n",
404 |        "      <td>10.0.0.10</td>\n",
405 |        "      <td>1</td>\n",
406 |        "      <td>454</td>\n",
407 |        "      <td>633</td>\n",
408 |        "      <td>0</td>\n",
409 |        "      <td>2013-04-03 06:48:05</td>\n",
410 |        "      <td>2013-04-03 06:48:05</td>\n",
411 |        "      <td>1</td>\n",
412 |        "    </tr>\n",
413 |        "    <tr>\n",
414 |        "      <th>7</th>\n",
415 |        "      <td>172.20.1.39</td>\n",
416 |        "      <td>239.255.255.250</td>\n",
417 |        "      <td>1</td>\n",
418 |        "      <td>525</td>\n",
419 |        "      <td>0</td>\n",
420 |        "      <td>6</td>\n",
421 |        "      <td>2013-04-03 06:36:59</td>\n",
422 |        "      <td>2013-04-03 06:36:59</td>\n",
423 |        "      <td>1</td>\n",
424 |        "    </tr>\n",
425 |        "    <tr>\n",
426 |        "      <th>8</th>\n",
427 |        "      <td>172.10.1.96</td>\n",
428 |        "      <td>172.0.0.1</td>\n",
429 |        "      <td>1</td>\n",
430 |        "      <td>180</td>\n",
431 |        "      <td>0</td>\n",
432 |        "      <td>0</td>\n",
433 |        "      <td>2013-04-03 06:36:21</td>\n",
434 |        "      <td>2013-04-03 06:36:21</td>\n",
435 |        "      <td>1</td>\n",
436 |        "    </tr>\n",
437 |        "    <tr>\n",
438 |        "      <th>9</th>\n",
439 |        "      <td>172.20.1.2</td>\n",
440 |        "      <td>239.255.255.250</td>\n",
441 |        "      <td>19</td>\n",
442 |        "      <td>3675</td>\n",
443 |        "      <td>0</td>\n",
444 |        "      <td>6</td>\n",
445 |        "      <td>2013-04-03 06:36:50</td>\n",
446 |        "      <td>2013-04-03 06:36:59</td>\n",
447 |        "      <td>19</td>\n",
448 |        "    </tr>\n",
449 |        "  </tbody>\n",
450 |        "</table>\n",
451 |        "</div>"
452 |       ],
453 |       "text/plain": [
454 |        "         source      destination  targetPorts  bytesOut  bytesIn  \\\n",
455 |        "0  172.10.1.226  239.255.255.250            3       875        0   \n",
456 |        "1  172.30.1.200  239.255.255.250            9      2275        0   \n",
457 |        "2  172.30.1.225        172.0.0.1            1        90       90   \n",
458 |        "3   172.30.1.46  239.255.255.250           17      4025        0   \n",
459 |        "4   172.20.2.71  239.255.255.250            3       875        0   \n",
460 |        "5  172.10.1.233        172.0.0.1            1       180      180   \n",
461 |        "6  172.30.1.102        10.0.0.10            1       454      633   \n",
462 |        "7   172.20.1.39  239.255.255.250            1       525        0   \n",
463 |        "8   172.10.1.96        172.0.0.1            1       180        0   \n",
464 |        "9    172.20.1.2  239.255.255.250           19      3675        0   \n",
465 |        "\n",
466 |        "   durationSeconds        firstFlowDate         lastFlowDate  attemptCount  \n",
467 |        "0                6  2013-04-03 06:36:44  2013-04-03 06:36:51             3  \n",
468 |        "1               12  2013-04-03 06:35:52  2013-04-03 06:43:26             9  \n",
469 |        "2                0  2013-04-03 06:36:14  2013-04-03 06:36:14             1  \n",
470 |        "3               18  2013-04-03 06:35:22  2013-04-03 06:44:08            17  \n",
471 |        "4                6  2013-04-03 06:37:11  2013-04-03 06:37:18             3  \n",
472 |        "5                0  2013-04-03 06:36:45  2013-04-03 06:36:45             1  \n",
473 |        "6                0  2013-04-03 06:48:05  2013-04-03 06:48:05             1  \n",
474 |        "7                6  2013-04-03 06:36:59  2013-04-03 06:36:59             1  \n",
475 |        "8                0  2013-04-03 06:36:21  2013-04-03 06:36:21             1  \n",
476 |        "9                6  2013-04-03 06:36:50  2013-04-03 06:36:59            19  "
477 |       ]
478 |      },
479 |      "execution_count": 4,
480 |      "metadata": {},
481 |      "output_type": "execute_result"
482 |     }
483 |    ],
484 |    "source": [
485 |     "# how's the dataframe look?\n",
486 |     "gdf.head(10)"
487 |    ]
488 |   }
489 |  ],
490 |  "metadata": {
491 |   "file_extension": ".py",
492 |   "kernelspec": {
493 |    "display_name": "Python 3",
494 |    "language": "python",
495 |    "name": "python3"
496 |   },
497 |   "language_info": {
498 |    "codemirror_mode": {
499 |     "name": "ipython",
500 |     "version": 3
501 |    },
502 |    "file_extension": ".py",
503 |    "mimetype": "text/x-python",
504 |    "name": "python",
505 |    "nbconvert_exporter": "python",
506 |    "pygments_lexer": "ipython3",
507 |    "version": "3.6.7"
508 |   },
509 |   "mimetype": "text/x-python",
510 |   "name": "python",
511 |   "npconvert_exporter": "python",
512 |   "pygments_lexer": "ipython3",
513 |   "version": 3
514 |  },
515 |  "nbformat": 4,
516 |  "nbformat_minor": 2
517 | }
518 | 


--------------------------------------------------------------------------------
/colab_notebooks/vs_pyspark_netflow.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "colab_type": "text",
  7 |     "id": "d0hJ4z8rBOFC"
  8 |    },
  9 |    "source": [
 10 |     "# BlazingSQL vs. Apache Spark \n",
 11 |     "\n",
 12 |     "Below we have one of our popular workloads running with [BlazingSQL](https://blazingsql.com/) + [RAPIDS AI](https://rapids.ai) and then running the entire ETL phase again, only this time with Apache Spark + PySpark.\n",
 13 |     "\n",
 14 |     "In this notebook, we will cover: \n",
 15 |     "- How to set up BlazingSQL and the RAPIDS AI suite in Google Colab.\n",
 16 |     "- How to read and query csv files with cuDF and BlazingSQL.\n",
 17 |     "- How BlazingSQL compares against Apache Spark (analyzing over 20M records).\n",
 18 |     "\n",
 19 |     "![Impression](https://www.google-analytics.com/collect?v=1&tid=UA-39814657-5&cid=555&t=event&ec=guides&ea=bsql_vs_spark&dt=bsql_vs_spark)"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "markdown",
 24 |    "metadata": {
 25 |     "colab_type": "text",
 26 |     "id": "kJyD4oSbugE0"
 27 |    },
 28 |    "source": [
 29 |     "## Setup\n",
 30 |     "### Environment Sanity Check \n",
 31 |     "\n",
 32 |     "RAPIDS packages (BlazingSQL included) require Pascal+ architecture to run. For Colab, this translates to a T4 GPU instance. \n",
 33 |     "\n",
 34 |     "The cell below will let you know what type of GPU you've been allocated, and how to proceed."
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 14,
 40 |    "metadata": {
 41 |     "colab": {
 42 |      "base_uri": "https://localhost:8080/",
 43 |      "height": 35
 44 |     },
 45 |     "colab_type": "code",
 46 |     "id": "QzVzojZ7tc9a",
 47 |     "outputId": "1c412c49-59fd-482b-83dc-1764af8fda12"
 48 |    },
 49 |    "outputs": [
 50 |     {
 51 |      "name": "stdout",
 52 |      "output_type": "stream",
 53 |      "text": [
 54 |       "\n",
 55 |       "\n",
 56 |       "***********************************\n",
 57 |       "GPU = b'Tesla T4'\n",
 58 |       "Woo! You got the right kind of GPU!\n",
 59 |       "***********************************\n",
 60 |       "\n",
 61 |       "\n"
 62 |      ]
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "!wget https://github.com/BlazingDB/bsql-demos/raw/master/utils/colab_env.py\n",
 67 |     "!python colab_env.py "
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "markdown",
 72 |    "metadata": {
 73 |     "colab": {},
 74 |     "colab_type": "code",
 75 |     "id": "btG1BbSA1nLu"
 76 |    },
 77 |    "source": [
 78 |     "## Installs \n",
 79 |     "The cell below pulls our Google Colab install script from the `bsql-demos` repo then runs it. The script first installs miniconda, then uses miniconda to install BlazingSQL and RAPIDS AI. This takes a few minutes to run. "
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": null,
 85 |    "metadata": {},
 86 |    "outputs": [],
 87 |    "source": [
 88 |     "!wget https://github.com/BlazingDB/bsql-demos/raw/master/utils/bsql-colab.sh \n",
 89 |     "!bash bsql-colab.sh\n",
 90 |     "\n",
 91 |     "import sys, os, time\n",
 92 |     "sys.path.append('/usr/local/lib/python3.6/site-packages/')\n",
 93 |     "os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'\n",
 94 |     "os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'\n",
 95 |     "\n",
 96 |     "import subprocess\n",
 97 |     "subprocess.Popen(['blazingsql-orchestrator', '9100', '8889', '127.0.0.1', '8890'],stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)\n",
 98 |     "subprocess.Popen(['java', '-jar', '/usr/local/lib/blazingsql-algebra.jar', '-p', '8890'])\n",
 99 |     "\n",
100 |     "import pyblazing.apiv2.context as cont\n",
101 |     "cont.runRal()\n",
102 |     "time.sleep(1) "
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {
108 |     "colab_type": "text",
109 |     "id": "0guvG6Ws_zmX"
110 |    },
111 |    "source": [
112 |     "## Import packages and create Blazing Context\n",
113 |     "You can think of the BlazingContext much like a Spark Context (i.e. where information such as FileSystems you have registered and Tables you have created will be stored). If you have issues running this cell, restart runtime and try running it again."
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": 1,
119 |    "metadata": {
120 |     "colab": {
121 |      "base_uri": "https://localhost:8080/",
122 |      "height": 35
123 |     },
124 |     "colab_type": "code",
125 |     "id": "ojm_V-WAtz0f",
126 |     "outputId": "a46625f4-1494-4a13-eb13-2f38efd80ccf"
127 |    },
128 |    "outputs": [
129 |     {
130 |      "name": "stdout",
131 |      "output_type": "stream",
132 |      "text": [
133 |       "BlazingContext ready\n"
134 |      ]
135 |     }
136 |    ],
137 |    "source": [
138 |     "from blazingsql import BlazingContext\n",
139 |     "import cudf\n",
140 |     "# start up BlazingSQL\n",
141 |     "bc = BlazingContext()"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "markdown",
146 |    "metadata": {
147 |     "colab_type": "text",
148 |     "id": "yp7z8bfivbna"
149 |    },
150 |    "source": [
151 |     "### Load & Query Table\n",
152 |     "First, we need to download the netflow data (20 million records) from AWS."
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {
159 |     "colab": {},
160 |     "colab_type": "code",
161 |     "id": "2dAt6DfG37KH"
162 |    },
163 |    "outputs": [],
164 |    "source": [
165 |     "# takes a few minutes to download\n",
166 |     "!wget https://blazingsql-colab.s3.amazonaws.com/netflow_data/nf-chunk2.csv"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "markdown",
171 |    "metadata": {
172 |     "colab_type": "text",
173 |     "id": "OTEaAsp2_zmf"
174 |    },
175 |    "source": [
176 |     "#### BlazingSQL + cuDF \n",
177 |     "Data in hand, we can test the preformance of cuDF and BlazingSQL on this dataset."
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 2,
183 |    "metadata": {
184 |     "colab": {
185 |      "base_uri": "https://localhost:8080/",
186 |      "height": 52
187 |     },
188 |     "colab_type": "code",
189 |     "id": "rirBsYQU3NH5",
190 |     "outputId": "51ced2b1-b930-4173-bbfa-09672e751d3f"
191 |    },
192 |    "outputs": [
193 |     {
194 |      "name": "stdout",
195 |      "output_type": "stream",
196 |      "text": [
197 |       "CPU times: user 138 ms, sys: 142 ms, total: 280 ms\n",
198 |       "Wall time: 304 ms\n"
199 |      ]
200 |     }
201 |    ],
202 |    "source": [
203 |     "%%time\n",
204 |     "# Load CSVs into GPU DataFrames (GDF)\n",
205 |     "netflow_gdf = cudf.read_csv('nf-chunk2.csv')"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "code",
210 |    "execution_count": 3,
211 |    "metadata": {
212 |     "colab": {
213 |      "base_uri": "https://localhost:8080/",
214 |      "height": 52
215 |     },
216 |     "colab_type": "code",
217 |     "id": "zCzLEFfB3N4k",
218 |     "outputId": "10ff9097-2736-423e-969d-de75983fbdda"
219 |    },
220 |    "outputs": [
221 |     {
222 |      "name": "stdout",
223 |      "output_type": "stream",
224 |      "text": [
225 |       "CPU times: user 27.5 ms, sys: 747 µs, total: 28.2 ms\n",
226 |       "Wall time: 55.9 ms\n"
227 |      ]
228 |     },
229 |     {
230 |      "data": {
231 |       "text/plain": [
232 |        "<pyblazing.apiv2.sql.Table at 0x7fbcb8339128>"
233 |       ]
234 |      },
235 |      "execution_count": 3,
236 |      "metadata": {},
237 |      "output_type": "execute_result"
238 |     }
239 |    ],
240 |    "source": [
241 |     "%%time\n",
242 |     "# Create BlazingSQL table from GDF - There is no copy in this process\n",
243 |     "bc.create_table('netflow', netflow_gdf)"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": 4,
249 |    "metadata": {
250 |     "colab": {
251 |      "base_uri": "https://localhost:8080/",
252 |      "height": 295
253 |     },
254 |     "colab_type": "code",
255 |     "id": "umBG2Tp0wbQx",
256 |     "outputId": "0975395e-7f5b-4244-afa3-45c8658ce61c"
257 |    },
258 |    "outputs": [
259 |     {
260 |      "name": "stdout",
261 |      "output_type": "stream",
262 |      "text": [
263 |       "CPU times: user 30.8 ms, sys: 0 ns, total: 30.8 ms\n",
264 |       "Wall time: 429 ms\n"
265 |      ]
266 |     }
267 |    ],
268 |    "source": [
269 |     "%%time\n",
270 |     "# make a query\n",
271 |     "query = '''\n",
272 |     "        SELECT\n",
273 |     "            a.firstSeenSrcIp as source,\n",
274 |     "            a.firstSeenDestIp as destination,\n",
275 |     "            count(a.firstSeenDestPort) as targetPorts,\n",
276 |     "            SUM(a.firstSeenSrcTotalBytes) as bytesOut,\n",
277 |     "            SUM(a.firstSeenDestTotalBytes) as bytesIn,\n",
278 |     "            SUM(a.durationSeconds) as durationSeconds,\n",
279 |     "            MIN(parsedDate) as firstFlowDate,\n",
280 |     "            MAX(parsedDate) as lastFlowDate,\n",
281 |     "            COUNT(*) as attemptCount\n",
282 |     "        FROM\n",
283 |     "            netflow a\n",
284 |     "        GROUP BY\n",
285 |     "            a.firstSeenSrcIp,\n",
286 |     "            a.firstSeenDestIp\n",
287 |     "            '''\n",
288 |     "\n",
289 |     "# query the table\n",
290 |     "gdf = bc.sql(query)"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": 5,
296 |    "metadata": {
297 |     "colab": {},
298 |     "colab_type": "code",
299 |     "id": "48_W2v8q_zmq",
300 |     "outputId": "db0394f1-e082-49b0-c477-e3bba8d3d0f4"
301 |    },
302 |    "outputs": [
303 |     {
304 |      "data": {
305 |       "text/html": [
306 |        "<div>\n",
307 |        "<style scoped>\n",
308 |        "    .dataframe tbody tr th:only-of-type {\n",
309 |        "        vertical-align: middle;\n",
310 |        "    }\n",
311 |        "\n",
312 |        "    .dataframe tbody tr th {\n",
313 |        "        vertical-align: top;\n",
314 |        "    }\n",
315 |        "\n",
316 |        "    .dataframe thead th {\n",
317 |        "        text-align: right;\n",
318 |        "    }\n",
319 |        "</style>\n",
320 |        "<table border=\"1\" class=\"dataframe\">\n",
321 |        "  <thead>\n",
322 |        "    <tr style=\"text-align: right;\">\n",
323 |        "      <th></th>\n",
324 |        "      <th>source</th>\n",
325 |        "      <th>destination</th>\n",
326 |        "      <th>targetPorts</th>\n",
327 |        "      <th>bytesOut</th>\n",
328 |        "      <th>bytesIn</th>\n",
329 |        "      <th>durationSeconds</th>\n",
330 |        "      <th>firstFlowDate</th>\n",
331 |        "      <th>lastFlowDate</th>\n",
332 |        "      <th>attemptCount</th>\n",
333 |        "    </tr>\n",
334 |        "  </thead>\n",
335 |        "  <tbody>\n",
336 |        "    <tr>\n",
337 |        "      <th>0</th>\n",
338 |        "      <td>172.30.2.48</td>\n",
339 |        "      <td>172.0.0.1</td>\n",
340 |        "      <td>1</td>\n",
341 |        "      <td>90</td>\n",
342 |        "      <td>0</td>\n",
343 |        "      <td>0</td>\n",
344 |        "      <td>2013-04-03 06:36:09</td>\n",
345 |        "      <td>2013-04-03 06:36:09</td>\n",
346 |        "      <td>1</td>\n",
347 |        "    </tr>\n",
348 |        "    <tr>\n",
349 |        "      <th>1</th>\n",
350 |        "      <td>172.10.2.81</td>\n",
351 |        "      <td>239.255.255.250</td>\n",
352 |        "      <td>14</td>\n",
353 |        "      <td>2800</td>\n",
354 |        "      <td>0</td>\n",
355 |        "      <td>6</td>\n",
356 |        "      <td>2013-04-03 06:36:41</td>\n",
357 |        "      <td>2013-04-03 06:36:48</td>\n",
358 |        "      <td>14</td>\n",
359 |        "    </tr>\n",
360 |        "    <tr>\n",
361 |        "      <th>2</th>\n",
362 |        "      <td>172.30.2.58</td>\n",
363 |        "      <td>172.0.0.1</td>\n",
364 |        "      <td>1</td>\n",
365 |        "      <td>90</td>\n",
366 |        "      <td>0</td>\n",
367 |        "      <td>0</td>\n",
368 |        "      <td>2013-04-03 06:36:09</td>\n",
369 |        "      <td>2013-04-03 06:36:09</td>\n",
370 |        "      <td>1</td>\n",
371 |        "    </tr>\n",
372 |        "    <tr>\n",
373 |        "      <th>3</th>\n",
374 |        "      <td>172.30.1.171</td>\n",
375 |        "      <td>10.0.0.13</td>\n",
376 |        "      <td>1</td>\n",
377 |        "      <td>454</td>\n",
378 |        "      <td>633</td>\n",
379 |        "      <td>0</td>\n",
380 |        "      <td>2013-04-03 06:48:02</td>\n",
381 |        "      <td>2013-04-03 06:48:02</td>\n",
382 |        "      <td>1</td>\n",
383 |        "    </tr>\n",
384 |        "    <tr>\n",
385 |        "      <th>4</th>\n",
386 |        "      <td>172.30.1.17</td>\n",
387 |        "      <td>10.0.0.7</td>\n",
388 |        "      <td>1</td>\n",
389 |        "      <td>453</td>\n",
390 |        "      <td>632</td>\n",
391 |        "      <td>0</td>\n",
392 |        "      <td>2013-04-03 06:47:56</td>\n",
393 |        "      <td>2013-04-03 06:47:56</td>\n",
394 |        "      <td>1</td>\n",
395 |        "    </tr>\n",
396 |        "  </tbody>\n",
397 |        "</table>\n",
398 |        "</div>"
399 |       ],
400 |       "text/plain": [
401 |        "         source      destination  targetPorts  bytesOut  bytesIn  \\\n",
402 |        "0   172.30.2.48        172.0.0.1            1        90        0   \n",
403 |        "1   172.10.2.81  239.255.255.250           14      2800        0   \n",
404 |        "2   172.30.2.58        172.0.0.1            1        90        0   \n",
405 |        "3  172.30.1.171        10.0.0.13            1       454      633   \n",
406 |        "4   172.30.1.17         10.0.0.7            1       453      632   \n",
407 |        "\n",
408 |        "   durationSeconds        firstFlowDate         lastFlowDate  attemptCount  \n",
409 |        "0                0  2013-04-03 06:36:09  2013-04-03 06:36:09             1  \n",
410 |        "1                6  2013-04-03 06:36:41  2013-04-03 06:36:48            14  \n",
411 |        "2                0  2013-04-03 06:36:09  2013-04-03 06:36:09             1  \n",
412 |        "3                0  2013-04-03 06:48:02  2013-04-03 06:48:02             1  \n",
413 |        "4                0  2013-04-03 06:47:56  2013-04-03 06:47:56             1  "
414 |       ]
415 |      },
416 |      "execution_count": 5,
417 |      "metadata": {},
418 |      "output_type": "execute_result"
419 |     }
420 |    ],
421 |    "source": [
422 |     "# how's it look?\n",
423 |     "gdf.head()"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "markdown",
428 |    "metadata": {
429 |     "colab_type": "text",
430 |     "id": "6PXbjW1hTxrD"
431 |    },
432 |    "source": [
433 |     "## Apache Spark\n",
434 |     "The cell below installs Apache Spark ([PySpark](https://spark.apache.org/docs/latest/api/python/index.html))."
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "code",
439 |    "execution_count": 4,
440 |    "metadata": {
441 |     "colab": {},
442 |     "colab_type": "code",
443 |     "id": "pnEEvVEtT8xi"
444 |    },
445 |    "outputs": [],
446 |    "source": [
447 |     "# Note: This installs Spark (version 2.4.1, as tested in Jan 2020)\n",
448 |     "!pip install pyspark"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "markdown",
453 |    "metadata": {
454 |     "colab_type": "text",
455 |     "id": "W3-XmZkz_zmw"
456 |    },
457 |    "source": [
458 |     "#### PyBlazing vs PySpark\n",
459 |     "With everything installed we can launch a SparkSession and see how BlazingSQL stacks up."
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "code",
464 |    "execution_count": 6,
465 |    "metadata": {
466 |     "colab": {
467 |      "base_uri": "https://localhost:8080/",
468 |      "height": 51
469 |     },
470 |     "colab_type": "code",
471 |     "id": "nioEt2MqT9B0",
472 |     "outputId": "f75b9823-5dbd-45b1-9282-562d3d6ddaf0"
473 |    },
474 |    "outputs": [
475 |     {
476 |      "name": "stdout",
477 |      "output_type": "stream",
478 |      "text": [
479 |       "CPU times: user 50.2 ms, sys: 12.9 ms, total: 63.1 ms\n",
480 |       "Wall time: 3.88 s\n"
481 |      ]
482 |     }
483 |    ],
484 |    "source": [
485 |     "%%time\n",
486 |     "# I copied this cell's snippet from another Google Colab by Luca Canali here: https://colab.research.google.com/github/LucaCanali/sparkMeasure/blob/master/examples/SparkMeasure_Jupyter_Colab_Example.ipynb\n",
487 |     "\n",
488 |     "from pyspark.sql import SparkSession\n",
489 |     "\n",
490 |     "# Create Spark Session\n",
491 |     "# This example uses a local cluster, you can modify master to use  YARN or K8S if available \n",
492 |     "# This example downloads sparkMeasure 0.13 for scala 2_11 from maven central\n",
493 |     "\n",
494 |     "spark = SparkSession \\\n",
495 |     " .builder \\\n",
496 |     " .master(\"local[*]\") \\\n",
497 |     " .appName(\"PySpark Netflow Benchmark code\") \\\n",
498 |     " .config(\"spark.jars.packages\",\"ch.cern.sparkmeasure:spark-measure_2.11:0.13\")  \\\n",
499 |     " .getOrCreate()"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "markdown",
504 |    "metadata": {
505 |     "colab_type": "text",
506 |     "id": "G8XSppQiUdLY"
507 |    },
508 |    "source": [
509 |     "### Load & Query Table"
510 |    ]
511 |   },
512 |   {
513 |    "cell_type": "code",
514 |    "execution_count": 5,
515 |    "metadata": {
516 |     "colab": {
517 |      "base_uri": "https://localhost:8080/",
518 |      "height": 51
519 |     },
520 |     "colab_type": "code",
521 |     "id": "ZSLuSYSOUDtf",
522 |     "outputId": "2b93169b-63c5-4c46-da14-af87645bf51b"
523 |    },
524 |    "outputs": [
525 |     {
526 |      "name": "stdout",
527 |      "output_type": "stream",
528 |      "text": [
529 |       "CPU times: user 2.73 ms, sys: 0 ns, total: 2.73 ms\n",
530 |       "Wall time: 2.91 s\n"
531 |      ]
532 |     }
533 |    ],
534 |    "source": [
535 |     "%%time\n",
536 |     "# load CSV into Spark\n",
537 |     "netflow_df = spark.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('nf-chunk2.csv')"
538 |    ]
539 |   },
540 |   {
541 |    "cell_type": "code",
542 |    "execution_count": 6,
543 |    "metadata": {
544 |     "colab": {
545 |      "base_uri": "https://localhost:8080/",
546 |      "height": 51
547 |     },
548 |     "colab_type": "code",
549 |     "id": "iT3BwLn8UDwE",
550 |     "outputId": "4eeff800-489f-4230-adb9-f3a1c16ede66"
551 |    },
552 |    "outputs": [
553 |     {
554 |      "name": "stdout",
555 |      "output_type": "stream",
556 |      "text": [
557 |       "CPU times: user 1.06 ms, sys: 611 µs, total: 1.67 ms\n",
558 |       "Wall time: 120 ms\n"
559 |      ]
560 |     }
561 |    ],
562 |    "source": [
563 |     "%%time\n",
564 |     "# create table for querying\n",
565 |     "netflow_df.createOrReplaceTempView('netflow')"
566 |    ]
567 |   },
568 |   {
569 |    "cell_type": "code",
570 |    "execution_count": 7,
571 |    "metadata": {
572 |     "colab": {
573 |      "base_uri": "https://localhost:8080/",
574 |      "height": 493
575 |     },
576 |     "colab_type": "code",
577 |     "id": "9SBhahA5UD2k",
578 |     "outputId": "accc1938-6470-44df-ab7f-70058c755b2b"
579 |    },
580 |    "outputs": [
581 |     {
582 |      "name": "stdout",
583 |      "output_type": "stream",
584 |      "text": [
585 |       "+------------+---------------+-----------+--------+-------+---------------+-------------------+-------------------+------------+\n",
586 |       "|      source|    destination|targetPorts|bytesOut|bytesIn|durationSeconds|      firstFlowDate|       lastFlowDate|attemptCount|\n",
587 |       "+------------+---------------+-----------+--------+-------+---------------+-------------------+-------------------+------------+\n",
588 |       "| 172.10.1.13|239.255.255.250|         15|    2975|      0|              6|2013-04-03 06:36:19|2013-04-03 06:36:27|          15|\n",
589 |       "|172.10.1.232|      172.0.0.1|          1|     180|    180|              0|2013-04-03 06:36:45|2013-04-03 06:36:45|           1|\n",
590 |       "|172.10.1.238|239.255.255.250|          2|     700|      0|              6|2013-04-03 06:36:44|2013-04-03 06:36:51|           2|\n",
591 |       "| 172.10.1.35|      172.0.0.1|          1|     270|      0|              0|2013-04-03 06:36:21|2013-04-03 06:36:21|           1|\n",
592 |       "|172.10.2.137|      172.0.0.1|          1|      90|     90|              0|2013-04-03 06:36:42|2013-04-03 06:36:42|           1|\n",
593 |       "+------------+---------------+-----------+--------+-------+---------------+-------------------+-------------------+------------+\n",
594 |       "only showing top 5 rows\n",
595 |       "\n",
596 |       "CPU times: user 1.5 ms, sys: 861 µs, total: 2.36 ms\n",
597 |       "Wall time: 1.14 s\n"
598 |      ]
599 |     }
600 |    ],
601 |    "source": [
602 |     "%%time\n",
603 |     "# make a query\n",
604 |     "query = '''\n",
605 |     "        SELECT\n",
606 |     "            a.firstSeenSrcIp as source,\n",
607 |     "            a.firstSeenDestIp as destination,\n",
608 |     "            count(a.firstSeenDestPort) as targetPorts,\n",
609 |     "            SUM(a.firstSeenSrcTotalBytes) as bytesOut,\n",
610 |     "            SUM(a.firstSeenDestTotalBytes) as bytesIn,\n",
611 |     "            SUM(a.durationSeconds) as durationSeconds,\n",
612 |     "            MIN(parsedDate) as firstFlowDate,\n",
613 |     "            MAX(parsedDate) as lastFlowDate,\n",
614 |     "            COUNT(*) as attemptCount\n",
615 |     "        FROM\n",
616 |     "            netflow a\n",
617 |     "        GROUP BY\n",
618 |     "            a.firstSeenSrcIp,\n",
619 |     "            a.firstSeenDestIp\n",
620 |     "            '''\n",
621 |     "\n",
622 |     "# query with Spark\n",
623 |     "edges_df = spark.sql(query)\n",
624 |     "\n",
625 |     "# set/display results\n",
626 |     "edges_df.show(5)"
627 |    ]
628 |   }
629 |  ],
630 |  "metadata": {
631 |   "accelerator": "GPU",
632 |   "colab": {
633 |    "collapsed_sections": [],
634 |    "name": "vs_pyspark_netflow.ipynb",
635 |    "provenance": [],
636 |    "toc_visible": true
637 |   },
638 |   "kernelspec": {
639 |    "display_name": "Python 3",
640 |    "language": "python",
641 |    "name": "python3"
642 |   },
643 |   "language_info": {
644 |    "codemirror_mode": {
645 |     "name": "ipython",
646 |     "version": 3
647 |    },
648 |    "file_extension": ".py",
649 |    "mimetype": "text/x-python",
650 |    "name": "python",
651 |    "nbconvert_exporter": "python",
652 |    "pygments_lexer": "ipython3",
653 |    "version": "3.6.7"
654 |   }
655 |  },
656 |  "nbformat": 4,
657 |  "nbformat_minor": 2
658 | }
659 | 


--------------------------------------------------------------------------------
/data/Music.csv:
--------------------------------------------------------------------------------
  1 | ARTIST,RATING,YEAR,LOCATION,FESTIVAL_SET
  2 | Arcade Fire,10,2018,Las Vegas,1
  3 | Justice,10,2018,Las Vegas,1
  4 | Florence and The Machine,10,2018,Las Vegas,1
  5 | Odesza,10,2018,Indio,1
  6 | Bon Iver,10,2017,Indio,1
  7 | LA Philharmonic + Sigur Ros,10,2017,LA,0
  8 | Sigur Ros,10,2014,Malmo,0
  9 | Arcade Fire,10,2014,Indio,1
 10 | Escort,9,2018,San Francisco,0
 11 | Phoenix,9,2018,Berkeley,0
 12 | Jamie XX,9,2018,Golden Gate Park,1
 13 | Beyonce,10,2018,Indio,1
 14 | Soulwax,9,2018,Indio,1
 15 | The XX,9,2017,Las Vegas,1
 16 | Justice,9,2017,Indio,1
 17 | Sigur Ros,9,2017,LA,0
 18 | The XX,9,2017,London,0
 19 | Porter Robinson and Madeon,9,2017,London,0
 20 | Garden City Movement,9,2018,Tel Aviv,0
 21 | ACDC,9,2015,Indio,1
 22 | Porter Robinson,9,2015,Las Vegas,1
 23 | Alt-J,9,2015,Barcelona,1
 24 | Arcade Fire,9,2014,LA,0
 25 | Phoenix,9,2013,Indio,1
 26 | Chvrches,9,2013,Copenhagen,0
 27 | Red Hot Chili Peppers,9,2006,Oakland,0
 28 | Jungle,8,2018,Las Vegas,1
 29 | Sylvan Esso,8,2018,Las Vegas,1
 30 | Lake Street Dive,8,2018,San Francisco,0
 31 | Elohim,8,2018,Golden Gate Park,1
 32 | Tash Sultana,8,2018,Golden Gate Park,1
 33 | David Byrne,8,2018,Indio,1
 34 | Eminem,8,2018,Indio,1
 35 | Tank and the Bangas,8,2018,Indio,1
 36 | The Blaze,8,2018,Indio,1
 37 | Jungle,8,2018,San Francisco,0
 38 | Chance The Rapper,8,2017,Las Vegas,1
 39 | Goldroom,8,2017,Las Vegas,1
 40 | Mura Masa,8,2017,Las Vegas,1
 41 | ZHU,8,2017,Las Vegas,1
 42 | Goldroom,8,2017,San Francisco,1
 43 | Phoenix,8,2017,Mountain View,1
 44 | Hans Zimmer,8,2017,Indio,1
 45 | Moderat,8,2017,Indio,1
 46 | The XX,8,2017,Indio,1
 47 | BORNS,8,2016,Indio,1
 48 | Chvrches,8,2016,Indio,1
 49 | Gallant,8,2016,Indio,1
 50 | Matt & Kim,8,2016,Indio,1
 51 | The Lumineers,8,2016,Las Vegas,1
 52 | Flume,8,2016,Las Vegas,1
 53 | Griz ,8,2016,San Francisco,1
 54 | James Vincent McMorrow,8,2016,London,0
 55 | Mura Masa,8,2016,San Francisco,0
 56 | Alt-J,8,2015,Indio,1
 57 | Jamie XX,8,2015,Indio,1
 58 | ODESZA,8,2015,Indio,1
 59 | Porter Robinson,8,2015,Indio,1
 60 | Yelle,8,2015,Indio,1
 61 | Sylvan Esso,8,2015,Indio,1
 62 | ODESZA,8,2015,Indio,1
 63 | Imagine Dragons,8,2015,LA,0
 64 | Ben Howard,8,2015,Berkeley ,0
 65 | Imagine Dragons,8,2015,Las Vegas,1
 66 | Elton John,8,2015,San Francisco,1
 67 | Garden City Movement,8,2015,Barcelona,1
 68 | Jungle,8,2015,Barcelona,1
 69 | Matt and Kim,8,2015,LA,0
 70 | Daughter,8,2014,Indio,1
 71 | Chromeo,8,2014,Indio,1
 72 | Flume,8,2014,Indio,1
 73 | Phantogram,8,2014,Monterey,1
 74 | Major Lazer,8,2013,Indio,1
 75 | The XX,8,2013,Indio,1
 76 | Yeasayer,8,2013,Indio,1
 77 | The Floor is Made of Lava,8,2013,Copenhagen,0
 78 | Taylor Swift,8,2012,Claremont,0
 79 | Elton John,8,2010,Ontario,0
 80 | First Aid Kit,7,2018,Las Vegas,1
 81 | Cut Copy,7,2018,Berkeley,0
 82 | Rainbow Kitten Surprise\,7,2018,Golden Gate Park,1
 83 | LP,7,2018,Golden Gate Park,1
 84 | Chvrches,7,2018,Golden Gate Park,1
 85 | Bon Iver,7,2018,Golden Gate Park,1
 86 | Bleachers,7,2018,Indio,1
 87 | Lola Marsh,7,2018,Tel Aviv,0
 88 | The Paz Band,7,2017,Tel Aviv,0
 89 | The Revivalists (Acoustic),7,2017,Las Vegas,1
 90 | Lorde,7,2017,Las Vegas,1
 91 | Treehouse Dubstep,7,2017,Las Vegas,1
 92 | Sofi Tukker (Tukker DJ set),7,2017,Las Vegas,1
 93 | Tycho,7,2017,Las Vegas,1
 94 | Pretty Lights,7,2017,Las Vegas,1
 95 | Tokimonsta,7,2017,Las Vegas,0
 96 | San Fermin,7,2017,San Francisco,1
 97 | Franz Ferdinand,7,2017,Mountain View,1
 98 | Ezra Furman,7,2017,Indio,1
 99 | FKJ,7,2017,Indio,1
100 | GoldLink,7,2017,Indio,1
101 | Jack Garratt,7,2017,Indio,1
102 | Oh Wonder,7,2017,Indio,1
103 | Phantogram,7,2017,Indio,1
104 | Sam Gellaitry,7,2017,Indio,1
105 | Sigur Ros,7,2017,Oakland,0
106 | LA Philharmonic,7,2017,LA,0
107 | Despacio,7,2016,Indio,1
108 | Goldroom,7,2016,Indio,1
109 | LCD soundsystem,7,2016,Indio,1
110 | Lido,7,2016,Indio,1
111 | Lord Huron,7,2016,Indio,1
112 | Major Lazer,7,2016,Indio,1
113 | Rufus du sol ,7,2016,Indio,1
114 | Spacewench,7,2016,Las Vegas,1
115 | Big Grams,7,2016,San Francisco,1
116 | Rufus Du Sol,7,2016,San Francisco,1
117 | Yellow Claw,7,2015,Indio,1
118 | St. Lucia,7,2015,Indio,1
119 | Jamie XX,7,2015,Indio,1
120 | Klingande,7,2015,Las Vegas,1
121 | Major Lazer,7,2015,Las Vegas,1
122 | Jauz,7,2015,Las Vegas,1
123 | Walk the Moon,7,2015,Las Vegas,1
124 | Madeon,7,2015,Las Vegas,1
125 | Chvrches,7,2015,Oakland,1
126 | Death Cab for Cutie,7,2015,Oakland,1
127 | X Ambassadors,7,2015,Oakland,1
128 | Porter Robinson,7,2015,San Francisco,1
129 | James Bay,7,2015,San Francisco,1
130 | Sam Smith,7,2015,San Francisco,1
131 | ACollective,7,2015,Barcelona,1
132 | Chet faker,7,2015,Barcelona,1
133 | Sylvan Esso,7,2015,Barcelona,1
134 | Chvrches,7,2014,Indio,1
135 | Krewella,7,2014,Indio,1
136 | St Lucia,7,2014,Indio,1
137 | The Naked and Famous,7,2014,Monterey,1
138 | Future Islands,7,2014,Monterey,1
139 | Tokyo Police Club,7,2014,Monterey,1
140 | Macklemore,7,2014,San Francisco,1
141 | Watsky,7,2014,San Francisco,1
142 | The Kooks,7,2014,San Francisco,1
143 | Yeah Yeah Yeahs,7,2013,Indio,1
144 | Passion Pit,7,2013,Indio,1
145 | Purity Ring,7,2013,Indio,1
146 | Red Hot Chili Peppers,7,2013,Indio,1
147 | The Postal Service,7,2013,Indio,1
148 | Vampire weekend,7,2013,Indio,1
149 | Scavenger Hunt,7,2013,LA,0
150 | Ms MR,7,2013,Copenhagen,0
151 | The Fratellis,7,2016,London,0
152 | St . Lucia,7,2015,LA,0
153 | Anderson Paak,7,2016,London,0
154 | Poolside,6,2018,Las Vegas,1
155 | St. Vincent,6,2018,Las Vegas,1
156 | Superorganism,6,2018,Las Vegas,1
157 | Sofi Tukker,6,2018,Las Vegas,1
158 | Sir Sly,6,2018,Berkeley,0
159 | Carly Rae Jepsen,6,2018,Golden Gate Park,1
160 | Alt-j,6,2018,Indio,1
161 | Nile Rogers and CHIC,6,2018,Indio,1
162 | Sudan Archives,6,2018,Indio,1
163 | Petit Biscuit,6,2018,Indio,1
164 | Elohim,6,2018,Indio,1
165 | St. Vincent,6,2018,Indio,1
166 | Nessi Gomes,6,2018,Israel,0
167 | RAC (DJ Set),6,2017,Las Vegas,1
168 | Two Door Cinema Club,6,2017,Las Vegas,1
169 | Milky Chance,6,2017,Las Vegas,1
170 | Alt-J,6,2017,San Francisco,1
171 | RAC,6,2017,San Francisco,1
172 | SOHN,6,2017,San Francisco,1
173 | RAC,6,2017,San Francisco,1
174 | Joseph,6,2017,San Francisco,1
175 | James Vincent McMorrow,6,2017,San Francisco,1
176 | Young the Giant,6,2017,San Francisco,1
177 | Lorde,6,2017,San Francisco,1
178 | Andre McMahon in the Wilderness,6,2017,Mountain View,1
179 | Joseph,6,2017,Indio,1
180 | Nao,6,2017,Indio,1
181 | Porter & Madeon,6,2017,Indio,1
182 | Two Door Cinema Club,6,2017,Indio,1
183 | Tycho,6,2017,Indio,1
184 | Of Monsters and Men,6,2017,Indio,1
185 | Flume,6,2017,Indio,1
186 | Lapsley,6,2017,Indio,1
187 | Jimmy Eat World,6,2012,Las Vegas,1
188 | Keys N Krates,6,2012,Las Vegas,1
189 | Leon Bridges,6,2012,Las Vegas,1
190 | Oh Wonder,6,2012,Las Vegas,1
191 | The Wombats,6,2012,San Francisco,1
192 | Oh Wonder,6,2012,San Francisco,1
193 | The War on Drugs,6,2012,Indio,1
194 | Andre McMahon in the Wilderness,6,2012,Indio,1
195 | Phox,6,2012,Indio,1
196 | Metric,6,2012,Las Vegas,1
197 | Bastille,6,2012,Oakland,1
198 | Halsey,6,2012,Oakland,1
199 | George Ezra,6,2012,San Francisco,1
200 | Mumford and Sons,6,2012,San Francisco,1
201 | Benjamin Booker,6,2012,San Francisco,1
202 | Mac Demarco,6,2012,Barcelona,1
203 | Bastille,6,2012,Indio,1
204 | Ellie Goulding,6,2012,Indio,1
205 | STRFKR,6,2012,Indio,1
206 | The National,6,2012,Monterey,1
207 | Blind Pilot,6,2012,Monterey,1
208 | Beck,6,2012,Monterey,1
209 | Flume,6,2012,San Francisco,1
210 | Lykke Li,6,2012,San Francisco,1
211 | Haim,6,2012,San Francisco,1
212 | Tycho,6,2012,San Francisco,1
213 | Earth Wind & Fire,6,2012,Claremont,0
214 | Of Monsters and Men,6,2012,Indio,1
215 | Japandroids,6,2012,Indio,1
216 | Lumineers,6,2012,Indio,1
217 | Chvrches,6,2012,LA,0
218 | Jack Johnson ,6,2012,Berkeley ,0
219 | Daughter,6,2012,Berkeley ,0
220 | Tom Misch,5,2018,Oakland,0
221 | Chvrches,5,2018,Las Vegas,1
222 | Two Feet,5,2018,Las Vegas,1
223 | Odesza,5,2018,Golden Gate Park,1
224 | Rezz,5,2018,Indio,1
225 | Jacob Banks,5,2017,Las Vegas,1
226 | Future Islands,5,2017,San Francisco,1
227 | Cold War Kids,5,2017,Mountain View,1
228 | Big Gigantic,5,2017,Indio,1
229 | Glass Animals,5,2017,Indio,1
230 | The Head and the Heart,5,2017,Indio,1
231 | What So 0t,5,2017,Indio,1
232 | Calvin Harris,5,2017,Indio,1
233 | Halsey,5,2017,Indio,1
234 | Snails,5,2017,Indio,1
235 | The 1975,5,2017,Indio,1
236 | Mr. Carmack,5,2017,Las Vegas,1
237 | Halsey,5,2017,San Francisco,1
238 | MO,5,2017,Indio,1
239 | Tycho,5,2017,Indio,1
240 | Coasts,5,2011,Indio,1
241 | Alessia Cara,5,2011,Las Vegas,1
242 | Halsey,5,2011,Las Vegas,1
243 | Run the Jewels,5,2011,Las Vegas,1
244 | Silversun Pickups,5,2011,Oakland,1
245 | First Aid Kit,5,2011,San Francisco,1
246 | Broods,5,2011,San Francisco,1
247 | RL Grime,5,2011,San Francisco,1
248 | Belle and Sebastian,5,2011,Barcelona,1
249 | Run the Jewels,5,2011,Barcelona,1
250 | The Strokes,5,2011,Barcelona,1
251 | Haim,5,2011,Indio,1
252 | The Head and the Heart,5,2011,Indio,1
253 | MGMT,5,2011,Indio,1
254 | Empire of the Sun,5,2011,Indio,1
255 | Grouplove,5,2011,Indio,1
256 | The 1975,5,2011,Indio,1
257 | Mr Little Jeans,5,2011,Monterey,1
258 | Atmosphere,5,2011,San Francisco,1
259 | The Chainsmokers,5,2011,Claremont,0
260 | Jessie Ware,5,2011,Indio,1
261 | Van Halen,5,2011,MOuntain View,
262 | Tycho,5,2011,London,0
263 | Foster The People,4,2018,Las Vegas,1
264 | Brasstracks,4,2018,Las Vegas,1
265 | Olivia O'Brien,4,2018,Golden Gate Park,1
266 | Slow Magic,4,2018,Indio,1
267 | Blink-182,4,2017,Las Vegas,1
268 | Classixx,4,2017,Las Vegas,1
269 | Local Natives,4,2017,Las Vegas,1
270 | The Japanese House,4,2017,San Francisco,1
271 | Above and Beyond,4,2017,San Francisco,1
272 | Milky Chance,4,2017,Mountain View,1
273 | Honne,4,2017,Indio,1
274 | RL Grime,4,2014,Indio,1
275 | James Bay,4,2014,Indio,1
276 | Ellie Goulding,4,2014,Indio,1
277 | Louis the child,4,2014,Indio,1
278 | Mr Carmack,4,2014,Indio,1
279 | ZHU,4,2014,Indio,1
280 | Gryffin,4,2014,Las Vegas,1
281 | ZHU,4,2014,Las Vegas,1
282 | Jauz,4,2014,San Francisco,1
283 | Chance The Rapper,4,2014,San Francisco,1
284 | George Ezra,4,2014,Indio,1
285 | Alabama Shakes,4,2014,Indio,1
286 | Kaskade,4,2014,Indio,1
287 | Madeon,4,2014,Indio,1
288 | Milky chance,4,2014,Indio,1
289 | Ryn Weaver,4,2014,Indio,1
290 | The weeknd,4,2014,Indio,1
291 | What So 0t,4,2014,Indio,1
292 | Lindsey Stirling,4,2014,Las Vegas,1
293 | Glass Animals,4,2014,Las Vegas,1
294 | Odesza,4,2014,San Francisco,1
295 | Black keys,4,2014,Barcelona,1
296 | Lorde,4,2014,Indio,1
297 | Adrian Lux,4,2014,Indio,1
298 | Outkast,4,2014,Indio,1
299 | Alesso,4,2014,Indio,1
300 | Beach House,4,2014,Monterey,1
301 | Kanye West,4,2014,San Francisco,1
302 | Disclosure,4,2014,San Francisco,1
303 | Chromeo,4,2014,San Francisco,1
304 | RAC,4,2014,LA,0
305 | Passion Pit,4,2014,San Francisco,0
306 | Banners,4,2014,San Francisco,0
307 | 98 Degrees,4,2014,LA,1
308 | Broken Social Scene,3,2018,Golden Gate Park,1
309 | Portugal the Man,3,2018,Indio,1
310 | Thundercat,3,2017,San Francisco,1
311 | Arkells,3,2017,Indio,1
312 | Jack U,3,2017,Indio,1
313 | Third Eye Blind,3,2017,Las Vegas,1
314 | Years and Years,3,2017,San Francisco,1
315 | Zedd,3,2017,San Francisco,1
316 | Angus and Julia Stone,3,2017,Indio,1
317 | Clean Bandit,3,2017,Indio,1
318 | Ratatat,3,2017,Indio,1
319 | Kaskade,3,2017,Indio,1
320 | Peking Duk,3,2017,Las Vegas,1
321 | Foals ,3,2017,Oakland,1
322 | The Neighborhood,3,2017,Indio,1
323 | Tyler the Creator / Earl Sweatshirt,3,2017,Indio,1
324 | Childish Gambi0,3,2017,Claremont,0
325 | Two Friends,3,2017,San Francisco,0
326 | Digitalism,3,2017,Copenhagen,0
327 | Deorro,2,2017,Las Vegas,1
328 | Bearson,2,2017,Las Vegas,1
329 | Whethan,2,2017,Mountain View,1
330 | Kungs,2,2017,Indio,1
331 | Thomas Jack,2,2017,Indio,1
332 | Vanic,2,2017,Indio,1
333 | Drake,2,2017,Indio,1
334 | Kygo,2,2017,Indio,1
335 | Health,2,2017,Barcelona,1
336 | Muse,2,2017,Indio,1
337 | Modest Mouse,2,2017,Indio,1
338 | The K0cks,2,2017,LA,0
339 | The Chainsmokers,2,2017,London,0
340 | Avicii,2,2017,Vegas,0
341 | 2 Chainz,1,2017,Las Vegas,1
342 | Nick cave and the bad seeds,1,2017,Indio,1
343 | ,,,,
344 | ,,,,
345 | ,,,,
346 | ,,,,
347 | ,,,", ",
348 | 


--------------------------------------------------------------------------------
/data/cancer_data_00.csv:
--------------------------------------------------------------------------------
  1 | 1,23,12,151
  2 | 0,9,13,133
  3 | 1,21,27,130
  4 | 1,14,16,78
  5 | 1,9,19,135
  6 | 0,25,25,83
  7 | 1,16,26,120
  8 | 1,15,18,90
  9 | 1,19,24,88
 10 | 1,25,11,84
 11 | 1,24,21,103
 12 | 1,17,15,104
 13 | 0,14,15,132
 14 | 1,12,22,104
 15 | 1,12,13,94
 16 | 1,22,19,97
 17 | 1,10,16,95
 18 | 1,15,14,108
 19 | 1,20,14,130
 20 | 0,17,11,87
 21 | 0,16,14,86
 22 | 0,17,24,60
 23 | 1,20,27,103
 24 | 1,19,12,137
 25 | 1,9,13,110
 26 | 1,19,27,116
 27 | 1,10,24,97
 28 | 1,16,24,122
 29 | 1,15,15,102
 30 | 1,11,16,115
 31 | 1,11,22,125
 32 | 1,23,26,78
 33 | 1,20,18,113
 34 | 1,11,21,128
 35 | 1,16,23,107
 36 | 1,10,13,110
 37 | 1,18,12,94
 38 | 0,21,11,83
 39 | 1,11,15,96
 40 | 1,10,14,88
 41 | 1,24,16,86
 42 | 1,19,27,72
 43 | 1,11,11,128
 44 | 1,15,21,87
 45 | 1,10,15,85
 46 | 1,18,11,124
 47 | 0,22,12,52
 48 | 1,20,14,86
 49 | 0,20,21,78
 50 | 0,25,11,87
 51 | 0,19,25,75
 52 | 0,19,22,87
 53 | 0,25,15,76
 54 | 1,14,26,120
 55 | 1,18,25,97
 56 | 0,18,13,73
 57 | 1,10,19,126
 58 | 1,17,20,96
 59 | 0,22,15,83
 60 | 0,23,26,54
 61 | 0,15,18,65
 62 | 0,25,15,55
 63 | 1,12,22,96
 64 | 0,24,17,59
 65 | 1,16,19,83
 66 | 1,11,21,97
 67 | 0,12,13,60
 68 | 0,18,12,72
 69 | 0,16,17,59
 70 | 0,17,21,81
 71 | 1,21,18,124
 72 | 0,9,26,59
 73 | 1,21,12,114
 74 | 1,22,25,90
 75 | 0,18,13,79
 76 | 1,21,18,104
 77 | 0,10,17,88
 78 | 1,11,21,120
 79 | 1,16,18,144
 80 | 0,22,16,83
 81 | 0,10,18,74
 82 | 0,17,21,86
 83 | 1,10,15,172
 84 | 1,20,14,129
 85 | 0,25,21,77
 86 | 1,14,13,121
 87 | 1,19,26,94
 88 | 1,19,11,122
 89 | 0,11,11,80
 90 | 0,12,23,96
 91 | 0,23,27,95
 92 | 1,10,12,100
 93 | 0,14,14,85
 94 | 0,10,17,87
 95 | 1,22,26,100
 96 | 1,23,16,132
 97 | 0,22,14,78
 98 | 0,19,27,62
 99 | 0,21,24,74
100 | 1,16,27,94


--------------------------------------------------------------------------------
/data/cancer_data_01.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BlazingDB/bsql-demos/ebee8a606a272f3e2ab7a38587a6092fe2018d93/data/cancer_data_01.parquet


--------------------------------------------------------------------------------
/data/cancer_data_02.csv:
--------------------------------------------------------------------------------
  1 | 0.278,0.242,0.079
  2 | 0.079,0.181,0.057
  3 | 0.16,0.207,0.06
  4 | 0.284,0.26,0.097
  5 | 0.133,0.181,0.059
  6 | 0.17,0.209,0.076
  7 | 0.109,0.179,0.057
  8 | 0.165,0.22,0.075
  9 | 0.193,0.235,0.074
 10 | 0.24,0.203,0.082
 11 | 0.067,0.153,0.057
 12 | 0.129,0.184,0.061
 13 | 0.246,0.24,0.078
 14 | 0.1,0.185,0.053
 15 | 0.229,0.207,0.077
 16 | 0.16,0.23,0.071
 17 | 0.072,0.159,0.059
 18 | 0.202,0.216,0.074
 19 | 0.103,0.158,0.054
 20 | 0.081,0.189,0.058
 21 | 0.127,0.197,0.068
 22 | 0.065,0.182,0.069
 23 | 0.214,0.252,0.07
 24 | 0.102,0.177,0.053
 25 | 0.146,0.2,0.063
 26 | 0.228,0.304,0.074
 27 | 0.187,0.225,0.069
 28 | 0.107,0.17,0.057
 29 | 0.17,0.193,0.065
 30 | 0.116,0.174,0.061
 31 | 0.189,0.218,0.062
 32 | 0.152,0.23,0.078
 33 | 0.15,0.225,0.064
 34 | 0.172,0.185,0.063
 35 | 0.156,0.2,0.065
 36 | 0.134,0.19,0.057
 37 | 0.11,0.189,0.061
 38 | 0.038,0.147,0.059
 39 | 0.051,0.157,0.055
 40 | 0.126,0.172,0.064
 41 | 0.06,0.178,0.056
 42 | 0.122,0.19,0.069
 43 | 0.219,0.231,0.063
 44 | 0.144,0.197,0.068
 45 | 0.105,0.175,0.062
 46 | 0.169,0.191,0.06
 47 | 0.059,0.177,0.065
 48 | 0.123,0.213,0.068
 49 | 0.091,0.168,0.06
 50 | 0.077,0.181,0.057
 51 | 0.05,0.15,0.059
 52 | 0.061,0.135,0.06
 53 | 0.048,0.187,0.061
 54 | 0.149,0.209,0.063
 55 | 0.071,0.162,0.057
 56 | 0.055,0.192,0.059
 57 | 0.127,0.192,0.06
 58 | 0.137,0.203,0.068
 59 | 0.038,0.182,0.055
 60 | 0.053,0.168,0.072
 61 | 0.081,0.274,0.07
 62 | 0.09,0.183,0.068
 63 | 0.201,0.195,0.073
 64 | 0.088,0.234,0.07
 65 | 0.126,0.191,0.066
 66 | 0.148,0.195,0.067
 67 | 0.078,0.172,0.069
 68 | 0.047,0.152,0.057
 69 | 0.141,0.211,0.08
 70 | 0.052,0.159,0.057
 71 | 0.103,0.158,0.055
 72 | 0.153,0.19,0.09
 73 | 0.183,0.193,0.065
 74 | 0.128,0.166,0.066
 75 | 0.068,0.172,0.059
 76 | 0.084,0.18,0.054
 77 | 0.105,0.24,0.066
 78 | 0.215,0.215,0.067
 79 | 0.345,0.291,0.081
 80 | 0.095,0.172,0.06
 81 | 0.094,0.184,0.07
 82 | 0.154,0.194,0.069
 83 | 0.267,0.183,0.068
 84 | 0.179,0.163,0.072
 85 | 0.072,0.208,0.06
 86 | 0.105,0.213,0.06
 87 | 0.099,0.208,0.056
 88 | 0.121,0.195,0.056
 89 | 0.094,0.193,0.064
 90 | 0.134,0.212,0.063
 91 | 0.086,0.169,0.059
 92 | 0.104,0.172,0.061
 93 | 0.051,0.139,0.053
 94 | 0.082,0.164,0.057
 95 | 0.155,0.186,0.063
 96 | 0.131,0.21,0.056
 97 | 0.071,0.19,0.066
 98 | 0.053,0.135,0.069
 99 | 0.075,0.162,0.066
100 | 0.114,0.188,0.064


--------------------------------------------------------------------------------
/federated_query_demo.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {
   6 |     "colab_type": "text",
   7 |     "id": "8AdUt3HiUrc3"
   8 |    },
   9 |    "source": [
  10 |     "# Querying Multiple Data Formats \n",
  11 |     "In this notebook, we will cover: \n",
  12 |     "- How to create and then join BlazingSQL tables from CSV, Parquet, and GPU DataFrame (GDF) sources. \n",
  13 |     "\n",
  14 |     "## Imports"
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "code",
  19 |    "execution_count": 1,
  20 |    "metadata": {},
  21 |    "outputs": [],
  22 |    "source": [
  23 |     "import os\n",
  24 |     "import cudf\n",
  25 |     "from blazingsql import BlazingContext"
  26 |    ]
  27 |   },
  28 |   {
  29 |    "cell_type": "markdown",
  30 |    "metadata": {
  31 |     "colab_type": "text",
  32 |     "id": "aMwNKxePSwOp"
  33 |    },
  34 |    "source": [
  35 |     "## Import packages and create BlazingContext\n",
  36 |     "You can think of the BlazingContext much like a SparkContext; this is where information such as FileSystems you have registered and Tables you have created will be stored. "
  37 |    ]
  38 |   },
  39 |   {
  40 |    "cell_type": "code",
  41 |    "execution_count": 2,
  42 |    "metadata": {
  43 |     "colab": {
  44 |      "base_uri": "https://localhost:8080/",
  45 |      "height": 35
  46 |     },
  47 |     "colab_type": "code",
  48 |     "id": "azZ7l2q7odYT",
  49 |     "outputId": "a5302d6e-307e-45c5-a682-c786cc999a40"
  50 |    },
  51 |    "outputs": [
  52 |     {
  53 |      "name": "stdout",
  54 |      "output_type": "stream",
  55 |      "text": [
  56 |       "BlazingContext ready\n"
  57 |      ]
  58 |     }
  59 |    ],
  60 |    "source": [
  61 |     "# start up BlazingSQL\n",
  62 |     "bc = BlazingContext()"
  63 |    ]
  64 |   },
  65 |   {
  66 |    "cell_type": "markdown",
  67 |    "metadata": {
  68 |     "colab_type": "text",
  69 |     "id": "N2bqpDEnZyQf"
  70 |    },
  71 |    "source": [
  72 |     "### Create Table from CSV\n",
  73 |     "Here we create a BlazingSQL table directly from a comma-separated values (CSV) file."
  74 |    ]
  75 |   },
  76 |   {
  77 |    "cell_type": "code",
  78 |    "execution_count": 3,
  79 |    "metadata": {
  80 |     "colab": {},
  81 |     "colab_type": "code",
  82 |     "id": "HhRhj-ZvZygH"
  83 |    },
  84 |    "outputs": [
  85 |     {
  86 |      "data": {
  87 |       "text/plain": [
  88 |        "<pyblazing.apiv2.context.BlazingTable at 0x7f2a31333da0>"
  89 |       ]
  90 |      },
  91 |      "execution_count": 3,
  92 |      "metadata": {},
  93 |      "output_type": "execute_result"
  94 |     }
  95 |    ],
  96 |    "source": [
  97 |     "# define column names and types\n",
  98 |     "column_names = ['diagnosis_result', 'radius', 'texture', 'perimeter']\n",
  99 |     "column_types = ['float32', 'float32', 'float32', 'float32']\n",
 100 |     "\n",
 101 |     "# identify local directory path \n",
 102 |     "cwd = os.getcwd()\n",
 103 |     "# add path to data\n",
 104 |     "data_path = cwd + '/data/cancer_data_00.csv'\n",
 105 |     "\n",
 106 |     "# create table from CSV file\n",
 107 |     "bc.create_table('data_00', data_path, dtype=column_types, names=column_names)"
 108 |    ]
 109 |   },
 110 |   {
 111 |    "cell_type": "markdown",
 112 |    "metadata": {
 113 |     "colab_type": "text",
 114 |     "id": "HJFz-mqZTJ5Z"
 115 |    },
 116 |    "source": [
 117 |     "### Create Table from Parquet\n",
 118 |     "Here we create a BlazingSQL table directly from an Apache Parquet file."
 119 |    ]
 120 |   },
 121 |   {
 122 |    "cell_type": "code",
 123 |    "execution_count": 4,
 124 |    "metadata": {
 125 |     "colab": {},
 126 |     "colab_type": "code",
 127 |     "id": "HJuvtJDYTMyb"
 128 |    },
 129 |    "outputs": [
 130 |     {
 131 |      "data": {
 132 |       "text/plain": [
 133 |        "<pyblazing.apiv2.context.BlazingTable at 0x7f2a300b7588>"
 134 |       ]
 135 |      },
 136 |      "execution_count": 4,
 137 |      "metadata": {},
 138 |      "output_type": "execute_result"
 139 |     }
 140 |    ],
 141 |    "source": [
 142 |     "# create table from Parquet file\n",
 143 |     "bc.create_table('data_01', cwd + '/data/cancer_data_01.parquet')"
 144 |    ]
 145 |   },
 146 |   {
 147 |    "cell_type": "markdown",
 148 |    "metadata": {
 149 |     "colab_type": "text",
 150 |     "id": "98HJFrt5TRa0"
 151 |    },
 152 |    "source": [
 153 |     "### Create Table from GPU DataFrame\n",
 154 |     "Here we use cuDF to create a GPU DataFrame (GDF), then use BlazingSQL to create a table from that GDF.\n",
 155 |     "\n",
 156 |     "The GDF is the standard memory representation for the RAPIDS AI ecosystem."
 157 |    ]
 158 |   },
 159 |   {
 160 |    "cell_type": "code",
 161 |    "execution_count": 5,
 162 |    "metadata": {
 163 |     "colab": {},
 164 |     "colab_type": "code",
 165 |     "id": "14GwxmLsTV_p",
 166 |     "scrolled": true
 167 |    },
 168 |    "outputs": [
 169 |     {
 170 |      "data": {
 171 |       "text/plain": [
 172 |        "<pyblazing.apiv2.context.BlazingTable at 0x7f2a300b9080>"
 173 |       ]
 174 |      },
 175 |      "execution_count": 5,
 176 |      "metadata": {},
 177 |      "output_type": "execute_result"
 178 |     }
 179 |    ],
 180 |    "source": [
 181 |     "# define column names and types\n",
 182 |     "column_names = ['compactness', 'symmetry', 'fractal_dimension']\n",
 183 |     "column_types = ['float32', 'float32', 'float32', 'float32']\n",
 184 |     "\n",
 185 |     "# make GDF with cuDF (uses relative path)\n",
 186 |     "gdf_02 = cudf.read_csv('data/cancer_data_02.csv', dtype=column_types, names=column_names)\n",
 187 |     "\n",
 188 |     "# create BlazingSQL table from GDF\n",
 189 |     "bc.create_table('data_02', gdf_02)"
 190 |    ]
 191 |   },
 192 |   {
 193 |    "cell_type": "markdown",
 194 |    "metadata": {
 195 |     "colab_type": "text",
 196 |     "id": "9DAZShZ2y-Nx"
 197 |    },
 198 |    "source": [
 199 |     "# Join Tables Together \n",
 200 |     "\n",
 201 |     "Now we can use BlazingSQL to join all three data formats in a single federated query. "
 202 |    ]
 203 |   },
 204 |   {
 205 |    "cell_type": "code",
 206 |    "execution_count": 6,
 207 |    "metadata": {
 208 |     "colab": {
 209 |      "base_uri": "https://localhost:8080/",
 210 |      "height": 1000
 211 |     },
 212 |     "colab_type": "code",
 213 |     "id": "HOYSFebvzGcX",
 214 |     "outputId": "ad133dfd-540e-4142-8f12-a4a70d803bb6",
 215 |     "scrolled": true
 216 |    },
 217 |    "outputs": [
 218 |     {
 219 |      "data": {
 220 |       "text/html": [
 221 |        "<div>\n",
 222 |        "<style scoped>\n",
 223 |        "    .dataframe tbody tr th:only-of-type {\n",
 224 |        "        vertical-align: middle;\n",
 225 |        "    }\n",
 226 |        "\n",
 227 |        "    .dataframe tbody tr th {\n",
 228 |        "        vertical-align: top;\n",
 229 |        "    }\n",
 230 |        "\n",
 231 |        "    .dataframe thead th {\n",
 232 |        "        text-align: right;\n",
 233 |        "    }\n",
 234 |        "</style>\n",
 235 |        "<table border=\"1\" class=\"dataframe\">\n",
 236 |        "  <thead>\n",
 237 |        "    <tr style=\"text-align: right;\">\n",
 238 |        "      <th></th>\n",
 239 |        "      <th>diagnosis_result</th>\n",
 240 |        "      <th>radius</th>\n",
 241 |        "      <th>texture</th>\n",
 242 |        "      <th>perimeter</th>\n",
 243 |        "      <th>area</th>\n",
 244 |        "      <th>smoothness</th>\n",
 245 |        "      <th>compactness</th>\n",
 246 |        "      <th>symmetry</th>\n",
 247 |        "      <th>fractal_dimension</th>\n",
 248 |        "    </tr>\n",
 249 |        "  </thead>\n",
 250 |        "  <tbody>\n",
 251 |        "    <tr>\n",
 252 |        "      <th>0</th>\n",
 253 |        "      <td>1.0</td>\n",
 254 |        "      <td>11.0</td>\n",
 255 |        "      <td>21.0</td>\n",
 256 |        "      <td>120.0</td>\n",
 257 |        "      <td>1033.0</td>\n",
 258 |        "      <td>0.115</td>\n",
 259 |        "      <td>0.149000004</td>\n",
 260 |        "      <td>0.209000006</td>\n",
 261 |        "      <td>0.063000001</td>\n",
 262 |        "    </tr>\n",
 263 |        "    <tr>\n",
 264 |        "      <th>1</th>\n",
 265 |        "      <td>0.0</td>\n",
 266 |        "      <td>17.0</td>\n",
 267 |        "      <td>21.0</td>\n",
 268 |        "      <td>86.0</td>\n",
 269 |        "      <td>563.0</td>\n",
 270 |        "      <td>0.082</td>\n",
 271 |        "      <td>0.059999999</td>\n",
 272 |        "      <td>0.178000003</td>\n",
 273 |        "      <td>0.056000002</td>\n",
 274 |        "    </tr>\n",
 275 |        "    <tr>\n",
 276 |        "      <th>2</th>\n",
 277 |        "      <td>1.0</td>\n",
 278 |        "      <td>19.0</td>\n",
 279 |        "      <td>26.0</td>\n",
 280 |        "      <td>94.0</td>\n",
 281 |        "      <td>578.0</td>\n",
 282 |        "      <td>0.113</td>\n",
 283 |        "      <td>0.229000002</td>\n",
 284 |        "      <td>0.207000002</td>\n",
 285 |        "      <td>0.077</td>\n",
 286 |        "    </tr>\n",
 287 |        "    <tr>\n",
 288 |        "      <th>3</th>\n",
 289 |        "      <td>1.0</td>\n",
 290 |        "      <td>19.0</td>\n",
 291 |        "      <td>11.0</td>\n",
 292 |        "      <td>122.0</td>\n",
 293 |        "      <td>1094.0</td>\n",
 294 |        "      <td>0.094</td>\n",
 295 |        "      <td>0.107000001</td>\n",
 296 |        "      <td>0.170000002</td>\n",
 297 |        "      <td>0.057</td>\n",
 298 |        "    </tr>\n",
 299 |        "    <tr>\n",
 300 |        "      <th>4</th>\n",
 301 |        "      <td>0.0</td>\n",
 302 |        "      <td>10.0</td>\n",
 303 |        "      <td>17.0</td>\n",
 304 |        "      <td>87.0</td>\n",
 305 |        "      <td>566.0</td>\n",
 306 |        "      <td>0.098</td>\n",
 307 |        "      <td>0.081</td>\n",
 308 |        "      <td>0.18900001</td>\n",
 309 |        "      <td>0.058000002</td>\n",
 310 |        "    </tr>\n",
 311 |        "    <tr>\n",
 312 |        "      <th>5</th>\n",
 313 |        "      <td>1.0</td>\n",
 314 |        "      <td>16.0</td>\n",
 315 |        "      <td>19.0</td>\n",
 316 |        "      <td>83.0</td>\n",
 317 |        "      <td>477.0</td>\n",
 318 |        "      <td>0.128</td>\n",
 319 |        "      <td>0.170000002</td>\n",
 320 |        "      <td>0.209000006</td>\n",
 321 |        "      <td>0.075999998</td>\n",
 322 |        "    </tr>\n",
 323 |        "    <tr>\n",
 324 |        "      <th>6</th>\n",
 325 |        "      <td>0.0</td>\n",
 326 |        "      <td>22.0</td>\n",
 327 |        "      <td>16.0</td>\n",
 328 |        "      <td>83.0</td>\n",
 329 |        "      <td>477.0</td>\n",
 330 |        "      <td>0.128</td>\n",
 331 |        "      <td>0.170000002</td>\n",
 332 |        "      <td>0.209000006</td>\n",
 333 |        "      <td>0.075999998</td>\n",
 334 |        "    </tr>\n",
 335 |        "    <tr>\n",
 336 |        "      <th>7</th>\n",
 337 |        "      <td>0.0</td>\n",
 338 |        "      <td>17.0</td>\n",
 339 |        "      <td>21.0</td>\n",
 340 |        "      <td>86.0</td>\n",
 341 |        "      <td>535.0</td>\n",
 342 |        "      <td>0.116</td>\n",
 343 |        "      <td>0.123000003</td>\n",
 344 |        "      <td>0.213000014</td>\n",
 345 |        "      <td>0.067999996</td>\n",
 346 |        "    </tr>\n",
 347 |        "    <tr>\n",
 348 |        "      <th>8</th>\n",
 349 |        "      <td>0.0</td>\n",
 350 |        "      <td>10.0</td>\n",
 351 |        "      <td>17.0</td>\n",
 352 |        "      <td>87.0</td>\n",
 353 |        "      <td>545.0</td>\n",
 354 |        "      <td>0.104</td>\n",
 355 |        "      <td>0.143999994</td>\n",
 356 |        "      <td>0.196999997</td>\n",
 357 |        "      <td>0.067999996</td>\n",
 358 |        "    </tr>\n",
 359 |        "    <tr>\n",
 360 |        "      <th>9</th>\n",
 361 |        "      <td>1.0</td>\n",
 362 |        "      <td>23.0</td>\n",
 363 |        "      <td>16.0</td>\n",
 364 |        "      <td>132.0</td>\n",
 365 |        "      <td>1123.0</td>\n",
 366 |        "      <td>0.097</td>\n",
 367 |        "      <td>0.246000007</td>\n",
 368 |        "      <td>0.24000001</td>\n",
 369 |        "      <td>0.078000002</td>\n",
 370 |        "    </tr>\n",
 371 |        "    <tr>\n",
 372 |        "      <th>10</th>\n",
 373 |        "      <td>1.0</td>\n",
 374 |        "      <td>16.0</td>\n",
 375 |        "      <td>19.0</td>\n",
 376 |        "      <td>83.0</td>\n",
 377 |        "      <td>524.0</td>\n",
 378 |        "      <td>0.090</td>\n",
 379 |        "      <td>0.037999999</td>\n",
 380 |        "      <td>0.147</td>\n",
 381 |        "      <td>0.059</td>\n",
 382 |        "    </tr>\n",
 383 |        "    <tr>\n",
 384 |        "      <th>11</th>\n",
 385 |        "      <td>1.0</td>\n",
 386 |        "      <td>21.0</td>\n",
 387 |        "      <td>18.0</td>\n",
 388 |        "      <td>124.0</td>\n",
 389 |        "      <td>1076.0</td>\n",
 390 |        "      <td>0.110</td>\n",
 391 |        "      <td>0.169</td>\n",
 392 |        "      <td>0.191</td>\n",
 393 |        "      <td>0.059999999</td>\n",
 394 |        "    </tr>\n",
 395 |        "    <tr>\n",
 396 |        "      <th>12</th>\n",
 397 |        "      <td>0.0</td>\n",
 398 |        "      <td>22.0</td>\n",
 399 |        "      <td>16.0</td>\n",
 400 |        "      <td>83.0</td>\n",
 401 |        "      <td>524.0</td>\n",
 402 |        "      <td>0.090</td>\n",
 403 |        "      <td>0.037999999</td>\n",
 404 |        "      <td>0.147</td>\n",
 405 |        "      <td>0.059</td>\n",
 406 |        "    </tr>\n",
 407 |        "    <tr>\n",
 408 |        "      <th>13</th>\n",
 409 |        "      <td>1.0</td>\n",
 410 |        "      <td>19.0</td>\n",
 411 |        "      <td>26.0</td>\n",
 412 |        "      <td>94.0</td>\n",
 413 |        "      <td>633.0</td>\n",
 414 |        "      <td>0.098</td>\n",
 415 |        "      <td>0.109999999</td>\n",
 416 |        "      <td>0.18900001</td>\n",
 417 |        "      <td>0.060999997</td>\n",
 418 |        "    </tr>\n",
 419 |        "    <tr>\n",
 420 |        "      <th>14</th>\n",
 421 |        "      <td>1.0</td>\n",
 422 |        "      <td>16.0</td>\n",
 423 |        "      <td>19.0</td>\n",
 424 |        "      <td>83.0</td>\n",
 425 |        "      <td>527.0</td>\n",
 426 |        "      <td>0.081</td>\n",
 427 |        "      <td>0.037999999</td>\n",
 428 |        "      <td>0.147</td>\n",
 429 |        "      <td>0.059</td>\n",
 430 |        "    </tr>\n",
 431 |        "    <tr>\n",
 432 |        "      <th>15</th>\n",
 433 |        "      <td>0.0</td>\n",
 434 |        "      <td>22.0</td>\n",
 435 |        "      <td>16.0</td>\n",
 436 |        "      <td>83.0</td>\n",
 437 |        "      <td>527.0</td>\n",
 438 |        "      <td>0.081</td>\n",
 439 |        "      <td>0.037999999</td>\n",
 440 |        "      <td>0.147</td>\n",
 441 |        "      <td>0.059</td>\n",
 442 |        "    </tr>\n",
 443 |        "    <tr>\n",
 444 |        "      <th>16</th>\n",
 445 |        "      <td>1.0</td>\n",
 446 |        "      <td>10.0</td>\n",
 447 |        "      <td>12.0</td>\n",
 448 |        "      <td>100.0</td>\n",
 449 |        "      <td>706.0</td>\n",
 450 |        "      <td>0.104</td>\n",
 451 |        "      <td>0.155000001</td>\n",
 452 |        "      <td>0.186000004</td>\n",
 453 |        "      <td>0.063000001</td>\n",
 454 |        "    </tr>\n",
 455 |        "    <tr>\n",
 456 |        "      <th>17</th>\n",
 457 |        "      <td>1.0</td>\n",
 458 |        "      <td>22.0</td>\n",
 459 |        "      <td>26.0</td>\n",
 460 |        "      <td>100.0</td>\n",
 461 |        "      <td>706.0</td>\n",
 462 |        "      <td>0.104</td>\n",
 463 |        "      <td>0.155000001</td>\n",
 464 |        "      <td>0.186000004</td>\n",
 465 |        "      <td>0.063000001</td>\n",
 466 |        "    </tr>\n",
 467 |        "    <tr>\n",
 468 |        "      <th>18</th>\n",
 469 |        "      <td>0.0</td>\n",
 470 |        "      <td>10.0</td>\n",
 471 |        "      <td>17.0</td>\n",
 472 |        "      <td>87.0</td>\n",
 473 |        "      <td>561.0</td>\n",
 474 |        "      <td>0.088</td>\n",
 475 |        "      <td>0.077</td>\n",
 476 |        "      <td>0.181000009</td>\n",
 477 |        "      <td>0.057</td>\n",
 478 |        "    </tr>\n",
 479 |        "    <tr>\n",
 480 |        "      <th>19</th>\n",
 481 |        "      <td>0.0</td>\n",
 482 |        "      <td>12.0</td>\n",
 483 |        "      <td>23.0</td>\n",
 484 |        "      <td>96.0</td>\n",
 485 |        "      <td>699.0</td>\n",
 486 |        "      <td>0.094</td>\n",
 487 |        "      <td>0.050999999</td>\n",
 488 |        "      <td>0.157000005</td>\n",
 489 |        "      <td>0.055</td>\n",
 490 |        "    </tr>\n",
 491 |        "    <tr>\n",
 492 |        "      <th>20</th>\n",
 493 |        "      <td>0.0</td>\n",
 494 |        "      <td>18.0</td>\n",
 495 |        "      <td>12.0</td>\n",
 496 |        "      <td>72.0</td>\n",
 497 |        "      <td>371.0</td>\n",
 498 |        "      <td>0.123</td>\n",
 499 |        "      <td>0.122000001</td>\n",
 500 |        "      <td>0.189999998</td>\n",
 501 |        "      <td>0.068999998</td>\n",
 502 |        "    </tr>\n",
 503 |        "    <tr>\n",
 504 |        "      <th>21</th>\n",
 505 |        "      <td>0.0</td>\n",
 506 |        "      <td>12.0</td>\n",
 507 |        "      <td>23.0</td>\n",
 508 |        "      <td>96.0</td>\n",
 509 |        "      <td>657.0</td>\n",
 510 |        "      <td>0.114</td>\n",
 511 |        "      <td>0.136999995</td>\n",
 512 |        "      <td>0.203000009</td>\n",
 513 |        "      <td>0.067999996</td>\n",
 514 |        "    </tr>\n",
 515 |        "    <tr>\n",
 516 |        "      <th>22</th>\n",
 517 |        "      <td>0.0</td>\n",
 518 |        "      <td>12.0</td>\n",
 519 |        "      <td>23.0</td>\n",
 520 |        "      <td>96.0</td>\n",
 521 |        "      <td>646.0</td>\n",
 522 |        "      <td>0.105</td>\n",
 523 |        "      <td>0.201000005</td>\n",
 524 |        "      <td>0.194999993</td>\n",
 525 |        "      <td>0.072999999</td>\n",
 526 |        "    </tr>\n",
 527 |        "    <tr>\n",
 528 |        "      <th>23</th>\n",
 529 |        "      <td>0.0</td>\n",
 530 |        "      <td>10.0</td>\n",
 531 |        "      <td>18.0</td>\n",
 532 |        "      <td>74.0</td>\n",
 533 |        "      <td>413.0</td>\n",
 534 |        "      <td>0.090</td>\n",
 535 |        "      <td>0.075000003</td>\n",
 536 |        "      <td>0.162</td>\n",
 537 |        "      <td>0.066</td>\n",
 538 |        "    </tr>\n",
 539 |        "    <tr>\n",
 540 |        "      <th>24</th>\n",
 541 |        "      <td>1.0</td>\n",
 542 |        "      <td>11.0</td>\n",
 543 |        "      <td>21.0</td>\n",
 544 |        "      <td>97.0</td>\n",
 545 |        "      <td>713.0</td>\n",
 546 |        "      <td>0.091</td>\n",
 547 |        "      <td>0.071000002</td>\n",
 548 |        "      <td>0.162</td>\n",
 549 |        "      <td>0.057</td>\n",
 550 |        "    </tr>\n",
 551 |        "    <tr>\n",
 552 |        "      <th>25</th>\n",
 553 |        "      <td>0.0</td>\n",
 554 |        "      <td>22.0</td>\n",
 555 |        "      <td>16.0</td>\n",
 556 |        "      <td>83.0</td>\n",
 557 |        "      <td>506.0</td>\n",
 558 |        "      <td>0.099</td>\n",
 559 |        "      <td>null</td>\n",
 560 |        "      <td>null</td>\n",
 561 |        "      <td>null</td>\n",
 562 |        "    </tr>\n",
 563 |        "    <tr>\n",
 564 |        "      <th>26</th>\n",
 565 |        "      <td>0.0</td>\n",
 566 |        "      <td>14.0</td>\n",
 567 |        "      <td>14.0</td>\n",
 568 |        "      <td>85.0</td>\n",
 569 |        "      <td>532.0</td>\n",
 570 |        "      <td>0.097</td>\n",
 571 |        "      <td>null</td>\n",
 572 |        "      <td>null</td>\n",
 573 |        "      <td>null</td>\n",
 574 |        "    </tr>\n",
 575 |        "    <tr>\n",
 576 |        "      <th>27</th>\n",
 577 |        "      <td>0.0</td>\n",
 578 |        "      <td>10.0</td>\n",
 579 |        "      <td>17.0</td>\n",
 580 |        "      <td>87.0</td>\n",
 581 |        "      <td>572.0</td>\n",
 582 |        "      <td>0.077</td>\n",
 583 |        "      <td>null</td>\n",
 584 |        "      <td>null</td>\n",
 585 |        "      <td>null</td>\n",
 586 |        "    </tr>\n",
 587 |        "    <tr>\n",
 588 |        "      <th>28</th>\n",
 589 |        "      <td>1.0</td>\n",
 590 |        "      <td>16.0</td>\n",
 591 |        "      <td>19.0</td>\n",
 592 |        "      <td>83.0</td>\n",
 593 |        "      <td>477.0</td>\n",
 594 |        "      <td>0.128</td>\n",
 595 |        "      <td>0.170000002</td>\n",
 596 |        "      <td>0.193000004</td>\n",
 597 |        "      <td>0.064999998</td>\n",
 598 |        "    </tr>\n",
 599 |        "    <tr>\n",
 600 |        "      <th>29</th>\n",
 601 |        "      <td>0.0</td>\n",
 602 |        "      <td>22.0</td>\n",
 603 |        "      <td>16.0</td>\n",
 604 |        "      <td>83.0</td>\n",
 605 |        "      <td>477.0</td>\n",
 606 |        "      <td>0.128</td>\n",
 607 |        "      <td>0.170000002</td>\n",
 608 |        "      <td>0.193000004</td>\n",
 609 |        "      <td>0.064999998</td>\n",
 610 |        "    </tr>\n",
 611 |        "    <tr>\n",
 612 |        "      <th>...</th>\n",
 613 |        "      <td>...</td>\n",
 614 |        "      <td>...</td>\n",
 615 |        "      <td>...</td>\n",
 616 |        "      <td>...</td>\n",
 617 |        "      <td>...</td>\n",
 618 |        "      <td>...</td>\n",
 619 |        "      <td>...</td>\n",
 620 |        "      <td>...</td>\n",
 621 |        "      <td>...</td>\n",
 622 |        "    </tr>\n",
 623 |        "    <tr>\n",
 624 |        "      <th>286</th>\n",
 625 |        "      <td>1.0</td>\n",
 626 |        "      <td>11.0</td>\n",
 627 |        "      <td>21.0</td>\n",
 628 |        "      <td>97.0</td>\n",
 629 |        "      <td>659.0</td>\n",
 630 |        "      <td>0.114</td>\n",
 631 |        "      <td>0.159999996</td>\n",
 632 |        "      <td>0.207000002</td>\n",
 633 |        "      <td>0.059999999</td>\n",
 634 |        "    </tr>\n",
 635 |        "    <tr>\n",
 636 |        "      <th>287</th>\n",
 637 |        "      <td>0.0</td>\n",
 638 |        "      <td>12.0</td>\n",
 639 |        "      <td>13.0</td>\n",
 640 |        "      <td>60.0</td>\n",
 641 |        "      <td>274.0</td>\n",
 642 |        "      <td>0.102</td>\n",
 643 |        "      <td>0.064999998</td>\n",
 644 |        "      <td>0.182000011</td>\n",
 645 |        "      <td>0.068999998</td>\n",
 646 |        "    </tr>\n",
 647 |        "    <tr>\n",
 648 |        "      <th>288</th>\n",
 649 |        "      <td>0.0</td>\n",
 650 |        "      <td>10.0</td>\n",
 651 |        "      <td>17.0</td>\n",
 652 |        "      <td>88.0</td>\n",
 653 |        "      <td>520.0</td>\n",
 654 |        "      <td>0.127</td>\n",
 655 |        "      <td>0.193000004</td>\n",
 656 |        "      <td>0.234999999</td>\n",
 657 |        "      <td>0.074000001</td>\n",
 658 |        "    </tr>\n",
 659 |        "    <tr>\n",
 660 |        "      <th>289</th>\n",
 661 |        "      <td>0.0</td>\n",
 662 |        "      <td>17.0</td>\n",
 663 |        "      <td>21.0</td>\n",
 664 |        "      <td>86.0</td>\n",
 665 |        "      <td>520.0</td>\n",
 666 |        "      <td>0.108</td>\n",
 667 |        "      <td>0.127000004</td>\n",
 668 |        "      <td>0.196999997</td>\n",
 669 |        "      <td>0.067999996</td>\n",
 670 |        "    </tr>\n",
 671 |        "    <tr>\n",
 672 |        "      <th>290</th>\n",
 673 |        "      <td>1.0</td>\n",
 674 |        "      <td>19.0</td>\n",
 675 |        "      <td>26.0</td>\n",
 676 |        "      <td>94.0</td>\n",
 677 |        "      <td>643.0</td>\n",
 678 |        "      <td>0.098</td>\n",
 679 |        "      <td>0.114</td>\n",
 680 |        "      <td>0.188000008</td>\n",
 681 |        "      <td>0.063999996</td>\n",
 682 |        "    </tr>\n",
 683 |        "    <tr>\n",
 684 |        "      <th>291</th>\n",
 685 |        "      <td>0.0</td>\n",
 686 |        "      <td>23.0</td>\n",
 687 |        "      <td>27.0</td>\n",
 688 |        "      <td>95.0</td>\n",
 689 |        "      <td>685.0</td>\n",
 690 |        "      <td>0.099</td>\n",
 691 |        "      <td>0.071999997</td>\n",
 692 |        "      <td>0.159000009</td>\n",
 693 |        "      <td>0.059</td>\n",
 694 |        "    </tr>\n",
 695 |        "    <tr>\n",
 696 |        "      <th>292</th>\n",
 697 |        "      <td>1.0</td>\n",
 698 |        "      <td>11.0</td>\n",
 699 |        "      <td>21.0</td>\n",
 700 |        "      <td>97.0</td>\n",
 701 |        "      <td>645.0</td>\n",
 702 |        "      <td>0.105</td>\n",
 703 |        "      <td>0.187000006</td>\n",
 704 |        "      <td>0.224999994</td>\n",
 705 |        "      <td>0.068999998</td>\n",
 706 |        "    </tr>\n",
 707 |        "    <tr>\n",
 708 |        "      <th>293</th>\n",
 709 |        "      <td>0.0</td>\n",
 710 |        "      <td>16.0</td>\n",
 711 |        "      <td>17.0</td>\n",
 712 |        "      <td>59.0</td>\n",
 713 |        "      <td>261.0</td>\n",
 714 |        "      <td>0.077</td>\n",
 715 |        "      <td>0.088</td>\n",
 716 |        "      <td>0.233999997</td>\n",
 717 |        "      <td>0.07</td>\n",
 718 |        "    </tr>\n",
 719 |        "    <tr>\n",
 720 |        "      <th>294</th>\n",
 721 |        "      <td>0.0</td>\n",
 722 |        "      <td>9.0</td>\n",
 723 |        "      <td>26.0</td>\n",
 724 |        "      <td>59.0</td>\n",
 725 |        "      <td>261.0</td>\n",
 726 |        "      <td>0.077</td>\n",
 727 |        "      <td>0.088</td>\n",
 728 |        "      <td>0.233999997</td>\n",
 729 |        "      <td>0.07</td>\n",
 730 |        "    </tr>\n",
 731 |        "    <tr>\n",
 732 |        "      <th>295</th>\n",
 733 |        "      <td>1.0</td>\n",
 734 |        "      <td>21.0</td>\n",
 735 |        "      <td>18.0</td>\n",
 736 |        "      <td>104.0</td>\n",
 737 |        "      <td>783.0</td>\n",
 738 |        "      <td>0.084</td>\n",
 739 |        "      <td>0.100000001</td>\n",
 740 |        "      <td>0.185000002</td>\n",
 741 |        "      <td>0.052999999</td>\n",
 742 |        "    </tr>\n",
 743 |        "    <tr>\n",
 744 |        "      <th>296</th>\n",
 745 |        "      <td>0.0</td>\n",
 746 |        "      <td>10.0</td>\n",
 747 |        "      <td>17.0</td>\n",
 748 |        "      <td>88.0</td>\n",
 749 |        "      <td>559.0</td>\n",
 750 |        "      <td>0.102</td>\n",
 751 |        "      <td>0.126000002</td>\n",
 752 |        "      <td>0.172000006</td>\n",
 753 |        "      <td>0.063999996</td>\n",
 754 |        "    </tr>\n",
 755 |        "    <tr>\n",
 756 |        "      <th>297</th>\n",
 757 |        "      <td>1.0</td>\n",
 758 |        "      <td>14.0</td>\n",
 759 |        "      <td>13.0</td>\n",
 760 |        "      <td>121.0</td>\n",
 761 |        "      <td>1075.0</td>\n",
 762 |        "      <td>0.099</td>\n",
 763 |        "      <td>null</td>\n",
 764 |        "      <td>null</td>\n",
 765 |        "      <td>null</td>\n",
 766 |        "    </tr>\n",
 767 |        "    <tr>\n",
 768 |        "      <th>298</th>\n",
 769 |        "      <td>1.0</td>\n",
 770 |        "      <td>19.0</td>\n",
 771 |        "      <td>26.0</td>\n",
 772 |        "      <td>94.0</td>\n",
 773 |        "      <td>648.0</td>\n",
 774 |        "      <td>0.094</td>\n",
 775 |        "      <td>null</td>\n",
 776 |        "      <td>null</td>\n",
 777 |        "      <td>null</td>\n",
 778 |        "    </tr>\n",
 779 |        "    <tr>\n",
 780 |        "      <th>299</th>\n",
 781 |        "      <td>1.0</td>\n",
 782 |        "      <td>19.0</td>\n",
 783 |        "      <td>11.0</td>\n",
 784 |        "      <td>122.0</td>\n",
 785 |        "      <td>1076.0</td>\n",
 786 |        "      <td>0.090</td>\n",
 787 |        "      <td>null</td>\n",
 788 |        "      <td>null</td>\n",
 789 |        "      <td>null</td>\n",
 790 |        "    </tr>\n",
 791 |        "    <tr>\n",
 792 |        "      <th>300</th>\n",
 793 |        "      <td>0.0</td>\n",
 794 |        "      <td>11.0</td>\n",
 795 |        "      <td>11.0</td>\n",
 796 |        "      <td>80.0</td>\n",
 797 |        "      <td>466.0</td>\n",
 798 |        "      <td>0.088</td>\n",
 799 |        "      <td>null</td>\n",
 800 |        "      <td>null</td>\n",
 801 |        "      <td>null</td>\n",
 802 |        "    </tr>\n",
 803 |        "    <tr>\n",
 804 |        "      <th>301</th>\n",
 805 |        "      <td>0.0</td>\n",
 806 |        "      <td>12.0</td>\n",
 807 |        "      <td>23.0</td>\n",
 808 |        "      <td>96.0</td>\n",
 809 |        "      <td>652.0</td>\n",
 810 |        "      <td>0.113</td>\n",
 811 |        "      <td>null</td>\n",
 812 |        "      <td>null</td>\n",
 813 |        "      <td>null</td>\n",
 814 |        "    </tr>\n",
 815 |        "    <tr>\n",
 816 |        "      <th>302</th>\n",
 817 |        "      <td>0.0</td>\n",
 818 |        "      <td>23.0</td>\n",
 819 |        "      <td>27.0</td>\n",
 820 |        "      <td>95.0</td>\n",
 821 |        "      <td>663.0</td>\n",
 822 |        "      <td>0.090</td>\n",
 823 |        "      <td>null</td>\n",
 824 |        "      <td>null</td>\n",
 825 |        "      <td>null</td>\n",
 826 |        "    </tr>\n",
 827 |        "    <tr>\n",
 828 |        "      <th>303</th>\n",
 829 |        "      <td>0.0</td>\n",
 830 |        "      <td>10.0</td>\n",
 831 |        "      <td>17.0</td>\n",
 832 |        "      <td>87.0</td>\n",
 833 |        "      <td>555.0</td>\n",
 834 |        "      <td>0.102</td>\n",
 835 |        "      <td>null</td>\n",
 836 |        "      <td>null</td>\n",
 837 |        "      <td>null</td>\n",
 838 |        "    </tr>\n",
 839 |        "    <tr>\n",
 840 |        "      <th>304</th>\n",
 841 |        "      <td>0.0</td>\n",
 842 |        "      <td>16.0</td>\n",
 843 |        "      <td>17.0</td>\n",
 844 |        "      <td>59.0</td>\n",
 845 |        "      <td>244.0</td>\n",
 846 |        "      <td>0.098</td>\n",
 847 |        "      <td>null</td>\n",
 848 |        "      <td>null</td>\n",
 849 |        "      <td>null</td>\n",
 850 |        "    </tr>\n",
 851 |        "    <tr>\n",
 852 |        "      <th>305</th>\n",
 853 |        "      <td>0.0</td>\n",
 854 |        "      <td>9.0</td>\n",
 855 |        "      <td>26.0</td>\n",
 856 |        "      <td>59.0</td>\n",
 857 |        "      <td>244.0</td>\n",
 858 |        "      <td>0.098</td>\n",
 859 |        "      <td>null</td>\n",
 860 |        "      <td>null</td>\n",
 861 |        "      <td>null</td>\n",
 862 |        "    </tr>\n",
 863 |        "    <tr>\n",
 864 |        "      <th>306</th>\n",
 865 |        "      <td>1.0</td>\n",
 866 |        "      <td>21.0</td>\n",
 867 |        "      <td>18.0</td>\n",
 868 |        "      <td>104.0</td>\n",
 869 |        "      <td>781.0</td>\n",
 870 |        "      <td>0.097</td>\n",
 871 |        "      <td>null</td>\n",
 872 |        "      <td>null</td>\n",
 873 |        "      <td>null</td>\n",
 874 |        "    </tr>\n",
 875 |        "    <tr>\n",
 876 |        "      <th>307</th>\n",
 877 |        "      <td>1.0</td>\n",
 878 |        "      <td>11.0</td>\n",
 879 |        "      <td>21.0</td>\n",
 880 |        "      <td>120.0</td>\n",
 881 |        "      <td>1040.0</td>\n",
 882 |        "      <td>0.095</td>\n",
 883 |        "      <td>null</td>\n",
 884 |        "      <td>null</td>\n",
 885 |        "      <td>null</td>\n",
 886 |        "    </tr>\n",
 887 |        "    <tr>\n",
 888 |        "      <th>308</th>\n",
 889 |        "      <td>1.0</td>\n",
 890 |        "      <td>16.0</td>\n",
 891 |        "      <td>19.0</td>\n",
 892 |        "      <td>83.0</td>\n",
 893 |        "      <td>506.0</td>\n",
 894 |        "      <td>0.099</td>\n",
 895 |        "      <td>null</td>\n",
 896 |        "      <td>null</td>\n",
 897 |        "      <td>null</td>\n",
 898 |        "    </tr>\n",
 899 |        "    <tr>\n",
 900 |        "      <th>309</th>\n",
 901 |        "      <td>1.0</td>\n",
 902 |        "      <td>22.0</td>\n",
 903 |        "      <td>25.0</td>\n",
 904 |        "      <td>90.0</td>\n",
 905 |        "      <td>578.0</td>\n",
 906 |        "      <td>0.119</td>\n",
 907 |        "      <td>null</td>\n",
 908 |        "      <td>null</td>\n",
 909 |        "      <td>null</td>\n",
 910 |        "    </tr>\n",
 911 |        "    <tr>\n",
 912 |        "      <th>310</th>\n",
 913 |        "      <td>1.0</td>\n",
 914 |        "      <td>11.0</td>\n",
 915 |        "      <td>21.0</td>\n",
 916 |        "      <td>97.0</td>\n",
 917 |        "      <td>659.0</td>\n",
 918 |        "      <td>0.114</td>\n",
 919 |        "      <td>0.159999996</td>\n",
 920 |        "      <td>0.230000004</td>\n",
 921 |        "      <td>0.071000002</td>\n",
 922 |        "    </tr>\n",
 923 |        "    <tr>\n",
 924 |        "      <th>311</th>\n",
 925 |        "      <td>0.0</td>\n",
 926 |        "      <td>14.0</td>\n",
 927 |        "      <td>14.0</td>\n",
 928 |        "      <td>85.0</td>\n",
 929 |        "      <td>552.0</td>\n",
 930 |        "      <td>0.074</td>\n",
 931 |        "      <td>0.050999999</td>\n",
 932 |        "      <td>0.138999999</td>\n",
 933 |        "      <td>0.052999999</td>\n",
 934 |        "    </tr>\n",
 935 |        "    <tr>\n",
 936 |        "      <th>312</th>\n",
 937 |        "      <td>0.0</td>\n",
 938 |        "      <td>25.0</td>\n",
 939 |        "      <td>21.0</td>\n",
 940 |        "      <td>77.0</td>\n",
 941 |        "      <td>443.0</td>\n",
 942 |        "      <td>0.097</td>\n",
 943 |        "      <td>0.071999997</td>\n",
 944 |        "      <td>0.208000004</td>\n",
 945 |        "      <td>0.059999999</td>\n",
 946 |        "    </tr>\n",
 947 |        "    <tr>\n",
 948 |        "      <th>313</th>\n",
 949 |        "      <td>0.0</td>\n",
 950 |        "      <td>17.0</td>\n",
 951 |        "      <td>21.0</td>\n",
 952 |        "      <td>86.0</td>\n",
 953 |        "      <td>520.0</td>\n",
 954 |        "      <td>0.108</td>\n",
 955 |        "      <td>0.127000004</td>\n",
 956 |        "      <td>0.192000002</td>\n",
 957 |        "      <td>0.059999999</td>\n",
 958 |        "    </tr>\n",
 959 |        "    <tr>\n",
 960 |        "      <th>314</th>\n",
 961 |        "      <td>0.0</td>\n",
 962 |        "      <td>23.0</td>\n",
 963 |        "      <td>27.0</td>\n",
 964 |        "      <td>95.0</td>\n",
 965 |        "      <td>685.0</td>\n",
 966 |        "      <td>0.099</td>\n",
 967 |        "      <td>0.071999997</td>\n",
 968 |        "      <td>0.208000004</td>\n",
 969 |        "      <td>0.059999999</td>\n",
 970 |        "    </tr>\n",
 971 |        "    <tr>\n",
 972 |        "      <th>315</th>\n",
 973 |        "      <td>0.0</td>\n",
 974 |        "      <td>10.0</td>\n",
 975 |        "      <td>17.0</td>\n",
 976 |        "      <td>88.0</td>\n",
 977 |        "      <td>559.0</td>\n",
 978 |        "      <td>0.102</td>\n",
 979 |        "      <td>0.126000002</td>\n",
 980 |        "      <td>0.191</td>\n",
 981 |        "      <td>0.066</td>\n",
 982 |        "    </tr>\n",
 983 |        "  </tbody>\n",
 984 |        "</table>\n",
 985 |        "<p>316 rows × 9 columns</p>\n",
 986 |        "</div>"
 987 |       ],
 988 |       "text/plain": [
 989 |        "     diagnosis_result  radius  texture  perimeter    area  smoothness  \\\n",
 990 |        "0                 1.0    11.0     21.0      120.0  1033.0       0.115   \n",
 991 |        "1                 0.0    17.0     21.0       86.0   563.0       0.082   \n",
 992 |        "2                 1.0    19.0     26.0       94.0   578.0       0.113   \n",
 993 |        "3                 1.0    19.0     11.0      122.0  1094.0       0.094   \n",
 994 |        "4                 0.0    10.0     17.0       87.0   566.0       0.098   \n",
 995 |        "5                 1.0    16.0     19.0       83.0   477.0       0.128   \n",
 996 |        "6                 0.0    22.0     16.0       83.0   477.0       0.128   \n",
 997 |        "7                 0.0    17.0     21.0       86.0   535.0       0.116   \n",
 998 |        "8                 0.0    10.0     17.0       87.0   545.0       0.104   \n",
 999 |        "9                 1.0    23.0     16.0      132.0  1123.0       0.097   \n",
1000 |        "10                1.0    16.0     19.0       83.0   524.0       0.090   \n",
1001 |        "11                1.0    21.0     18.0      124.0  1076.0       0.110   \n",
1002 |        "12                0.0    22.0     16.0       83.0   524.0       0.090   \n",
1003 |        "13                1.0    19.0     26.0       94.0   633.0       0.098   \n",
1004 |        "14                1.0    16.0     19.0       83.0   527.0       0.081   \n",
1005 |        "15                0.0    22.0     16.0       83.0   527.0       0.081   \n",
1006 |        "16                1.0    10.0     12.0      100.0   706.0       0.104   \n",
1007 |        "17                1.0    22.0     26.0      100.0   706.0       0.104   \n",
1008 |        "18                0.0    10.0     17.0       87.0   561.0       0.088   \n",
1009 |        "19                0.0    12.0     23.0       96.0   699.0       0.094   \n",
1010 |        "20                0.0    18.0     12.0       72.0   371.0       0.123   \n",
1011 |        "21                0.0    12.0     23.0       96.0   657.0       0.114   \n",
1012 |        "22                0.0    12.0     23.0       96.0   646.0       0.105   \n",
1013 |        "23                0.0    10.0     18.0       74.0   413.0       0.090   \n",
1014 |        "24                1.0    11.0     21.0       97.0   713.0       0.091   \n",
1015 |        "25                0.0    22.0     16.0       83.0   506.0       0.099   \n",
1016 |        "26                0.0    14.0     14.0       85.0   532.0       0.097   \n",
1017 |        "27                0.0    10.0     17.0       87.0   572.0       0.077   \n",
1018 |        "28                1.0    16.0     19.0       83.0   477.0       0.128   \n",
1019 |        "29                0.0    22.0     16.0       83.0   477.0       0.128   \n",
1020 |        "..                ...     ...      ...        ...     ...         ...   \n",
1021 |        "286               1.0    11.0     21.0       97.0   659.0       0.114   \n",
1022 |        "287               0.0    12.0     13.0       60.0   274.0       0.102   \n",
1023 |        "288               0.0    10.0     17.0       88.0   520.0       0.127   \n",
1024 |        "289               0.0    17.0     21.0       86.0   520.0       0.108   \n",
1025 |        "290               1.0    19.0     26.0       94.0   643.0       0.098   \n",
1026 |        "291               0.0    23.0     27.0       95.0   685.0       0.099   \n",
1027 |        "292               1.0    11.0     21.0       97.0   645.0       0.105   \n",
1028 |        "293               0.0    16.0     17.0       59.0   261.0       0.077   \n",
1029 |        "294               0.0     9.0     26.0       59.0   261.0       0.077   \n",
1030 |        "295               1.0    21.0     18.0      104.0   783.0       0.084   \n",
1031 |        "296               0.0    10.0     17.0       88.0   559.0       0.102   \n",
1032 |        "297               1.0    14.0     13.0      121.0  1075.0       0.099   \n",
1033 |        "298               1.0    19.0     26.0       94.0   648.0       0.094   \n",
1034 |        "299               1.0    19.0     11.0      122.0  1076.0       0.090   \n",
1035 |        "300               0.0    11.0     11.0       80.0   466.0       0.088   \n",
1036 |        "301               0.0    12.0     23.0       96.0   652.0       0.113   \n",
1037 |        "302               0.0    23.0     27.0       95.0   663.0       0.090   \n",
1038 |        "303               0.0    10.0     17.0       87.0   555.0       0.102   \n",
1039 |        "304               0.0    16.0     17.0       59.0   244.0       0.098   \n",
1040 |        "305               0.0     9.0     26.0       59.0   244.0       0.098   \n",
1041 |        "306               1.0    21.0     18.0      104.0   781.0       0.097   \n",
1042 |        "307               1.0    11.0     21.0      120.0  1040.0       0.095   \n",
1043 |        "308               1.0    16.0     19.0       83.0   506.0       0.099   \n",
1044 |        "309               1.0    22.0     25.0       90.0   578.0       0.119   \n",
1045 |        "310               1.0    11.0     21.0       97.0   659.0       0.114   \n",
1046 |        "311               0.0    14.0     14.0       85.0   552.0       0.074   \n",
1047 |        "312               0.0    25.0     21.0       77.0   443.0       0.097   \n",
1048 |        "313               0.0    17.0     21.0       86.0   520.0       0.108   \n",
1049 |        "314               0.0    23.0     27.0       95.0   685.0       0.099   \n",
1050 |        "315               0.0    10.0     17.0       88.0   559.0       0.102   \n",
1051 |        "\n",
1052 |        "     compactness     symmetry fractal_dimension  \n",
1053 |        "0    0.149000004  0.209000006       0.063000001  \n",
1054 |        "1    0.059999999  0.178000003       0.056000002  \n",
1055 |        "2    0.229000002  0.207000002             0.077  \n",
1056 |        "3    0.107000001  0.170000002             0.057  \n",
1057 |        "4          0.081   0.18900001       0.058000002  \n",
1058 |        "5    0.170000002  0.209000006       0.075999998  \n",
1059 |        "6    0.170000002  0.209000006       0.075999998  \n",
1060 |        "7    0.123000003  0.213000014       0.067999996  \n",
1061 |        "8    0.143999994  0.196999997       0.067999996  \n",
1062 |        "9    0.246000007   0.24000001       0.078000002  \n",
1063 |        "10   0.037999999        0.147             0.059  \n",
1064 |        "11         0.169        0.191       0.059999999  \n",
1065 |        "12   0.037999999        0.147             0.059  \n",
1066 |        "13   0.109999999   0.18900001       0.060999997  \n",
1067 |        "14   0.037999999        0.147             0.059  \n",
1068 |        "15   0.037999999        0.147             0.059  \n",
1069 |        "16   0.155000001  0.186000004       0.063000001  \n",
1070 |        "17   0.155000001  0.186000004       0.063000001  \n",
1071 |        "18         0.077  0.181000009             0.057  \n",
1072 |        "19   0.050999999  0.157000005             0.055  \n",
1073 |        "20   0.122000001  0.189999998       0.068999998  \n",
1074 |        "21   0.136999995  0.203000009       0.067999996  \n",
1075 |        "22   0.201000005  0.194999993       0.072999999  \n",
1076 |        "23   0.075000003        0.162             0.066  \n",
1077 |        "24   0.071000002        0.162             0.057  \n",
1078 |        "25          null         null              null  \n",
1079 |        "26          null         null              null  \n",
1080 |        "27          null         null              null  \n",
1081 |        "28   0.170000002  0.193000004       0.064999998  \n",
1082 |        "29   0.170000002  0.193000004       0.064999998  \n",
1083 |        "..           ...          ...               ...  \n",
1084 |        "286  0.159999996  0.207000002       0.059999999  \n",
1085 |        "287  0.064999998  0.182000011       0.068999998  \n",
1086 |        "288  0.193000004  0.234999999       0.074000001  \n",
1087 |        "289  0.127000004  0.196999997       0.067999996  \n",
1088 |        "290        0.114  0.188000008       0.063999996  \n",
1089 |        "291  0.071999997  0.159000009             0.059  \n",
1090 |        "292  0.187000006  0.224999994       0.068999998  \n",
1091 |        "293        0.088  0.233999997              0.07  \n",
1092 |        "294        0.088  0.233999997              0.07  \n",
1093 |        "295  0.100000001  0.185000002       0.052999999  \n",
1094 |        "296  0.126000002  0.172000006       0.063999996  \n",
1095 |        "297         null         null              null  \n",
1096 |        "298         null         null              null  \n",
1097 |        "299         null         null              null  \n",
1098 |        "300         null         null              null  \n",
1099 |        "301         null         null              null  \n",
1100 |        "302         null         null              null  \n",
1101 |        "303         null         null              null  \n",
1102 |        "304         null         null              null  \n",
1103 |        "305         null         null              null  \n",
1104 |        "306         null         null              null  \n",
1105 |        "307         null         null              null  \n",
1106 |        "308         null         null              null  \n",
1107 |        "309         null         null              null  \n",
1108 |        "310  0.159999996  0.230000004       0.071000002  \n",
1109 |        "311  0.050999999  0.138999999       0.052999999  \n",
1110 |        "312  0.071999997  0.208000004       0.059999999  \n",
1111 |        "313  0.127000004  0.192000002       0.059999999  \n",
1112 |        "314  0.071999997  0.208000004       0.059999999  \n",
1113 |        "315  0.126000002        0.191             0.066  \n",
1114 |        "\n",
1115 |        "[316 rows x 9 columns]"
1116 |       ]
1117 |      },
1118 |      "execution_count": 6,
1119 |      "metadata": {},
1120 |      "output_type": "execute_result"
1121 |     }
1122 |    ],
1123 |    "source": [
1124 |     "# grab everything from 00 & 02, area & smoothness from 01\n",
1125 |     "query = '''\n",
1126 |     "        SELECT \n",
1127 |     "            a.*, \n",
1128 |     "            b.area, b.smoothness, \n",
1129 |     "            c.* \n",
1130 |     "        FROM \n",
1131 |     "            data_00 AS a\n",
1132 |     "        LEFT JOIN \n",
1133 |     "            data_01 AS b\n",
1134 |     "            ON (a.perimeter = b.perimeter)\n",
1135 |     "        LEFT JOIN \n",
1136 |     "            data_02 AS c\n",
1137 |     "            ON (b.compactness = c.compactness)\n",
1138 |     "        '''\n",
1139 |     "\n",
1140 |     "# join the tables together (type(gdf)==cudf.core.dataframe.Dataframe)\n",
1141 |     "gdf = bc.sql(query)\n",
1142 |     "\n",
1143 |     "# display result\n",
1144 |     "gdf"
1145 |    ]
1146 |   },
1147 |   {
1148 |    "cell_type": "markdown",
1149 |    "metadata": {
1150 |     "colab_type": "text",
1151 |     "id": "wygAeTIFTm2X"
1152 |    },
1153 |    "source": [
1154 |     "# You're Ready to Rock\n",
1155 |     "And... thats it! You are now live with BlazingSQL.\n",
1156 |     "\n",
1157 |     "Check out our [docs](https://docs.blazingdb.com) to get fancy or to learn more about how BlazingSQL works with the rest of [RAPIDS AI](https://rapids.ai/)."
1158 |    ]
1159 |   }
1160 |  ],
1161 |  "metadata": {
1162 |   "accelerator": "GPU",
1163 |   "colab": {
1164 |    "collapsed_sections": [
1165 |     "McVBO7GHRDzz"
1166 |    ],
1167 |    "name": "BlazingSQL_Federated_Query_Demo.ipynb",
1168 |    "provenance": [],
1169 |    "toc_visible": true
1170 |   },
1171 |   "kernelspec": {
1172 |    "display_name": "Python 3",
1173 |    "language": "python",
1174 |    "name": "python3"
1175 |   },
1176 |   "language_info": {
1177 |    "codemirror_mode": {
1178 |     "name": "ipython",
1179 |     "version": 3
1180 |    },
1181 |    "file_extension": ".py",
1182 |    "mimetype": "text/x-python",
1183 |    "name": "python",
1184 |    "nbconvert_exporter": "python",
1185 |    "pygments_lexer": "ipython3",
1186 |    "version": "3.7.3"
1187 |   }
1188 |  },
1189 |  "nbformat": 4,
1190 |  "nbformat_minor": 4
1191 | }
1192 | 


--------------------------------------------------------------------------------
/graphistry_netflow_demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "colab_type": "text",
  7 |     "id": "kJyD4oSbugE0"
  8 |    },
  9 |    "source": [
 10 |     "# Graphistry Netflow Demo\n",
 11 |     "\n",
 12 |     "In this example we are taking millions of rows of netflow (network traffic flow) data in order to search for anomalous activity within a network. We will query 70M+ rows of network security data (netflow) with BlazingSQL and pass it to Graphistry for visualization."
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "markdown",
 17 |    "metadata": {},
 18 |    "source": [
 19 |     "## Blazing Context\n",
 20 |     "Here we are importing cuDF and BlazingContext. You can think of the BlazingContext much like a Spark Context (i.e. where information such as FileSystems you have registered and Tables you have created will be stored). If you have issues running this cell, restart runtime and try running it again."
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 12,
 26 |    "metadata": {},
 27 |    "outputs": [
 28 |     {
 29 |      "name": "stdout",
 30 |      "output_type": "stream",
 31 |      "text": [
 32 |       "Already connected to the Orchestrator\n",
 33 |       "BlazingContext ready\n"
 34 |      ]
 35 |     }
 36 |    ],
 37 |    "source": [
 38 |     "from blazingsql import BlazingContext \n",
 39 |     "\n",
 40 |     "bc = BlazingContext()"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "markdown",
 45 |    "metadata": {
 46 |     "colab_type": "text",
 47 |     "id": "yp7z8bfivbna"
 48 |    },
 49 |    "source": [
 50 |     "### Create & Query Tables\n",
 51 |     "In this next cell we identify the full path to the data."
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 13,
 57 |    "metadata": {},
 58 |    "outputs": [
 59 |     {
 60 |      "data": {
 61 |       "text/plain": [
 62 |        "'/home/winston/bsql-demos/data/*_0.parquet'"
 63 |       ]
 64 |      },
 65 |      "execution_count": 13,
 66 |      "metadata": {},
 67 |      "output_type": "execute_result"
 68 |     }
 69 |    ],
 70 |    "source": [
 71 |     "# identify working directory path\n",
 72 |     "local_path = !pwd\n",
 73 |     "\n",
 74 |     "# make wildcard path to load all 4 parquet files into blazingsql\n",
 75 |     "path = str(local_path) + '/data/*_0.parquet'\n",
 76 |     "\n",
 77 |     "# what's the path? \n",
 78 |     "path"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "markdown",
 83 |    "metadata": {},
 84 |    "source": [
 85 |     "#### Create\n",
 86 |     "Here use the path identified above to load all 4 parquet files into a single BlazingSQL table. This is done by using a wildcard (*) in the file path. \n",
 87 |     "\n",
 88 |     "Note: point path to `data/small-chunk2.csv` for pre-downloaded data."
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": 31,
 94 |    "metadata": {
 95 |     "colab": {},
 96 |     "colab_type": "code",
 97 |     "id": "lU-2wlwQntnq"
 98 |    },
 99 |    "outputs": [
100 |     {
101 |      "name": "stdout",
102 |      "output_type": "stream",
103 |      "text": [
104 |       "CPU times: user 4.16 ms, sys: 4.18 ms, total: 8.35 ms\n",
105 |       "Wall time: 298 ms\n"
106 |      ]
107 |     },
108 |     {
109 |      "data": {
110 |       "text/plain": [
111 |        "<pyblazing.apiv2.sql.Table at 0x7f7189dc4ac8>"
112 |       ]
113 |      },
114 |      "execution_count": 31,
115 |      "metadata": {},
116 |      "output_type": "execute_result"
117 |     }
118 |    ],
119 |    "source": [
120 |     "%%time\n",
121 |     "# blazingsql table from gpu dataframe\n",
122 |     "bc.create_table('netflow', path)"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {
128 |     "colab_type": "text",
129 |     "id": "cgivbut9df-R"
130 |    },
131 |    "source": [
132 |     "#### Query\n",
133 |     "With the table made, we can simply run a SQL query.\n",
134 |     "\n",
135 |     "We are going to run some joins and aggregations in order to condese these millions of rows into thousands of rows that represent nodes and edges."
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 32,
141 |    "metadata": {
142 |     "colab": {
143 |      "base_uri": "https://localhost:8080/",
144 |      "height": 277
145 |     },
146 |     "colab_type": "code",
147 |     "id": "umBG2Tp0wbQx",
148 |     "outputId": "b89e3666-f85a-40e9-e7c4-cda9a80b7fe5"
149 |    },
150 |    "outputs": [
151 |     {
152 |      "name": "stdout",
153 |      "output_type": "stream",
154 |      "text": [
155 |       "CPU times: user 29.3 ms, sys: 41.9 ms, total: 71.3 ms\n",
156 |       "Wall time: 4.51 s\n"
157 |      ]
158 |     }
159 |    ],
160 |    "source": [
161 |     "%%time\n",
162 |     "# what are we looking for \n",
163 |     "query = '''\n",
164 |     "        SELECT\n",
165 |     "            a.firstSeenSrcIp as source,\n",
166 |     "            a.firstSeenDestIp as destination,\n",
167 |     "            count(a.firstSeenDestPort) as targetPorts,\n",
168 |     "            SUM(a.firstSeenSrcTotalBytes) as bytesOut,\n",
169 |     "            SUM(a.firstSeenDestTotalBytes) as bytesIn,\n",
170 |     "            SUM(a.durationSeconds) as durationSeconds,\n",
171 |     "            MIN(parsedDate) as firstFlowDate,\n",
172 |     "            MAX(parsedDate) as lastFlowDate,\n",
173 |     "            COUNT(*) as attemptCount\n",
174 |     "        FROM\n",
175 |     "            netflow a\n",
176 |     "        GROUP BY\n",
177 |     "            a.firstSeenSrcIp,\n",
178 |     "            a.firstSeenDestIp\n",
179 |     "        '''\n",
180 |     "\n",
181 |     "# run sql query (returns cuDF DataFrame)\n",
182 |     "gdf = bc.sql(query)\n",
183 |     "\n",
184 |     "# how do the results look?\n",
185 |     "gdf.head(25)"
186 |    ]
187 |   }
188 |  ],
189 |  "metadata": {
190 |   "file_extension": ".py",
191 |   "kernelspec": {
192 |    "display_name": "Python 3",
193 |    "language": "python",
194 |    "name": "python3"
195 |   },
196 |   "language_info": {
197 |    "codemirror_mode": {
198 |     "name": "ipython",
199 |     "version": 3
200 |    },
201 |    "file_extension": ".py",
202 |    "mimetype": "text/x-python",
203 |    "name": "python",
204 |    "nbconvert_exporter": "python",
205 |    "pygments_lexer": "ipython3",
206 |    "version": "3.7.3"
207 |   },
208 |   "mimetype": "text/x-python",
209 |   "name": "python",
210 |   "npconvert_exporter": "python",
211 |   "pygments_lexer": "ipython3",
212 |   "version": 3
213 |  },
214 |  "nbformat": 4,
215 |  "nbformat_minor": 4
216 | }
217 | 


--------------------------------------------------------------------------------
/imgs/bsql_main.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BlazingDB/bsql-demos/ebee8a606a272f3e2ab7a38587a6092fe2018d93/imgs/bsql_main.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | blazingsql>=0.11
2 | cudf>=0.11
3 | cuml>=0.11


--------------------------------------------------------------------------------
/sample_use_cases/csv_to_parquet.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# CSV to Parquet"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "In this demo we'll walk through querying a CSV file from an AWS S3 bucket and saving the results locally as a Parquet file."
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "Identify the Dask Client (`client`) of your local GPUs, and pass it to BlazingContext (`bc`) upon initialization to activate distributed query execution with BlazingSQL."
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 1,
 27 |    "metadata": {},
 28 |    "outputs": [
 29 |     {
 30 |      "name": "stdout",
 31 |      "output_type": "stream",
 32 |      "text": [
 33 |       "BlazingContext ready\n"
 34 |      ]
 35 |     }
 36 |    ],
 37 |    "source": [
 38 |     "from dask_cuda import LocalCUDACluster\n",
 39 |     "cluster = LocalCUDACluster()\n",
 40 |     "\n",
 41 |     "from dask.distributed import Client\n",
 42 |     "client = Client(cluster)\n",
 43 |     "\n",
 44 |     "from blazingsql import BlazingContext\n",
 45 |     "bc = BlazingContext(dask_client=client, network_interface='lo')"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {},
 51 |    "source": [
 52 |     "Register a public AWS S3 bucket and create a table (`taxi`) from it."
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 2,
 58 |    "metadata": {},
 59 |    "outputs": [
 60 |     {
 61 |      "data": {
 62 |       "text/plain": [
 63 |        "<pyblazing.apiv2.context.BlazingTable at 0x7f462dacf750>"
 64 |       ]
 65 |      },
 66 |      "execution_count": 2,
 67 |      "metadata": {},
 68 |      "output_type": "execute_result"
 69 |     }
 70 |    ],
 71 |    "source": [
 72 |     "bc.s3('blazingsql-colab', bucket_name='blazingsql-colab')\n",
 73 |     "\n",
 74 |     "col_names = ['key', 'fare', 'pickup_x', 'pickup_y', 'dropoff_x', 'dropoff_y', 'passenger_count']\n",
 75 |     "bc.create_table('taxi', 's3://blazingsql-colab/taxi_data/taxi_00.csv', names=col_names)"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "markdown",
 80 |    "metadata": {},
 81 |    "source": [
 82 |     "Tag the file path to the local directory where results will be saved as `data_dir`."
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": 3,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "from os import getcwd\n",
 92 |     "data_dir = getcwd().replace('/sample_use_cases', '/data')"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "<!-- Query the table and write the results directly `.to_parquet()`. -->\n",
100 |     "\n",
101 |     "As BlazingSQL returns a distributed query's results as a dask_cudf.DataFrame, we can call write those results directly [.to_parquet()](https://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.to_parquet)."
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "code",
106 |    "execution_count": 4,
107 |    "metadata": {},
108 |    "outputs": [],
109 |    "source": [
110 |     "bc.sql('SELECT * FROM taxi').to_parquet(f'{data_dir}/yellow_cab')"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "metadata": {},
116 |    "source": [
117 |     "Create a table from that newly written file, and run a simple query to see how it looks by `.compute()`ing to a cudf.DataFrame for display."
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 5,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "data": {
127 |       "text/html": [
128 |        "<div>\n",
129 |        "<style scoped>\n",
130 |        "    .dataframe tbody tr th:only-of-type {\n",
131 |        "        vertical-align: middle;\n",
132 |        "    }\n",
133 |        "\n",
134 |        "    .dataframe tbody tr th {\n",
135 |        "        vertical-align: top;\n",
136 |        "    }\n",
137 |        "\n",
138 |        "    .dataframe thead th {\n",
139 |        "        text-align: right;\n",
140 |        "    }\n",
141 |        "</style>\n",
142 |        "<table border=\"1\" class=\"dataframe\">\n",
143 |        "  <thead>\n",
144 |        "    <tr style=\"text-align: right;\">\n",
145 |        "      <th></th>\n",
146 |        "      <th>key</th>\n",
147 |        "      <th>fare</th>\n",
148 |        "      <th>pickup_x</th>\n",
149 |        "      <th>pickup_y</th>\n",
150 |        "      <th>dropoff_x</th>\n",
151 |        "      <th>dropoff_y</th>\n",
152 |        "      <th>passenger_count</th>\n",
153 |        "      <th>index</th>\n",
154 |        "    </tr>\n",
155 |        "  </thead>\n",
156 |        "  <tbody>\n",
157 |        "    <tr>\n",
158 |        "      <th>0</th>\n",
159 |        "      <td>2012-02-02 22:30:19.0000002</td>\n",
160 |        "      <td>8.9</td>\n",
161 |        "      <td>-73.988703</td>\n",
162 |        "      <td>40.758803</td>\n",
163 |        "      <td>-73.986517</td>\n",
164 |        "      <td>40.737205</td>\n",
165 |        "      <td>1</td>\n",
166 |        "      <td>0</td>\n",
167 |        "    </tr>\n",
168 |        "    <tr>\n",
169 |        "      <th>1</th>\n",
170 |        "      <td>2014-09-20 07:19:24.0000001</td>\n",
171 |        "      <td>4.0</td>\n",
172 |        "      <td>-73.990208</td>\n",
173 |        "      <td>40.746703</td>\n",
174 |        "      <td>-73.994729</td>\n",
175 |        "      <td>40.750512</td>\n",
176 |        "      <td>1</td>\n",
177 |        "      <td>1</td>\n",
178 |        "    </tr>\n",
179 |        "    <tr>\n",
180 |        "      <th>2</th>\n",
181 |        "      <td>2013-02-23 07:18:05.0000001</td>\n",
182 |        "      <td>5.5</td>\n",
183 |        "      <td>-74.016757</td>\n",
184 |        "      <td>40.709438</td>\n",
185 |        "      <td>-74.009</td>\n",
186 |        "      <td>40.719496</td>\n",
187 |        "      <td>3</td>\n",
188 |        "      <td>2</td>\n",
189 |        "    </tr>\n",
190 |        "    <tr>\n",
191 |        "      <th>3</th>\n",
192 |        "      <td>2015-04-18 23:49:27.0000009</td>\n",
193 |        "      <td>13.5</td>\n",
194 |        "      <td>-74.002708</td>\n",
195 |        "      <td>40.733730</td>\n",
196 |        "      <td>-73.98609924</td>\n",
197 |        "      <td>40.73477554</td>\n",
198 |        "      <td>1</td>\n",
199 |        "      <td>3</td>\n",
200 |        "    </tr>\n",
201 |        "    <tr>\n",
202 |        "      <th>4</th>\n",
203 |        "      <td>2010-03-04 08:15:59.0000001</td>\n",
204 |        "      <td>10.5</td>\n",
205 |        "      <td>-73.988356</td>\n",
206 |        "      <td>40.737665</td>\n",
207 |        "      <td>-74.012459</td>\n",
208 |        "      <td>40.713934</td>\n",
209 |        "      <td>1</td>\n",
210 |        "      <td>4</td>\n",
211 |        "    </tr>\n",
212 |        "    <tr>\n",
213 |        "      <th>...</th>\n",
214 |        "      <td>...</td>\n",
215 |        "      <td>...</td>\n",
216 |        "      <td>...</td>\n",
217 |        "      <td>...</td>\n",
218 |        "      <td>...</td>\n",
219 |        "      <td>...</td>\n",
220 |        "      <td>...</td>\n",
221 |        "      <td>...</td>\n",
222 |        "    </tr>\n",
223 |        "    <tr>\n",
224 |        "      <th>4999995</th>\n",
225 |        "      <td>2011-02-24 16:06:26.0000001</td>\n",
226 |        "      <td>6.9</td>\n",
227 |        "      <td>-73.966542</td>\n",
228 |        "      <td>40.804975</td>\n",
229 |        "      <td>-73.949043</td>\n",
230 |        "      <td>40.804227</td>\n",
231 |        "      <td>2</td>\n",
232 |        "      <td>4999995</td>\n",
233 |        "    </tr>\n",
234 |        "    <tr>\n",
235 |        "      <th>4999996</th>\n",
236 |        "      <td>2009-09-22 19:20:22.0000009</td>\n",
237 |        "      <td>9.7</td>\n",
238 |        "      <td>-73.980055</td>\n",
239 |        "      <td>40.752535</td>\n",
240 |        "      <td>-74.006443</td>\n",
241 |        "      <td>40.739613</td>\n",
242 |        "      <td>1</td>\n",
243 |        "      <td>4999996</td>\n",
244 |        "    </tr>\n",
245 |        "    <tr>\n",
246 |        "      <th>4999997</th>\n",
247 |        "      <td>2012-04-19 02:17:32.0000001</td>\n",
248 |        "      <td>14.1</td>\n",
249 |        "      <td>-73.998508</td>\n",
250 |        "      <td>40.745305</td>\n",
251 |        "      <td>-73.953184</td>\n",
252 |        "      <td>40.799361</td>\n",
253 |        "      <td>2</td>\n",
254 |        "      <td>4999997</td>\n",
255 |        "    </tr>\n",
256 |        "    <tr>\n",
257 |        "      <th>4999998</th>\n",
258 |        "      <td>2012-06-08 11:09:47.0000006</td>\n",
259 |        "      <td>3.3</td>\n",
260 |        "      <td>-73.953630</td>\n",
261 |        "      <td>40.778797</td>\n",
262 |        "      <td>-73.946068</td>\n",
263 |        "      <td>40.775552</td>\n",
264 |        "      <td>1</td>\n",
265 |        "      <td>4999998</td>\n",
266 |        "    </tr>\n",
267 |        "    <tr>\n",
268 |        "      <th>4999999</th>\n",
269 |        "      <td>2009-06-21 11:07:00.00000036</td>\n",
270 |        "      <td>6.5</td>\n",
271 |        "      <td>-73.981578</td>\n",
272 |        "      <td>40.772575</td>\n",
273 |        "      <td>-73.963333</td>\n",
274 |        "      <td>40.762132</td>\n",
275 |        "      <td>1</td>\n",
276 |        "      <td>4999999</td>\n",
277 |        "    </tr>\n",
278 |        "  </tbody>\n",
279 |        "</table>\n",
280 |        "<p>5000000 rows × 8 columns</p>\n",
281 |        "</div>"
282 |       ],
283 |       "text/plain": [
284 |        "                                  key  fare   pickup_x   pickup_y  \\\n",
285 |        "0         2012-02-02 22:30:19.0000002   8.9 -73.988703  40.758803   \n",
286 |        "1         2014-09-20 07:19:24.0000001   4.0 -73.990208  40.746703   \n",
287 |        "2         2013-02-23 07:18:05.0000001   5.5 -74.016757  40.709438   \n",
288 |        "3         2015-04-18 23:49:27.0000009  13.5 -74.002708  40.733730   \n",
289 |        "4         2010-03-04 08:15:59.0000001  10.5 -73.988356  40.737665   \n",
290 |        "...                               ...   ...        ...        ...   \n",
291 |        "4999995   2011-02-24 16:06:26.0000001   6.9 -73.966542  40.804975   \n",
292 |        "4999996   2009-09-22 19:20:22.0000009   9.7 -73.980055  40.752535   \n",
293 |        "4999997   2012-04-19 02:17:32.0000001  14.1 -73.998508  40.745305   \n",
294 |        "4999998   2012-06-08 11:09:47.0000006   3.3 -73.953630  40.778797   \n",
295 |        "4999999  2009-06-21 11:07:00.00000036   6.5 -73.981578  40.772575   \n",
296 |        "\n",
297 |        "            dropoff_x    dropoff_y  passenger_count    index  \n",
298 |        "0          -73.986517    40.737205                1        0  \n",
299 |        "1          -73.994729    40.750512                1        1  \n",
300 |        "2             -74.009    40.719496                3        2  \n",
301 |        "3        -73.98609924  40.73477554                1        3  \n",
302 |        "4          -74.012459    40.713934                1        4  \n",
303 |        "...               ...          ...              ...      ...  \n",
304 |        "4999995    -73.949043    40.804227                2  4999995  \n",
305 |        "4999996    -74.006443    40.739613                1  4999996  \n",
306 |        "4999997    -73.953184    40.799361                2  4999997  \n",
307 |        "4999998    -73.946068    40.775552                1  4999998  \n",
308 |        "4999999    -73.963333    40.762132                1  4999999  \n",
309 |        "\n",
310 |        "[5000000 rows x 8 columns]"
311 |       ]
312 |      },
313 |      "execution_count": 5,
314 |      "metadata": {},
315 |      "output_type": "execute_result"
316 |     }
317 |    ],
318 |    "source": [
319 |     "bc.create_table('parquet_taxi', f'{data_dir}/yellow_cab/part.0.parquet')\n",
320 |     "\n",
321 |     "bc.sql('select * from parquet_taxi').compute()"
322 |    ]
323 |   },
324 |   {
325 |    "cell_type": "markdown",
326 |    "metadata": {},
327 |    "source": [
328 |     "You can find the Python script version of this Notebook at [/python_scripts/csv_to_parquet.py](python_scripts/csv_to_parquet.py)."
329 |    ]
330 |   }
331 |  ],
332 |  "metadata": {
333 |   "kernelspec": {
334 |    "display_name": "RAPIDS Nightly",
335 |    "language": "python",
336 |    "name": "python3"
337 |   },
338 |   "language_info": {
339 |    "codemirror_mode": {
340 |     "name": "ipython",
341 |     "version": 3
342 |    },
343 |    "file_extension": ".py",
344 |    "mimetype": "text/x-python",
345 |    "name": "python",
346 |    "nbconvert_exporter": "python",
347 |    "pygments_lexer": "ipython3",
348 |    "version": "3.7.6"
349 |   }
350 |  },
351 |  "nbformat": 4,
352 |  "nbformat_minor": 4
353 | }
354 | 


--------------------------------------------------------------------------------
/sample_use_cases/python_scripts/csv_to_parquet.py:
--------------------------------------------------------------------------------
 1 | from dask.distributed import Client
 2 | from blazingsql import BlazingContext
 3 | from dask_cuda import LocalCUDACluster
 4 | 
 5 | # initalize BlazingContext with the Dask Client of local GPUs to distribute query execution
 6 | bc = BlazingContext(dask_client=Client(LocalCUDACluster()), network_interface='lo')
 7 | 
 8 | # register public AWS S3 bucket 
 9 | bc.s3('blazingsql-colab', bucket_name='blazingsql-colab')
10 | 
11 | # create a table from that S3 bucket
12 | col_names = ['key', 'fare', 'pickup_x', 'pickup_y', 'dropoff_x', 'dropoff_y', 'passenger_count']
13 | bc.create_table('taxi', 's3://blazingsql-colab/taxi_data/taxi_00.csv', names=col_names)
14 | 
15 | # query the table & write results locally as parquet 
16 | bc.sql('SELECT * FROM taxi').to_parquet(f'../../data/yellow_cab')
17 | 


--------------------------------------------------------------------------------
/taxi_fare_prediction.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "colab_type": "text",
  7 |     "id": "l4fOFMjbRvkZ"
  8 |    },
  9 |    "source": [
 10 |     "# BlazingSQL + cuML NYC Taxi Cab Fare Prediction\n",
 11 |     "\n",
 12 |     "This demo uses publicly available [NYC Taxi Cab Data](https://www.kaggle.com/c/new-york-city-taxi-fare-prediction) to predict the total fare of a taxi ride in New York City given the pickup and dropoff locations. \n",
 13 |     "\n",
 14 |     "In this notebook, we will cover: \n",
 15 |     "- How to read and query multiple CSV files with BlazingSQL.\n",
 16 |     "- How to implement a linear regression model with cuML.\n",
 17 |     "\n",
 18 |     "### Imports\n",
 19 |     "This next cell will import all packages you need to run this notebook end-to-end."
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 1,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "import os\n",
 29 |     "import urllib\n",
 30 |     "from cuml import LinearRegression\n",
 31 |     "from blazingsql import BlazingContext"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "markdown",
 36 |    "metadata": {},
 37 |    "source": [
 38 |     "## Create BlazingContext\n",
 39 |     "You can think of the BlazingContext much like a Spark Context (i.e. where information such as FileSystems you have registered and Tables you have created will be stored). If you have issues running this cell, restart runtime and try running it again."
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 2,
 45 |    "metadata": {},
 46 |    "outputs": [
 47 |     {
 48 |      "name": "stdout",
 49 |      "output_type": "stream",
 50 |      "text": [
 51 |       "BlazingContext ready\n"
 52 |      ]
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "# connect to BlazingSQL\n",
 57 |     "bc = BlazingContext()"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {
 63 |     "colab_type": "text",
 64 |     "id": "Gt0TPBqif50q"
 65 |    },
 66 |    "source": [
 67 |     "### Download Data\n",
 68 |     "For this demo we will train our model with 25,000,000 rows of data from 5 CSV files (5M rows each).\n",
 69 |     "\n",
 70 |     "The cell below will check if you already have them, and, if you don't, will download them from AWS for you. "
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 3,
 76 |    "metadata": {},
 77 |    "outputs": [
 78 |     {
 79 |      "name": "stdout",
 80 |      "output_type": "stream",
 81 |      "text": [
 82 |       "Downloading https://blazingsql-colab.s3.amazonaws.com/taxi_data/taxi_00.csv to data/taxi_00.csv\n",
 83 |       "Downloading https://blazingsql-colab.s3.amazonaws.com/taxi_data/taxi_01.csv to data/taxi_01.csv\n",
 84 |       "Downloading https://blazingsql-colab.s3.amazonaws.com/taxi_data/taxi_02.csv to data/taxi_02.csv\n",
 85 |       "Downloading https://blazingsql-colab.s3.amazonaws.com/taxi_data/taxi_03.csv to data/taxi_03.csv\n",
 86 |       "Downloading https://blazingsql-colab.s3.amazonaws.com/taxi_data/taxi_04.csv to data/taxi_04.csv\n",
 87 |       "CPU times: user 4.19 s, sys: 5.16 s, total: 9.36 s\n",
 88 |       "Wall time: 26.8 s\n"
 89 |      ]
 90 |     }
 91 |    ],
 92 |    "source": [
 93 |     "%%time\n",
 94 |     "# download taxi data\n",
 95 |     "base_url = 'https://blazingsql-colab.s3.amazonaws.com/taxi_data/'\n",
 96 |     "for i in range(0, 5):\n",
 97 |     "    fn = 'taxi_0' + str(i) + '.csv'\n",
 98 |     "    # check if we already have the file\n",
 99 |     "    if not os.path.isfile('data/' + fn):\n",
100 |     "        # we don't let me know we're downloading it now\n",
101 |     "        print(f'Downloading {base_url + fn} to data/{fn}')\n",
102 |     "        # download file\n",
103 |     "        urllib.request.urlretrieve(base_url + fn, 'data/' + fn)\n",
104 |     "    # we already have data\n",
105 |     "    else:\n",
106 |     "        # let us know\n",
107 |     "        print(f'data/{fn} already downloaded')"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {
113 |     "colab_type": "text",
114 |     "id": "PXtydYrimQGt"
115 |    },
116 |    "source": [
117 |     "## Extract, transform, load\n",
118 |     "In order to train our Linear Regression model, we must first preform ETL to prepare our data.\n",
119 |     "\n",
120 |     "BlazingSQL currently requires the full file path to create tables, the cell below will identify that path for you."
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 4,
126 |    "metadata": {},
127 |    "outputs": [
128 |     {
129 |      "data": {
130 |       "text/plain": [
131 |        "'/home/jupyter-winston/bsql-demos/data/taxi_0*.csv'"
132 |       ]
133 |      },
134 |      "execution_count": 4,
135 |      "metadata": {},
136 |      "output_type": "execute_result"
137 |     }
138 |    ],
139 |    "source": [
140 |     "# identify current working directory\n",
141 |     "cwd = os.getcwd()\n",
142 |     "# add path to data w/ wildcard (*) so BSQL can read all 5 files at once\n",
143 |     "data_path = cwd + '/data/taxi_0*.csv'\n",
144 |     "# how's it look?\n",
145 |     "data_path"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "markdown",
150 |    "metadata": {},
151 |    "source": [
152 |     "### ETL: Create Table \n",
153 |     "In this next cell we will create a single BlazingSQL table from all 5 CSVs."
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 5,
159 |    "metadata": {
160 |     "colab": {},
161 |     "colab_type": "code",
162 |     "id": "Gr7CUSrsEBmW"
163 |    },
164 |    "outputs": [
165 |     {
166 |      "name": "stdout",
167 |      "output_type": "stream",
168 |      "text": [
169 |       "CPU times: user 3.13 ms, sys: 2.44 ms, total: 5.57 ms\n",
170 |       "Wall time: 4.66 ms\n"
171 |      ]
172 |     },
173 |     {
174 |      "data": {
175 |       "text/plain": [
176 |        "<pyblazing.apiv2.context.BlazingTable at 0x7fc0b06a4b70>"
177 |       ]
178 |      },
179 |      "execution_count": 5,
180 |      "metadata": {},
181 |      "output_type": "execute_result"
182 |     }
183 |    ],
184 |    "source": [
185 |     "%%time\n",
186 |     "# tag column names and types\n",
187 |     "col_names = ['key', 'fare_amount', 'pickup_longitude', 'pickup_latitude', \n",
188 |     "             'dropoff_longitude', 'dropoff_latitude', 'passenger_count']\n",
189 |     "col_types = ['date64', 'float32', 'float32', 'float32',\n",
190 |     "             'float32', 'float32', 'float32']\n",
191 |     "\n",
192 |     "# create a table from all 5 taxi files at once\n",
193 |     "bc.create_table('train_taxi', data_path, names=col_names, dtype=col_types, header=0)"
194 |    ]
195 |   },
196 |   {
197 |    "cell_type": "markdown",
198 |    "metadata": {
199 |     "colab_type": "text",
200 |     "id": "XnzjqEFnmDC5"
201 |    },
202 |    "source": [
203 |     "### ETL: Query Tables for Training Data"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 7,
209 |    "metadata": {
210 |     "colab": {
211 |      "base_uri": "https://localhost:8080/",
212 |      "height": 425
213 |     },
214 |     "colab_type": "code",
215 |     "id": "_MDxz73ZMhhK",
216 |     "outputId": "f2abeafc-0cdf-46b1-ddf5-a5cde3d37792"
217 |    },
218 |    "outputs": [
219 |     {
220 |      "data": {
221 |       "text/html": [
222 |        "<div>\n",
223 |        "<style scoped>\n",
224 |        "    .dataframe tbody tr th:only-of-type {\n",
225 |        "        vertical-align: middle;\n",
226 |        "    }\n",
227 |        "\n",
228 |        "    .dataframe tbody tr th {\n",
229 |        "        vertical-align: top;\n",
230 |        "    }\n",
231 |        "\n",
232 |        "    .dataframe thead th {\n",
233 |        "        text-align: right;\n",
234 |        "    }\n",
235 |        "</style>\n",
236 |        "<table border=\"1\" class=\"dataframe\">\n",
237 |        "  <thead>\n",
238 |        "    <tr style=\"text-align: right;\">\n",
239 |        "      <th></th>\n",
240 |        "      <th>hours</th>\n",
241 |        "      <th>days</th>\n",
242 |        "      <th>months</th>\n",
243 |        "      <th>years</th>\n",
244 |        "      <th>longitude_distance</th>\n",
245 |        "      <th>latitude_distance</th>\n",
246 |        "      <th>passenger_count</th>\n",
247 |        "    </tr>\n",
248 |        "  </thead>\n",
249 |        "  <tbody>\n",
250 |        "    <tr>\n",
251 |        "      <th>0</th>\n",
252 |        "      <td>20.0</td>\n",
253 |        "      <td>10.0</td>\n",
254 |        "      <td>9.0</td>\n",
255 |        "      <td>13.0</td>\n",
256 |        "      <td>0.049057</td>\n",
257 |        "      <td>0.003063</td>\n",
258 |        "      <td>1.0</td>\n",
259 |        "    </tr>\n",
260 |        "    <tr>\n",
261 |        "      <th>1</th>\n",
262 |        "      <td>20.0</td>\n",
263 |        "      <td>22.0</td>\n",
264 |        "      <td>11.0</td>\n",
265 |        "      <td>9.0</td>\n",
266 |        "      <td>0.003464</td>\n",
267 |        "      <td>0.007088</td>\n",
268 |        "      <td>1.0</td>\n",
269 |        "    </tr>\n",
270 |        "    <tr>\n",
271 |        "      <th>2</th>\n",
272 |        "      <td>21.0</td>\n",
273 |        "      <td>4.0</td>\n",
274 |        "      <td>12.0</td>\n",
275 |        "      <td>9.0</td>\n",
276 |        "      <td>0.003151</td>\n",
277 |        "      <td>0.007584</td>\n",
278 |        "      <td>1.0</td>\n",
279 |        "    </tr>\n",
280 |        "    <tr>\n",
281 |        "      <th>3</th>\n",
282 |        "      <td>22.0</td>\n",
283 |        "      <td>6.0</td>\n",
284 |        "      <td>5.0</td>\n",
285 |        "      <td>15.0</td>\n",
286 |        "      <td>0.007141</td>\n",
287 |        "      <td>0.011543</td>\n",
288 |        "      <td>1.0</td>\n",
289 |        "    </tr>\n",
290 |        "    <tr>\n",
291 |        "      <th>4</th>\n",
292 |        "      <td>23.0</td>\n",
293 |        "      <td>27.0</td>\n",
294 |        "      <td>4.0</td>\n",
295 |        "      <td>9.0</td>\n",
296 |        "      <td>-0.014870</td>\n",
297 |        "      <td>-0.033161</td>\n",
298 |        "      <td>1.0</td>\n",
299 |        "    </tr>\n",
300 |        "  </tbody>\n",
301 |        "</table>\n",
302 |        "</div>"
303 |       ],
304 |       "text/plain": [
305 |        "   hours  days  months  years  longitude_distance  latitude_distance  \\\n",
306 |        "0   20.0  10.0     9.0   13.0            0.049057           0.003063   \n",
307 |        "1   20.0  22.0    11.0    9.0            0.003464           0.007088   \n",
308 |        "2   21.0   4.0    12.0    9.0            0.003151           0.007584   \n",
309 |        "3   22.0   6.0     5.0   15.0            0.007141           0.011543   \n",
310 |        "4   23.0  27.0     4.0    9.0           -0.014870          -0.033161   \n",
311 |        "\n",
312 |        "   passenger_count  \n",
313 |        "0              1.0  \n",
314 |        "1              1.0  \n",
315 |        "2              1.0  \n",
316 |        "3              1.0  \n",
317 |        "4              1.0  "
318 |       ]
319 |      },
320 |      "execution_count": 7,
321 |      "metadata": {},
322 |      "output_type": "execute_result"
323 |     }
324 |    ],
325 |    "source": [
326 |     "# extract time columns, long & lat, # riders (all floats)\n",
327 |     "query = '''\n",
328 |     "        select \n",
329 |     "            cast(hour(key) as float) hours, \n",
330 |     "            cast(dayofmonth(key) as float) days, \n",
331 |     "            cast(month(key) as float) months, \n",
332 |     "            cast(year(key) - 2000 as float) years,  \n",
333 |     "            dropoff_longitude - pickup_longitude as longitude_distance, \n",
334 |     "            dropoff_latitude - pickup_latitude as latitude_distance, \n",
335 |     "            passenger_count \n",
336 |     "        from \n",
337 |     "            train_taxi\n",
338 |     "            '''\n",
339 |     "\n",
340 |     "# run query on table (returns cuDF DataFrame)\n",
341 |     "X_train = bc.sql(query)\n",
342 |     "\n",
343 |     "# fill any null values \n",
344 |     "X_train['longitude_distance'] = X_train['longitude_distance'].fillna(0)\n",
345 |     "X_train['latitude_distance'] = X_train['latitude_distance'].fillna(0)\n",
346 |     "X_train['passenger_count'] = X_train['passenger_count'].fillna(0)\n",
347 |     "\n",
348 |     "# how's it look? \n",
349 |     "X_train.head()"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": 8,
355 |    "metadata": {},
356 |    "outputs": [
357 |     {
358 |      "data": {
359 |       "text/html": [
360 |        "<div>\n",
361 |        "<style scoped>\n",
362 |        "    .dataframe tbody tr th:only-of-type {\n",
363 |        "        vertical-align: middle;\n",
364 |        "    }\n",
365 |        "\n",
366 |        "    .dataframe tbody tr th {\n",
367 |        "        vertical-align: top;\n",
368 |        "    }\n",
369 |        "\n",
370 |        "    .dataframe thead th {\n",
371 |        "        text-align: right;\n",
372 |        "    }\n",
373 |        "</style>\n",
374 |        "<table border=\"1\" class=\"dataframe\">\n",
375 |        "  <thead>\n",
376 |        "    <tr style=\"text-align: right;\">\n",
377 |        "      <th></th>\n",
378 |        "      <th>fare_amount</th>\n",
379 |        "    </tr>\n",
380 |        "  </thead>\n",
381 |        "  <tbody>\n",
382 |        "    <tr>\n",
383 |        "      <th>0</th>\n",
384 |        "      <td>17.0</td>\n",
385 |        "    </tr>\n",
386 |        "    <tr>\n",
387 |        "      <th>1</th>\n",
388 |        "      <td>3.3</td>\n",
389 |        "    </tr>\n",
390 |        "    <tr>\n",
391 |        "      <th>2</th>\n",
392 |        "      <td>4.1</td>\n",
393 |        "    </tr>\n",
394 |        "    <tr>\n",
395 |        "      <th>3</th>\n",
396 |        "      <td>6.0</td>\n",
397 |        "    </tr>\n",
398 |        "    <tr>\n",
399 |        "      <th>4</th>\n",
400 |        "      <td>8.9</td>\n",
401 |        "    </tr>\n",
402 |        "  </tbody>\n",
403 |        "</table>\n",
404 |        "</div>"
405 |       ],
406 |       "text/plain": [
407 |        "   fare_amount\n",
408 |        "0         17.0\n",
409 |        "1          3.3\n",
410 |        "2          4.1\n",
411 |        "3          6.0\n",
412 |        "4          8.9"
413 |       ]
414 |      },
415 |      "execution_count": 8,
416 |      "metadata": {},
417 |      "output_type": "execute_result"
418 |     }
419 |    ],
420 |    "source": [
421 |     "# query dependent variable y\n",
422 |     "y_train = bc.sql('SELECT fare_amount FROM train_taxi')\n",
423 |     "# how's it look?\n",
424 |     "y_train.head()"
425 |    ]
426 |   },
427 |   {
428 |    "cell_type": "markdown",
429 |    "metadata": {},
430 |    "source": [
431 |     "## Linear Regression\n",
432 |     "To learn more about the cuML's LinearRegression model, check out [Beginner’s Guide to Linear Regression in Google Colab with cuML](https://medium.com/future-vision/beginners-guide-to-linear-regression-in-python-with-cuml-30e2709c761?source=friends_link&sk=1da35920b9e2ffea59d5cb3c998bfeae).\n",
433 |     "\n",
434 |     "### LR: Train Model"
435 |    ]
436 |   },
437 |   {
438 |    "cell_type": "code",
439 |    "execution_count": 9,
440 |    "metadata": {
441 |     "colab": {
442 |      "base_uri": "https://localhost:8080/",
443 |      "height": 531
444 |     },
445 |     "colab_type": "code",
446 |     "id": "tVUZvT9TB6Ii",
447 |     "outputId": "d61c0249-47ee-40b8-a72f-9d62383f23dd"
448 |    },
449 |    "outputs": [
450 |     {
451 |      "name": "stdout",
452 |      "output_type": "stream",
453 |      "text": [
454 |       "Coefficients:\n",
455 |       "0   -0.027069\n",
456 |       "1    0.003295\n",
457 |       "2    0.107198\n",
458 |       "3    0.636705\n",
459 |       "4    0.000932\n",
460 |       "5   -0.000494\n",
461 |       "6    0.092028\n",
462 |       "dtype: float32\n",
463 |       "\n",
464 |       "Y intercept:\n",
465 |       "3.3608126640319824\n",
466 |       "\n",
467 |       "CPU times: user 892 ms, sys: 412 ms, total: 1.3 s\n",
468 |       "Wall time: 2.25 s\n"
469 |      ]
470 |     }
471 |    ],
472 |    "source": [
473 |     "%%time\n",
474 |     "# call & create cuML model\n",
475 |     "lr = LinearRegression(fit_intercept=True, normalize=False, algorithm=\"eig\")\n",
476 |     "\n",
477 |     "# train Linear Regression model \n",
478 |     "reg = lr.fit(X_train, y_train)\n",
479 |     "\n",
480 |     "# display results\n",
481 |     "print(f\"Coefficients:\\n{reg.coef_}\\n\")\n",
482 |     "print(f\"Y intercept:\\n{reg.intercept_}\\n\")"
483 |    ]
484 |   },
485 |   {
486 |    "cell_type": "markdown",
487 |    "metadata": {
488 |     "colab_type": "text",
489 |     "id": "pHtni9xcl-ht"
490 |    },
491 |    "source": [
492 |     "### LR: Use Model to Predict Future Taxi Fares \n",
493 |     "\n",
494 |     "#### Download Test Data\n",
495 |     "The cell below will check to see if you've already got the Test data, and, if you don't, will download it for you."
496 |    ]
497 |   },
498 |   {
499 |    "cell_type": "code",
500 |    "execution_count": 10,
501 |    "metadata": {},
502 |    "outputs": [
503 |     {
504 |      "name": "stdout",
505 |      "output_type": "stream",
506 |      "text": [
507 |       "--2020-01-23 04:49:37--  https://blazingsql-demos.s3-us-west-1.amazonaws.com/test.csv\n",
508 |       "Resolving blazingsql-demos.s3-us-west-1.amazonaws.com (blazingsql-demos.s3-us-west-1.amazonaws.com)... 52.219.116.137\n",
509 |       "Connecting to blazingsql-demos.s3-us-west-1.amazonaws.com (blazingsql-demos.s3-us-west-1.amazonaws.com)|52.219.116.137|:443... connected.\n",
510 |       "HTTP request sent, awaiting response... 200 OK\n",
511 |       "Length: 982916 (960K) [text/csv]\n",
512 |       "Saving to: ‘data/test.csv’\n",
513 |       "\n",
514 |       "test.csv            100%[===================>] 959.88K  2.22MB/s    in 0.4s    \n",
515 |       "\n",
516 |       "2020-01-23 04:49:38 (2.22 MB/s) - ‘data/test.csv’ saved [982916/982916]\n",
517 |       "\n",
518 |       "CPU times: user 8.09 ms, sys: 26.9 ms, total: 35 ms\n",
519 |       "Wall time: 902 ms\n"
520 |      ]
521 |     }
522 |    ],
523 |    "source": [
524 |     "%%time\n",
525 |     "# do we have Test taxi file?\n",
526 |     "if not os.path.isfile('/data/test.csv'):\n",
527 |     "    !wget -P data https://blazingsql-demos.s3-us-west-1.amazonaws.com/test.csv\n",
528 |     "else:\n",
529 |     "    print('test data already downloaded')"
530 |    ]
531 |   },
532 |   {
533 |    "cell_type": "code",
534 |    "execution_count": 11,
535 |    "metadata": {
536 |     "colab": {},
537 |     "colab_type": "code",
538 |     "id": "yRM5PosNiuGh"
539 |    },
540 |    "outputs": [
541 |     {
542 |      "name": "stdout",
543 |      "output_type": "stream",
544 |      "text": [
545 |       "CPU times: user 1.68 ms, sys: 5.19 ms, total: 6.87 ms\n",
546 |       "Wall time: 5.42 ms\n"
547 |      ]
548 |     },
549 |     {
550 |      "data": {
551 |       "text/plain": [
552 |        "<pyblazing.apiv2.context.BlazingTable at 0x7fc0b95790b8>"
553 |       ]
554 |      },
555 |      "execution_count": 11,
556 |      "metadata": {},
557 |      "output_type": "execute_result"
558 |     }
559 |    ],
560 |    "source": [
561 |     "%%time\n",
562 |     "# set column names and types\n",
563 |     "col_names = ['key', 'fare_amount', 'pickup_longitude', 'pickup_latitude', \n",
564 |     "                'dropoff_longitude', 'dropoff_latitude', 'passenger_count']\n",
565 |     "col_types = ['date64', 'float32', 'float32', 'float32', 'float32', 'float32', 'float32']\n",
566 |     "\n",
567 |     "# tag path to test data\n",
568 |     "test_path = cwd + '/data/test.csv'\n",
569 |     "\n",
570 |     "# create test table directly from CSV\n",
571 |     "bc.create_table('test_taxi', test_path, names=col_names, dtype=col_types)"
572 |    ]
573 |   },
574 |   {
575 |    "cell_type": "code",
576 |    "execution_count": 12,
577 |    "metadata": {
578 |     "colab": {},
579 |     "colab_type": "code",
580 |     "id": "g4I8AJ51dpW5"
581 |    },
582 |    "outputs": [
583 |     {
584 |      "name": "stdout",
585 |      "output_type": "stream",
586 |      "text": [
587 |       "CPU times: user 61.8 ms, sys: 1.41 ms, total: 63.2 ms\n",
588 |       "Wall time: 36.9 ms\n"
589 |      ]
590 |     },
591 |     {
592 |      "data": {
593 |       "text/html": [
594 |        "<div>\n",
595 |        "<style scoped>\n",
596 |        "    .dataframe tbody tr th:only-of-type {\n",
597 |        "        vertical-align: middle;\n",
598 |        "    }\n",
599 |        "\n",
600 |        "    .dataframe tbody tr th {\n",
601 |        "        vertical-align: top;\n",
602 |        "    }\n",
603 |        "\n",
604 |        "    .dataframe thead th {\n",
605 |        "        text-align: right;\n",
606 |        "    }\n",
607 |        "</style>\n",
608 |        "<table border=\"1\" class=\"dataframe\">\n",
609 |        "  <thead>\n",
610 |        "    <tr style=\"text-align: right;\">\n",
611 |        "      <th></th>\n",
612 |        "      <th>hours</th>\n",
613 |        "      <th>days</th>\n",
614 |        "      <th>months</th>\n",
615 |        "      <th>years</th>\n",
616 |        "      <th>longitude_distance</th>\n",
617 |        "      <th>latitude_distance</th>\n",
618 |        "      <th>passenger_count</th>\n",
619 |        "    </tr>\n",
620 |        "  </thead>\n",
621 |        "  <tbody>\n",
622 |        "    <tr>\n",
623 |        "      <th>0</th>\n",
624 |        "      <td>13.0</td>\n",
625 |        "      <td>27.0</td>\n",
626 |        "      <td>1.0</td>\n",
627 |        "      <td>15.0</td>\n",
628 |        "      <td>-0.008110</td>\n",
629 |        "      <td>-0.019970</td>\n",
630 |        "      <td>1.0</td>\n",
631 |        "    </tr>\n",
632 |        "    <tr>\n",
633 |        "      <th>1</th>\n",
634 |        "      <td>13.0</td>\n",
635 |        "      <td>27.0</td>\n",
636 |        "      <td>1.0</td>\n",
637 |        "      <td>15.0</td>\n",
638 |        "      <td>-0.012024</td>\n",
639 |        "      <td>0.019814</td>\n",
640 |        "      <td>1.0</td>\n",
641 |        "    </tr>\n",
642 |        "    <tr>\n",
643 |        "      <th>2</th>\n",
644 |        "      <td>11.0</td>\n",
645 |        "      <td>8.0</td>\n",
646 |        "      <td>10.0</td>\n",
647 |        "      <td>11.0</td>\n",
648 |        "      <td>0.002869</td>\n",
649 |        "      <td>-0.005119</td>\n",
650 |        "      <td>1.0</td>\n",
651 |        "    </tr>\n",
652 |        "    <tr>\n",
653 |        "      <th>3</th>\n",
654 |        "      <td>21.0</td>\n",
655 |        "      <td>1.0</td>\n",
656 |        "      <td>12.0</td>\n",
657 |        "      <td>12.0</td>\n",
658 |        "      <td>-0.009277</td>\n",
659 |        "      <td>-0.016178</td>\n",
660 |        "      <td>1.0</td>\n",
661 |        "    </tr>\n",
662 |        "    <tr>\n",
663 |        "      <th>4</th>\n",
664 |        "      <td>21.0</td>\n",
665 |        "      <td>1.0</td>\n",
666 |        "      <td>12.0</td>\n",
667 |        "      <td>12.0</td>\n",
668 |        "      <td>-0.022537</td>\n",
669 |        "      <td>-0.045345</td>\n",
670 |        "      <td>1.0</td>\n",
671 |        "    </tr>\n",
672 |        "  </tbody>\n",
673 |        "</table>\n",
674 |        "</div>"
675 |       ],
676 |       "text/plain": [
677 |        "   hours  days  months  years  longitude_distance  latitude_distance  \\\n",
678 |        "0   13.0  27.0     1.0   15.0           -0.008110          -0.019970   \n",
679 |        "1   13.0  27.0     1.0   15.0           -0.012024           0.019814   \n",
680 |        "2   11.0   8.0    10.0   11.0            0.002869          -0.005119   \n",
681 |        "3   21.0   1.0    12.0   12.0           -0.009277          -0.016178   \n",
682 |        "4   21.0   1.0    12.0   12.0           -0.022537          -0.045345   \n",
683 |        "\n",
684 |        "   passenger_count  \n",
685 |        "0              1.0  \n",
686 |        "1              1.0  \n",
687 |        "2              1.0  \n",
688 |        "3              1.0  \n",
689 |        "4              1.0  "
690 |       ]
691 |      },
692 |      "execution_count": 12,
693 |      "metadata": {},
694 |      "output_type": "execute_result"
695 |     }
696 |    ],
697 |    "source": [
698 |     "%%time\n",
699 |     "# extract time columns, long & lat, # riders (all floats)\n",
700 |     "query = '''\n",
701 |     "        select \n",
702 |     "            cast(hour(key) as float) hours, \n",
703 |     "            cast(dayofmonth(key) as float) days, \n",
704 |     "            cast(month(key) as float) months, \n",
705 |     "            cast(year(key) - 2000 as float) years,  \n",
706 |     "            dropoff_longitude - pickup_longitude as longitude_distance, \n",
707 |     "            dropoff_latitude - pickup_latitude as latitude_distance, \n",
708 |     "            passenger_count\n",
709 |     "        from \n",
710 |     "            test_taxi\n",
711 |     "            '''\n",
712 |     "\n",
713 |     "# run query on table (returns cuDF DataFrame)\n",
714 |     "X_test = bc.sql(query)\n",
715 |     "\n",
716 |     "# fill null values \n",
717 |     "X_test['longitude_distance'] = X_test['longitude_distance'].fillna(0)\n",
718 |     "X_test['latitude_distance'] = X_test['latitude_distance'].fillna(0)\n",
719 |     "X_test['passenger_count'] = X_test['passenger_count'].fillna(0)\n",
720 |     "\n",
721 |     "# how's it look? \n",
722 |     "X_test.head()"
723 |    ]
724 |   },
725 |   {
726 |    "cell_type": "code",
727 |    "execution_count": 13,
728 |    "metadata": {
729 |     "colab": {},
730 |     "colab_type": "code",
731 |     "id": "zCft6P5QkepN"
732 |    },
733 |    "outputs": [
734 |     {
735 |      "data": {
736 |       "text/plain": [
737 |        "0       12.847689\n",
738 |        "1       12.847666\n",
739 |        "2       11.257179\n",
740 |        "3       11.814514\n",
741 |        "4       11.814518\n",
742 |        "5       11.814510\n",
743 |        "6       11.223505\n",
744 |        "7       11.223265\n",
745 |        "8       11.223516\n",
746 |        "9       12.234369\n",
747 |        "10      12.234383\n",
748 |        "11      12.234411\n",
749 |        "12       9.695659\n",
750 |        "13       9.695644\n",
751 |        "14      11.467134\n",
752 |        "15      11.467148\n",
753 |        "16      11.460003\n",
754 |        "17      11.460035\n",
755 |        "18      11.460011\n",
756 |        "19      11.460001\n",
757 |        "20      13.480091\n",
758 |        "21      12.704147\n",
759 |        "22      12.704123\n",
760 |        "23      12.704136\n",
761 |        "24      12.704132\n",
762 |        "25      12.704119\n",
763 |        "26      12.704292\n",
764 |        "27      12.704145\n",
765 |        "28      12.704140\n",
766 |        "29      12.704115\n",
767 |        "          ...    \n",
768 |        "9884    12.641771\n",
769 |        "9885    12.641808\n",
770 |        "9886    12.641790\n",
771 |        "9887    12.641766\n",
772 |        "9888    12.641785\n",
773 |        "9889    12.641790\n",
774 |        "9890    12.641781\n",
775 |        "9891    12.641809\n",
776 |        "9892    12.641788\n",
777 |        "9893    12.641804\n",
778 |        "9894    12.641783\n",
779 |        "9895    12.641851\n",
780 |        "9896    12.641764\n",
781 |        "9897    13.446104\n",
782 |        "9898    13.204254\n",
783 |        "9899    14.129877\n",
784 |        "9900    13.363419\n",
785 |        "9901    13.627535\n",
786 |        "9902    14.162102\n",
787 |        "9903    13.824402\n",
788 |        "9904    13.664045\n",
789 |        "9905    13.252615\n",
790 |        "9906    14.129101\n",
791 |        "9907    13.444111\n",
792 |        "9908    13.710255\n",
793 |        "9909    13.707689\n",
794 |        "9910    13.150122\n",
795 |        "9911    13.413801\n",
796 |        "9912    13.645849\n",
797 |        "9913    13.251087\n",
798 |        "Length: 9914, dtype: float32"
799 |       ]
800 |      },
801 |      "execution_count": 13,
802 |      "metadata": {},
803 |      "output_type": "execute_result"
804 |     }
805 |    ],
806 |    "source": [
807 |     "# predict fares \n",
808 |     "predictions = lr.predict(X_test)\n",
809 |     "\n",
810 |     "# display predictions\n",
811 |     "predictions"
812 |    ]
813 |   },
814 |   {
815 |    "cell_type": "code",
816 |    "execution_count": 14,
817 |    "metadata": {
818 |     "colab": {},
819 |     "colab_type": "code",
820 |     "id": "GdjUjJ42l2BI"
821 |    },
822 |    "outputs": [
823 |     {
824 |      "data": {
825 |       "text/html": [
826 |        "<div>\n",
827 |        "<style scoped>\n",
828 |        "    .dataframe tbody tr th:only-of-type {\n",
829 |        "        vertical-align: middle;\n",
830 |        "    }\n",
831 |        "\n",
832 |        "    .dataframe tbody tr th {\n",
833 |        "        vertical-align: top;\n",
834 |        "    }\n",
835 |        "\n",
836 |        "    .dataframe thead th {\n",
837 |        "        text-align: right;\n",
838 |        "    }\n",
839 |        "</style>\n",
840 |        "<table border=\"1\" class=\"dataframe\">\n",
841 |        "  <thead>\n",
842 |        "    <tr style=\"text-align: right;\">\n",
843 |        "      <th></th>\n",
844 |        "      <th>hours</th>\n",
845 |        "      <th>days</th>\n",
846 |        "      <th>months</th>\n",
847 |        "      <th>years</th>\n",
848 |        "      <th>longitude_distance</th>\n",
849 |        "      <th>latitude_distance</th>\n",
850 |        "      <th>passenger_count</th>\n",
851 |        "      <th>predicted_fare</th>\n",
852 |        "    </tr>\n",
853 |        "  </thead>\n",
854 |        "  <tbody>\n",
855 |        "    <tr>\n",
856 |        "      <th>0</th>\n",
857 |        "      <td>13.0</td>\n",
858 |        "      <td>27.0</td>\n",
859 |        "      <td>1.0</td>\n",
860 |        "      <td>15.0</td>\n",
861 |        "      <td>-0.008110</td>\n",
862 |        "      <td>-0.019970</td>\n",
863 |        "      <td>1.0</td>\n",
864 |        "      <td>12.847689</td>\n",
865 |        "    </tr>\n",
866 |        "    <tr>\n",
867 |        "      <th>1</th>\n",
868 |        "      <td>13.0</td>\n",
869 |        "      <td>27.0</td>\n",
870 |        "      <td>1.0</td>\n",
871 |        "      <td>15.0</td>\n",
872 |        "      <td>-0.012024</td>\n",
873 |        "      <td>0.019814</td>\n",
874 |        "      <td>1.0</td>\n",
875 |        "      <td>12.847666</td>\n",
876 |        "    </tr>\n",
877 |        "    <tr>\n",
878 |        "      <th>2</th>\n",
879 |        "      <td>11.0</td>\n",
880 |        "      <td>8.0</td>\n",
881 |        "      <td>10.0</td>\n",
882 |        "      <td>11.0</td>\n",
883 |        "      <td>0.002869</td>\n",
884 |        "      <td>-0.005119</td>\n",
885 |        "      <td>1.0</td>\n",
886 |        "      <td>11.257179</td>\n",
887 |        "    </tr>\n",
888 |        "    <tr>\n",
889 |        "      <th>3</th>\n",
890 |        "      <td>21.0</td>\n",
891 |        "      <td>1.0</td>\n",
892 |        "      <td>12.0</td>\n",
893 |        "      <td>12.0</td>\n",
894 |        "      <td>-0.009277</td>\n",
895 |        "      <td>-0.016178</td>\n",
896 |        "      <td>1.0</td>\n",
897 |        "      <td>11.814514</td>\n",
898 |        "    </tr>\n",
899 |        "    <tr>\n",
900 |        "      <th>4</th>\n",
901 |        "      <td>21.0</td>\n",
902 |        "      <td>1.0</td>\n",
903 |        "      <td>12.0</td>\n",
904 |        "      <td>12.0</td>\n",
905 |        "      <td>-0.022537</td>\n",
906 |        "      <td>-0.045345</td>\n",
907 |        "      <td>1.0</td>\n",
908 |        "      <td>11.814518</td>\n",
909 |        "    </tr>\n",
910 |        "  </tbody>\n",
911 |        "</table>\n",
912 |        "</div>"
913 |       ],
914 |       "text/plain": [
915 |        "   hours  days  months  years  longitude_distance  latitude_distance  \\\n",
916 |        "0   13.0  27.0     1.0   15.0           -0.008110          -0.019970   \n",
917 |        "1   13.0  27.0     1.0   15.0           -0.012024           0.019814   \n",
918 |        "2   11.0   8.0    10.0   11.0            0.002869          -0.005119   \n",
919 |        "3   21.0   1.0    12.0   12.0           -0.009277          -0.016178   \n",
920 |        "4   21.0   1.0    12.0   12.0           -0.022537          -0.045345   \n",
921 |        "\n",
922 |        "   passenger_count  predicted_fare  \n",
923 |        "0              1.0       12.847689  \n",
924 |        "1              1.0       12.847666  \n",
925 |        "2              1.0       11.257179  \n",
926 |        "3              1.0       11.814514  \n",
927 |        "4              1.0       11.814518  "
928 |       ]
929 |      },
930 |      "execution_count": 14,
931 |      "metadata": {},
932 |      "output_type": "execute_result"
933 |     }
934 |    ],
935 |    "source": [
936 |     "# add predictions to test dataframe\n",
937 |     "X_test['predicted_fare'] = predictions\n",
938 |     "\n",
939 |     "# how's that look?\n",
940 |     "X_test.head()"
941 |    ]
942 |   }
943 |  ],
944 |  "metadata": {
945 |   "accelerator": "GPU",
946 |   "colab": {
947 |    "collapsed_sections": [],
948 |    "name": "BlazingSQL_cuML_Taxi_Fare_Prediction.ipynb",
949 |    "provenance": []
950 |   },
951 |   "kernelspec": {
952 |    "display_name": "Python 3",
953 |    "language": "python",
954 |    "name": "python3"
955 |   },
956 |   "language_info": {
957 |    "codemirror_mode": {
958 |     "name": "ipython",
959 |     "version": 3
960 |    },
961 |    "file_extension": ".py",
962 |    "mimetype": "text/x-python",
963 |    "name": "python",
964 |    "nbconvert_exporter": "python",
965 |    "pygments_lexer": "ipython3",
966 |    "version": "3.6.7"
967 |   }
968 |  },
969 |  "nbformat": 4,
970 |  "nbformat_minor": 4
971 | }
972 | 


--------------------------------------------------------------------------------
/utils/env-check.py:
--------------------------------------------------------------------------------
 1 | import sys, os 
 2 | 
 3 | sys.path.append('/usr/local/lib/python3.6/site-packages/')
 4 | os.environ['NUMBAPRO_NVVM'] = '/usr/local/cuda/nvvm/lib64/libnvvm.so'
 5 | os.environ['NUMBAPRO_LIBDEVICE'] = '/usr/local/cuda/nvvm/libdevice/'
 6 | 
 7 | import pynvml
 8 | 
 9 | pynvml.nvmlInit()
10 | handle = pynvml.nvmlDeviceGetHandleByIndex(0)
11 | device_name = pynvml.nvmlDeviceGetName(handle)
12 | 
13 | if device_name != b'Tesla T4':
14 |   raise Exception("""
15 |     Unfortunately Colab didn't give you a T4 GPU.
16 |     
17 |     Make sure you've configured Colab to request a GPU instance type.
18 |     
19 |     If you get a K80 GPU, try Runtime -> Reset all runtimes...
20 |   """)
21 | else:
22 |   print('*********************************************')
23 |   print('Woo! Your instance has the right kind of GPU!')
24 |   print('*********************************************')
25 |   print()
26 | 


--------------------------------------------------------------------------------
/vs_pyspark_netflow.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "colab_type": "text",
  7 |     "id": "d0hJ4z8rBOFC"
  8 |    },
  9 |    "source": [
 10 |     "# BlazingSQL vs. Apache Spark \n",
 11 |     "\n",
 12 |     "Below we have one of our popular workloads running with [BlazingSQL](https://blazingsql.com/), and then with Apache Spark + PySpark.\n",
 13 |     "\n",
 14 |     "In this notebook, we will cover: \n",
 15 |     "- How to read and query csv files with BlazingSQL.\n",
 16 |     "- How BlazingSQL compares against Apache Spark (analyzing over 20M records)."
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "metadata": {
 22 |     "colab_type": "text",
 23 |     "id": "0guvG6Ws_zmX"
 24 |    },
 25 |    "source": [
 26 |     "## Import packages and create Blazing Context\n",
 27 |     "You can think of the BlazingContext much like a Spark Context (i.e. information such as FileSystems you have registered and Tables you have created will be stored here). "
 28 |    ]
 29 |   },
 30 |   {
 31 |    "cell_type": "code",
 32 |    "execution_count": 1,
 33 |    "metadata": {
 34 |     "colab": {
 35 |      "base_uri": "https://localhost:8080/",
 36 |      "height": 35
 37 |     },
 38 |     "colab_type": "code",
 39 |     "id": "ojm_V-WAtz0f",
 40 |     "outputId": "a46625f4-1494-4a13-eb13-2f38efd80ccf"
 41 |    },
 42 |    "outputs": [
 43 |     {
 44 |      "name": "stdout",
 45 |      "output_type": "stream",
 46 |      "text": [
 47 |       "BlazingContext ready\n"
 48 |      ]
 49 |     }
 50 |    ],
 51 |    "source": [
 52 |     "from blazingsql import BlazingContext\n",
 53 |     "# start up BlazingSQL\n",
 54 |     "bc = BlazingContext()"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {
 60 |     "colab_type": "text",
 61 |     "id": "yp7z8bfivbna"
 62 |    },
 63 |    "source": [
 64 |     "### Load & Query Table\n",
 65 |     "First, we need to download the netflow data (21,526,138 records) from AWS. If you do not wish to download the full 2.5G file, the first 100,000 rows of data are pre-downloaded at `data/small-chunk2.csv`, simply skip the cell below and change the file path when propmted 2 cells from now."
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 2,
 71 |    "metadata": {
 72 |     "colab": {},
 73 |     "colab_type": "code",
 74 |     "id": "2dAt6DfG37KH"
 75 |    },
 76 |    "outputs": [
 77 |     {
 78 |      "name": "stdout",
 79 |      "output_type": "stream",
 80 |      "text": [
 81 |       "--2020-01-20 22:14:17--  https://blazingsql-colab.s3.amazonaws.com/netflow_data/nf-chunk2.csv\n",
 82 |       "Resolving blazingsql-colab.s3.amazonaws.com (blazingsql-colab.s3.amazonaws.com)... 52.216.112.139\n",
 83 |       "Connecting to blazingsql-colab.s3.amazonaws.com (blazingsql-colab.s3.amazonaws.com)|52.216.112.139|:443... connected.\n",
 84 |       "HTTP request sent, awaiting response... 200 OK\n",
 85 |       "Length: 2725056295 (2.5G) [text/csv]\n",
 86 |       "Saving to: ‘data/nf-chunk2.csv’\n",
 87 |       "\n",
 88 |       "nf-chunk2.csv       100%[===================>]   2.54G  51.8MB/s    in 49s     \n",
 89 |       "\n",
 90 |       "2020-01-20 22:15:06 (53.2 MB/s) - ‘data/nf-chunk2.csv’ saved [2725056295/2725056295]\n",
 91 |       "\n"
 92 |      ]
 93 |     }
 94 |    ],
 95 |    "source": [
 96 |     "# save nf-chunk2 to data folder, may take a few minutes to download\n",
 97 |     "!wget -P data/ https://blazingsql-colab.s3.amazonaws.com/netflow_data/nf-chunk2.csv "
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {
103 |     "colab_type": "text",
104 |     "id": "OTEaAsp2_zmf"
105 |    },
106 |    "source": [
107 |     "## BlazingSQL \n",
108 |     "Data in hand, we can test the preformance of BlazingSQL on this dataset. \n",
109 |     "\n",
110 |     "To use pre-downloaded data, change the file path to `data/small-chunk2.csv`."
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 3,
116 |    "metadata": {
117 |     "colab": {
118 |      "base_uri": "https://localhost:8080/",
119 |      "height": 52
120 |     },
121 |     "colab_type": "code",
122 |     "id": "rirBsYQU3NH5",
123 |     "outputId": "51ced2b1-b930-4173-bbfa-09672e751d3f"
124 |    },
125 |    "outputs": [
126 |     {
127 |      "data": {
128 |       "text/plain": [
129 |        "'/home/winston/bsql-demos/data/nf-chunk2.csv'"
130 |       ]
131 |      },
132 |      "execution_count": 3,
133 |      "metadata": {},
134 |      "output_type": "execute_result"
135 |     }
136 |    ],
137 |    "source": [
138 |     "import os\n",
139 |     "# determine current working directory \n",
140 |     "cwd = os.getcwd()\n",
141 |     "# complete path to data\n",
142 |     "path = cwd + '/data/nf-chunk2.csv'\n",
143 |     "# what's the path?\n",
144 |     "path"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 4,
150 |    "metadata": {
151 |     "colab": {
152 |      "base_uri": "https://localhost:8080/",
153 |      "height": 52
154 |     },
155 |     "colab_type": "code",
156 |     "id": "zCzLEFfB3N4k",
157 |     "outputId": "10ff9097-2736-423e-969d-de75983fbdda"
158 |    },
159 |    "outputs": [
160 |     {
161 |      "name": "stdout",
162 |      "output_type": "stream",
163 |      "text": [
164 |       "CPU times: user 9.9 ms, sys: 13.1 ms, total: 23 ms\n",
165 |       "Wall time: 1.14 s\n"
166 |      ]
167 |     },
168 |     {
169 |      "data": {
170 |       "text/plain": [
171 |        "<pyblazing.apiv2.context.BlazingTable at 0x7f3e181d1bd0>"
172 |       ]
173 |      },
174 |      "execution_count": 4,
175 |      "metadata": {},
176 |      "output_type": "execute_result"
177 |     }
178 |    ],
179 |    "source": [
180 |     "%%time\n",
181 |     "# Create BlazingSQL table from GDF - There is no copy in this process\n",
182 |     "bc.create_table('netflow', path, header=0)"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 5,
188 |    "metadata": {
189 |     "colab": {
190 |      "base_uri": "https://localhost:8080/",
191 |      "height": 295
192 |     },
193 |     "colab_type": "code",
194 |     "id": "umBG2Tp0wbQx",
195 |     "outputId": "0975395e-7f5b-4244-afa3-45c8658ce61c"
196 |    },
197 |    "outputs": [
198 |     {
199 |      "name": "stdout",
200 |      "output_type": "stream",
201 |      "text": [
202 |       "CPU times: user 5.07 s, sys: 2.61 s, total: 7.67 s\n",
203 |       "Wall time: 10.4 s\n"
204 |      ]
205 |     }
206 |    ],
207 |    "source": [
208 |     "%%time\n",
209 |     "# define the query\n",
210 |     "query = '''\n",
211 |     "        SELECT\n",
212 |     "            a.firstSeenSrcIp as source,\n",
213 |     "            a.firstSeenDestIp as destination,\n",
214 |     "            count(a.firstSeenDestPort) as targetPorts,\n",
215 |     "            SUM(a.firstSeenSrcTotalBytes) as bytesOut,\n",
216 |     "            SUM(a.firstSeenDestTotalBytes) as bytesIn,\n",
217 |     "            SUM(a.durationSeconds) as durationSeconds,\n",
218 |     "            MIN(parsedDate) as firstFlowDate,\n",
219 |     "            MAX(parsedDate) as lastFlowDate,\n",
220 |     "            COUNT(*) as attemptCount\n",
221 |     "        FROM \n",
222 |     "            netflow a\n",
223 |     "        GROUP BY\n",
224 |     "            a.firstSeenSrcIp,\n",
225 |     "            a.firstSeenDestIp\n",
226 |     "            '''\n",
227 |     "\n",
228 |     "# query the table (returns cuDF DataFrame)\n",
229 |     "gdf = bc.sql(query)"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 6,
235 |    "metadata": {
236 |     "colab": {},
237 |     "colab_type": "code",
238 |     "id": "48_W2v8q_zmq",
239 |     "outputId": "db0394f1-e082-49b0-c477-e3bba8d3d0f4"
240 |    },
241 |    "outputs": [
242 |     {
243 |      "data": {
244 |       "text/html": [
245 |        "<div>\n",
246 |        "<style scoped>\n",
247 |        "    .dataframe tbody tr th:only-of-type {\n",
248 |        "        vertical-align: middle;\n",
249 |        "    }\n",
250 |        "\n",
251 |        "    .dataframe tbody tr th {\n",
252 |        "        vertical-align: top;\n",
253 |        "    }\n",
254 |        "\n",
255 |        "    .dataframe thead th {\n",
256 |        "        text-align: right;\n",
257 |        "    }\n",
258 |        "</style>\n",
259 |        "<table border=\"1\" class=\"dataframe\">\n",
260 |        "  <thead>\n",
261 |        "    <tr style=\"text-align: right;\">\n",
262 |        "      <th></th>\n",
263 |        "      <th>source</th>\n",
264 |        "      <th>destination</th>\n",
265 |        "      <th>targetPorts</th>\n",
266 |        "      <th>bytesOut</th>\n",
267 |        "      <th>bytesIn</th>\n",
268 |        "      <th>durationSeconds</th>\n",
269 |        "      <th>firstFlowDate</th>\n",
270 |        "      <th>lastFlowDate</th>\n",
271 |        "      <th>attemptCount</th>\n",
272 |        "    </tr>\n",
273 |        "  </thead>\n",
274 |        "  <tbody>\n",
275 |        "    <tr>\n",
276 |        "      <th>0</th>\n",
277 |        "      <td>172.30.2.60</td>\n",
278 |        "      <td>10.0.0.9</td>\n",
279 |        "      <td>82</td>\n",
280 |        "      <td>34839</td>\n",
281 |        "      <td>47716</td>\n",
282 |        "      <td>134</td>\n",
283 |        "      <td>2013-04-03 06:48:47</td>\n",
284 |        "      <td>2013-04-03 12:12:37</td>\n",
285 |        "      <td>82</td>\n",
286 |        "    </tr>\n",
287 |        "    <tr>\n",
288 |        "      <th>1</th>\n",
289 |        "      <td>172.10.1.162</td>\n",
290 |        "      <td>10.0.0.11</td>\n",
291 |        "      <td>87</td>\n",
292 |        "      <td>39628</td>\n",
293 |        "      <td>53983</td>\n",
294 |        "      <td>24</td>\n",
295 |        "      <td>2013-04-03 06:50:13</td>\n",
296 |        "      <td>2013-04-03 14:58:35</td>\n",
297 |        "      <td>87</td>\n",
298 |        "    </tr>\n",
299 |        "    <tr>\n",
300 |        "      <th>2</th>\n",
301 |        "      <td>10.1.0.76</td>\n",
302 |        "      <td>172.10.1.82</td>\n",
303 |        "      <td>1</td>\n",
304 |        "      <td>633</td>\n",
305 |        "      <td>392</td>\n",
306 |        "      <td>0</td>\n",
307 |        "      <td>2013-04-03 09:55:05</td>\n",
308 |        "      <td>2013-04-03 09:55:05</td>\n",
309 |        "      <td>1</td>\n",
310 |        "    </tr>\n",
311 |        "    <tr>\n",
312 |        "      <th>3</th>\n",
313 |        "      <td>172.30.1.56</td>\n",
314 |        "      <td>172.0.0.1</td>\n",
315 |        "      <td>25</td>\n",
316 |        "      <td>3330</td>\n",
317 |        "      <td>3240</td>\n",
318 |        "      <td>67</td>\n",
319 |        "      <td>2013-04-03 01:59:09</td>\n",
320 |        "      <td>2013-04-03 22:05:39</td>\n",
321 |        "      <td>25</td>\n",
322 |        "    </tr>\n",
323 |        "    <tr>\n",
324 |        "      <th>4</th>\n",
325 |        "      <td>172.30.1.10</td>\n",
326 |        "      <td>10.0.0.12</td>\n",
327 |        "      <td>69</td>\n",
328 |        "      <td>31042</td>\n",
329 |        "      <td>43044</td>\n",
330 |        "      <td>25</td>\n",
331 |        "      <td>2013-04-03 06:48:01</td>\n",
332 |        "      <td>2013-04-03 12:11:40</td>\n",
333 |        "      <td>69</td>\n",
334 |        "    </tr>\n",
335 |        "    <tr>\n",
336 |        "      <th>5</th>\n",
337 |        "      <td>172.10.1.89</td>\n",
338 |        "      <td>10.0.0.5</td>\n",
339 |        "      <td>112</td>\n",
340 |        "      <td>51222</td>\n",
341 |        "      <td>70260</td>\n",
342 |        "      <td>24</td>\n",
343 |        "      <td>2013-04-03 06:48:24</td>\n",
344 |        "      <td>2013-04-03 15:17:39</td>\n",
345 |        "      <td>112</td>\n",
346 |        "    </tr>\n",
347 |        "    <tr>\n",
348 |        "      <th>6</th>\n",
349 |        "      <td>172.10.1.234</td>\n",
350 |        "      <td>10.0.0.5</td>\n",
351 |        "      <td>104</td>\n",
352 |        "      <td>47287</td>\n",
353 |        "      <td>64750</td>\n",
354 |        "      <td>18</td>\n",
355 |        "      <td>2013-04-03 06:53:55</td>\n",
356 |        "      <td>2013-04-03 15:11:07</td>\n",
357 |        "      <td>104</td>\n",
358 |        "    </tr>\n",
359 |        "    <tr>\n",
360 |        "      <th>7</th>\n",
361 |        "      <td>172.30.2.125</td>\n",
362 |        "      <td>10.0.0.9</td>\n",
363 |        "      <td>69</td>\n",
364 |        "      <td>30701</td>\n",
365 |        "      <td>41558</td>\n",
366 |        "      <td>341</td>\n",
367 |        "      <td>2013-04-03 06:50:50</td>\n",
368 |        "      <td>2013-04-03 12:12:37</td>\n",
369 |        "      <td>69</td>\n",
370 |        "    </tr>\n",
371 |        "    <tr>\n",
372 |        "      <th>8</th>\n",
373 |        "      <td>172.30.1.85</td>\n",
374 |        "      <td>10.0.0.8</td>\n",
375 |        "      <td>84</td>\n",
376 |        "      <td>37828</td>\n",
377 |        "      <td>52864</td>\n",
378 |        "      <td>3</td>\n",
379 |        "      <td>2013-04-03 06:48:21</td>\n",
380 |        "      <td>2013-04-03 12:06:53</td>\n",
381 |        "      <td>84</td>\n",
382 |        "    </tr>\n",
383 |        "    <tr>\n",
384 |        "      <th>9</th>\n",
385 |        "      <td>10.0.0.9</td>\n",
386 |        "      <td>172.30.1.124</td>\n",
387 |        "      <td>1</td>\n",
388 |        "      <td>632</td>\n",
389 |        "      <td>391</td>\n",
390 |        "      <td>0</td>\n",
391 |        "      <td>2013-04-03 10:36:04</td>\n",
392 |        "      <td>2013-04-03 10:36:04</td>\n",
393 |        "      <td>1</td>\n",
394 |        "    </tr>\n",
395 |        "  </tbody>\n",
396 |        "</table>\n",
397 |        "</div>"
398 |       ],
399 |       "text/plain": [
400 |        "         source   destination  targetPorts  bytesOut  bytesIn  \\\n",
401 |        "0   172.30.2.60      10.0.0.9           82     34839    47716   \n",
402 |        "1  172.10.1.162     10.0.0.11           87     39628    53983   \n",
403 |        "2     10.1.0.76   172.10.1.82            1       633      392   \n",
404 |        "3   172.30.1.56     172.0.0.1           25      3330     3240   \n",
405 |        "4   172.30.1.10     10.0.0.12           69     31042    43044   \n",
406 |        "5   172.10.1.89      10.0.0.5          112     51222    70260   \n",
407 |        "6  172.10.1.234      10.0.0.5          104     47287    64750   \n",
408 |        "7  172.30.2.125      10.0.0.9           69     30701    41558   \n",
409 |        "8   172.30.1.85      10.0.0.8           84     37828    52864   \n",
410 |        "9      10.0.0.9  172.30.1.124            1       632      391   \n",
411 |        "\n",
412 |        "   durationSeconds        firstFlowDate         lastFlowDate  attemptCount  \n",
413 |        "0              134  2013-04-03 06:48:47  2013-04-03 12:12:37            82  \n",
414 |        "1               24  2013-04-03 06:50:13  2013-04-03 14:58:35            87  \n",
415 |        "2                0  2013-04-03 09:55:05  2013-04-03 09:55:05             1  \n",
416 |        "3               67  2013-04-03 01:59:09  2013-04-03 22:05:39            25  \n",
417 |        "4               25  2013-04-03 06:48:01  2013-04-03 12:11:40            69  \n",
418 |        "5               24  2013-04-03 06:48:24  2013-04-03 15:17:39           112  \n",
419 |        "6               18  2013-04-03 06:53:55  2013-04-03 15:11:07           104  \n",
420 |        "7              341  2013-04-03 06:50:50  2013-04-03 12:12:37            69  \n",
421 |        "8                3  2013-04-03 06:48:21  2013-04-03 12:06:53            84  \n",
422 |        "9                0  2013-04-03 10:36:04  2013-04-03 10:36:04             1  "
423 |       ]
424 |      },
425 |      "execution_count": 6,
426 |      "metadata": {},
427 |      "output_type": "execute_result"
428 |     }
429 |    ],
430 |    "source": [
431 |     "# how's it look?\n",
432 |     "gdf.head(10)"
433 |    ]
434 |   },
435 |   {
436 |    "cell_type": "markdown",
437 |    "metadata": {
438 |     "colab_type": "text",
439 |     "id": "6PXbjW1hTxrD"
440 |    },
441 |    "source": [
442 |     "## Apache Spark\n",
443 |     "The cell below installs Apache Spark ([PySpark](https://spark.apache.org/docs/latest/api/python/index.html))."
444 |    ]
445 |   },
446 |   {
447 |    "cell_type": "code",
448 |    "execution_count": 7,
449 |    "metadata": {
450 |     "colab": {},
451 |     "colab_type": "code",
452 |     "id": "pnEEvVEtT8xi"
453 |    },
454 |    "outputs": [
455 |     {
456 |      "name": "stdout",
457 |      "output_type": "stream",
458 |      "text": [
459 |       "Collecting pyspark\n",
460 |       "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/87/21/f05c186f4ddb01d15d0ddc36ef4b7e3cedbeb6412274a41f26b55a650ee5/pyspark-2.4.4.tar.gz (215.7MB)\n",
461 |       "\u001b[K     |████████████████████████████████| 215.7MB 50kB/s s eta 0:00:01\n",
462 |       "\u001b[?25hCollecting py4j==0.10.7\n",
463 |       "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)\n",
464 |       "\u001b[K     |████████████████████████████████| 204kB 54.4MB/s eta 0:00:01\n",
465 |       "\u001b[?25hBuilding wheels for collected packages: pyspark\n",
466 |       "  Building wheel for pyspark (setup.py) ... \u001b[?25ldone\n",
467 |       "\u001b[?25h  Created wheel for pyspark: filename=pyspark-2.4.4-py2.py3-none-any.whl size=216130387 sha256=14abaa33edbf681f432ee00d234718731961da639e5eec86c4784667d43b4f5d\n",
468 |       "  Stored in directory: /home/winston/.cache/pip/wheels/ab/09/4d/0d184230058e654eb1b04467dbc1292f00eaa186544604b471\n",
469 |       "Successfully built pyspark\n",
470 |       "Installing collected packages: py4j, pyspark\n",
471 |       "Successfully installed py4j-0.10.7 pyspark-2.4.4\n"
472 |      ]
473 |     }
474 |    ],
475 |    "source": [
476 |     "# installs Spark (2.4.4 Jan 2020)\n",
477 |     "!pip install pyspark"
478 |    ]
479 |   },
480 |   {
481 |    "cell_type": "markdown",
482 |    "metadata": {
483 |     "colab_type": "text",
484 |     "id": "W3-XmZkz_zmw"
485 |    },
486 |    "source": [
487 |     "#### PyBlazing vs PySpark\n",
488 |     "With everything installed we can launch a SparkSession and see how BlazingSQL stacks up."
489 |    ]
490 |   },
491 |   {
492 |    "cell_type": "code",
493 |    "execution_count": 1,
494 |    "metadata": {
495 |     "colab": {
496 |      "base_uri": "https://localhost:8080/",
497 |      "height": 51
498 |     },
499 |     "colab_type": "code",
500 |     "id": "nioEt2MqT9B0",
501 |     "outputId": "f75b9823-5dbd-45b1-9282-562d3d6ddaf0"
502 |    },
503 |    "outputs": [
504 |     {
505 |      "name": "stdout",
506 |      "output_type": "stream",
507 |      "text": [
508 |       "CPU times: user 321 ms, sys: 208 ms, total: 529 ms\n",
509 |       "Wall time: 3.65 s\n"
510 |      ]
511 |     }
512 |    ],
513 |    "source": [
514 |     "%%time\n",
515 |     "# copied this cell's snippet from another Google Colab by Luca Canali here: https://colab.research.google.com/github/LucaCanali/sparkMeasure/blob/master/examples/SparkMeasure_Jupyter_Colab_Example.ipynb\n",
516 |     "\n",
517 |     "from pyspark.sql import SparkSession\n",
518 |     "\n",
519 |     "# Create Spark Session\n",
520 |     "# This example uses a local cluster, you can modify master to use  YARN or K8S if available \n",
521 |     "# This example downloads sparkMeasure 0.13 for scala 2_11 from maven central\n",
522 |     "\n",
523 |     "spark = SparkSession \\\n",
524 |     "        .builder \\\n",
525 |     "        .master(\"local[*]\") \\\n",
526 |     "        .appName(\"PySpark Netflow Benchmark code\") \\\n",
527 |     "        .config(\"spark.jars.packages\",\"ch.cern.sparkmeasure:spark-measure_2.11:0.13\")  \\\n",
528 |     "        .getOrCreate()"
529 |    ]
530 |   },
531 |   {
532 |    "cell_type": "markdown",
533 |    "metadata": {
534 |     "colab_type": "text",
535 |     "id": "G8XSppQiUdLY"
536 |    },
537 |    "source": [
538 |     "### Load & Query Table"
539 |    ]
540 |   },
541 |   {
542 |    "cell_type": "code",
543 |    "execution_count": 2,
544 |    "metadata": {
545 |     "colab": {
546 |      "base_uri": "https://localhost:8080/",
547 |      "height": 51
548 |     },
549 |     "colab_type": "code",
550 |     "id": "ZSLuSYSOUDtf",
551 |     "outputId": "2b93169b-63c5-4c46-da14-af87645bf51b"
552 |    },
553 |    "outputs": [
554 |     {
555 |      "name": "stdout",
556 |      "output_type": "stream",
557 |      "text": [
558 |       "CPU times: user 20.2 ms, sys: 11.3 ms, total: 31.5 ms\n",
559 |       "Wall time: 2min 46s\n"
560 |      ]
561 |     }
562 |    ],
563 |    "source": [
564 |     "%%time\n",
565 |     "# load CSV into Spark\n",
566 |     "netflow_df = spark.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('data/nf-chunk2.csv')"
567 |    ]
568 |   },
569 |   {
570 |    "cell_type": "code",
571 |    "execution_count": 3,
572 |    "metadata": {
573 |     "colab": {
574 |      "base_uri": "https://localhost:8080/",
575 |      "height": 51
576 |     },
577 |     "colab_type": "code",
578 |     "id": "iT3BwLn8UDwE",
579 |     "outputId": "4eeff800-489f-4230-adb9-f3a1c16ede66"
580 |    },
581 |    "outputs": [
582 |     {
583 |      "name": "stdout",
584 |      "output_type": "stream",
585 |      "text": [
586 |       "CPU times: user 1.72 ms, sys: 176 µs, total: 1.9 ms\n",
587 |       "Wall time: 157 ms\n"
588 |      ]
589 |     }
590 |    ],
591 |    "source": [
592 |     "%%time\n",
593 |     "# create table for querying\n",
594 |     "netflow_df.createOrReplaceTempView('netflow')"
595 |    ]
596 |   },
597 |   {
598 |    "cell_type": "code",
599 |    "execution_count": 4,
600 |    "metadata": {
601 |     "colab": {
602 |      "base_uri": "https://localhost:8080/",
603 |      "height": 493
604 |     },
605 |     "colab_type": "code",
606 |     "id": "9SBhahA5UD2k",
607 |     "outputId": "accc1938-6470-44df-ab7f-70058c755b2b"
608 |    },
609 |    "outputs": [
610 |     {
611 |      "name": "stdout",
612 |      "output_type": "stream",
613 |      "text": [
614 |       "+------------+---------------+-----------+--------+-------+---------------+-------------------+-------------------+------------+\n",
615 |       "|      source|    destination|targetPorts|bytesOut|bytesIn|durationSeconds|      firstFlowDate|       lastFlowDate|attemptCount|\n",
616 |       "+------------+---------------+-----------+--------+-------+---------------+-------------------+-------------------+------------+\n",
617 |       "| 172.10.1.13|239.255.255.250|         15|    2975|      0|              6|2013-04-03 06:36:19|2013-04-03 06:36:27|          15|\n",
618 |       "|172.30.1.204|239.255.255.250|          8|    1750|      0|              6|2013-04-03 06:36:13|2013-04-03 06:36:20|           8|\n",
619 |       "| 172.30.2.86|      172.0.0.1|          1|     540|      0|              2|2013-04-03 06:36:09|2013-04-03 06:36:09|           1|\n",
620 |       "|172.30.1.246|      172.0.0.1|         29|    2610|   2610|              0|2013-04-03 00:26:46|2013-04-03 23:06:00|          29|\n",
621 |       "| 172.30.1.51|239.255.255.250|         16|    3850|      0|             18|2013-04-03 06:35:22|2013-04-03 06:44:08|          16|\n",
622 |       "| 172.10.1.35|      172.0.0.1|          1|     270|      0|              0|2013-04-03 06:36:21|2013-04-03 06:36:21|           1|\n",
623 |       "| 172.20.1.91|239.255.255.250|         19|    3675|      0|              6|2013-04-03 06:36:50|2013-04-03 06:36:59|          19|\n",
624 |       "|172.20.1.249|239.255.255.250|          2|     700|      0|              6|2013-04-03 06:37:17|2013-04-03 06:37:23|           2|\n",
625 |       "|172.10.1.232|      172.0.0.1|         30|    3060|   3060|             48|2013-04-03 01:31:31|2013-04-03 22:53:36|          30|\n",
626 |       "|172.10.1.238|239.255.255.250|          2|     700|      0|              6|2013-04-03 06:36:44|2013-04-03 06:36:51|           2|\n",
627 |       "+------------+---------------+-----------+--------+-------+---------------+-------------------+-------------------+------------+\n",
628 |       "only showing top 10 rows\n",
629 |       "\n",
630 |       "CPU times: user 4.39 ms, sys: 8.82 ms, total: 13.2 ms\n",
631 |       "Wall time: 1min 9s\n"
632 |      ]
633 |     }
634 |    ],
635 |    "source": [
636 |     "%%time\n",
637 |     "# define the same query run tested on blazingsql above\n",
638 |     "query = '''\n",
639 |     "        SELECT\n",
640 |     "            a.firstSeenSrcIp as source,\n",
641 |     "            a.firstSeenDestIp as destination,\n",
642 |     "            count(a.firstSeenDestPort) as targetPorts,\n",
643 |     "            SUM(a.firstSeenSrcTotalBytes) as bytesOut,\n",
644 |     "            SUM(a.firstSeenDestTotalBytes) as bytesIn,\n",
645 |     "            SUM(a.durationSeconds) as durationSeconds,\n",
646 |     "            MIN(parsedDate) as firstFlowDate,\n",
647 |     "            MAX(parsedDate) as lastFlowDate,\n",
648 |     "            COUNT(*) as attemptCount\n",
649 |     "        FROM\n",
650 |     "            netflow a\n",
651 |     "        GROUP BY\n",
652 |     "            a.firstSeenSrcIp,\n",
653 |     "            a.firstSeenDestIp\n",
654 |     "            '''\n",
655 |     "\n",
656 |     "# query with Spark\n",
657 |     "edges_df = spark.sql(query)\n",
658 |     "\n",
659 |     "# set/display results\n",
660 |     "edges_df.show(10)"
661 |    ]
662 |   }
663 |  ],
664 |  "metadata": {
665 |   "accelerator": "GPU",
666 |   "colab": {
667 |    "collapsed_sections": [],
668 |    "name": "vs_pyspark_netflow.ipynb",
669 |    "provenance": [],
670 |    "toc_visible": true
671 |   },
672 |   "kernelspec": {
673 |    "display_name": "Python 3",
674 |    "language": "python",
675 |    "name": "python3"
676 |   },
677 |   "language_info": {
678 |    "codemirror_mode": {
679 |     "name": "ipython",
680 |     "version": 3
681 |    },
682 |    "file_extension": ".py",
683 |    "mimetype": "text/x-python",
684 |    "name": "python",
685 |    "nbconvert_exporter": "python",
686 |    "pygments_lexer": "ipython3",
687 |    "version": "3.7.6"
688 |   }
689 |  },
690 |  "nbformat": 4,
691 |  "nbformat_minor": 4
692 | }
693 | 


--------------------------------------------------------------------------------