├── .gitmodules ├── README.md ├── accelerated_data_processing_examples ├── cudf_pandas_Introduction_to_Exploratory_Data_Analysis.ipynb ├── cudf_pandas_Introduction_to_Time_Series_Data_Analysis.ipynb ├── cudf_pandas_large_string.ipynb ├── cudf_pandas_single_gpu_large_dataset.ipynb ├── multi_gpu_polars_demo.ipynb ├── notebook_list.md ├── nxcg_wikipedia_e2e.ipynb └── polars_gpu_engine_demo.ipynb ├── benchmarks └── mortgage │ ├── README.md │ ├── images │ ├── mortgage_sl1.jpg │ └── mortgage_sl2.jpg │ ├── mortgage_e2e.ipynb │ └── utils │ ├── Data_Spec.json │ └── utils.py ├── blogs_notebooks ├── Managed_memory_demo_in_cudf_pandas.ipynb ├── README.md ├── blog_notebook_for_Stacking_Generalization_with_HPO_Maximize_Accuracy_in_15_Minutes_with_NVIDIA_cuml.ipynb └── blog_notebook_for_tips_for_data_scientists_to_get_started_with_GPU_acceleration.ipynb ├── documentation_notebooks └── README.md ├── event_notebooks ├── GTC_2021 │ └── credit_scorecard │ │ ├── README.md │ │ ├── cpu │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── __init__.py │ │ ├── data_utils.py │ │ ├── woesc_demo_vehicle_data.ipynb │ │ ├── woesc_utils.py │ │ └── xgbsc_demo_vehicle_data.ipynb │ │ └── gpu │ │ ├── Dockerfile │ │ ├── README.md │ │ ├── __init__.py │ │ ├── woesc_demo_vehicle_data_gpu.ipynb │ │ └── woesc_utils_gpu.py ├── JupyterCon_2020_RAPIDSViz │ ├── .gitignore │ ├── 00 Index and Introduction.ipynb │ ├── 01 Data Inspection and Validation.ipynb │ ├── 02 Exploratory Data Visualization.ipynb │ ├── 03 Data Analysis with Visual Analytics.ipynb │ ├── 04 Explanatory Data Visualization.ipynb │ ├── README.md │ ├── assets │ │ └── dash-style.css │ ├── data │ │ ├── README_data.md │ │ └── dash-style.css │ ├── environment.yml │ └── images │ │ ├── DataLanguage.jpg │ │ ├── DivvyBikesStation_ map.png │ │ ├── PlotlyDash-Dashboard.png │ │ ├── RAPIDS-header-graphic.png │ │ ├── RAPIDS-header.png │ │ ├── RAPIDSwow.gif │ │ ├── README_images.md │ │ ├── census-crop.jpg │ │ ├── census-demo.jpg │ │ ├── cuxfilter_02_dashboard_1.png │ │ ├── cuxfilter_02_dashboard_2.png │ │ ├── cuxfilter_02_dashboard_3.png │ │ ├── cuxfilter_02_dashboard_4.png │ │ ├── dashboard-sketch-ideas.jpg │ │ ├── notebook_04_dashboard_1.png │ │ └── plotly_dashboard_sketch.jpg ├── KDD_2020 │ ├── Presenters.md │ ├── README.md │ ├── img │ │ ├── cybert_workflow.png │ │ ├── microsoft_logo.png │ │ ├── njit_logo.png │ │ ├── nvidia_logo.jpg │ │ └── rapids_logo.png │ ├── kdd_initial_setup.sh │ └── notebooks │ │ ├── Lungs │ │ ├── __pycache__ │ │ │ └── rapids_scanpy_funcs.cpython-37.pyc │ │ ├── hlca_lung_gpu_analysis.ipynb │ │ └── rapids_scanpy_funcs.py │ │ ├── Taxi │ │ ├── NYCTax.ipynb │ │ ├── img │ │ │ └── ny_yellow_cab.jpg │ │ └── nyctaxi_data.py │ │ ├── cybert │ │ ├── cyBERT_training_inference.ipynb │ │ ├── models │ │ │ ├── apache_cased_example_labels.p │ │ │ ├── apache_label_map.txt │ │ │ └── apache_label_map_example.txt │ │ ├── resources │ │ │ ├── bert-base-cased-hash.txt │ │ │ ├── bert-base-cased-vocab.txt │ │ │ └── cybert_workflow.png │ │ └── training_data │ │ │ └── apache_sample_1k.csv │ │ ├── nvtabular │ │ └── rossmann-store-sales-example.ipynb │ │ └── parking │ │ ├── README.md │ │ ├── __patch │ │ └── cuspatial_init_patched.py │ │ └── codes │ │ ├── 1_rapids_seattleParking.ipynb │ │ ├── 2_rapids_seattleParking_graph.ipynb │ │ ├── 3_rapids_seattleParking_parkingNodes.ipynb │ │ ├── config │ │ └── GoogleMapsAPI.cred │ │ └── maps_rendered │ │ ├── map_as_crow_flies.html │ │ ├── map_walk_final.html │ │ └── map_walk_interim.html ├── README.md └── TMLS_2020 │ └── notebooks │ └── Taxi │ ├── Overview-Taxi.ipynb │ ├── img │ └── TMLS.png │ └── nyctaxi_data.py ├── getting_started_tutorials ├── 10min_to_cudf_colab.ipynb ├── README.md ├── accelerated_networkx_demo.ipynb ├── cudf_pandas_colab_demo.ipynb ├── cudf_pandas_demo.ipynb ├── cudf_pandas_opencellid_demo.ipynb ├── cudf_pandas_stocks_demo.ipynb ├── cuml_sklearn_colab_demo.ipynb ├── dask-sql-weather.ipynb ├── images │ └── ibis-cudf-pandas-comparison.png ├── opencellid_downloader.py └── rapids-pip-colab-template.ipynb └── team_contributions ├── README.md └── cuxfilter-tutorial ├── LICENSE ├── README.md ├── cuxfilter_tutorial.ipynb └── preprocess.py /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "team_contributions/Spatial-Analytics-Viz"] 2 | path = team_contributions/Spatial-Analytics-Viz 3 | url = https://github.com/exactlyallan/Spatial-Analytics-Viz 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RAPIDS Showcase Notebooks 2 | A [RAPIDS](https://rapids.ai/) community focused showcase of notebooks, examples and demos. 3 | 4 | Some examples include: 5 | - A visualization tutorial at: showcase/event_notebooks/JupyterCon_2020_RAPIDSViz/ 6 | - A Plotly dashboard using cuDF, cuGraph, and cuSpatial at: showcase/team_contributions/Spatial-Analytics-Viz 7 | 8 | Have a look around! 9 | -------------------------------------------------------------------------------- /accelerated_data_processing_examples/notebook_list.md: -------------------------------------------------------------------------------- 1 | # List of Notebooks 2 | -------------------------------------------------------------------------------- /accelerated_data_processing_examples/nxcg_wikipedia_e2e.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "x7aMYrwpiDyz" 7 | }, 8 | "source": [ 9 | "# End-to-End Demo\n", 10 | "## Running Pagerank on Wikipedia With vs. Without `nx-cugraph`\n", 11 | "\n", 12 | "This notebook demonstrates a zero code change, end-to-end workflow using `cudf.pandas` and `nx-cugraph`.\n", 13 | "\n", 14 | "Please see the [System Requirements](https://docs.rapids.ai/api/cugraph/stable/nx_cugraph/installation/#system-requirements) in order to run this notebook." 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": { 21 | "colab": { 22 | "base_uri": "https://localhost:8080/" 23 | }, 24 | "id": "caioaKAHiDy3", 25 | "outputId": "9b4dfc51-8d75-4386-9296-5bdf3790c5bc" 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "# Uncomment these two lines to enable GPU acceleration\n", 30 | "# The rest of the code stays the same!\n", 31 | "\n", 32 | "%load_ext cudf.pandas\n", 33 | "%env NX_CUGRAPH_AUTOCONFIG=True" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": { 40 | "id": "fQsXbML0NDDh" 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "import pandas as pd\n", 45 | "import networkx as nx" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": { 51 | "id": "6FwLpFboiDy5" 52 | }, 53 | "source": [ 54 | "Downloading the data" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 5, 60 | "metadata": { 61 | "id": "VTZysiOqOr3H" 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "import gzip\n", 66 | "import shutil\n", 67 | "import urllib.request\n", 68 | "from pathlib import Path\n", 69 | "\n", 70 | "# Get the data\n", 71 | "def download_datafile(url, file_path):\n", 72 | " compressed_path = file_path + \".gz\"\n", 73 | "\n", 74 | " if not Path(file_path).exists():\n", 75 | " print(f\"File not found. Downloading from {url}...\")\n", 76 | " urllib.request.urlretrieve(url, compressed_path)\n", 77 | "\n", 78 | " print(f\"\\tDownloaded to {compressed_path}. Unzipping...\")\n", 79 | " with gzip.open(compressed_path, 'rb') as f_in, open(file_path, 'wb') as f_out:\n", 80 | " shutil.copyfileobj(f_in, f_out)\n", 81 | "\n", 82 | " print(\"Done.\")\n", 83 | " else:\n", 84 | " print(f\"File already exists at {file_path}. Skipping download\")" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": { 91 | "colab": { 92 | "base_uri": "https://localhost:8080/" 93 | }, 94 | "id": "71o3rsvRiDy6", 95 | "outputId": "5b76fd27-1fe8-4bfb-99a7-746ba040db69" 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "nodedata_url=\"https://data.rapids.ai/cugraph/benchmark/enwiki-20240620-nodeids.csv.gz\"\n", 100 | "nodedata_path = \"enwiki-20240620-nodeids.csv\"\n", 101 | "download_datafile(nodedata_url, nodedata_path)\n", 102 | "\n", 103 | "edgelist_url=\"https://data.rapids.ai/cugraph/benchmark/enwiki-20240620-edges.csv.gz\"\n", 104 | "edgelist_path = \"enwiki-20240620-edges.csv\"\n", 105 | "download_datafile(edgelist_url, edgelist_path)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": { 111 | "id": "7mHFaMtdO3_Z" 112 | }, 113 | "source": [ 114 | "The dataset used in this script falls under the Creative Common Attribution-ShareAlike 4.0 International (CC BY-SA 4.0) License, available at https://creativecommons.org/licenses/by-sa/4.0/legalcode.en" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": { 120 | "id": "1NsIwM02iDy7" 121 | }, 122 | "source": [ 123 | "Timed end-to-end code" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": { 130 | "colab": { 131 | "base_uri": "https://localhost:8080/" 132 | }, 133 | "id": "qLM-hdWZiDy7", 134 | "outputId": "2cec80bf-fae5-44b8-f57c-f58b7b994e97" 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "%%time\n", 139 | "\n", 140 | "# Read the Wikipedia Connectivity data from `edgelist_path`\n", 141 | "edgelist_df = pd.read_csv(\n", 142 | " edgelist_path,\n", 143 | " sep=\" \",\n", 144 | " names=[\"src\", \"dst\"],\n", 145 | " dtype=\"int32\",\n", 146 | ")" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": { 153 | "colab": { 154 | "base_uri": "https://localhost:8080/" 155 | }, 156 | "id": "8UIsAc1MmjIS", 157 | "outputId": "c99bc050-bed4-4e83-d7b0-c4cb9ca6189f" 158 | }, 159 | "outputs": [], 160 | "source": [ 161 | "%%time\n", 162 | "\n", 163 | "# Read the Wikipedia Page metadata from `nodedata_path`\n", 164 | "nodedata_df = pd.read_csv(\n", 165 | " nodedata_path,\n", 166 | " sep=\"\\t\",\n", 167 | " names=[\"nodeid\", \"title\"],\n", 168 | " dtype={\"nodeid\": \"int32\", \"title\": \"str\"},\n", 169 | ")" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": { 176 | "id": "IcqAvqCImmTr" 177 | }, 178 | "outputs": [], 179 | "source": [ 180 | "%%time\n", 181 | "\n", 182 | "# Create a NetworkX graph from the connectivity info\n", 183 | "G = nx.from_pandas_edgelist(\n", 184 | " edgelist_df,\n", 185 | " source=\"src\",\n", 186 | " target=\"dst\",\n", 187 | " create_using=nx.DiGraph,\n", 188 | ")" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": { 195 | "id": "ufhSlz1WmtQg" 196 | }, 197 | "outputs": [], 198 | "source": [ 199 | "%%time\n", 200 | "\n", 201 | "# Run pagerank on NetworkX\n", 202 | "nx_pr_vals = nx.pagerank(G)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": { 209 | "id": "fjFFGFISmu2V" 210 | }, 211 | "outputs": [], 212 | "source": [ 213 | "%%time\n", 214 | "\n", 215 | "# Create a DataFrame containing the results\n", 216 | "pagerank_df = pd.DataFrame({\n", 217 | " \"nodeid\": nx_pr_vals.keys(),\n", 218 | " \"pagerank\": nx_pr_vals.values()\n", 219 | "})" 220 | ] 221 | }, 222 | { 223 | "cell_type": "code", 224 | "execution_count": null, 225 | "metadata": { 226 | "id": "QB0fooRwmv8T" 227 | }, 228 | "outputs": [], 229 | "source": [ 230 | "%%time\n", 231 | "# Add NetworkX results to `nodedata` as new columns\n", 232 | "nodedata_df = nodedata_df.merge(pagerank_df, how=\"left\", on=\"nodeid\")\n", 233 | "\n", 234 | "# Here the top 25 pages based on pagerank value\n", 235 | "nodedata_df.sort_values(by=\"pagerank\", ascending=False).head(25)" 236 | ] 237 | } 238 | ], 239 | "metadata": { 240 | "accelerator": "GPU", 241 | "colab": { 242 | "gpuType": "A100", 243 | "machine_shape": "hm", 244 | "provenance": [] 245 | }, 246 | "kernelspec": { 247 | "display_name": "Python 3", 248 | "name": "python3" 249 | }, 250 | "language_info": { 251 | "codemirror_mode": { 252 | "name": "ipython", 253 | "version": 3 254 | }, 255 | "file_extension": ".py", 256 | "mimetype": "text/x-python", 257 | "name": "python", 258 | "nbconvert_exporter": "python", 259 | "pygments_lexer": "ipython3", 260 | "version": "3.11.9" 261 | } 262 | }, 263 | "nbformat": 4, 264 | "nbformat_minor": 0 265 | } 266 | -------------------------------------------------------------------------------- /benchmarks/mortgage/README.md: -------------------------------------------------------------------------------- 1 | # Mortgage Workflow 2 | 3 | ## The Dataset 4 | The dataset used with this workflow is derived from [Fannie Mae’s Single-Family Loan Performance Data](http://www.fanniemae.com/portal/funding-the-market/data/loan-performance-data.html) with all rights reserved by Fannie Mae. This processed dataset is redistributed with permission and consent from Fannie Mae. 5 | 6 | To acquire this dataset, please visit [RAPIDS Datasets Homepage](https://docs.rapids.ai/datasets/mortgage-data) 7 | 8 | ## Introduction 9 | The Mortgage workflow is composed of three core phases: 10 | 11 | 1. ETL - Extract, Transform, Load 12 | 2. Data Conversion 13 | 3. ML - Training 14 | 15 | ### ETL 16 | Data is: 17 | 1. Read in from storage 18 | 2. Transformed to emphasize key features 19 | 3. Loaded into volatile memory for conversion 20 | 21 | ### Data Conversion 22 | Features are: 23 | 1. Broken into (labels, data) pairs 24 | 2. Distributed across many workers 25 | 3. Converted into compressed sparse row (CSR) matrix format for XGBoost 26 | 27 | ### Machine Learning 28 | The CSR data is fed into a distributed training session with `xgboost.dask` 29 | 30 | 31 | ## Performance 32 | We regularly benchmark RAPIDS on this workload to measure our performance against not just Apache Spark on CPUs but past versions of RAPIDS. 33 | 34 | ![Slide 1](images/mortgage_sl1.jpg) 35 | 36 | ![Slide 2](images/mortgage_sl2.jpg) 37 | -------------------------------------------------------------------------------- /benchmarks/mortgage/images/mortgage_sl1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/benchmarks/mortgage/images/mortgage_sl1.jpg -------------------------------------------------------------------------------- /benchmarks/mortgage/images/mortgage_sl2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/benchmarks/mortgage/images/mortgage_sl2.jpg -------------------------------------------------------------------------------- /benchmarks/mortgage/utils/Data_Spec.json: -------------------------------------------------------------------------------- 1 | { 2 | "SpecInfo": 3 | [ 4 | { 5 | "Total_Mem" : 630e9, 6 | "Start_Year" : 2000, 7 | "End_Year" : 2016, 8 | "Part_Count" : [48, 96] 9 | }, 10 | 11 | { 12 | "Total_Mem" : 511e9, 13 | "Start_Year" : 2000, 14 | "End_Year" : 2016, 15 | "Part_Count" : [32, 64] 16 | }, 17 | 18 | { 19 | "Total_Mem" : 255e9, 20 | "Start_Year" : 2000, 21 | "End_Year" : 2016, 22 | "Part_Count" : [24, 48] 23 | }, 24 | 25 | { 26 | "Total_Mem" : 127e9, 27 | "Start_Year" : 2000, 28 | "End_Year" : 2007, 29 | "Part_Count" : [16, 24] 30 | }, 31 | 32 | { 33 | "Total_Mem" : 47e9, 34 | "Start_Year" : 2000, 35 | "End_Year" : 2004, 36 | "Part_Count" : [8, 12] 37 | }, 38 | 39 | { 40 | "Total_Mem" : 15e9, 41 | "Start_Year" : 2000, 42 | "End_Year" : 2000, 43 | "Part_Count" : [2, 3] 44 | } 45 | 46 | ] 47 | } -------------------------------------------------------------------------------- /benchmarks/mortgage/utils/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import glob 3 | import multiprocessing 4 | import pynvml 5 | import os 6 | import tarfile 7 | import urllib 8 | 9 | # Global variables 10 | 11 | # Links to mortgage data files 12 | MORTGAGE_YEARLY_1GB_SPLITS_URL = "https://data.rapids.ai/notebook-mortgage-data/mortgage_yearly/" 13 | MORTGAGE_YEARLY_2GB_SPLITS_URL = "https://data.rapids.ai/notebook-mortgage-data/mortgage_yearly_2gb/" 14 | 15 | 16 | def get_data(data_dir, start_year, end_year, use_1GB_splits): 17 | """ 18 | Utility to download and extract mortgage data to specied data_dir. 19 | Only specific years of data between `start_year` and `end_year` will be downloaded 20 | to the specified directory 21 | """ 22 | if use_1GB_splits: 23 | data_url = MORTGAGE_YEARLY_1GB_SPLITS_URL 24 | else: 25 | data_url = MORTGAGE_YEARLY_2GB_SPLITS_URL 26 | for year in range(start_year, end_year + 1): 27 | if not os.path.isfile(data_dir + "acq/Acquisition_" + str(year) + "Q4.txt"): 28 | print(f"Downloading data for year {year}") 29 | filename = "mortgage_" + str(year) 30 | filename += "_1gb.tgz" if use_1GB_splits else "_2GB.tgz" 31 | urllib.request.urlretrieve(data_url + filename, data_dir + filename) 32 | print(f"Download complete") 33 | print(f"Decompressing and extracting data") 34 | 35 | tar = tarfile.open(data_dir + filename, mode="r:gz") 36 | tar.extractall(path=data_dir) 37 | tar.close() 38 | print(f"Done extracting year {year}") 39 | 40 | if not os.path.isfile(data_dir + "names.csv"): 41 | urllib.request.urlretrieve(data_url + "names.csv", data_dir + "names.csv") 42 | 43 | 44 | def _read_data_spec(filename=os.path.dirname(__file__) + "/Data_Spec.json"): 45 | """ 46 | Read the Data_Spec json 47 | """ 48 | with open(filename) as f: 49 | data_spec = json.load(f) 50 | 51 | try: 52 | spec_list = data_spec["SpecInfo"] 53 | except KeyError: 54 | raise ValueError(f"SpecInfo missing in Data spec file: {filename}") 55 | return spec_list 56 | 57 | 58 | def determine_dataset(total_mem, min_mem, part_count=None): 59 | """ 60 | Determine params and dataset to use 61 | based on Data spec sheet and available memory 62 | """ 63 | start_year = None # start year for etl proessing 64 | end_year = None # end year for etl processing (inclusive) 65 | 66 | use_1GB_splits = True 67 | if min_mem >= 31.5e9: 68 | use_1GB_splits = False 69 | 70 | spec_list = _read_data_spec() 71 | # Assumption that spec_list has elements with mem_requirement 72 | # in Descending order 73 | 74 | # TODO: Code duplication. Consolidate into one 75 | if part_count: 76 | part_count = int(part_count) 77 | for i, spec in enumerate(spec_list): 78 | spec_part_count = ( 79 | spec["Part_Count"][1] if use_1GB_splits else spec["Part_Count"][0] 80 | ) 81 | if part_count > spec_part_count: 82 | start_year = ( 83 | spec_list[i - 1]["Start_Year"] if i > 0 else spec["Start_Year"] 84 | ) 85 | end_year = spec_list[i - 1]["End_Year"] if i > 0 else spec["End_Year"] 86 | break 87 | if not start_year: 88 | start_year = spec_list[-1]["Start_Year"] 89 | end_year = spec_list[-1]["End_Year"] 90 | 91 | else: 92 | for spec in spec_list: 93 | spec_part_count = ( 94 | spec["Part_Count"][1] if use_1GB_splits else spec["Part_Count"][0] 95 | ) 96 | if total_mem >= spec["Total_Mem"]: 97 | start_year = spec["Start_Year"] 98 | end_year = spec["End_Year"] 99 | part_count = spec_part_count 100 | break 101 | 102 | return (start_year, end_year, part_count, use_1GB_splits) 103 | 104 | 105 | def memory_info(): 106 | """ 107 | Assumes identical GPUs in a node 108 | """ 109 | pynvml.nvmlInit() 110 | handle = pynvml.nvmlDeviceGetHandleByIndex(0) 111 | gpu_mem = pynvml.nvmlDeviceGetMemoryInfo(handle).total 112 | pynvml.nvmlShutdown() 113 | return gpu_mem 114 | 115 | 116 | def get_num_files(start_year, end_year, perf_dir): 117 | """ 118 | Get number of files to read given start_year 119 | end_year and path to performance files 120 | """ 121 | count = 0 122 | for year in range(start_year, end_year + 1): 123 | count += len(glob.glob(perf_dir + f"/*{year}*")) 124 | return count 125 | 126 | 127 | def get_cpu_cores(): 128 | return multiprocessing.cpu_count() 129 | -------------------------------------------------------------------------------- /blogs_notebooks/README.md: -------------------------------------------------------------------------------- 1 | # RAPIDS Blog Notebooks 2 | Coming Soon! -------------------------------------------------------------------------------- /blogs_notebooks/blog_notebook_for_tips_for_data_scientists_to_get_started_with_GPU_acceleration.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "_cell_guid": "ea25cdf7-bdbc-3cf1-0737-bc51675e3374", 7 | "_uuid": "fed5696c67bf55a553d6d04313a77e8c617cad99", 8 | "id": "e2TxdK7lM53J" 9 | }, 10 | "source": [ 11 | "# Tips for Data Scientists to Get Started with GPU Acceleration\n", 12 | "\n", 13 | "## Introduction\n", 14 | "This notebook showcases important functionalities that are important for data scientist and how RAPIDS accelerates workflows using its powerful suite of libraries and frameworks. \n", 15 | "\n", 16 | "## Data We'll be Using\n", 17 | "We'll be exploring and augmenting the Titanic passenger demographic data set from Kaggle to showcase how you can apply these functions to yoru real world data. The dataset used for this notebook can be downloaded from Kaggle and consists of a \n", 18 | "- [train](https://www.kaggle.com/code/startupsci/titanic-data-science-solutions/input?select=train.csv) dataset\n", 19 | "- [test](https://www.kaggle.com/code/startupsci/titanic-data-science-solutions/input?select=test.csv) dataset\n", 20 | "\n", 21 | "You will need to accept the terms of the competition before you can download it. Once you do, please download both before continuing and put them into the same folder as you're running this notebook.\n" 22 | ] 23 | }, 24 | { 25 | "cell_type": "markdown", 26 | "metadata": {}, 27 | "source": [ 28 | "## Hello World: exploring cuDF and GPU Acceleration for pandas" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "%load_ext cudf.pandas loads the cuDF extension for Pandas, allowing the use of GPU-accelerated DataFrames." 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 18, 41 | "metadata": { 42 | "id": "d_hibjM0M53K", 43 | "outputId": "3d447dda-da99-4495-c2e9-64a3fdefd752" 44 | }, 45 | "outputs": [ 46 | { 47 | "name": "stdout", 48 | "output_type": "stream", 49 | "text": [ 50 | "The cudf.pandas extension is already loaded. To reload it, use:\n", 51 | " %reload_ext cudf.pandas\n" 52 | ] 53 | } 54 | ], 55 | "source": [ 56 | "%load_ext cudf.pandas" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "Import libraries, read Titanic data, and concatenate data" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 19, 69 | "metadata": { 70 | "id": "eog51TtaB5Ij" 71 | }, 72 | "outputs": [], 73 | "source": [ 74 | "import pandas as pd\n", 75 | "import cupy as cp\n", 76 | "\n", 77 | "train = pd.read_csv('./train.csv')\n", 78 | "test = pd.read_csv('./test.csv')\n", 79 | "concat = pd.concat([train, test], axis = 0)" 80 | ] 81 | }, 82 | { 83 | "cell_type": "markdown", 84 | "metadata": {}, 85 | "source": [ 86 | "Scale up the dataset to demonstrate the advantage of GPU acceleration: the original Titanic dataset is too small, so we replicate it to simulate a dataset with 1 million rows" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 20, 92 | "metadata": { 93 | "_cell_guid": "e7319668-86fe-8adc-438d-0eef3fd0a982", 94 | "_uuid": "13f38775c12ad6f914254a08f0d1ef948a2bd453", 95 | "id": "0HwIvMVJM53L", 96 | "outputId": "de15a613-e8ce-43b8-c57c-424b81b9bc06" 97 | }, 98 | "outputs": [ 99 | { 100 | "name": "stdout", 101 | "output_type": "stream", 102 | "text": [ 103 | "(1000000, 12)\n", 104 | "(1000000, 11)\n" 105 | ] 106 | } 107 | ], 108 | "source": [ 109 | "target_rows = 1_000_000\n", 110 | "repeats = -(-target_rows // len(train)) # Ceiling division\n", 111 | "train_df = pd.concat([train] * repeats, ignore_index=True).head(target_rows)\n", 112 | "print(train_df.shape) # (1000000, 2)\n", 113 | "\n", 114 | "repeats = -(-target_rows // len(test)) # Ceiling division\n", 115 | "test_df = pd.concat([test] * repeats, ignore_index=True).head(target_rows)\n", 116 | "print(test_df.shape) # (1000000, 2)\n", 117 | "\n", 118 | "combine = [train_df, test_df]" 119 | ] 120 | }, 121 | { 122 | "cell_type": "markdown", 123 | "metadata": {}, 124 | "source": [ 125 | "The cudf.pandas extension allows the execution of familiar pandas operations such as filtering, grouping, and merging, on GPUs without requiring a code change and/or rewrites." 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 21, 131 | "metadata": { 132 | "id": "QWLwdRUWBuGn" 133 | }, 134 | "outputs": [], 135 | "source": [ 136 | "filtered_df = train_df[(train_df['Age'] > 30) & (train_df['Fare'] > 50)]\n", 137 | "grouped_df = train_df.groupby('Embarked')[['Fare', 'Age']].mean()\n", 138 | "additional_info = pd.DataFrame({\n", 139 | "\t'PassengerId': [1, 2, 3],\n", 140 | "\t'VIP_Status': ['No', 'Yes', 'No']\n", 141 | "})\n", 142 | "merged_df = train_df.merge(additional_info, on='PassengerId', how='left')" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "## Tracking Performance: CPU and GPU Runtime Metrics" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "The %%cudf.pandas.profile magic command profiles the calls executed on CPU and GPU and the time taken to execute them. The profiling output reveals that certain operations reverted to CPU execution, thereby indicating areas where GPU acceleration was not effectively utilized. \n" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 22, 162 | "metadata": { 163 | "_cell_guid": "0964832a-a4be-2d6f-a89e-63526389cee9", 164 | "_uuid": "97a845528ce9f76e85055a4bb9e97c27091f6aa1", 165 | "id": "OXGjZiFdM53N", 166 | "outputId": "08f7f964-4ac8-4d38-ffa3-57e0bb511492" 167 | }, 168 | "outputs": [ 169 | { 170 | "data": { 171 | "text/html": [ 172 | "
\n", 173 | "\n", 186 | "\n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | " \n", 203 | " \n", 204 | " \n", 205 | " \n", 206 | " \n", 207 | " \n", 208 | " \n", 209 | " \n", 210 | " \n", 211 | "
PclassSurvived
010.629592
120.472810
230.242378
\n", 212 | "
" 213 | ], 214 | "text/plain": [ 215 | " Pclass Survived\n", 216 | "0 1 0.629592\n", 217 | "1 2 0.472810\n", 218 | "2 3 0.242378" 219 | ] 220 | }, 221 | "execution_count": 22, 222 | "metadata": {}, 223 | "output_type": "execute_result" 224 | }, 225 | { 226 | "data": { 227 | "text/html": [ 228 | "
                                                                                                           \n",
229 |        "                                         Total time elapsed: 0.092 seconds                                 \n",
230 |        "                                       5 GPU function calls in 0.014 seconds                               \n",
231 |        "                                       0 CPU function calls in 0.000 seconds                               \n",
232 |        "                                                                                                           \n",
233 |        "                                                       Stats                                               \n",
234 |        "                                                                                                           \n",
235 |        "┏━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n",
236 |        "┃ Function               GPU ncalls  GPU cumtime  GPU percall  CPU ncalls  CPU cumtime  CPU percall ┃\n",
237 |        "┡━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n",
238 |        "│ DataFrame.__getitem__ │ 1          │ 0.001       │ 0.001       │ 0          │ 0.000       │ 0.000       │\n",
239 |        "│ DataFrame.groupby     │ 1          │ 0.000       │ 0.000       │ 0          │ 0.000       │ 0.000       │\n",
240 |        "│ GroupBy.mean          │ 1          │ 0.005       │ 0.005       │ 0          │ 0.000       │ 0.000       │\n",
241 |        "│ DataFrame.sort_values │ 1          │ 0.002       │ 0.002       │ 0          │ 0.000       │ 0.000       │\n",
242 |        "│ DataFrame.__repr__    │ 1          │ 0.005       │ 0.005       │ 0          │ 0.000       │ 0.000       │\n",
243 |        "└───────────────────────┴────────────┴─────────────┴─────────────┴────────────┴─────────────┴─────────────┘\n",
244 |        "
\n" 245 | ], 246 | "text/plain": [ 247 | "\u001b[3m \u001b[0m\n", 248 | "\u001b[3m Total time elapsed: 0.092 seconds \u001b[0m\n", 249 | "\u001b[3m 5 GPU function calls in 0.014 seconds \u001b[0m\n", 250 | "\u001b[3m 0 CPU function calls in 0.000 seconds \u001b[0m\n", 251 | "\u001b[3m \u001b[0m\n", 252 | "\u001b[3m Stats \u001b[0m\n", 253 | "\u001b[3m \u001b[0m\n", 254 | "┏━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n", 255 | "┃\u001b[1m \u001b[0m\u001b[1mFunction \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mGPU ncalls\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mGPU cumtime\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mGPU percall\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mCPU ncalls\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mCPU cumtime\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mCPU percall\u001b[0m\u001b[1m \u001b[0m┃\n", 256 | "┡━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n", 257 | "│ DataFrame.__getitem__ │ 1 │ 0.001 │ 0.001 │ 0 │ 0.000 │ 0.000 │\n", 258 | "│ DataFrame.groupby │ 1 │ 0.000 │ 0.000 │ 0 │ 0.000 │ 0.000 │\n", 259 | "│ GroupBy.mean │ 1 │ 0.005 │ 0.005 │ 0 │ 0.000 │ 0.000 │\n", 260 | "│ DataFrame.sort_values │ 1 │ 0.002 │ 0.002 │ 0 │ 0.000 │ 0.000 │\n", 261 | "│ DataFrame.__repr__ │ 1 │ 0.005 │ 0.005 │ 0 │ 0.000 │ 0.000 │\n", 262 | "└───────────────────────┴────────────┴─────────────┴─────────────┴────────────┴─────────────┴─────────────┘\n" 263 | ] 264 | }, 265 | "metadata": {}, 266 | "output_type": "display_data" 267 | } 268 | ], 269 | "source": [ 270 | "%%cudf.pandas.profile\n", 271 | "train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "We can use Python’s magic commands %%time %%timeit to time either the CPU and the GPU enabling you to benchmark specific code blocks by measuring their execution time and processor type. Because this environment is currently GPU enabled with cudf.pandas, and there currently is no simple way to turn it off, we can only show GPU accelerated runtimes. What we will do is run both examples from the blog with the GPU measurement. If you want to see the differences, you can still rerun the notebook and not load the cudf.pandas extension." 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 23, 284 | "metadata": { 285 | "_cell_guid": "da057efe-88f0-bf49-917b-bb2fec418ed9", 286 | "_uuid": "e328d9882affedcfc4c167aa5bb1ac132547558c", 287 | "id": "X7AM_T6XM53O", 288 | "outputId": "b9f64843-d53f-4213-af82-ed14f7ea5821" 289 | }, 290 | "outputs": [ 291 | { 292 | "name": "stdout", 293 | "output_type": "stream", 294 | "text": [ 295 | "Before (1000000, 12) (1000000, 11) (1000000, 12) (1000000, 11)\n", 296 | "After (1000000, 10) (1000000, 9) (1000000, 10) (1000000, 9)\n", 297 | "CPU times: user 4.19 ms, sys: 12.8 ms, total: 17 ms\n", 298 | "Wall time: 16.3 ms\n" 299 | ] 300 | } 301 | ], 302 | "source": [ 303 | "%%time\n", 304 | "\n", 305 | "print(\"Before\", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape)\n", 306 | "\n", 307 | "train_df = train_df.drop(['Ticket', 'Cabin'], axis=1)\n", 308 | "test_df = test_df.drop(['Ticket', 'Cabin'], axis=1)\n", 309 | "combine = [train_df, test_df]\n", 310 | "\n", 311 | "print(\"After\", train_df.shape, test_df.shape, combine[0].shape, combine[1].shape)" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 24, 317 | "metadata": { 318 | "_cell_guid": "df7f0cd4-992c-4a79-fb19-bf6f0c024d4b", 319 | "_uuid": "c916644bd151f3dc8fca900f656d415b4c55e2bc", 320 | "id": "piTSx5LPM53O", 321 | "outputId": "6d8c93f3-bff5-46ba-e16c-9886fe185470" 322 | }, 323 | "outputs": [ 324 | { 325 | "name": "stdout", 326 | "output_type": "stream", 327 | "text": [ 328 | "36.8 ms ± 372 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" 329 | ] 330 | } 331 | ], 332 | "source": [ 333 | "%%timeit\n", 334 | "\n", 335 | "for dataset in combine:\n", 336 | " dataset['Title'] = dataset.Name.str.extract(' ([A-Za-z]+)\\\\.', expand=False)\n", 337 | "\n", 338 | "pd.crosstab(train_df['Title'], train_df['Sex'])" 339 | ] 340 | }, 341 | { 342 | "cell_type": "markdown", 343 | "metadata": {}, 344 | "source": [ 345 | "## Verifying GPU Utilization" 346 | ] 347 | }, 348 | { 349 | "cell_type": "markdown", 350 | "metadata": {}, 351 | "source": [ 352 | "Replicate a cupy.ndarray" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 25, 358 | "metadata": { 359 | "_cell_guid": "9299523c-dcf1-fb00-e52f-e2fb860a3920", 360 | "_uuid": "24a0971daa4cbc3aa700bae42e68c17ce9f3a6e2", 361 | "id": "2v0-iydSM53P", 362 | "outputId": "0c9a7965-ccae-4884-d5eb-bc000ed7c8ff" 363 | }, 364 | "outputs": [ 365 | { 366 | "data": { 367 | "text/plain": [ 368 | "array([[0., 0., 0.],\n", 369 | " [0., 0., 0.]])" 370 | ] 371 | }, 372 | "execution_count": 25, 373 | "metadata": {}, 374 | "output_type": "execute_result" 375 | } 376 | ], 377 | "source": [ 378 | "guess_ages = cp.zeros((2,3))\n", 379 | "guess_ages" 380 | ] 381 | }, 382 | { 383 | "cell_type": "markdown", 384 | "metadata": {}, 385 | "source": [ 386 | "Whether arrays are being processed on the CPU or GPU can be checked using the type command to differentiate between NumPy and CuPy arrays. If the output is np.array, the data is being processed on the CPU. If the output is cupy.ndarray, the data is being processed on the GPU. " 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 26, 392 | "metadata": { 393 | "id": "7nihLFkTM53P", 394 | "outputId": "f04771d3-7231-416b-9787-5ae978fba0b1" 395 | }, 396 | "outputs": [ 397 | { 398 | "data": { 399 | "text/plain": [ 400 | "cupy.ndarray" 401 | ] 402 | }, 403 | "execution_count": 26, 404 | "metadata": {}, 405 | "output_type": "execute_result" 406 | } 407 | ], 408 | "source": [ 409 | "type(guess_ages)" 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "metadata": {}, 415 | "source": [ 416 | "Using the print command can confirm whether the GPU is being utilized and ensure that a cuDF DataFrame is being processed. The output specifies whether the fast path (cuDF) or slow path (pandas) is in use." 417 | ] 418 | }, 419 | { 420 | "cell_type": "code", 421 | "execution_count": 27, 422 | "metadata": {}, 423 | "outputs": [ 424 | { 425 | "name": "stdout", 426 | "output_type": "stream", 427 | "text": [ 428 | "\n" 429 | ] 430 | } 431 | ], 432 | "source": [ 433 | "print(pd)" 434 | ] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "metadata": {}, 439 | "source": [ 440 | "Commands like df.info() can be used to inspect the structure of cuDF DataFrame and confirm that computations are GPU-accelerated." 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 28, 446 | "metadata": { 447 | "id": "57dq01RxM53M", 448 | "outputId": "ed303fb1-65ad-446b-c1b7-9a7f4891e79b" 449 | }, 450 | "outputs": [ 451 | { 452 | "name": "stdout", 453 | "output_type": "stream", 454 | "text": [ 455 | "\n", 456 | "RangeIndex: 1000000 entries, 0 to 999999\n", 457 | "Data columns (total 11 columns):\n", 458 | " # Column Non-Null Count Dtype\n", 459 | "--- ------ -------------- -----\n", 460 | " 0 PassengerId 1000000 non-null int64\n", 461 | " 1 Survived 1000000 non-null int64\n", 462 | " 2 Pclass 1000000 non-null int64\n", 463 | " 3 Name 1000000 non-null object\n", 464 | " 4 Sex 1000000 non-null object\n", 465 | " 5 Age 801349 non-null float64\n", 466 | " 6 SibSp 1000000 non-null int64\n", 467 | " 7 Parch 1000000 non-null int64\n", 468 | " 8 Fare 1000000 non-null float64\n", 469 | " 9 Embarked 997755 non-null object\n", 470 | " 10 Title 1000000 non-null object\n", 471 | "dtypes: float64(2), int64(5), object(4)\n", 472 | "memory usage: 102.7+ MB\n" 473 | ] 474 | } 475 | ], 476 | "source": [ 477 | "train_df.info()" 478 | ] 479 | } 480 | ], 481 | "metadata": { 482 | "_change_revision": 0, 483 | "_is_fork": false, 484 | "colab": { 485 | "provenance": [] 486 | }, 487 | "kernelspec": { 488 | "display_name": "Python 3 (ipykernel)", 489 | "language": "python", 490 | "name": "python3" 491 | }, 492 | "language_info": { 493 | "codemirror_mode": { 494 | "name": "ipython", 495 | "version": 3 496 | }, 497 | "file_extension": ".py", 498 | "mimetype": "text/x-python", 499 | "name": "python", 500 | "nbconvert_exporter": "python", 501 | "pygments_lexer": "ipython3", 502 | "version": "3.12.3" 503 | } 504 | }, 505 | "nbformat": 4, 506 | "nbformat_minor": 4 507 | } 508 | -------------------------------------------------------------------------------- /documentation_notebooks/README.md: -------------------------------------------------------------------------------- 1 | # RAPIDS Documentation Notebooks 2 | Coming Soon! -------------------------------------------------------------------------------- /event_notebooks/GTC_2021/credit_scorecard/README.md: -------------------------------------------------------------------------------- 1 | # Building Credit Risk Scorecards with RAPIDS 2 | 3 | This repo contains code referenced from the GTC 2021 talk, "Machine Learning in Retail Credit Risk: Algorithms, Infrastructure, and Alternative Data — Past, Present, and Future [S31327]" by Paul Edwards, Director, Data Science and Model Innovation at Scotiabank. 4 | 5 | `/cpu` contains a notebook and tools demonstrating how to build scorecards using weight-of-evidence logistic regression (WOELR) on CPU, using libraries like NumPy, Pandas, and Scikit-learn. 6 | 7 | `/gpu` contains a (work-in-progress) notebook and tools accelerating the above work on GPU, using libraries like CuPy, cuDF, and cuML. 8 | 9 | This work uses vehicle loan default prediction data from L&T Company, accessible through Kaggle: https://www.kaggle.com/sneharshinde/ltfs-av-data. 10 | 11 | 12 | -------------------------------------------------------------------------------- /event_notebooks/GTC_2021/credit_scorecard/cpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rapidsai/rapidsai-core:0.18-cuda10.1-runtime-ubuntu18.04-py3.8 2 | RUN apt-get update && source activate rapids && conda install -c conda-forge mlxtend && conda install -c conda-forge python-graphviz 3 | -------------------------------------------------------------------------------- /event_notebooks/GTC_2021/credit_scorecard/cpu/README.md: -------------------------------------------------------------------------------- 1 | # Building Credit Risk Scorecards on CPU 2 | 3 | #### Code by: Stephen Denton, Scotiabank (stephen.denton@scotiabank.com) 4 | 5 | This folder contains a notebook and tools demonstrating how to build scorecards using weight-of-evidence logistic regression (WOELR) on CPU, using libraries like NumPy, Pandas, and Scikit-learn. 6 | 7 | `Dockerfile` contains a docker recipe that can be used to execute both the CPU and GPU code: 8 | 9 | ``` 10 | $ docker build . -t rapids_container 11 | $ docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 \ 12 | -v /path/to/host/data:/rapids/my_data \ 13 | rapids_container 14 | ``` 15 | 16 | You can then attach to the Jupyter server at `http://localhost:8888/lab?` 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /event_notebooks/GTC_2021/credit_scorecard/cpu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/GTC_2021/credit_scorecard/cpu/__init__.py -------------------------------------------------------------------------------- /event_notebooks/GTC_2021/credit_scorecard/cpu/data_utils.py: -------------------------------------------------------------------------------- 1 | from woesc_utils import * 2 | 3 | 4 | def preprocess_vehicle_data(dataset, id_vars, targ_var): 5 | ## Sort data by unique client identifier 6 | dataset.sort_values(id_vars, inplace=True) 7 | dataset.reset_index(drop=True, inplace=True) 8 | 9 | ## Make the target variable the second column in the dataframe 10 | targets = dataset.pop(targ_var) 11 | dataset.insert(1, targ_var, targets) 12 | 13 | ## Replace periods in variable names with underscores 14 | new_cols = [sub.replace('.', '_') for sub in dataset.columns] 15 | dataset.rename( columns=dict(zip(dataset.columns, new_cols)), inplace=True) 16 | 17 | ## Specify variables that should be treated as categorical and convert them to character strings (non-numeric) 18 | cat_vars = [ 'branch_id', 'supplier_id', 'manufacturer_id', 'Current_pincode_ID', 'State_ID', 'Employee_code_ID' 19 | , 'Aadhar_flag', 'PAN_flag', 'VoterID_flag', 'Driving_flag', 'Passport_flag'] 20 | dataset[cat_vars] = dataset[cat_vars].fillna('') 21 | dataset[cat_vars] = dataset[cat_vars].applymap(str) 22 | 23 | ## Strategically add some missing data 24 | ## Note: There is no bureau data for more than half of the records 25 | no_bureau = (dataset.PERFORM_CNS_SCORE_DESCRIPTION == 'No Bureau History Available') 26 | dataset.loc[no_bureau, 'PERFORM_CNS_SCORE_DESCRIPTION'] = '' 27 | bureau_vars = [ 'PERFORM_CNS_SCORE', 'PRI_NO_OF_ACCTS', 'PRI_ACTIVE_ACCTS', 'PRI_OVERDUE_ACCTS' 28 | , 'PRI_CURRENT_BALANCE', 'PRI_SANCTIONED_AMOUNT', 'PRI_DISBURSED_AMOUNT', 'PRIMARY_INSTAL_AMT'] 29 | dataset.loc[no_bureau, bureau_vars] = np.nan 30 | 31 | ## The 'Credit Score' variable PERFORM_CNS_SCORE has some issues and could use some additional feature engineering. 32 | ## The values of 300, 738, and 825 are over-represented in the data (300 should be at the end of the distribution) 33 | ## The values 11,14-18 are clearly 'Not Scored' codes - setting to missing for demo 34 | # dataset.PERFORM_CNS_SCORE.value_counts() 35 | # dataset.PERFORM_CNS_SCORE_DESCRIPTION.value_counts().sort_index() 36 | # pd.crosstab(dataset.PERFORM_CNS_SCORE_DESCRIPTION, dataset.PERFORM_CNS_SCORE, margins=True) 37 | dataset.loc[dataset.PERFORM_CNS_SCORE < 20, 'PERFORM_CNS_SCORE'] = np.nan 38 | 39 | ## Make all date calculation relative to January 2019 when this dataset was created. 40 | t_0 = pd.to_datetime('201901', format='%Y%m') 41 | dataset['DoB'] = pd.to_datetime(dataset['Date_of_Birth'], format='%d-%m-%y', errors='coerce') 42 | dataset['DoB'] = dataset['DoB'].mask( dataset['DoB'].dt.year > t_0.year 43 | , dataset['DoB'] - pd.offsets.DateOffset(years=100)) 44 | dataset['AgeInMonths'] = (t_0 - dataset.DoB).astype('timedelta64[M]') 45 | 46 | dataset['DaysSinceDisbursement'] = (t_0 - pd.to_datetime(dataset.DisbursalDate, format='%d-%m-%y') 47 | ).astype('timedelta64[D]') 48 | 49 | def timestr_to_mths(timestr): 50 | '''timestr formatted as 'Xyrs Ymon' ''' 51 | year = int(timestr.split()[0].split('y')[0]) 52 | mo = int(timestr.split()[1].split('m')[0]) 53 | num = year*12 + mo 54 | return(num) 55 | 56 | dataset['AcctAgeInMonths'] = dataset['AVERAGE_ACCT_AGE'].apply(lambda x: timestr_to_mths(x)) 57 | dataset['CreditHistLenInMonths'] = dataset["CREDIT_HISTORY_LENGTH"].apply(lambda x: timestr_to_mths(x)) 58 | 59 | dat = dataset.drop(columns=['Date_of_Birth', 'DoB', 'AVERAGE_ACCT_AGE', 'CREDIT_HISTORY_LENGTH' 60 | , 'MobileNo_Avl_Flag', 'DisbursalDate'] ) 61 | dat[targ_var] = dat[targ_var].astype(int) 62 | 63 | # ## Can drop records with no credit history - just to trim the data (justifiable in scenarios where 64 | # ## no_credit_bureau leads to an auto-decline or initiates a separate adjudication process) 65 | # dat = dat.loc[(~no_bureau | (dat.SEC_NO_OF_ACCTS != 0)), :] 66 | 67 | ## Drop some variables that are not good for scorecarding (sparse, high cardinality) 68 | ## The variable 'branch_id' is likely linked to geography and therefore demographics 69 | dat = dat.drop(columns=['supplier_id', 'Current_pincode_ID', 'Employee_code_ID', 'branch_id']) 70 | 71 | ## Give some variables shorter names 72 | dat.rename(columns={'PERFORM_CNS_SCORE_DESCRIPTION': 'PERF_CNS_SC_DESC' 73 | , 'DELINQUENT_ACCTS_IN_LAST_SIX_MONTHS': 'DELI_ACCTS_LAST_6_MTHS' 74 | , 'NEW_ACCTS_IN_LAST_SIX_MONTHS': 'NEW_ACCTS_LAST_6_MTHS'}, inplace=True) 75 | 76 | return dat -------------------------------------------------------------------------------- /event_notebooks/GTC_2021/credit_scorecard/gpu/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rapidsai/rapidsai-core:0.18-cuda10.1-runtime-ubuntu18.04-py3.8 2 | RUN apt-get update && source activate rapids && conda install -c conda-forge mlxtend && conda install -c conda-forge python-graphviz 3 | -------------------------------------------------------------------------------- /event_notebooks/GTC_2021/credit_scorecard/gpu/README.md: -------------------------------------------------------------------------------- 1 | # Building Credit Risk Scorecards on GPU 2 | 3 | #### Code by: Stephen Denton, Scotiabank (stephen.denton@scotiabank.com) 4 | 5 | This folder contains a (work-in-progress) notebook and tools accelerating the above work on GPU, using libraries like CuPy, cuDF, and cuML. As this is a work in progress, not all functions work on GPU yet. 6 | 7 | `Dockerfile` contains a docker recipe that can be used to execute both the CPU and GPU code: 8 | 9 | ``` 10 | $ docker build . -t rapids_container 11 | $ docker run --gpus all --rm -it -p 8888:8888 -p 8787:8787 -p 8786:8786 \ 12 | -v /path/to/host/data:/rapids/my_data \ 13 | rapids_container 14 | ``` 15 | 16 | You can then attach to the Jupyter server at `http://localhost:8888/lab?` 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | -------------------------------------------------------------------------------- /event_notebooks/GTC_2021/credit_scorecard/gpu/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/GTC_2021/credit_scorecard/gpu/__init__.py -------------------------------------------------------------------------------- /event_notebooks/JupyterCon_2020_RAPIDSViz/.gitignore: -------------------------------------------------------------------------------- 1 | # Jupyter Notebook 2 | .ipynb_checkpoints 3 | 4 | # IPython 5 | profile_default/ 6 | ipython_config.py 7 | -------------------------------------------------------------------------------- /event_notebooks/JupyterCon_2020_RAPIDSViz/00 Index and Introduction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Using RAPIDS and Jupyter to Accelerate Visualization Workflows" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "## Run this cell to play the walk through video: ##\n", 24 | "from IPython.display.IFrame import HTML\n", 25 | "HTML('')" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "## Introduction to RAPIDS\n", 33 | "Backed by NVIDIA, the **[RAPIDS](https://rapids.ai/index.html)** suite of open source software libraries gives you the ability to execute end-to-end data science and analytics pipelines entirely on GPUs.\n", 34 | "\n", 35 | "Some of the main libraries includes [**cuDF**](https://docs.rapids.ai/api/cudf/stable/), a pandas-like dataframe manipulation library; [**cuML**](https://docs.rapids.ai/api/cuml/stable/), a collection of machine learning libraries that provide GPU versions of algorithms available in scikit-learn; [**cuGraph**](https://docs.rapids.ai/api/cugraph/stable/), a NetworkX-like accelerated graph analytics library; and [**cuSpatial**](https://docs.rapids.ai/api/cuspatial/stable/), a library for common spatial and spatiotemporal operations.\n", 36 | "\n", 37 | "For more general information, check out the **[RAPIDS.ai home page](https://rapids.ai/index.html)**.\n", 38 | "\n", 39 | "For a detailed presentation about RAPIDS and the latest release notes, visit the **[RAPIDS overview documentation](https://docs.rapids.ai/overview)**." 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "## Introduction to RAPIDS Visualization\n", 54 | "The RAPIDS viz group's overall goal is to build open source libraries and collaborate with other open source projects. We hope to foster a greater adoption of GPUs in the python visualization ecosystem and beyond. Its not just for the sake of making things faster - we feel that when data scientist and analysts are able to interact with larger datasets in real time and with high fidelity, they will be able to ask better questions, more often, and get more accurate answers to today's complex problems.\n", 55 | "\n", 56 | "\n", 57 | "## RAPIDS Supported Viz Frameworks\n", 58 | "The below frameworks currently support RAPIDS - primarily through using cuDF as a data source: \n", 59 | "\n", 60 | "- **[hvplot](https://hvplot.holoviz.org/)**: wrapper API for easily visualizing data. \n", 61 | "- **[cuxfilter](https://github.com/rapidsai/cuxfilter)**: RAPIDS library for easily cross-filtering data. \n", 62 | "- **[Plotly Dash](https://plotly.com/dash/gpu-dask-acceleration/)**: framework for production ready visualization applications.\n", 63 | "- **[Datashader](https://datashader.org/)**: library for high fidelity server side data rendering.\n", 64 | "\n", 65 | "The RAPIDS visualization team is continually working to integrate with other open source projects - if you wish to help or have questions, reach out on our [Community Slack Channel (GOAI)](https://join.slack.com/t/rapids-goai/shared_invite/zt-h54mq1uv-KHeHDVCYs8xvZO5AB~ctTQ). \n", 66 | "\n", 67 | "### GPU Compute and/or GPU Render\n", 68 | "Generally RAPIDS works to accelerate visualization through faster compute - that is computing aggregations, filters, algorithms etc. quickly enough to be directly interacted with through a visualization. GPUs can also speed up visualization through faster data rendering (of which people more often associate GPUs). The architecture required to do one or both of these through web browsers can be complex, but is useful to understand when building advanced visualizations. Feel free to ask for details and future plans in our [Community Slack Channel (GOAI)](https://join.slack.com/t/rapids-goai/shared_invite/zt-h54mq1uv-KHeHDVCYs8xvZO5AB~ctTQ).\n" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "## Hardware and Software Requirements\n", 76 | "To run RAPIDS you will need to meet these general requirements:\n", 77 | "- NVIDIA Pascal™ or better GPU\n", 78 | "- Ubuntu 16.04+ or CentOS 7 OS (Windows support pending)\n", 79 | "- Recent CUDA & NVIDIA Drivers\n", 80 | "- Docker and/or Anaconda\n", 81 | "\n", 82 | "For the most up to date requirements and installation details, see the [RAPIDS Getting Started Page](https://rapids.ai/start.html).\n", 83 | "\n", 84 | "### Package Requirments\n", 85 | "Other packages are required in addition to a RAPIDS (0.16+) release installation. Everything is listed in the `environment.yml` and can be installed via [conda forge](https://conda-forge.org/). Using `conda`, first execute:\n", 86 | "```\n", 87 | "conda env create --name jupytercon_tutorial --file environment.yml\n", 88 | "```\n", 89 | "Then:\n", 90 | "```\n", 91 | "conda activate jupytercon_tutorial\n", 92 | "```\n", 93 | "\n" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "# Index of Notebooks\n", 101 | "\n", 102 | "- 00 **Index**: you are here (but are we anywhere..really?)\n", 103 | "- [01 **Data inspection and validation**](01%20Data%20Inspection%20and%20Validation.ipynb): dataset procurement as well as inspection with hvplot via bokeh charts.\n", 104 | "- [02 **Exploratory data visualization**](02%20Exploratory%20Data%20Visualization.ipynb): exploring preliminary patterns through cross-filtering with cuxfilter.\n", 105 | "- [03 **Data analysis with visual analytics**](03%20Data%20Analysis%20with%20Visual%20Analytics.ipynb): applying visual analytics with cuSpatial, cuGraph, hvplot via bokeh charts and datashader. \n", 106 | "- [04 **Explanatory data visualization**](04%20Explanatory%20Data%20Visualization.ipynb): presenting findings through a visualization application with Plotly Dash." 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "metadata": {}, 113 | "outputs": [], 114 | "source": [] 115 | } 116 | ], 117 | "metadata": { 118 | "kernelspec": { 119 | "display_name": "Python 3", 120 | "language": "python", 121 | "name": "python3" 122 | }, 123 | "language_info": { 124 | "codemirror_mode": { 125 | "name": "ipython", 126 | "version": 3 127 | }, 128 | "file_extension": ".py", 129 | "mimetype": "text/x-python", 130 | "name": "python", 131 | "nbconvert_exporter": "python", 132 | "pygments_lexer": "ipython3", 133 | "version": "3.7.8" 134 | } 135 | }, 136 | "nbformat": 4, 137 | "nbformat_minor": 4 138 | } 139 | -------------------------------------------------------------------------------- /event_notebooks/JupyterCon_2020_RAPIDSViz/01 Data Inspection and Validation.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "# Data Inspection and Validation\n", 15 | "***Loading data, vetting its quality, and understanding its shape***\n" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "## Run this cell to show the next section's walkthrough video ##\n", 25 | "from IPython.display import HTML\n", 26 | "HTML('')\n" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## Overview\n", 34 | "This intro notebook will use cuDF and hvplot (with bokeh charts) to load a public bike share dataset and get a general sense of what it contains, then run some cursory visualization to validate that the data is free of issues.\n", 35 | "\n", 36 | "### cuDF and hvplot\n", 37 | "- [cuDF](https://docs.rapids.ai/api/cudf/stable/), the core of RAPIDS, is a Python GPU DataFrame library (built on the Apache Arrow columnar memory format) for loading, joining, aggregating, filtering, and otherwise manipulating data in a pandas-like API.\n", 38 | "- [hvplot](https://hvplot.holoviz.org/) is a high-level plotting API for the PyData ecosystem built on [HoloViews](http://holoviews.org/)." 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "## Imports\n", 46 | "Let's first make sure the necessary imports are present to load." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "import cudf\n", 56 | "import hvplot.cudf\n", 57 | "import cupy\n", 58 | "import pandas as pd" 59 | ] 60 | }, 61 | { 62 | "cell_type": "markdown", 63 | "metadata": {}, 64 | "source": [ 65 | "## Data Size and GPU Speedups\n", 66 | "This tutorial's dataset size is about `2.1GB` unzipped and contains about `9 million rows`. While this will do for a tutorial, its still too small to get a sense of the speed up possible with GPU acceleration. We've created a larger `300 million row` [2010 Census Visualization](https://github.com/rapidsai/plotly-dash-rapids-census-demo) application available through the RAPIDS [GitHub page](https://github.com/rapidsai) as another demo. " 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "## Run this cell to show the next section's walkthrough video ##\n", 76 | "from IPython.display import HTML\n", 77 | "HTML('')\n" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "## Loading Data into cuDF\n", 85 | "We need to download and extract the sample data we will use for this tutorial. This notebook uses the Kaggle [Chicago Divvy Bicycle Sharing Data](https://www.kaggle.com/yingwurenjian/chicago-divvy-bicycle-sharing-data) dataset. Once the `data.csv` file is downloaded and unzipped, point the paths below at the location *(Make sure to set DATA_DIR to the path you saved that data file to)*:\n" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": {}, 92 | "outputs": [], 93 | "source": [ 94 | "from pathlib import Path\n", 95 | "\n", 96 | "DATA_DIR = Path(\"../data\")" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "# Download and Extract the dataset\n", 106 | "! wget -N -P {DATA_DIR} https://data.rapids.ai/viz-data/data.tar.xz\n", 107 | "! tar -xf {DATA_DIR}/data.tar.xz -C {DATA_DIR}" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "FILENAME = Path(\"data.csv\")" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "We now read the .csv file into the GPU cuDF Dataframe (which behaves similar to a Pandas dataframe). " 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "df = cudf.read_csv(DATA_DIR / FILENAME)" 133 | ] 134 | }, 135 | { 136 | "cell_type": "markdown", 137 | "metadata": {}, 138 | "source": [ 139 | "## Mapping out the Data Shape\n", 140 | "CuDF supports all the standard Pandas operations for a quick look at the data e.g. to see the total number of rows..." 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": null, 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "len(df)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "markdown", 154 | "metadata": {}, 155 | "source": [ 156 | "Or to inspect the column headers and first few rows..." 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": null, 162 | "metadata": {}, 163 | "outputs": [], 164 | "source": [ 165 | "df.head()" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "Or to see the full list of columns..." 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "df.columns" 182 | ] 183 | }, 184 | { 185 | "cell_type": "markdown", 186 | "metadata": {}, 187 | "source": [ 188 | "Or see how many trips were made by subscribers." 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": null, 194 | "metadata": {}, 195 | "outputs": [], 196 | "source": [ 197 | "df.groupby(\"usertype\").size()" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": {}, 203 | "source": [ 204 | "## Improving Data Utility\n", 205 | "Now that we have a basic idea of how big our dataset is and what it contains, we want to start making the data more meaningful. This task can vary from removing unnecessary columns, mapping values to be more human readable, or formatting them to be understood by our tools. \n", 206 | "\n", 207 | "Having looked at the `df.head()` above, the first thing we might want is to re-load the data, parsing the start-stop time columns as more usable datetimes types: " 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "df = cudf.read_csv(DATA_DIR / FILENAME, parse_dates=('starttime', 'stoptime'))" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "One thing we will want to do is to look at trips by day of week. Now that we have real datetime columns, we can use `dt.weekday` to add a `weekday` column to our `cudf` Dataframe:" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "df[\"weekday\"] = df['starttime'].dt.weekday" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": null, 238 | "metadata": {}, 239 | "outputs": [], 240 | "source": [ 241 | "## Run this cell to show the next section's walkthrough video ##\n", 242 | "from IPython.display import HTML\n", 243 | "HTML('')" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "## Inspecting Data Quality and Distribution\n", 251 | "Another important step is getting a sense of the quality of the dataset. As these datasets are often larger than is feasible to look through row by row, mapping out the distribution of values early on helps find issuse that can derail an analysis later.\n", 252 | "\n", 253 | "Some examples are gaps in data, unexpected or empty value types, infeasible values, or incorrect projections. " 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "## Gender and Subsriber Columns\n", 261 | "We could do this in a numerical way, such as getting the totals from the 'gender' data column as a table:" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": null, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "mf_counts = df.groupby(\"gender\").size().rename(\"count\").reset_index()\n", 271 | "mf_counts" 272 | ] 273 | }, 274 | { 275 | "cell_type": "markdown", 276 | "metadata": {}, 277 | "source": [ 278 | "While technically functional as a table, taking values and visualizating them as bars help to intuitively show the scale of the difference faster (hvplot's API makes this very simple):" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "mf_counts.hvplot.bar(\"gender\",\"count\").opts(title=\"Total trips by gender\")" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "metadata": {}, 293 | "source": [ 294 | "### A Note on Preattentive Attributes\n", 295 | "This subconcious ability to quickly recognize patterns is due to our brain's natural ability to find [preattentive attributes](http://daydreamingnumbers.com/blog/preattentive-attributes-example/), such as height, orientation, or color. Imagine 100 values in a table and 100 in a bar chart and how quickly you would be albe to find the smallest and largest values in either." 296 | ] 297 | }, 298 | { 299 | "cell_type": "markdown", 300 | "metadata": {}, 301 | "source": [ 302 | "### Try It out\n", 303 | "Now try using [hvplot's user guide](https://hvplot.holoviz.org/user_guide/Plotting.html) and our examples to create a hvplot that shows the distribution of `Subscriber` types:" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": null, 309 | "metadata": {}, 310 | "outputs": [], 311 | "source": [ 312 | "# code here" 313 | ] 314 | }, 315 | { 316 | "cell_type": "markdown", 317 | "metadata": {}, 318 | "source": [ 319 | "The above data columns maybe show some potentially useful disparities, but without supplimental data, it would be hard to have a follow up question.\n" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": null, 325 | "metadata": {}, 326 | "outputs": [], 327 | "source": [ 328 | "## Run this cell to show the next section's walkthrough video ##\n", 329 | "from IPython.display import HTML\n", 330 | "HTML('')" 331 | ] 332 | }, 333 | { 334 | "cell_type": "markdown", 335 | "metadata": {}, 336 | "source": [ 337 | "## Trip Starts\n", 338 | "Instead, another question we might want to ask is how many trip starts are there per day of the week? We can group the `cudf` Dataframe and call `hvplot.bar` directly the result:" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": null, 344 | "metadata": {}, 345 | "outputs": [], 346 | "source": [ 347 | "day_counts = df.groupby(\"weekday\").size().rename(\"count\").reset_index()\n", 348 | "day_counts.hvplot.bar(\"weekday\", \"count\").opts(title=\"Trip starts, per Week Day\", yformatter=\"%0.0f\")" 349 | ] 350 | }, 351 | { 352 | "cell_type": "markdown", 353 | "metadata": {}, 354 | "source": [ 355 | "With 0-4 being a weekday, and 5-6 being a weekend, there is a clear drop off of ridership on the weekends. Lets note that!\n" 356 | ] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "metadata": {}, 361 | "source": [ 362 | "## Trips by Duration\n", 363 | "Another quick look we can generate is to see the overall distribution of trip durations, this time using `hvplot.hist`:" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": { 370 | "scrolled": true 371 | }, 372 | "outputs": [], 373 | "source": [ 374 | "# We selected an arbitrary 50 for bin size, try and see patterns with other sizes\n", 375 | "df.hvplot.hist(y=\"tripduration\").opts(\n", 376 | " title=\"Trips Duration Histrogram\", yformatter=\"%0.0f\"\n", 377 | ")" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "metadata": {}, 383 | "source": [ 384 | "Clearly, most trips are less than 15 minuites long. \n", 385 | "\n", 386 | "`hvplot` also makes it simple to interrogate different dimensions. For example, we can add `groupby=\"month\"` to our call to `hvplot.hist`, and automatically get a slider to see a histogram specific to each month:" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "df.hvplot.hist(y=\"tripduration\", bins=50, groupby=\"month\").opts(\n", 396 | " title=\"Trips Duration Histrogram by Month\", yformatter=\"%0.0f\", width=400\n", 397 | ")" 398 | ] 399 | }, 400 | { 401 | "cell_type": "markdown", 402 | "metadata": {}, 403 | "source": [ 404 | "By scrubbing between the months we can start to see a pattern of slightly longer trip durations emerge during the summer months.\n", 405 | "\n" 406 | ] 407 | }, 408 | { 409 | "cell_type": "markdown", 410 | "metadata": {}, 411 | "source": [ 412 | "## Trips vs Temperatures\n", 413 | "Lets follow up on this by using `hvplot` to generate a KDE distributions using our `cudf` Dataframes for 9 million trips:" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": null, 419 | "metadata": { 420 | "scrolled": true 421 | }, 422 | "outputs": [], 423 | "source": [ 424 | "df.hvplot.kde(y=\"temperature\").opts(title=\"Distribution of trip temperatures\")" 425 | ] 426 | }, 427 | { 428 | "cell_type": "markdown", 429 | "metadata": {}, 430 | "source": [ 431 | "Clearly most trips occur around a temperature sweet spot of around 65-80 degrees.\n", 432 | "\n", 433 | "\n", 434 | "The `hvplot.heatmap` method can group in two dimensions and colormap according to aggregations on those groups. Here we see *average* trip duration by year and month: " 435 | ] 436 | }, 437 | { 438 | "cell_type": "code", 439 | "execution_count": null, 440 | "metadata": {}, 441 | "outputs": [], 442 | "source": [ 443 | "df.hvplot.heatmap(x='month', y='year', C='tripduration', \n", 444 | " reduce_function=cudf.DataFrame.mean , colorbar=True, cmap=\"Viridis\")" 445 | ] 446 | }, 447 | { 448 | "cell_type": "markdown", 449 | "metadata": {}, 450 | "source": [ 451 | "So what we saw hinted at with the trip duration slider is much more clearly shown in this literal heatmap *(ba-dom-tss)*. \n", 452 | "\n" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": null, 458 | "metadata": {}, 459 | "outputs": [], 460 | "source": [ 461 | "## Run this cell to show the next section's walkthrough video ##\n", 462 | "from IPython.display import HTML\n", 463 | "HTML('')\n" 464 | ] 465 | }, 466 | { 467 | "cell_type": "markdown", 468 | "metadata": {}, 469 | "source": [ 470 | "## Trip Geography\n", 471 | "Temperature and months aside, we might also want to bin the data geographically to check for anomalies. The `hvplot.hexbin` can show the counts for trip starts overlaid on a tile map:" 472 | ] 473 | }, 474 | { 475 | "cell_type": "code", 476 | "execution_count": null, 477 | "metadata": {}, 478 | "outputs": [], 479 | "source": [ 480 | "df.hvplot.hexbin(x='longitude_start', y='latitude_start', geo=True, tiles=\"OSM\").opts(width=600, height=600)" 481 | ] 482 | }, 483 | { 484 | "cell_type": "markdown", 485 | "metadata": {}, 486 | "source": [ 487 | "Interestingly there seems to be a strong concentration of trips in a core area that radiate outwards. Lets take note of that. \n", 488 | "\n", 489 | "The location of the data compared to a current system map also seems to show that everything is where it should be, without any extraneous data points or off map projections:\n", 490 | "\n", 491 | "" 492 | ] 493 | }, 494 | { 495 | "cell_type": "markdown", 496 | "metadata": {}, 497 | "source": [ 498 | "## Data Cleanup\n", 499 | "Based on our inspection, this dataset is uncommonly well formatted and of high quality. But a little cleanup and formatting aids will make some things simpler in future notebooks. \n", 500 | "\n", 501 | "One thing that is missing is a list of just station id's and their coordinates. Let's generate that and save it for later. First, let's group by all the unique \"from\" and \"to\" station id values, and take a representative from each group:" 502 | ] 503 | }, 504 | { 505 | "cell_type": "code", 506 | "execution_count": null, 507 | "metadata": {}, 508 | "outputs": [], 509 | "source": [ 510 | "from_ids = df.groupby(\"from_station_id\")\n", 511 | "to_ids = df.groupby(\"to_station_id\")" 512 | ] 513 | }, 514 | { 515 | "cell_type": "markdown", 516 | "metadata": {}, 517 | "source": [ 518 | "It's possible (but unlikely) that a particular station is only a sink or source for trips. For good measure, let's make sure the group keys are identical:" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": null, 524 | "metadata": {}, 525 | "outputs": [], 526 | "source": [ 527 | "all(from_ids.size().index.values == to_ids.size().index.values)" 528 | ] 529 | }, 530 | { 531 | "cell_type": "markdown", 532 | "metadata": {}, 533 | "source": [ 534 | "Each group has items for a single station, which all have the same lat/lon. So let's make a new DataFrame by taking a representative from each group, then rename some columns:" 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": null, 540 | "metadata": {}, 541 | "outputs": [], 542 | "source": [ 543 | "stations = from_ids.nth(1).to_pandas()\n", 544 | "stations.index.name = \"station_id\"\n", 545 | "stations.rename(columns={\"latitude_start\": \"lat\", \"longitude_start\": \"lon\"}, inplace=True)\n", 546 | "stations = stations.reset_index().filter([\"station_id\", \"lat\", \"lon\"])\n", 547 | "stations" 548 | ] 549 | }, 550 | { 551 | "cell_type": "markdown", 552 | "metadata": {}, 553 | "source": [ 554 | "Finally write the results to \"stations.csv\" in our data directory:" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": null, 560 | "metadata": {}, 561 | "outputs": [], 562 | "source": [ 563 | "stations.to_csv(DATA_DIR / \"stations.csv\", index=False)" 564 | ] 565 | }, 566 | { 567 | "cell_type": "code", 568 | "execution_count": null, 569 | "metadata": {}, 570 | "outputs": [], 571 | "source": [ 572 | "## Run this cell to show the next section's walkthrough video ##\n", 573 | "from IPython.display import HTML\n", 574 | "HTML('')\n" 575 | ] 576 | }, 577 | { 578 | "cell_type": "markdown", 579 | "metadata": {}, 580 | "source": [ 581 | "## Summary of the Data\n", 582 | "Overall this is an interesting and useful dataset. Our preliminary vetting found no issues with quality and already started to hint at areas to investigate:\n", 583 | "\n", 584 | "- Weekday vs Weekend trip counts\n", 585 | "- Bike trips vs weather correlation \n", 586 | "- Core vs Outward trip concentrations \n", 587 | "\n", 588 | "We will follow up with these findings in our next notebook." 589 | ] 590 | }, 591 | { 592 | "cell_type": "code", 593 | "execution_count": null, 594 | "metadata": {}, 595 | "outputs": [], 596 | "source": [] 597 | } 598 | ], 599 | "metadata": { 600 | "kernelspec": { 601 | "display_name": "Python 3", 602 | "language": "python", 603 | "name": "python3" 604 | }, 605 | "language_info": { 606 | "codemirror_mode": { 607 | "name": "ipython", 608 | "version": 3 609 | }, 610 | "file_extension": ".py", 611 | "mimetype": "text/x-python", 612 | "name": "python", 613 | "nbconvert_exporter": "python", 614 | "pygments_lexer": "ipython3", 615 | "version": "3.7.8" 616 | } 617 | }, 618 | "nbformat": 4, 619 | "nbformat_minor": 4 620 | } 621 | -------------------------------------------------------------------------------- /event_notebooks/JupyterCon_2020_RAPIDSViz/README.md: -------------------------------------------------------------------------------- 1 | # Using RAPIDS and Jupyter to Accelerate Visualization Workflows 2 | 3 | Welcome to the repository for Accelerated Visualization Workflows, using RAPIDS. This was a tutorial originally presented for JupyterCon 2020. To start, make sure you have the required installs and hardware, then open the `00 Index and Introduction` notebook to get started. The series is comprised of 5 notebooks, an open source bike share dataset, and embeded walkthrough videos. 4 | 5 | 6 | ## Install Requirements 7 | **Requirements can be met with the Anaconda install below. Make sure to update your cuda_toolkit version to match host cuda version, which currently supports cuda 10.1, 10.2, 11.0** 8 | ``` 9 | conda env create --name JC_RAPIDSViz --file environment.yml 10 | conda activate JC_RAPIDSViz 11 | ``` 12 | 13 | ## Hardware Requirements 14 | **NVIDIA GPU with at least 16GB of memory that also meets the prerequisites (here)[https://rapids.ai/start.html]. -------------------------------------------------------------------------------- /event_notebooks/JupyterCon_2020_RAPIDSViz/assets/dash-style.css: -------------------------------------------------------------------------------- 1 | /* Table of contents 2 | –––––––––––––––––––––––––––––––––––––––––––––––––– 3 | - Plotly.js 4 | - Grid 5 | - Base Styles 6 | - Typography 7 | - Links 8 | - Buttons 9 | - Forms 10 | - Lists 11 | - Code 12 | - Tables 13 | - Spacing 14 | - Utilities 15 | - Clearing 16 | - Media Queries 17 | */ 18 | 19 | /* PLotly.js 20 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 21 | /* plotly.js's modebar's z-index is 1001 by default 22 | * https://github.com/plotly/plotly.js/blob/7e4d8ab164258f6bd48be56589dacd9bdd7fded2/src/css/_modebar.scss#L5 23 | * In case a dropdown is above the graph, the dropdown's options 24 | * will be rendered below the modebar 25 | * Increase the select option's z-index 26 | */ 27 | 28 | /* This was actually not quite right - 29 | dropdowns were overlapping each other (edited October 26) 30 | 31 | .Select { 32 | z-index: 1002; 33 | }*/ 34 | 35 | 36 | /* Custom 37 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 38 | html, body { 39 | height: auto; 40 | width: 100%; 41 | margin: 0; 42 | 43 | } 44 | 45 | /* Grid 46 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 47 | .container { 48 | position: relative; 49 | width: 100%; 50 | max-width: 960px; 51 | margin: 0 auto; 52 | padding: 0 20px; 53 | box-sizing: border-box; } 54 | .column, 55 | .columns { 56 | width: 100%; 57 | float: left; 58 | box-sizing: border-box; } 59 | 60 | /* For devices larger than 400px */ 61 | @media (min-width: 400px) { 62 | .container { 63 | width: 85%; 64 | padding: 0; } 65 | } 66 | 67 | /* For devices larger than 550px */ 68 | @media (min-width: 550px) { 69 | .container { 70 | width: 80%; } 71 | .column, 72 | .columns { 73 | margin-left: 4%; } 74 | .column:first-child, 75 | .columns:first-child { 76 | margin-left: 0; } 77 | 78 | .one.column, 79 | .one.columns { width: 4.66666666667%; } 80 | .two.columns { width: 13.3333333333%; } 81 | .three.columns { width: 22%; } 82 | .four.columns { width: 30.6666666667%; } 83 | .five.columns { width: 39.3333333333%; } 84 | .six.columns { width: 48%; } 85 | .seven.columns { width: 56.6666666667%; } 86 | .eight.columns { width: 65.3333333333%; } 87 | .nine.columns { width: 74.0%; } 88 | .ten.columns { width: 82.6666666667%; } 89 | .eleven.columns { width: 91.3333333333%; } 90 | .twelve.columns { width: 100%; margin-left: 0; } 91 | 92 | .one-third.column { width: 30.6666666667%; } 93 | .two-thirds.column { width: 65.3333333333%; } 94 | 95 | .one-half.column { width: 48%; } 96 | 97 | /* Offsets */ 98 | .offset-by-one.column, 99 | .offset-by-one.columns { margin-left: 8.66666666667%; } 100 | .offset-by-two.column, 101 | .offset-by-two.columns { margin-left: 17.3333333333%; } 102 | .offset-by-three.column, 103 | .offset-by-three.columns { margin-left: 26%; } 104 | .offset-by-four.column, 105 | .offset-by-four.columns { margin-left: 34.6666666667%; } 106 | .offset-by-five.column, 107 | .offset-by-five.columns { margin-left: 43.3333333333%; } 108 | .offset-by-six.column, 109 | .offset-by-six.columns { margin-left: 52%; } 110 | .offset-by-seven.column, 111 | .offset-by-seven.columns { margin-left: 60.6666666667%; } 112 | .offset-by-eight.column, 113 | .offset-by-eight.columns { margin-left: 69.3333333333%; } 114 | .offset-by-nine.column, 115 | .offset-by-nine.columns { margin-left: 78.0%; } 116 | .offset-by-ten.column, 117 | .offset-by-ten.columns { margin-left: 86.6666666667%; } 118 | .offset-by-eleven.column, 119 | .offset-by-eleven.columns { margin-left: 95.3333333333%; } 120 | 121 | .offset-by-one-third.column, 122 | .offset-by-one-third.columns { margin-left: 34.6666666667%; } 123 | .offset-by-two-thirds.column, 124 | .offset-by-two-thirds.columns { margin-left: 69.3333333333%; } 125 | 126 | .offset-by-one-half.column, 127 | .offset-by-one-half.columns { margin-left: 52%; } 128 | 129 | } 130 | 131 | 132 | /* Base Styles 133 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 134 | /* NOTE 135 | html is set to 62.5% so that all the REM measurements throughout Skeleton 136 | are based on 10px sizing. So basically 1.5rem = 15px :) */ 137 | html { 138 | font-size: 62.5%; } 139 | body { 140 | font-size: 1.5em; /* currently ems cause chrome bug misinterpreting rems on body element */ 141 | line-height: 1.6; 142 | font-weight: 400; 143 | font-family: "Open Sans", "HelveticaNeue", "Helvetica Neue", Helvetica, Arial, sans-serif; 144 | color: rgb(50, 50, 50); } 145 | 146 | 147 | /* Typography 148 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 149 | h1, h2, h3, h4, h5, h6 { 150 | margin-top: 0; 151 | margin-bottom: 0; 152 | font-weight: 300; } 153 | h1 { font-size: 4.5rem; line-height: 1.2; letter-spacing: -.1rem; margin-bottom: 2rem; } 154 | h2 { font-size: 3.6rem; line-height: 1.25; letter-spacing: -.1rem; margin-bottom: 1.8rem; margin-top: 1.8rem;} 155 | h3 { font-size: 3.0rem; line-height: 1.3; letter-spacing: -.1rem; margin-bottom: 1.5rem; margin-top: 1.5rem;} 156 | h4 { font-size: 2.6rem; line-height: 1.35; letter-spacing: -.08rem; margin-bottom: 1.2rem; margin-top: 1.2rem;} 157 | h5 { font-size: 2.2rem; line-height: 1.5; letter-spacing: -.05rem; margin-bottom: 0.6rem; margin-top: 0.6rem;} 158 | h6 { font-size: 2.0rem; line-height: 1.6; letter-spacing: 0; margin-bottom: 0.75rem; margin-top: 0.75rem;} 159 | 160 | p { 161 | margin-top: 0; } 162 | 163 | 164 | /* Blockquotes 165 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 166 | blockquote { 167 | border-left: 4px lightgrey solid; 168 | padding-left: 1rem; 169 | margin-top: 2rem; 170 | margin-bottom: 2rem; 171 | margin-left: 0rem; 172 | } 173 | 174 | 175 | /* Links 176 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 177 | a { 178 | color: #1EAEDB; 179 | text-decoration: underline; 180 | cursor: pointer;} 181 | a:hover { 182 | color: #0FA0CE; } 183 | 184 | 185 | /* Buttons 186 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 187 | .button, 188 | button, 189 | input[type="submit"], 190 | input[type="reset"], 191 | input[type="button"] { 192 | display: inline-block; 193 | height: 38px; 194 | padding: 0 30px; 195 | color: #555; 196 | text-align: center; 197 | font-size: 11px; 198 | font-weight: 600; 199 | line-height: 38px; 200 | letter-spacing: .1rem; 201 | text-transform: uppercase; 202 | text-decoration: none; 203 | white-space: nowrap; 204 | background-color: transparent; 205 | border-radius: 4px; 206 | border: 1px solid #bbb; 207 | cursor: pointer; 208 | box-sizing: border-box; } 209 | .button:hover, 210 | button:hover, 211 | input[type="submit"]:hover, 212 | input[type="reset"]:hover, 213 | input[type="button"]:hover, 214 | .button:focus, 215 | button:focus, 216 | input[type="submit"]:focus, 217 | input[type="reset"]:focus, 218 | input[type="button"]:focus { 219 | color: #333; 220 | border-color: #888; 221 | outline: 0; } 222 | .button.button-primary, 223 | button.button-primary, 224 | input[type="submit"].button-primary, 225 | input[type="reset"].button-primary, 226 | input[type="button"].button-primary { 227 | color: #FFF; 228 | background-color: #33C3F0; 229 | border-color: #33C3F0; } 230 | .button.button-primary:hover, 231 | button.button-primary:hover, 232 | input[type="submit"].button-primary:hover, 233 | input[type="reset"].button-primary:hover, 234 | input[type="button"].button-primary:hover, 235 | .button.button-primary:focus, 236 | button.button-primary:focus, 237 | input[type="submit"].button-primary:focus, 238 | input[type="reset"].button-primary:focus, 239 | input[type="button"].button-primary:focus { 240 | color: #FFF; 241 | background-color: #1EAEDB; 242 | border-color: #1EAEDB; } 243 | 244 | 245 | /* Forms 246 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 247 | input[type="email"], 248 | input[type="number"], 249 | input[type="search"], 250 | input[type="text"], 251 | input[type="tel"], 252 | input[type="url"], 253 | input[type="password"], 254 | textarea, 255 | select { 256 | height: 38px; 257 | padding: 6px 10px; /* The 6px vertically centers text on FF, ignored by Webkit */ 258 | background-color: #fff; 259 | border: 1px solid #D1D1D1; 260 | border-radius: 4px; 261 | box-shadow: none; 262 | box-sizing: border-box; 263 | font-family: inherit; 264 | font-size: inherit; /*https://stackoverflow.com/questions/6080413/why-doesnt-input-inherit-the-font-from-body*/} 265 | /* Removes awkward default styles on some inputs for iOS */ 266 | input[type="email"], 267 | input[type="number"], 268 | input[type="search"], 269 | input[type="text"], 270 | input[type="tel"], 271 | input[type="url"], 272 | input[type="password"], 273 | textarea { 274 | -webkit-appearance: none; 275 | -moz-appearance: none; 276 | appearance: none; } 277 | textarea { 278 | min-height: 65px; 279 | padding-top: 6px; 280 | padding-bottom: 6px; } 281 | input[type="email"]:focus, 282 | input[type="number"]:focus, 283 | input[type="search"]:focus, 284 | input[type="text"]:focus, 285 | input[type="tel"]:focus, 286 | input[type="url"]:focus, 287 | input[type="password"]:focus, 288 | textarea:focus, 289 | select:focus { 290 | border: 1px solid #33C3F0; 291 | outline: 0; } 292 | label, 293 | legend { 294 | display: block; 295 | margin-bottom: 0px; } 296 | fieldset { 297 | padding: 0; 298 | border-width: 0; } 299 | input[type="checkbox"], 300 | input[type="radio"] { 301 | display: inline; } 302 | label > .label-body { 303 | display: inline-block; 304 | margin-left: .5rem; 305 | font-weight: normal; } 306 | 307 | 308 | /* Lists 309 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 310 | ul { 311 | list-style: circle inside; } 312 | ol { 313 | list-style: decimal inside; } 314 | ol, ul { 315 | padding-left: 0; 316 | margin-top: 0; } 317 | ul ul, 318 | ul ol, 319 | ol ol, 320 | ol ul { 321 | margin: 1.5rem 0 1.5rem 3rem; 322 | font-size: 90%; } 323 | li { 324 | margin-bottom: 1rem; } 325 | 326 | 327 | /* Tables 328 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 329 | table { 330 | border-collapse: collapse; 331 | } 332 | th:not(.CalendarDay), 333 | td:not(.CalendarDay) { 334 | padding: 12px 15px; 335 | text-align: left; 336 | border-bottom: 1px solid #E1E1E1; } 337 | th:first-child:not(.CalendarDay), 338 | td:first-child:not(.CalendarDay) { 339 | padding-left: 0; } 340 | th:last-child:not(.CalendarDay), 341 | td:last-child:not(.CalendarDay) { 342 | padding-right: 0; } 343 | 344 | 345 | /* Spacing 346 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 347 | button, 348 | .button { 349 | margin-bottom: 0rem; } 350 | input, 351 | textarea, 352 | select, 353 | fieldset { 354 | margin-bottom: 0rem; } 355 | pre, 356 | dl, 357 | figure, 358 | table, 359 | form { 360 | margin-bottom: 0rem; } 361 | p, 362 | ul, 363 | ol { 364 | margin-bottom: 0.75rem; } 365 | 366 | /* Utilities 367 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 368 | .u-full-width { 369 | width: 100%; 370 | box-sizing: border-box; } 371 | .u-max-full-width { 372 | max-width: 100%; 373 | box-sizing: border-box; } 374 | .u-pull-right { 375 | float: right; } 376 | .u-pull-left { 377 | float: left; } 378 | 379 | 380 | /* Misc 381 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 382 | hr { 383 | margin-top: 3rem; 384 | margin-bottom: 3.5rem; 385 | border-width: 0; 386 | border-top: 1px solid #E1E1E1; } 387 | 388 | 389 | /* Clearing 390 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 391 | 392 | /* Self Clearing Goodness */ 393 | .container:after, 394 | .row:after, 395 | .u-cf { 396 | content: ""; 397 | display: table; 398 | clear: both; } 399 | 400 | 401 | /* Media Queries 402 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 403 | /* 404 | Note: The best way to structure the use of media queries is to create the queries 405 | near the relevant code. For example, if you wanted to change the styles for buttons 406 | on small devices, paste the mobile query code up in the buttons section and style it 407 | there. 408 | */ 409 | 410 | 411 | /* Larger than mobile */ 412 | @media (min-width: 400px) {} 413 | 414 | /* Larger than phablet (also point when grid becomes active) */ 415 | @media (min-width: 550px) {} 416 | 417 | /* Larger than tablet */ 418 | @media (min-width: 750px) {} 419 | 420 | /* Larger than desktop */ 421 | @media (min-width: 1000px) {} 422 | 423 | /* Larger than Desktop HD */ 424 | @media (min-width: 1200px) {} -------------------------------------------------------------------------------- /event_notebooks/JupyterCon_2020_RAPIDSViz/data/README_data.md: -------------------------------------------------------------------------------- 1 | # Data 2 | 3 | Any data files used in the worked-out examples or learner exercises should be in this folder. 4 | 5 | Data sources should be listed in this README. 6 | 7 | **Recommend** that data be shared under [CC0](https://creativecommons.org/share-your-work/public-domain/cc0) public-domain dedication or [CC-BY](https://creativecommons.org/licenses/by/2.0/). 8 | -------------------------------------------------------------------------------- /event_notebooks/JupyterCon_2020_RAPIDSViz/data/dash-style.css: -------------------------------------------------------------------------------- 1 | /* Table of contents 2 | –––––––––––––––––––––––––––––––––––––––––––––––––– 3 | - Plotly.js 4 | - Grid 5 | - Base Styles 6 | - Typography 7 | - Links 8 | - Buttons 9 | - Forms 10 | - Lists 11 | - Code 12 | - Tables 13 | - Spacing 14 | - Utilities 15 | - Clearing 16 | - Media Queries 17 | */ 18 | 19 | /* PLotly.js 20 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 21 | /* plotly.js's modebar's z-index is 1001 by default 22 | * https://github.com/plotly/plotly.js/blob/7e4d8ab164258f6bd48be56589dacd9bdd7fded2/src/css/_modebar.scss#L5 23 | * In case a dropdown is above the graph, the dropdown's options 24 | * will be rendered below the modebar 25 | * Increase the select option's z-index 26 | */ 27 | 28 | /* This was actually not quite right - 29 | dropdowns were overlapping each other (edited October 26) 30 | 31 | .Select { 32 | z-index: 1002; 33 | }*/ 34 | 35 | 36 | /* Custom 37 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 38 | body { 39 | background-color: red; 40 | } 41 | 42 | 43 | /* Grid 44 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 45 | .container { 46 | position: relative; 47 | width: 100%; 48 | max-width: 960px; 49 | margin: 0 auto; 50 | padding: 0 20px; 51 | box-sizing: border-box; } 52 | .column, 53 | .columns { 54 | width: 100%; 55 | float: left; 56 | box-sizing: border-box; } 57 | 58 | /* For devices larger than 400px */ 59 | @media (min-width: 400px) { 60 | .container { 61 | width: 85%; 62 | padding: 0; } 63 | } 64 | 65 | /* For devices larger than 550px */ 66 | @media (min-width: 550px) { 67 | .container { 68 | width: 80%; } 69 | .column, 70 | .columns { 71 | margin-left: 4%; } 72 | .column:first-child, 73 | .columns:first-child { 74 | margin-left: 0; } 75 | 76 | .one.column, 77 | .one.columns { width: 4.66666666667%; } 78 | .two.columns { width: 13.3333333333%; } 79 | .three.columns { width: 22%; } 80 | .four.columns { width: 30.6666666667%; } 81 | .five.columns { width: 39.3333333333%; } 82 | .six.columns { width: 48%; } 83 | .seven.columns { width: 56.6666666667%; } 84 | .eight.columns { width: 65.3333333333%; } 85 | .nine.columns { width: 74.0%; } 86 | .ten.columns { width: 82.6666666667%; } 87 | .eleven.columns { width: 91.3333333333%; } 88 | .twelve.columns { width: 100%; margin-left: 0; } 89 | 90 | .one-third.column { width: 30.6666666667%; } 91 | .two-thirds.column { width: 65.3333333333%; } 92 | 93 | .one-half.column { width: 48%; } 94 | 95 | /* Offsets */ 96 | .offset-by-one.column, 97 | .offset-by-one.columns { margin-left: 8.66666666667%; } 98 | .offset-by-two.column, 99 | .offset-by-two.columns { margin-left: 17.3333333333%; } 100 | .offset-by-three.column, 101 | .offset-by-three.columns { margin-left: 26%; } 102 | .offset-by-four.column, 103 | .offset-by-four.columns { margin-left: 34.6666666667%; } 104 | .offset-by-five.column, 105 | .offset-by-five.columns { margin-left: 43.3333333333%; } 106 | .offset-by-six.column, 107 | .offset-by-six.columns { margin-left: 52%; } 108 | .offset-by-seven.column, 109 | .offset-by-seven.columns { margin-left: 60.6666666667%; } 110 | .offset-by-eight.column, 111 | .offset-by-eight.columns { margin-left: 69.3333333333%; } 112 | .offset-by-nine.column, 113 | .offset-by-nine.columns { margin-left: 78.0%; } 114 | .offset-by-ten.column, 115 | .offset-by-ten.columns { margin-left: 86.6666666667%; } 116 | .offset-by-eleven.column, 117 | .offset-by-eleven.columns { margin-left: 95.3333333333%; } 118 | 119 | .offset-by-one-third.column, 120 | .offset-by-one-third.columns { margin-left: 34.6666666667%; } 121 | .offset-by-two-thirds.column, 122 | .offset-by-two-thirds.columns { margin-left: 69.3333333333%; } 123 | 124 | .offset-by-one-half.column, 125 | .offset-by-one-half.columns { margin-left: 52%; } 126 | 127 | } 128 | 129 | 130 | /* Base Styles 131 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 132 | /* NOTE 133 | html is set to 62.5% so that all the REM measurements throughout Skeleton 134 | are based on 10px sizing. So basically 1.5rem = 15px :) */ 135 | html { 136 | font-size: 62.5%; } 137 | body { 138 | font-size: 1.5em; /* currently ems cause chrome bug misinterpreting rems on body element */ 139 | line-height: 1.6; 140 | font-weight: 400; 141 | font-family: "Open Sans", "HelveticaNeue", "Helvetica Neue", Helvetica, Arial, sans-serif; 142 | color: rgb(50, 50, 50); } 143 | 144 | 145 | /* Typography 146 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 147 | h1, h2, h3, h4, h5, h6 { 148 | margin-top: 0; 149 | margin-bottom: 0; 150 | font-weight: 300; } 151 | h1 { font-size: 4.5rem; line-height: 1.2; letter-spacing: -.1rem; margin-bottom: 2rem; } 152 | h2 { font-size: 3.6rem; line-height: 1.25; letter-spacing: -.1rem; margin-bottom: 1.8rem; margin-top: 1.8rem;} 153 | h3 { font-size: 3.0rem; line-height: 1.3; letter-spacing: -.1rem; margin-bottom: 1.5rem; margin-top: 1.5rem;} 154 | h4 { font-size: 2.6rem; line-height: 1.35; letter-spacing: -.08rem; margin-bottom: 1.2rem; margin-top: 1.2rem;} 155 | h5 { font-size: 2.2rem; line-height: 1.5; letter-spacing: -.05rem; margin-bottom: 0.6rem; margin-top: 0.6rem;} 156 | h6 { font-size: 2.0rem; line-height: 1.6; letter-spacing: 0; margin-bottom: 0.75rem; margin-top: 0.75rem;} 157 | 158 | p { 159 | margin-top: 0; } 160 | 161 | 162 | /* Blockquotes 163 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 164 | blockquote { 165 | border-left: 4px lightgrey solid; 166 | padding-left: 1rem; 167 | margin-top: 2rem; 168 | margin-bottom: 2rem; 169 | margin-left: 0rem; 170 | } 171 | 172 | 173 | /* Links 174 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 175 | a { 176 | color: #1EAEDB; 177 | text-decoration: underline; 178 | cursor: pointer;} 179 | a:hover { 180 | color: #0FA0CE; } 181 | 182 | 183 | /* Buttons 184 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 185 | .button, 186 | button, 187 | input[type="submit"], 188 | input[type="reset"], 189 | input[type="button"] { 190 | display: inline-block; 191 | height: 38px; 192 | padding: 0 30px; 193 | color: #555; 194 | text-align: center; 195 | font-size: 11px; 196 | font-weight: 600; 197 | line-height: 38px; 198 | letter-spacing: .1rem; 199 | text-transform: uppercase; 200 | text-decoration: none; 201 | white-space: nowrap; 202 | background-color: transparent; 203 | border-radius: 4px; 204 | border: 1px solid #bbb; 205 | cursor: pointer; 206 | box-sizing: border-box; } 207 | .button:hover, 208 | button:hover, 209 | input[type="submit"]:hover, 210 | input[type="reset"]:hover, 211 | input[type="button"]:hover, 212 | .button:focus, 213 | button:focus, 214 | input[type="submit"]:focus, 215 | input[type="reset"]:focus, 216 | input[type="button"]:focus { 217 | color: #333; 218 | border-color: #888; 219 | outline: 0; } 220 | .button.button-primary, 221 | button.button-primary, 222 | input[type="submit"].button-primary, 223 | input[type="reset"].button-primary, 224 | input[type="button"].button-primary { 225 | color: #FFF; 226 | background-color: #33C3F0; 227 | border-color: #33C3F0; } 228 | .button.button-primary:hover, 229 | button.button-primary:hover, 230 | input[type="submit"].button-primary:hover, 231 | input[type="reset"].button-primary:hover, 232 | input[type="button"].button-primary:hover, 233 | .button.button-primary:focus, 234 | button.button-primary:focus, 235 | input[type="submit"].button-primary:focus, 236 | input[type="reset"].button-primary:focus, 237 | input[type="button"].button-primary:focus { 238 | color: #FFF; 239 | background-color: #1EAEDB; 240 | border-color: #1EAEDB; } 241 | 242 | 243 | /* Forms 244 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 245 | input[type="email"], 246 | input[type="number"], 247 | input[type="search"], 248 | input[type="text"], 249 | input[type="tel"], 250 | input[type="url"], 251 | input[type="password"], 252 | textarea, 253 | select { 254 | height: 38px; 255 | padding: 6px 10px; /* The 6px vertically centers text on FF, ignored by Webkit */ 256 | background-color: #fff; 257 | border: 1px solid #D1D1D1; 258 | border-radius: 4px; 259 | box-shadow: none; 260 | box-sizing: border-box; 261 | font-family: inherit; 262 | font-size: inherit; /*https://stackoverflow.com/questions/6080413/why-doesnt-input-inherit-the-font-from-body*/} 263 | /* Removes awkward default styles on some inputs for iOS */ 264 | input[type="email"], 265 | input[type="number"], 266 | input[type="search"], 267 | input[type="text"], 268 | input[type="tel"], 269 | input[type="url"], 270 | input[type="password"], 271 | textarea { 272 | -webkit-appearance: none; 273 | -moz-appearance: none; 274 | appearance: none; } 275 | textarea { 276 | min-height: 65px; 277 | padding-top: 6px; 278 | padding-bottom: 6px; } 279 | input[type="email"]:focus, 280 | input[type="number"]:focus, 281 | input[type="search"]:focus, 282 | input[type="text"]:focus, 283 | input[type="tel"]:focus, 284 | input[type="url"]:focus, 285 | input[type="password"]:focus, 286 | textarea:focus, 287 | select:focus { 288 | border: 1px solid #33C3F0; 289 | outline: 0; } 290 | label, 291 | legend { 292 | display: block; 293 | margin-bottom: 0px; } 294 | fieldset { 295 | padding: 0; 296 | border-width: 0; } 297 | input[type="checkbox"], 298 | input[type="radio"] { 299 | display: inline; } 300 | label > .label-body { 301 | display: inline-block; 302 | margin-left: .5rem; 303 | font-weight: normal; } 304 | 305 | 306 | /* Lists 307 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 308 | ul { 309 | list-style: circle inside; } 310 | ol { 311 | list-style: decimal inside; } 312 | ol, ul { 313 | padding-left: 0; 314 | margin-top: 0; } 315 | ul ul, 316 | ul ol, 317 | ol ol, 318 | ol ul { 319 | margin: 1.5rem 0 1.5rem 3rem; 320 | font-size: 90%; } 321 | li { 322 | margin-bottom: 1rem; } 323 | 324 | 325 | /* Tables 326 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 327 | table { 328 | border-collapse: collapse; 329 | } 330 | th:not(.CalendarDay), 331 | td:not(.CalendarDay) { 332 | padding: 12px 15px; 333 | text-align: left; 334 | border-bottom: 1px solid #E1E1E1; } 335 | th:first-child:not(.CalendarDay), 336 | td:first-child:not(.CalendarDay) { 337 | padding-left: 0; } 338 | th:last-child:not(.CalendarDay), 339 | td:last-child:not(.CalendarDay) { 340 | padding-right: 0; } 341 | 342 | 343 | /* Spacing 344 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 345 | button, 346 | .button { 347 | margin-bottom: 0rem; } 348 | input, 349 | textarea, 350 | select, 351 | fieldset { 352 | margin-bottom: 0rem; } 353 | pre, 354 | dl, 355 | figure, 356 | table, 357 | form { 358 | margin-bottom: 0rem; } 359 | p, 360 | ul, 361 | ol { 362 | margin-bottom: 0.75rem; } 363 | 364 | /* Utilities 365 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 366 | .u-full-width { 367 | width: 100%; 368 | box-sizing: border-box; } 369 | .u-max-full-width { 370 | max-width: 100%; 371 | box-sizing: border-box; } 372 | .u-pull-right { 373 | float: right; } 374 | .u-pull-left { 375 | float: left; } 376 | 377 | 378 | /* Misc 379 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 380 | hr { 381 | margin-top: 3rem; 382 | margin-bottom: 3.5rem; 383 | border-width: 0; 384 | border-top: 1px solid #E1E1E1; } 385 | 386 | 387 | /* Clearing 388 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 389 | 390 | /* Self Clearing Goodness */ 391 | .container:after, 392 | .row:after, 393 | .u-cf { 394 | content: ""; 395 | display: table; 396 | clear: both; } 397 | 398 | 399 | /* Media Queries 400 | –––––––––––––––––––––––––––––––––––––––––––––––––– */ 401 | /* 402 | Note: The best way to structure the use of media queries is to create the queries 403 | near the relevant code. For example, if you wanted to change the styles for buttons 404 | on small devices, paste the mobile query code up in the buttons section and style it 405 | there. 406 | */ 407 | 408 | 409 | /* Larger than mobile */ 410 | @media (min-width: 400px) {} 411 | 412 | /* Larger than phablet (also point when grid becomes active) */ 413 | @media (min-width: 550px) {} 414 | 415 | /* Larger than tablet */ 416 | @media (min-width: 750px) {} 417 | 418 | /* Larger than desktop */ 419 | @media (min-width: 1000px) {} 420 | 421 | /* Larger than Desktop HD */ 422 | @media (min-width: 1200px) {} -------------------------------------------------------------------------------- /event_notebooks/JupyterCon_2020_RAPIDSViz/environment.yml: -------------------------------------------------------------------------------- 1 | # name: cudf_dev10.2 2 | channels: 3 | - rapidsai 4 | - rapidsai-nightly 5 | - nvidia 6 | - pyviz 7 | - conda-forge 8 | - plotly 9 | - anaconda 10 | dependencies: 11 | - cuxfilter>=0.16.0a201007 12 | - cuspatial=0.16 13 | - cugraph=0.16 14 | - cudatoolkit=10.2 15 | - python>=3.6,<3.8 16 | - plotly>=4.5 17 | - dash-core-components 18 | - dash-html-components 19 | - jupyter-dash 20 | - jupyterlab 21 | - jupyter-server-proxy>=1.5.0 22 | - holoviews 23 | - hvplot 24 | - geoviews 25 | - cartopy 26 | - networkx 27 | -------------------------------------------------------------------------------- /event_notebooks/JupyterCon_2020_RAPIDSViz/images/DataLanguage.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/JupyterCon_2020_RAPIDSViz/images/DataLanguage.jpg -------------------------------------------------------------------------------- /event_notebooks/JupyterCon_2020_RAPIDSViz/images/DivvyBikesStation_ map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/JupyterCon_2020_RAPIDSViz/images/DivvyBikesStation_ map.png -------------------------------------------------------------------------------- /event_notebooks/JupyterCon_2020_RAPIDSViz/images/PlotlyDash-Dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/JupyterCon_2020_RAPIDSViz/images/PlotlyDash-Dashboard.png -------------------------------------------------------------------------------- /event_notebooks/JupyterCon_2020_RAPIDSViz/images/RAPIDS-header-graphic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/JupyterCon_2020_RAPIDSViz/images/RAPIDS-header-graphic.png -------------------------------------------------------------------------------- /event_notebooks/JupyterCon_2020_RAPIDSViz/images/RAPIDS-header.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/JupyterCon_2020_RAPIDSViz/images/RAPIDS-header.png -------------------------------------------------------------------------------- /event_notebooks/JupyterCon_2020_RAPIDSViz/images/RAPIDSwow.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/JupyterCon_2020_RAPIDSViz/images/RAPIDSwow.gif -------------------------------------------------------------------------------- /event_notebooks/JupyterCon_2020_RAPIDSViz/images/README_images.md: -------------------------------------------------------------------------------- 1 | # Images 2 | 3 | Any images embedded in markdown cells within the Jupyter notebooks should go in this folder. 4 | 5 | List any image sources and authors in this README. 6 | 7 | **Recommend** that any images used in the materials be shared under a [CC-BY](https://creativecommons.org/licenses/by/2.0/) license. 8 | -------------------------------------------------------------------------------- /event_notebooks/JupyterCon_2020_RAPIDSViz/images/census-crop.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/JupyterCon_2020_RAPIDSViz/images/census-crop.jpg -------------------------------------------------------------------------------- /event_notebooks/JupyterCon_2020_RAPIDSViz/images/census-demo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/JupyterCon_2020_RAPIDSViz/images/census-demo.jpg -------------------------------------------------------------------------------- /event_notebooks/JupyterCon_2020_RAPIDSViz/images/cuxfilter_02_dashboard_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/JupyterCon_2020_RAPIDSViz/images/cuxfilter_02_dashboard_1.png -------------------------------------------------------------------------------- /event_notebooks/JupyterCon_2020_RAPIDSViz/images/cuxfilter_02_dashboard_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/JupyterCon_2020_RAPIDSViz/images/cuxfilter_02_dashboard_2.png -------------------------------------------------------------------------------- /event_notebooks/JupyterCon_2020_RAPIDSViz/images/cuxfilter_02_dashboard_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/JupyterCon_2020_RAPIDSViz/images/cuxfilter_02_dashboard_3.png -------------------------------------------------------------------------------- /event_notebooks/JupyterCon_2020_RAPIDSViz/images/cuxfilter_02_dashboard_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/JupyterCon_2020_RAPIDSViz/images/cuxfilter_02_dashboard_4.png -------------------------------------------------------------------------------- /event_notebooks/JupyterCon_2020_RAPIDSViz/images/dashboard-sketch-ideas.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/JupyterCon_2020_RAPIDSViz/images/dashboard-sketch-ideas.jpg -------------------------------------------------------------------------------- /event_notebooks/JupyterCon_2020_RAPIDSViz/images/notebook_04_dashboard_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/JupyterCon_2020_RAPIDSViz/images/notebook_04_dashboard_1.png -------------------------------------------------------------------------------- /event_notebooks/JupyterCon_2020_RAPIDSViz/images/plotly_dashboard_sketch.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/JupyterCon_2020_RAPIDSViz/images/plotly_dashboard_sketch.jpg -------------------------------------------------------------------------------- /event_notebooks/KDD_2020/Presenters.md: -------------------------------------------------------------------------------- 1 | # Presenters 2 | 3 | 4 | ### Bartley Richardson - RAPIDS Data Science Lead, NVIDIA 5 | Bartley Richardson, PhD is a Senior Cybersecurity Data Scientist and AI Infrastructure Manager (RAPIDS) at NVIDIA. He leads a team that researches and applies GPU-accelerated ML and DL to help solve today’s information security and cybersecurity challenges. Previously, Bartley was a technical lead and performer on multiple DARPA research projects where he applied data science and ML/DL algorithms at scale to address large cybersecurity problems. His primary research areas are NLP and sequence-based methods for cyber network defense. Bartley holds a PhD in CS and CompE with a focus on un-structured logical query optimization. His BS is in Computer Engineering with a concentration in software design and AI. 6 | 7 | --- 8 | 9 | ### David Bader - Director, Institute for DS, NJIT 10 | 11 | David A. Bader is a Distinguished Professor in the Department of Computer Science at New Jersey Institute of Technology. Prior to this, he served as founding Professor and Chair of the School of Computational Science and Engineering, College of Computing, at Georgia Institute of Technology. He is a Fellow of the IEEE, AAAS, and SIAM, and advises the White House, most recently on the National Strategic Computing Initiative (NSCI). Dr. Bader is a leading expert in solving global grand challenges in science, engineering, computing, and data science. His interests are at the intersection of high-performance computing and real-world applications, including cybersecurity, massive-scale analytics, and computational genomics, and he has co-authored over 250 scholarly papers. Dr. Bader has served as a lead scientist in several DARPA programs including High Productivity Computing Systems (HPCS) with IBM, Ubiquitous High Performance Computing (UHPC) with NVIDIA, Anomaly Detection at Multiple Scales (ADAMS), Power Efficiency Revolution For Embedded Computing Technologies (PERFECT), Hierarchical Identify Verify Exploit (HIVE), and Software-Defined Hardware (SDH). He has also served as Director of the Sony-Toshiba-IBM Center of Competence for the Cell Broadband Engine Processor. Bader is a cofounder of the Graph500 List for benchmarking “Big Data” computing platforms. Bader is recognized as a “RockStar” of High-Performance Computing by InsideHPC and as HPCwire's People to Watch in 2012 and 2014. In April 2019, Bader was awarded an NVIDIA AI Lab (NVAIL) award, and in July 2019, Bader received a Facebook Research AI Hardware/Software Co-Design award. 12 | 13 | --- 14 | 15 | ### Brad Rees - RAPIDS cuGraph Lead, NVIDIA 16 | Brad Rees, PhD is a Senior Manager at NVIDIA and lead of the RAPIDS cuGraph team. Brad has been designing, implementing, and supporting a variety of advanced software and hardware systems within the defense and research communities for over 30 years,specializing in complex analytic systems, primarily using graph analytic techniques for social and cyber network analysis. Brad has a Ph.D. in Computer Science with a focus on graph analytics. 17 | 18 | --- 19 | 20 | ### Keith Kraus - RAPIDS cuDF Lead, NVIDIA 21 | Keith Kraus is a Senior Engineer of Applied Solutions Engineering at NVIDIA and leads the RAPIDS cuDF team. At NVIDIA, Keith's focus is on building GPU-accelerated solutions around data engineering, analytics, and visualization. Prior to working for NVIDIA, Keith did extensive data engineering, systems engineering, and data visualization work in the cybersecurity domain focused on building a GPU-accelerated big data solution for advanced threat detection and cyber-hunting capabilities. Keith graduated from Stevens Institute of Technology with a BEng in computer engineering and an MEng in networked information systems. 22 | 23 | --- 24 | 25 | ### Josh Patterson - Senior Director of RAPIDS, NVIDIA 26 | Joshua Patterson is the senior director of applied solutions engineering at NVIDIA. Previously, Josh worked with leading experts across the public and private sectors and academia to build a next-generation cyber defense platform. He was also a White House Presidential Innovation Fellow. His current passions are graph analytics, machine learning, and GPU data acceleration. Josh also loves storytelling with data and creating interactive data visualizations. He holds a BA in economics from the University of North Carolina at Chapel Hill and an MA in economics from the University of South Carolina’s Moore School of Business. 27 | 28 | --- 29 | 30 | ### Tom Drabas - Senior Data Scientist, Microsoft 31 | Tom Drabas is a Senior Data Scientist at Microsoft in the Azure Machine Learning group. His research interests include parallel computing, deep learning, and ML algorithms and their applications. During his time at Microsoft, Tom has published multiple books and authored a video series on data science, machine learning, and distributed computing in Spark. He has over 17 years of international experience working in the airline, telecommunication and technology industries. Tom holds a Ph.D. in the airline Operations Research field from the University of New South Wales. 32 | 33 | --- 34 | 35 | ### Corey Nolet - Senior Data Scientist, NVIDIA 36 | Corey Nolet is a Senior Data Scientist and Engineer on the RAPIDS cuML team at NVIDIA, where he focuses on building and scaling machine learning algorithms to support extreme data loads at light-speed. Prior to working at NVIDIA, Corey spent over a decade building massive-scale exploratory data science & real-time analytics platforms for HPC environments in the defense industry. Corey holds Bs. & Ms. degrees in Computer Science. He is also working towards his PhD in the same discipline, focused on scaling machine learning algorithms for decentralized architectures. Corey has a passion for using data to make better sense of the world. -------------------------------------------------------------------------------- /event_notebooks/KDD_2020/README.md: -------------------------------------------------------------------------------- 1 | ![RAPIDS](img/rapids_logo.png) 2 | 3 |

4 |

5 |

6 | 7 |

8 | 9 | # Accelerating and Expanding End-to-End Data Science Workflows with DL/ML Interoperability Using RAPIDS 10 | 11 | ## KDD 2020 Tutorial 12 | The lines between data science (DS), machine learning (ML), deep learning (DL), and data mining continue to be blurred and removed. This is great as it ushers in vast amounts of capabilities, but it brings increased complexity and a vast number of tools/techniques. It’s not uncommon for DL engineers to use one set of tools for data extraction/cleaning and then pivot to another library for training their models. After training and inference, it’s common to then move data yet again by another set of tools for post-processing. The ​RAPIDS​ suite of open source libraries not only provides a method to execute and accelerate these tasks using GPUs with familiar APIs, but it also provides interoperability with the broader open source community and DL tools while removing unnecessary serializations that slow down workflows. GPUs provide massive parallelization that DL has leveraged for some time, and RAPIDS provides the missing pieces that extend this computing power to more traditional yet important DS and ML tasks (e.g., ETL, modeling). Complete pipelines can be built that encompass everything, including ETL, feature engineering, ML/DL modeling, inference, and visualization, all while removing typical serialization costs and affording seamless interoperability between libraries. All 13 | 14 | experiments using RAPIDS can effortlessly be scheduled, logged and reviewed using existing public cloud options. 15 | Join our engineers and data scientists as they walk through a collection of DS and ML/DL engineering problems that show how RAPIDS running on Azure ML can be used for end-to-end, entirely GPU pipelines. This tutorial includes specifics on how to use RAPIDS for feature engineering, interoperability with common ML/DL packages, and creating GPU native visualizations using ​cuxfilter​. The use cases presented here give attendees a hands-on approach to using RAPIDS components as part of a larger workflow, seamlessly integrating with other libraries (e.g., PyTorch) and visualization packages. 16 | 17 | ## Agenda: 18 | 1. Introduction (not hands-on) [20 min] 19 | 1. Speaker Introductions 20 | 2. Getting Connected to the VM Instances 21 | 3. Why RAPIDS, and How RAPIDS Connects to the Larger Ecosystem 22 | 2. Tutorial (hands-on) [2 hours 20 min] 23 | 1. [New York Taxi Data](./notebooks/Taxi/NYCTax.ipynb) 24 | 1. In troduction to RAPIDS via an analysis of the New York City taxi data set 25 | 2. Key Libraries: cuDF, cuML, cuGraph, cuXfilter​ 26 | 2. [Deep Learning for Tabular Data](nvtabular/rossmann-store-sales-example.ipynb) 27 | 1. Perform store sales prediction using tabular deep learning​ 28 | 2. Key Libraries: NVTabular, cuDF, TensorFlow​ 29 | 3. [Single-Cell RNA Sequencing Analysis](notebooks/Lungs/hlca_lung_gpu_analysis.ipynb) 30 | 1. Analyzing gene expression from a population of cells from a human lung​ 31 | 2. Key Libraries: scanpy, cuDF, cuML, cuGraph​ 32 | 4. [Where to Park](./notebooks/parking/codes/) 33 | 1. Analyzing Seattle Parking data and determining the best parking spot within a walkable distance from Space Needle​ 34 | 2. Key Libraries: cuSpatial, cuDF, cuGraph​ 35 | 5. CyBERT 36 | 1. Cyber Log Parsing using Neural Networks and Language Based Model​ 37 | 2. Key Libraries: CLX, cuDF, PyTorch​ 38 | 3. Conclusions (not hands-on) [15 min] 39 | 5. Future Improvements / Roadmap 40 | 6. Any Additional Questions 41 | 42 | ## Configuring Your Environment 43 | This tutorial requires RAPIDS and a modern GPU (Pascal architecture or newer). It makes use of multiple GPU packages and CUDA 10.2. In order to make this process as simple as possible, we've created detailed instructions to get a RAPIDS nightly container modified for the notebooks in this repo. Follow the steps outlined below. For the purposes of these instructions, we assume usage of Ubuntu 18.04, CUDA 10.2, and Python 3.7. 44 | 45 | #### 1. Pull the RAPIDS nightly container for your environment by visiting the [Getting Started](https://rapids.ai/start.html) page 46 | 47 | ``` 48 | docker pull rapidsai/rapidsai-nightly:cuda10.2-runtime-ubuntu18.04-py3.7 49 | ``` 50 | 51 | #### 2. Start the container 52 | 53 | ``` 54 | docker run --gpus all -it -p 8888:8888 -p 8787:8787 -p 8786:8786 --name kdd_rapids \ 55 | rapidsai/rapidsai-nightly:cuda10.2-runtime-ubuntu18.04-py3.7 56 | ``` 57 | 58 | You should now be at a prompt inside the container that looks like this: 59 | 60 | ``` 61 | (rapids) root@aa3f80497e9c:/rapids/notebooks# 62 | ``` 63 | 64 | If it does not, attach to the container you just created by running: 65 | 66 | ``` 67 | docker attach kdd_rapids 68 | ``` 69 | 70 | #### 3. Clone this Git repo 71 | 72 | ``` 73 | git clone https://github.com/rapidsai-community/notebooks-contrib.git 74 | ``` 75 | 76 | #### 4. Change directories to the KDD 2020 directory 77 | 78 | ``` 79 | cd /rapids/notebooks/notebooks-contrib/conference_notebooks/KDD_2020/ 80 | ``` 81 | 82 | #### 5. Run the KDD initial setup script 83 | 84 | ``` 85 | sh kdd_initial_setup 86 | ``` 87 | 88 | #### 6. Wait for the script to finish, then visit Jupyter Lab in your Web browser 89 | 90 | Once the script finishes, use your favorite Web browser and navigate to your Jupyter Lab instance. You'll need to know the IP address of the machine where your container is running. If this is your local machine, you can often use `127.0.0.1` or `localhost`. 91 | 92 | Jupyter Lab is running on port 8888. An example running on your local machine would be: 93 | 94 | ``` 95 | 127.0.0.1:8888 96 | ``` 97 | 98 | You should now see the KDD 2020 content in your Web browser. 99 | 100 | ## Presenters 101 | 102 | [Presenters](Presenters.md) -------------------------------------------------------------------------------- /event_notebooks/KDD_2020/img/cybert_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/KDD_2020/img/cybert_workflow.png -------------------------------------------------------------------------------- /event_notebooks/KDD_2020/img/microsoft_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/KDD_2020/img/microsoft_logo.png -------------------------------------------------------------------------------- /event_notebooks/KDD_2020/img/njit_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/KDD_2020/img/njit_logo.png -------------------------------------------------------------------------------- /event_notebooks/KDD_2020/img/nvidia_logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/KDD_2020/img/nvidia_logo.jpg -------------------------------------------------------------------------------- /event_notebooks/KDD_2020/img/rapids_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/KDD_2020/img/rapids_logo.png -------------------------------------------------------------------------------- /event_notebooks/KDD_2020/kdd_initial_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script sets up the RAPIDS nightly container for the KDD 2020 hands-on tutorial 3 | 4 | echo "****************************************************************" 5 | echo "** Stopping Jupyter Server **" 6 | echo "****************************************************************" 7 | kill $(ps aux | grep '[j]upyter' | awk '{print $2}') 8 | 9 | echo "" 10 | echo "****************************************************************" 11 | echo "** Installing cyBERT requirements **" 12 | echo "****************************************************************" 13 | pip install torch torchvision 14 | pip install transformers 15 | pip install requests 16 | pip install seqeval 17 | 18 | echo "" 19 | echo "****************************************************************" 20 | echo "** Installing NVT requirements **" 21 | echo "****************************************************************" 22 | pip install git+https://github.com/NVIDIA/NVTabular.git 23 | pip install tensorflow 24 | 25 | echo "" 26 | echo "****************************************************************" 27 | echo "** Intalling parking requirements **" 28 | echo "****************************************************************" 29 | pip install --upgrade ipython-autotime wget gmaps geopy 30 | 31 | echo "" 32 | echo "****************************************************************" 33 | echo "** Patching cuspatial **" 34 | echo "****************************************************************" 35 | ### replace __init__.py for cuspatial as it throws a variety of weird erorrs 36 | cp notebooks/parking/__patch/cuspatial_init_patched.py /opt/conda/envs/rapids/lib/python3.7/site-packages/cuspatial/__init__.py 37 | 38 | ### copy libgdal.so.27 to libgdal.so.26 39 | cp /opt/conda/envs/rapids/lib/libgdal.so.27 /opt/conda/envs/rapids/lib/libgdal.so.26 40 | 41 | echo "" 42 | echo "****************************************************************" 43 | echo "** Patching CUDA Version 10.2->10.1 **" 44 | echo "****************************************************************" 45 | ln -s /usr/local/cuda/lib64/libcudart.so.10.2 /usr/local/cuda/lib64/libcudart.so.10.1 46 | 47 | echo "" 48 | echo "****************************************************************" 49 | echo "** Modifying LD_LIBRARY_PATH **" 50 | echo "****************************************************************" 51 | export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64/" 52 | 53 | echo "" 54 | echo "****************************************************************" 55 | echo "** Reloacing Config (ldconfig) **" 56 | echo "****************************************************************" 57 | ldconfig 58 | 59 | echo "" 60 | echo "****************************************************************" 61 | echo "** Starting Jupyter Server **" 62 | echo "****************************************************************" 63 | /rapids/utils/start_jupyter.sh -------------------------------------------------------------------------------- /event_notebooks/KDD_2020/notebooks/Lungs/__pycache__/rapids_scanpy_funcs.cpython-37.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/KDD_2020/notebooks/Lungs/__pycache__/rapids_scanpy_funcs.cpython-37.pyc -------------------------------------------------------------------------------- /event_notebooks/KDD_2020/notebooks/Lungs/rapids_scanpy_funcs.py: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright (c) 2020, NVIDIA CORPORATION. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | 17 | import cuml 18 | import cupy as cp 19 | import cudf 20 | 21 | import numpy as np 22 | import scipy 23 | import math 24 | 25 | import dask.array as da 26 | 27 | from cuml.linear_model import LinearRegression 28 | 29 | 30 | def scale(normalized, max_value=10): 31 | mean = normalized.mean(axis=0) 32 | stddev = cp.sqrt(normalized.var(axis=0)) 33 | 34 | normalized -= mean 35 | normalized *= 1/stddev 36 | 37 | normalized[normalized>10] = 10 38 | 39 | return normalized 40 | 41 | 42 | def _regress_out_chunk(X, y): 43 | """ 44 | Performs a data_cunk.shape[1] number of local linear regressions, 45 | replacing the data in the original chunk w/ the regressed result. 46 | """ 47 | output = [] 48 | lr = LinearRegression(fit_intercept=False) 49 | lr.fit(X, y, convert_dtype=True) 50 | return y.reshape(y.shape[0],) - lr.predict(X).reshape(y.shape[0]) 51 | 52 | 53 | 54 | def normalize_total(filtered_cells, target_sum): 55 | sums = np.array(target_sum / filtered_cells.sum(axis=1)).ravel() 56 | 57 | normalized = filtered_cells.multiply(sums[:, np.newaxis]) # Done on host for now 58 | normalized = cp.sparse.csr_matrix(normalized) 59 | 60 | return normalized 61 | 62 | 63 | def regress_out(normalized, n_counts, percent_mito, verbose=False): 64 | 65 | regressors = cp.ones((n_counts.shape[0]*3)).reshape((n_counts.shape[0], 3), order="F") 66 | 67 | regressors[:, 1] = n_counts 68 | regressors[:, 2] = percent_mito 69 | 70 | for i in range(normalized.shape[1]): 71 | if verbose and i % 500 == 0: 72 | print("Regressed %s out of %s" %(i, normalized.shape[1])) 73 | X = regressors 74 | y = normalized[:,i] 75 | _regress_out_chunk(X, y) 76 | 77 | return normalized 78 | 79 | 80 | def filter_cells(sparse_gpu_array, min_genes, max_genes, rows_per_batch=10000): 81 | n_batches = math.ceil(sparse_gpu_array.shape[0] / rows_per_batch) 82 | print("Running %d batches" % n_batches) 83 | filtered_list = [] 84 | for batch in range(n_batches): 85 | batch_size = rows_per_batch 86 | start_idx = batch * batch_size 87 | stop_idx = min(batch * batch_size + batch_size, sparse_gpu_array.shape[0]) 88 | arr_batch = sparse_gpu_array[start_idx:stop_idx] 89 | filtered_list.append(_filter_cells(arr_batch, 90 | min_genes=min_genes, 91 | max_genes=max_genes)) 92 | 93 | return scipy.sparse.vstack(filtered_list) 94 | 95 | 96 | def _filter_cells(sparse_gpu_array, min_genes, max_genes): 97 | degrees = cp.diff(sparse_gpu_array.indptr) 98 | query = ((min_genes <= degrees) & (degrees <= max_genes)).ravel() 99 | return sparse_gpu_array.get()[query.get()] 100 | 101 | 102 | def filter_genes(sparse_gpu_array, genes_idx, min_cells=0): 103 | thr = np.asarray(sparse_gpu_array.sum(axis=0) >= min_cells).ravel() 104 | filtered_genes = sparse_gpu_array[:,thr] 105 | genes_idx = genes_idx[np.where(thr)[0]] 106 | 107 | return filtered_genes, genes_idx.reset_index(drop=True) 108 | 109 | 110 | def select_groups(labels, groups_order_subset='all'): 111 | """Get subset of groups in adata.obs[key]. 112 | """ 113 | 114 | adata_obs_key = labels 115 | groups_order = labels.cat.categories 116 | groups_masks = cp.zeros( 117 | (len(labels.cat.categories), len(labels.cat.codes)), dtype=bool 118 | ) 119 | for iname, name in enumerate(labels.cat.categories): 120 | # if the name is not found, fallback to index retrieval 121 | if labels.cat.categories[iname] in labels.cat.codes: 122 | mask = labels.cat.categories[iname] == labels.cat.codes 123 | else: 124 | mask = iname == labels.cat.codes 125 | groups_masks[iname] = mask.values 126 | groups_ids = list(range(len(groups_order))) 127 | if groups_order_subset != 'all': 128 | groups_ids = [] 129 | for name in groups_order_subset: 130 | groups_ids.append( 131 | cp.where(cp.array(labels.cat.categories.to_array().astype("int32")) == int(name))[0][0] 132 | ) 133 | if len(groups_ids) == 0: 134 | # fallback to index retrieval 135 | groups_ids = cp.where( 136 | cp.in1d( 137 | cp.arange(len(labels.cat.categories)).astype(str), 138 | cp.array(groups_order_subset), 139 | ) 140 | )[0] 141 | 142 | groups_ids = [groups_id.item() for groups_id in groups_ids] 143 | groups_masks = groups_masks[groups_ids] 144 | groups_order_subset = labels.cat.categories[groups_ids].to_array().astype(int) 145 | else: 146 | groups_order_subset = groups_order.to_array() 147 | return groups_order_subset, groups_masks 148 | 149 | 150 | def rank_genes_groups( 151 | X, 152 | labels, # louvain results 153 | var_names, 154 | groupby = str, 155 | groups = None, 156 | reference = 'rest', 157 | n_genes = 100, 158 | key_added = None, 159 | layer = None, 160 | **kwds, 161 | ): 162 | 163 | #### Wherever we see "adata.obs[groupby], we should just replace w/ the groups" 164 | 165 | import time 166 | 167 | start = time.time() 168 | 169 | # for clarity, rename variable 170 | if groups == 'all': 171 | groups_order = 'all' 172 | elif isinstance(groups, (str, int)): 173 | raise ValueError('Specify a sequence of groups') 174 | else: 175 | groups_order = list(groups) 176 | if isinstance(groups_order[0], int): 177 | groups_order = [str(n) for n in groups_order] 178 | if reference != 'rest' and reference not in set(groups_order): 179 | groups_order += [reference] 180 | if ( 181 | reference != 'rest' 182 | and reference not in set(labels.cat.categories) 183 | ): 184 | cats = labels.cat.categories.tolist() 185 | raise ValueError( 186 | f'reference = {reference} needs to be one of groupby = {cats}.' 187 | ) 188 | 189 | groups_order, groups_masks = select_groups(labels, groups_order) 190 | 191 | original_reference = reference 192 | 193 | n_vars = len(var_names) 194 | 195 | # for clarity, rename variable 196 | n_genes_user = n_genes 197 | # make sure indices are not OoB in case there are less genes than n_genes 198 | if n_genes_user > X.shape[1]: 199 | n_genes_user = X.shape[1] 200 | # in the following, n_genes is simply another name for the total number of genes 201 | n_genes = X.shape[1] 202 | 203 | n_groups = groups_masks.shape[0] 204 | ns = cp.zeros(n_groups, dtype=int) 205 | for imask, mask in enumerate(groups_masks): 206 | ns[imask] = cp.where(mask)[0].size 207 | if reference != 'rest': 208 | ireference = cp.where(groups_order == reference)[0][0] 209 | reference_indices = cp.arange(n_vars, dtype=int) 210 | 211 | rankings_gene_scores = [] 212 | rankings_gene_names = [] 213 | rankings_gene_logfoldchanges = [] 214 | rankings_gene_pvals = [] 215 | rankings_gene_pvals_adj = [] 216 | 217 | # if 'log1p' in adata.uns_keys() and adata.uns['log1p']['base'] is not None: 218 | # expm1_func = lambda x: np.expm1(x * np.log(adata.uns['log1p']['base'])) 219 | # else: 220 | # expm1_func = np.expm1 221 | 222 | # Perform LogReg 223 | 224 | # if reference is not set, then the groups listed will be compared to the rest 225 | # if reference is set, then the groups listed will be compared only to the other groups listed 226 | from cuml.linear_model import LogisticRegression 227 | reference = groups_order[0] 228 | if len(groups) == 1: 229 | raise Exception('Cannot perform logistic regression on a single cluster.') 230 | grouping_mask = labels.astype('int').isin(cudf.Series(groups_order)) 231 | grouping = labels.loc[grouping_mask] 232 | 233 | X = X[grouping_mask.values, :] # Indexing with a series causes issues, possibly segfault 234 | y = labels.loc[grouping] 235 | 236 | clf = LogisticRegression(**kwds) 237 | clf.fit(X.get(), grouping.to_array().astype('float32')) 238 | scores_all = cp.array(clf.coef_).T 239 | 240 | for igroup, group in enumerate(groups_order): 241 | if len(groups_order) <= 2: # binary logistic regression 242 | scores = scores_all[0] 243 | else: 244 | scores = scores_all[igroup] 245 | 246 | partition = cp.argpartition(scores, -n_genes_user)[-n_genes_user:] 247 | partial_indices = cp.argsort(scores[partition])[::-1] 248 | global_indices = reference_indices[partition][partial_indices] 249 | rankings_gene_scores.append(scores[global_indices].get()) ## Shouldn't need to take this off device 250 | rankings_gene_names.append(var_names[global_indices].to_pandas()) 251 | if len(groups_order) <= 2: 252 | break 253 | 254 | groups_order_save = [str(g) for g in groups_order] 255 | if (len(groups) == 2): 256 | groups_order_save = [g for g in groups_order if g != reference] 257 | 258 | print("Ranking took (GPU): " + str(time.time() - start)) 259 | 260 | start = time.time() 261 | 262 | scores = np.rec.fromarrays( 263 | [n for n in rankings_gene_scores], 264 | dtype=[(rn, 'float32') for rn in groups_order_save], 265 | ) 266 | 267 | names = np.rec.fromarrays( 268 | [n for n in rankings_gene_names], 269 | dtype=[(rn, 'U50') for rn in groups_order_save], 270 | ) 271 | 272 | print("Preparing output np.rec.fromarrays took (CPU): " + str(time.time() - start)) 273 | print("Note: This operation will be accelerated in a future version") 274 | 275 | return scores, names, original_reference 276 | -------------------------------------------------------------------------------- /event_notebooks/KDD_2020/notebooks/Taxi/img/ny_yellow_cab.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/KDD_2020/notebooks/Taxi/img/ny_yellow_cab.jpg -------------------------------------------------------------------------------- /event_notebooks/KDD_2020/notebooks/Taxi/nyctaxi_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import urllib.request 4 | from tqdm import tqdm 5 | from itertools import chain 6 | 7 | def download_nyctaxi_data(years, path): 8 | taxi_years = [ 9 | "2014", 10 | "2015", 11 | "2016" 12 | ] 13 | 14 | if not set(years) <= set(taxi_years): 15 | print(years) 16 | print("years list not valid, please specify a sublist of") 17 | print(taxi_years) 18 | raise Exception("{years} list is not valid".format(years=years)) 19 | 20 | data_dir = os.path.abspath(os.path.join(path, "nyctaxi")) 21 | if not os.path.exists(data_dir): 22 | os.makedirs(data_dir) 23 | 24 | filenames = [] 25 | local_paths = [] 26 | for year in years: 27 | if year == "2016": 28 | start = 1 29 | end = 7 30 | else: 31 | start = 1 32 | end = 13 33 | if not os.path.exists(os.path.join(data_dir, year)): 34 | os.makedirs(os.path.join(data_dir, year)) 35 | for i in range(start, end): 36 | filename = "yellow_tripdata_{year}-{month:02d}.csv".format(year=year, month=i) 37 | filenames.append(filename) 38 | local_path = os.path.join(data_dir, year, filename) 39 | local_paths.append(local_path) 40 | 41 | for year in years: 42 | for idx, filename in enumerate(filenames): 43 | filename_elements = [filename_element.split('-') for filename_element in filename.split('_')] 44 | filename_elements = list(chain.from_iterable(filename_elements)) 45 | if year in filename_elements: 46 | url = "https://storage.googleapis.com/anaconda-public-data/nyc-taxi/csv/{year}/".format(year=year) + filename 47 | print("- Downloading " + url) 48 | if not os.path.exists(local_paths[idx]): 49 | with open(local_paths[idx], 'wb') as file: 50 | with urllib.request.urlopen(url) as resp: 51 | length = int(resp.getheader('content-length')) 52 | blocksize = max(4096, length // 100) 53 | with tqdm(total=length, file=sys.stdout) as pbar: 54 | while True: 55 | buff = resp.read(blocksize) 56 | if not buff: 57 | break 58 | file.write(buff) 59 | pbar.update(len(buff)) 60 | else: 61 | print("- File already exists locally") 62 | 63 | print("-------------------") 64 | print("-Download complete-") 65 | print("-------------------") 66 | -------------------------------------------------------------------------------- /event_notebooks/KDD_2020/notebooks/cybert/models/apache_cased_example_labels.p: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/KDD_2020/notebooks/cybert/models/apache_cased_example_labels.p -------------------------------------------------------------------------------- /event_notebooks/KDD_2020/notebooks/cybert/models/apache_label_map.txt: -------------------------------------------------------------------------------- 1 | other 2 | request_http_ver 3 | request_url 4 | request_header_referer 5 | request_header_user_agent__os__family 6 | time_received 7 | response_bytes_clf 8 | remote_host 9 | request_header_user_agent 10 | request_method 11 | remote_user 12 | error_level 13 | error_message 14 | status 15 | X 16 | [PAD] -------------------------------------------------------------------------------- /event_notebooks/KDD_2020/notebooks/cybert/models/apache_label_map_example.txt: -------------------------------------------------------------------------------- 1 | [PAD]request_methoderror_messagetime_receivedotherresponse_bytes_clferror_levelrequest_urlremote_hostrequest_header_refererrequest_header_user_agentX -------------------------------------------------------------------------------- /event_notebooks/KDD_2020/notebooks/cybert/resources/cybert_workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/KDD_2020/notebooks/cybert/resources/cybert_workflow.png -------------------------------------------------------------------------------- /event_notebooks/KDD_2020/notebooks/nvtabular/rossmann-store-sales-example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Rossmann Store Sales Prediction Example\n", 8 | "In this example, we'll illustrate how to use NVTabular to preprocess and load tabular data for training neural networks into TensorFlow. This usees a [dataset built by FastAI](https://github.com/fastai/fastai/blob/master/courses/dl1/lesson3-rossman.ipynb) for solving the [Kaggle Rossmann Store Sales competition](https://www.kaggle.com/c/rossmann-store-sales). To expedite this tutorial, we've already lightly preprocessed the data we'll be using. For a full version of this specific example, please visit the [NVTabular GitHub](https://github.com/NVIDIA/NVTabular)." 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "We won't go into full details of the Rossmann Kaggle competition, but below is a brief description of the task taken directly from Kaggle:\n", 16 | "\n", 17 | "
Rossmann operates over 3,000 drug stores in 7 European countries. Currently, Rossmann store managers are tasked with predicting their daily sales for up to six weeks in advance. Store sales are influenced by many factors, including promotions, competition, school and state holidays, seasonality, and locality. With thousands of individual managers predicting sales based on their unique circumstances, the accuracy of results can be quite varied.
" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "Essentially, our goal is to use features in this dataset to predict sales. There are a number of different ways to do this. You could employ a random forest classifier or even a Naive Bayes model. And in practice, it's good to try a variety of models and cross validate. However, for the purposes of this tutorial, we want to illustrate how easy it is to use data loaders and features built into NVTabular to create a deep learning model with tabular data.\n", 25 | "\n", 26 | "Here we go!" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "## Do we have a GPU?" 34 | ] 35 | }, 36 | { 37 | "cell_type": "markdown", 38 | "metadata": {}, 39 | "source": [ 40 | "It's always a good idea to check. Really hope we do, otherwise this is going to be a lightening fast tutorial." 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "!nvidia-smi" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "And of course we have the necessary imports. We'll primarily be using NVTabular, cuDF, and TensorFlow (which we'll import a bit later)." 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "import nvtabular as nvt\n", 66 | "import os\n", 67 | "import glob\n", 68 | "import cudf\n", 69 | "import requests" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "## Preparing our Dataset\n", 77 | "Let's start by defining some of the _a priori_ information about our data, including its schema (what columns to use and what sorts of variables they represent), as well as the location of the files corresponding to some particular sampling from this schema. Note that throughout, I'll use UPPERCASE variables to represent this sort of a priori information that you might usually encode using commandline arguments or config files.\n", 78 | "\n", 79 | "For ease of this tutorial, we've already lightly preprocessed the input data and generated nice, clean CSV files for you. You know, just like in the real world." 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "DATA_DIR = os.environ.get('DATA_DIR', './data')" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "# create data directory if it doesn't exist\n", 98 | "if not os.path.exists(DATA_DIR):\n", 99 | " os.makedirs(DATA_DIR)\n", 100 | "\n", 101 | "# configure paths for models\n", 102 | "DATA_BASE_URL = \"https://data.rapids.ai/cyber/kdd2020/nvt/\"\n", 103 | "\n", 104 | "TRAIN_FILE = \"train.csv\"\n", 105 | "VALIDATE_FILE = \"valid.csv\"\n", 106 | "TEST_FILE = \"test.csv\"\n", 107 | "\n", 108 | "# download the training CSV if it doesn't exist\n", 109 | "if not os.path.exists(DATA_DIR + \"/\" + TRAIN_FILE):\n", 110 | " print(\">> '\" + TRAIN_FILE + \"' was not found, downloading now\")\n", 111 | " r = requests.get(DATA_BASE_URL + TRAIN_FILE)\n", 112 | " open(DATA_DIR + '/' + TRAIN_FILE, 'wb').write(r.content)\n", 113 | "else:\n", 114 | " print(\">> '\" + TRAIN_FILE + \"' was found at: \" + DATA_DIR + \"/\" + TRAIN_FILE)\n", 115 | " \n", 116 | "# download the validation CSV if it doesn't exist\n", 117 | "if not os.path.exists(DATA_DIR + \"/\" + VALIDATE_FILE):\n", 118 | " print(\">> '\" + VALIDATE_FILE + \"' was not found, downloading now\")\n", 119 | " r = requests.get(DATA_BASE_URL + VALIDATE_FILE)\n", 120 | " open(DATA_DIR + '/' + VALIDATE_FILE, 'wb').write(r.content)\n", 121 | "else:\n", 122 | " print(\">> '\" + VALIDATE_FILE + \"' was found at: \" + DATA_DIR + \"/\" + VALIDATE_FILE)\n", 123 | " \n", 124 | "# download the test CSV if it doesn't exist\n", 125 | "if not os.path.exists(DATA_DIR + \"/\" + TEST_FILE):\n", 126 | " print(\">> '\" + TEST_FILE + \"' was not found, downloading now\")\n", 127 | " r = requests.get(DATA_BASE_URL + TEST_FILE)\n", 128 | " open(DATA_DIR + '/' + TEST_FILE, 'wb').write(r.content)\n", 129 | "else:\n", 130 | " print(\">> '\" + TEST_FILE + \"' was found at: \" + DATA_DIR + \"/\" + TEST_FILE)" 131 | ] 132 | }, 133 | { 134 | "cell_type": "markdown", 135 | "metadata": {}, 136 | "source": [ 137 | "What files are available to train on in our data directory?" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [ 146 | "! ls $DATA_DIR" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "`train.csv` and `valid.csv` seem like good candidates, let's use those." 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": null, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "TRAIN_PATH = os.path.join(DATA_DIR, 'train.csv')\n", 163 | "VALID_PATH = os.path.join(DATA_DIR, 'valid.csv')" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "metadata": {}, 169 | "source": [ 170 | "### Data Exploration" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "Before we set about modeling, we can explore the data using cuDF. Let's just read in the training data." 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "train_df = cudf.read_csv(TRAIN_PATH, sep=',')" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "train_df.head()" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "The data is fairly wide, so we can select just one record and look at what the typical data is. cuDF doesn't support non-numeric types in the `values` call yet, but it's easy to take a small amount of data to Pandas to accomplish this." 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "train_df.loc[0:0].to_pandas().values" 212 | ] 213 | }, 214 | { 215 | "cell_type": "markdown", 216 | "metadata": {}, 217 | "source": [ 218 | "We can inspect the data types of the data as well." 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "execution_count": null, 224 | "metadata": {}, 225 | "outputs": [], 226 | "source": [ 227 | "train_df.columns.to_series().groupby(train_df.dtypes).groups" 228 | ] 229 | }, 230 | { 231 | "cell_type": "markdown", 232 | "metadata": {}, 233 | "source": [ 234 | "By repeating this process, we can assign columns into variables that link common data types. We're looking to predict `Sales`, so we'll denote that as our label." 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [ 243 | "CATEGORICAL_COLUMNS = [\n", 244 | " 'Store', 'DayOfWeek', 'Year', 'Month', 'Day', 'StateHoliday', 'CompetitionMonthsOpen',\n", 245 | " 'Promo2Weeks', 'StoreType', 'Assortment', 'PromoInterval', 'CompetitionOpenSinceYear', 'Promo2SinceYear',\n", 246 | " 'State', 'Week', 'Events', 'Promo_fw', 'Promo_bw', 'StateHoliday_fw', 'StateHoliday_bw',\n", 247 | " 'SchoolHoliday_fw', 'SchoolHoliday_bw'\n", 248 | "]\n", 249 | "\n", 250 | "CONTINUOUS_COLUMNS = [\n", 251 | " 'CompetitionDistance', 'Max_TemperatureC', 'Mean_TemperatureC', 'Min_TemperatureC',\n", 252 | " 'Max_Humidity', 'Mean_Humidity', 'Min_Humidity', 'Max_Wind_SpeedKm_h', \n", 253 | " 'Mean_Wind_SpeedKm_h', 'CloudCover', 'trend', 'trend_DE',\n", 254 | " 'AfterStateHoliday', 'BeforeStateHoliday', 'Promo', 'SchoolHoliday'\n", 255 | "]\n", 256 | "LABEL_COLUMNS = ['Sales']\n", 257 | "\n", 258 | "COLUMNS = CATEGORICAL_COLUMNS + CONTINUOUS_COLUMNS + LABEL_COLUMNS" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "### Workflows and Preprocessing\n", 266 | "A `Workflow` is used to represent the chains of feature engineering and preprocessing operations performed on a dataset, and is instantiated with a description of the dataset's schema so that it can keep track of how columns transform with each operation.\n", 267 | "\n", 268 | "_NOTE: As of this tutorial, NVT doesn't support transforming label columns. We'll pretend it's a regular continuous column during our feature engineering phase._" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": null, 274 | "metadata": {}, 275 | "outputs": [], 276 | "source": [ 277 | "proc = nvt.Workflow(\n", 278 | " cat_names=CATEGORICAL_COLUMNS,\n", 279 | " cont_names=CONTINUOUS_COLUMNS+LABEL_COLUMNS,\n", 280 | " label_name=LABEL_COLUMNS\n", 281 | ")" 282 | ] 283 | }, 284 | { 285 | "cell_type": "markdown", 286 | "metadata": {}, 287 | "source": [ 288 | "### Adding Operations to our Workflow\n", 289 | "We add operations to a `Workflow` by leveraging the `add_(cat|cont)_feature` and `add_(cat|cont)_preprocess` methods for categorical and continuous variables, respectively. When we're done adding ops, we call the `finalize` method to let the `Workflow` build a representation of its outputs. We use these operations to fill missing values, standardize the `Sales` column around 0 with a standard deviation of 1 (`LogOp`), normalize continuous columns, and transform categorical features into unique integer values (`Categorify`). Complete details about these functions are available on the [NVTabular's API documention site](https://nvidia.github.io/NVTabular/index.html)." 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [ 298 | "proc.add_cont_feature(nvt.ops.FillMissing())\n", 299 | "proc.add_cont_preprocess(nvt.ops.LogOp(columns=['Sales']))\n", 300 | "proc.add_cont_preprocess(nvt.ops.Normalize())\n", 301 | "proc.add_cat_preprocess(nvt.ops.Categorify())\n", 302 | "proc.finalize()" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": {}, 308 | "source": [ 309 | "### Datasets\n", 310 | "In general, the `Ops` in our `Workflow` will require measurements of statistical properties of our data in order to be leveraged. For example, the `Normalize` op requires measurements of the dataset mean and standard deviation, and the `Categorify` op requires an accounting of all the categories a particular feature can manifest. However, we frequently need to measure these properties across datasets which are too large to fit into GPU memory (or CPU memory for that matter) at once.\n", 311 | "\n", 312 | "NVTabular solves this by providing the `dataset` object, an iterator over manageable chunks of sets of parquet or csv files that can we can use to compute statistics in an online fashion (and, later, to train neural networks in batches loaded from disk). The size of those chunks will be determined by the `gpu_memory_frac` kwarg, which will load chunks whose memory footprint is equal to that fraction of available GPU memory.\n", 313 | "\n", 314 | "Larger chunks will lead to shorter run times due to the parallel-processing power of GPUs, but will constrain your memory and possibly lead to disk caching by expensive operations, thereby lowering efficiency." 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "GPU_MEMORY_FRAC = 0.2\n", 324 | "train_ds_iterator = nvt.Dataset(TRAIN_PATH, gpu_memory_frac=GPU_MEMORY_FRAC, columns=COLUMNS)\n", 325 | "valid_ds_iterator = nvt.Dataset(VALID_PATH, gpu_memory_frac=GPU_MEMORY_FRAC, columns=COLUMNS)" 326 | ] 327 | }, 328 | { 329 | "cell_type": "markdown", 330 | "metadata": {}, 331 | "source": [ 332 | "Now that we have our datasets, we'll apply our `Workflow` to them and save the results out to parquet files for fast reading at train time. We'll also measure and record statistics on our training set using the `record_stats=True` kwarg so that our `Workflow` can use them at apply time." 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": null, 338 | "metadata": {}, 339 | "outputs": [], 340 | "source": [ 341 | "PREPROCESS_DIR = os.path.join(DATA_DIR, 'jp_ross')\n", 342 | "PREPROCESS_DIR_TRAIN = os.path.join(PREPROCESS_DIR, 'train')\n", 343 | "PREPROCESS_DIR_VALID = os.path.join(PREPROCESS_DIR, 'valid')\n", 344 | "! mkdir -p $PREPROCESS_DIR_TRAIN\n", 345 | "! mkdir -p $PREPROCESS_DIR_VALID" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "proc.apply(train_ds_iterator, apply_offline=True, record_stats=True, output_path=PREPROCESS_DIR_TRAIN, shuffle=False)\n", 355 | "proc.apply(valid_ds_iterator, apply_offline=True, record_stats=False, output_path=PREPROCESS_DIR_VALID, shuffle=False)" 356 | ] 357 | }, 358 | { 359 | "cell_type": "markdown", 360 | "metadata": {}, 361 | "source": [ 362 | "### Finalize Columns\n", 363 | "The workflow will leverage the `Workflow.ds_to_tensors` method, which will map a dataset to its corresponding tensors. In order to make sure it runs correctly, we'll call the `create_final_cols` method to let the `Workflow` know to build the output dataset schema, and then we'll be sure to remove instances of the label column that got added to that schema when we performed processing on it." 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": {}, 370 | "outputs": [], 371 | "source": [ 372 | "proc.create_final_cols()\n", 373 | "# using log op and normalize on sales column causes it to get added to\n", 374 | "# continuous columns_ctx, so we'll remove it here\n", 375 | "while True:\n", 376 | " try:\n", 377 | " proc.columns_ctx['final']['cols']['continuous'].remove(LABEL_COLUMNS[0])\n", 378 | " except ValueError:\n", 379 | " break" 380 | ] 381 | }, 382 | { 383 | "cell_type": "markdown", 384 | "metadata": {}, 385 | "source": [ 386 | "## Training a Network\n", 387 | "Now that our data is preprocessed and saved out, we can leverage `dataset`s to read through the preprocessed parquet files in an online fashion to train neural networks! Even better, using the `dlpack` library, we can pass data loaded by cuDF's accelerated parquet reader to networks in TensorFlow.\n", 388 | "\n", 389 | "We'll start by setting some universal hyperparameters for our model and optimizer (without making any claims on the quality of these hyperparmeter choices). We leave it as an exercise to the attendee to experiment with the hyperparemeters." 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": null, 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "BATCH_SIZE = 65536\n", 399 | "LEARNING_RATE = 1e-3\n", 400 | "EMBEDDING_DROPOUT_RATE = 0.04\n", 401 | "DROPOUT_RATES = [0.001, 0.01]\n", 402 | "HIDDEN_DIMS = [1000, 500]\n", 403 | "EPOCHS = 10\n", 404 | "\n", 405 | "# our categorical encoder provides a handy utility for coming up with default embedding sizes\n", 406 | "# based on the number of potential categories, so we'll just use those defaults\n", 407 | "EMBEDDING_TABLE_SHAPES = {\n", 408 | " column: shape for column, shape in\n", 409 | " nvt.ops.get_embedding_sizes(proc).items()\n", 410 | "}\n", 411 | "\n", 412 | "TRAIN_PATHS = sorted(glob.glob(os.path.join(PREPROCESS_DIR_TRAIN, '*.parquet')))\n", 413 | "VALID_PATHS = sorted(glob.glob(os.path.join(PREPROCESS_DIR_VALID, '*.parquet')))" 414 | ] 415 | }, 416 | { 417 | "cell_type": "markdown", 418 | "metadata": {}, 419 | "source": [ 420 | "## Data Loaders\n", 421 | "The first thing we need to do is set up the objects for getting data into our models" 422 | ] 423 | }, 424 | { 425 | "cell_type": "markdown", 426 | "metadata": {}, 427 | "source": [ 428 | "`KerasSequenceDataset` wraps a lightweight iterator around a `dataset` object to handle chunking, shuffling, and application of any workflows (which can be applied online as a preprocessing step). For column names, can use either a list of string names or a list of TensorFlow `feature_columns` that will be used to feed the network" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": null, 434 | "metadata": {}, 435 | "outputs": [], 436 | "source": [ 437 | "import tensorflow as tf\n", 438 | "\n", 439 | "# we can control how much memory to give tensorflow with this environment variable\n", 440 | "# IMPORTANT: make sure you do this before you initialize TF's runtime, otherwise\n", 441 | "# it's too late and TF will have claimed all free GPU memory\n", 442 | "os.environ['TF_MEMORY_ALLOCATION'] = \"8192\" # explicit MB\n", 443 | "os.environ['TF_MEMORY_ALLOCATION'] = \"0.5\" # fraction of free memory\n", 444 | "from nvtabular.tf_dataloader import KerasSequenceDataset\n", 445 | "\n", 446 | "# cheap wrapper to keep things some semblance of neat\n", 447 | "def make_categorical_embedding_column(name, dictionary_size, embedding_dim):\n", 448 | " return tf.feature_column.embedding_column(\n", 449 | " tf.feature_column.categorical_column_with_identity(name, dictionary_size),\n", 450 | " embedding_dim\n", 451 | " )\n", 452 | "\n", 453 | "# instantiate our columns\n", 454 | "categorical_columns = [\n", 455 | " make_categorical_embedding_column(name, *EMBEDDING_TABLE_SHAPES[name]) for\n", 456 | " name in CATEGORICAL_COLUMNS\n", 457 | "]\n", 458 | "continuous_columns = [\n", 459 | " tf.feature_column.numeric_column(name, (1,)) for name in CONTINUOUS_COLUMNS\n", 460 | "]\n", 461 | "\n", 462 | "# feed them to our datasets\n", 463 | "train_dataset_tf = KerasSequenceDataset(\n", 464 | " TRAIN_PATHS, # you could also use a glob pattern\n", 465 | " categorical_columns+continuous_columns,\n", 466 | " batch_size=BATCH_SIZE,\n", 467 | " label_name=LABEL_COLUMNS[0],\n", 468 | " shuffle=True,\n", 469 | " buffer_size=48 # how many batches to load at once\n", 470 | ")\n", 471 | "valid_dataset_tf = KerasSequenceDataset(\n", 472 | " VALID_PATHS, # you could also use a glob pattern\n", 473 | " categorical_columns+continuous_columns,\n", 474 | " batch_size=BATCH_SIZE*4,\n", 475 | " label_name=LABEL_COLUMNS[0],\n", 476 | " shuffle=False,\n", 477 | " buffer_size=12\n", 478 | ")" 479 | ] 480 | }, 481 | { 482 | "cell_type": "markdown", 483 | "metadata": {}, 484 | "source": [ 485 | "## Defining a Model\n", 486 | "Next we'll need to define the inputs that will feed our model and build an architecture on top of them. For now, we'll just stick to a simple MLP model." 487 | ] 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "metadata": {}, 492 | "source": [ 493 | "Using Keras, we can define the layers of our model and their parameters explicitly. Here, for the sake of consistency, I've tried to recreate the model created by FastAI as faithfully as I can given their description [here](https://docs.fast.ai/tabular.models.html#TabularModel), without making any claims as to whether this is the _right_ model to use." 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": null, 499 | "metadata": {}, 500 | "outputs": [], 501 | "source": [ 502 | "# DenseFeatures layer needs a dictionary of {feature_name: input}\n", 503 | "categorical_inputs = {}\n", 504 | "for column_name in CATEGORICAL_COLUMNS:\n", 505 | " categorical_inputs[column_name] = tf.keras.Input(name=column_name, shape=(1,), dtype=tf.int64)\n", 506 | "categorical_embedding_layer = tf.keras.layers.DenseFeatures(categorical_columns)\n", 507 | "categorical_x = categorical_embedding_layer(categorical_inputs)\n", 508 | "categorical_x = tf.keras.layers.Dropout(EMBEDDING_DROPOUT_RATE)(categorical_x)\n", 509 | "\n", 510 | "# Just concatenating continuous, so can use a list\n", 511 | "continuous_inputs = []\n", 512 | "for column_name in CONTINUOUS_COLUMNS:\n", 513 | " continuous_inputs.append(tf.keras.Input(name=column_name, shape=(1,), dtype=tf.float32))\n", 514 | "continuous_embedding_layer = tf.keras.layers.Concatenate(axis=1)\n", 515 | "continuous_x = continuous_embedding_layer(continuous_inputs)\n", 516 | "continuous_x = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.1)(continuous_x)\n", 517 | "\n", 518 | "# concatenate and build MLP\n", 519 | "x = tf.keras.layers.Concatenate(axis=1)([categorical_x, continuous_x])\n", 520 | "for dim, dropout_rate in zip(HIDDEN_DIMS, DROPOUT_RATES):\n", 521 | " x = tf.keras.layers.Dense(dim, activation='relu')(x)\n", 522 | " x = tf.keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.1)(x)\n", 523 | " x = tf.keras.layers.Dropout(dropout_rate)(x)\n", 524 | "x = tf.keras.layers.Dense(1, activation='linear')(x)\n", 525 | "\n", 526 | "# combine all our inputs into a single list\n", 527 | "# (note that you can still use .fit, .predict, etc. on a dict\n", 528 | "# that maps input tensor names to input values)\n", 529 | "inputs = list(categorical_inputs.values()) + continuous_inputs\n", 530 | "tf_model = tf.keras.Model(inputs=inputs, outputs=x)" 531 | ] 532 | }, 533 | { 534 | "cell_type": "markdown", 535 | "metadata": {}, 536 | "source": [ 537 | "## Define Optimizer and Train\n", 538 | "This is probably the most conceptually consistent part between the frameworks: we'll define an objective and a method for optimizing it, then fit our model to our dataset iterators using that optimization scheme. We'll build a quick implementation of the metric Kaggle used in the original competition so that we can keep tabs on it during training.\n", 539 | "\n", 540 | "Submissions to the Rossmann Store Sales Kaggle competition were evaulated using Root Mean Square Percentage Error (RMSPE).\n", 541 | "\n", 542 | "$$\\textrm{RMSPE}=\\sqrt{\\frac{1}{n}\\sum_{i=1}^{n}\\left ( \\frac{y_i-\\hat{y_i}}{y_i} \\right )^2}$$\n", 543 | "\n", 544 | "Note that we're making an explicit choice to drop zeroes to maintain consistency with Kaggle." 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": null, 550 | "metadata": {}, 551 | "outputs": [], 552 | "source": [ 553 | "def rmspe_tf(y_true, y_pred):\n", 554 | " # map back into \"true\" space by undoing transform\n", 555 | " y_true = y_true*proc.stats['stds']['Sales'] + proc.stats['means']['Sales']\n", 556 | " y_pred = y_pred*proc.stats['stds']['Sales'] + proc.stats['means']['Sales']\n", 557 | "\n", 558 | " # and then the log(1+x)\n", 559 | " y_true = tf.exp(y_true) - 1\n", 560 | " y_pred = tf.exp(y_pred) - 1\n", 561 | "\n", 562 | " # drop zeroes for stability (and consistency with Kaggle)\n", 563 | " where = tf.not_equal(y_true, 0.)\n", 564 | " y_true = y_true[where]\n", 565 | " y_pred = y_pred[where]\n", 566 | "\n", 567 | " percent_error = (y_true - y_pred) / y_true\n", 568 | " return tf.sqrt(tf.reduce_mean(percent_error**2))\n", 569 | "\n", 570 | "optimizer = tf.keras.optimizers.Adam(LEARNING_RATE)\n", 571 | "tf_model.compile(optimizer, 'mse', metrics=[rmspe_tf])\n", 572 | "history = tf_model.fit(\n", 573 | " train_dataset_tf,\n", 574 | " validation_data=valid_dataset_tf,\n", 575 | " epochs=EPOCHS\n", 576 | ")" 577 | ] 578 | }, 579 | { 580 | "cell_type": "markdown", 581 | "metadata": {}, 582 | "source": [ 583 | "This does fairly well straight away, with minimal tuning of hyperparemeters and thought given to specific features and feature engineering. We also could reconsider the network structure, opting for something other than a simple MPE. In reality, it would take a RMSPE <= 0.10021 to beat the eventual winner of this specific Kaggle compeition. So while this submission won't win that Kaggle competition, the hope was to illustrate how easy it is to process data using NVTabular and feed it to a neural network in TensorFlow." 584 | ] 585 | }, 586 | { 587 | "cell_type": "markdown", 588 | "metadata": {}, 589 | "source": [ 590 | "### Acknowledgments\n", 591 | "\n", 592 | "This notebook was adapted for use in this tutorial from the [NVTabular repository](https://github.com/NVIDIA/NVTabular). It was originally created by Even Oldridge, Julio Perez, and Alec Gunny." 593 | ] 594 | } 595 | ], 596 | "metadata": { 597 | "file_extension": ".py", 598 | "kernelspec": { 599 | "display_name": "Python 3", 600 | "language": "python", 601 | "name": "python3" 602 | }, 603 | "language_info": { 604 | "codemirror_mode": { 605 | "name": "ipython", 606 | "version": 3 607 | }, 608 | "file_extension": ".py", 609 | "mimetype": "text/x-python", 610 | "name": "python", 611 | "nbconvert_exporter": "python", 612 | "pygments_lexer": "ipython3", 613 | "version": "3.7.8" 614 | }, 615 | "mimetype": "text/x-python", 616 | "name": "python", 617 | "npconvert_exporter": "python", 618 | "pygments_lexer": "ipython3", 619 | "version": 3 620 | }, 621 | "nbformat": 4, 622 | "nbformat_minor": 4 623 | } 624 | -------------------------------------------------------------------------------- /event_notebooks/KDD_2020/notebooks/parking/README.md: -------------------------------------------------------------------------------- 1 | # Accelerating and Expanding End-to-End Data Science Workflows with DL/ML Interoperability Using RAPIDS 2 | ## Analyzing the Paid Parking Occupancy dataset from Seattle Department of Transportation 3 | 4 | In this part of the hands-on tutorial session we have three notebooks: 5 | 6 | 1. [Where should I park?](codes/1_rapids_seattleParking.ipynb) Using this notebook we will find the parking spots that maximize your chances of having an empty spot when you arrive there. 7 | 2. [Where do I walk?](codes/2_rapids_seattleParking_graph.ipynb) With this notebook we will calculate the walking distance to parking spots instead of *as the crow flies* using haversine distance. 8 | 3. [Where really are the parking spots?](codes/3_rapids_seattleParking_parkingNodes.ipynb) Finally, in this notebook, we will *walk* in a right way, following the roads by adding additional nodes to the road graph. 9 | 10 | We will work with a dataset published by Seattle Department of Transportation called Paid Parking Occupancy that enumerates every single parking transaction in the city of Seattle. The dataset is published daily generating around 3GB of data monthly but in this workshop we will only be using two months worth of data. Namely, we will be looking at the period of May and June of 2019. 11 | 12 | In order to run these notebooks you will need access to a machine or a compute instance in the cloud that has a GPU from NVIDIA. The GPU needs to be at least a Pascal or above family so any GTX 1000-series like 1080 Ti should work fine. 13 | 14 | We do support RAPIDS in AzureML and you can use our `dask_cloudprovider.AzureMLCluster` tool to quickly instantiate a Dask cluster running RAPIDS on Azure ML. You can install the dask-cloudprovider package using pip: `pip install dask-cloudprovider`. For example how to start the Dask Cluster check [an example here](https://github.com/drabastomek/GTC/blob/master/SJ_2020/workshop/1_Setup/Setup.ipynb). **Note that you will need to provide a your own `subscription_id`, `resource_id` and `workspace_name`.** 15 | 16 | 17 | The datasets we will use will automatically download when you use the notebooks. -------------------------------------------------------------------------------- /event_notebooks/KDD_2020/notebooks/parking/__patch/cuspatial_init_patched.py: -------------------------------------------------------------------------------- 1 | from .core.gis import ( 2 | directed_hausdorff_distance, 3 | haversine_distance, 4 | lonlat_to_cartesian, 5 | point_in_polygon, 6 | polygon_bounding_boxes, 7 | polyline_bounding_boxes, 8 | ) 9 | from .core.spatial_window import points_in_spatial_window 10 | from .core.trajectory import ( 11 | derive_trajectories, 12 | trajectory_bounding_boxes, 13 | trajectory_distances_and_speeds, 14 | ) 15 | from .io.shapefile import read_polygon_shapefile -------------------------------------------------------------------------------- /event_notebooks/KDD_2020/notebooks/parking/codes/config/GoogleMapsAPI.cred: -------------------------------------------------------------------------------- 1 | AIzaSyDiyIPOYUARNlDUtKXZmKbZ3WeZh-XnuwI -------------------------------------------------------------------------------- /event_notebooks/README.md: -------------------------------------------------------------------------------- 1 | # RAPIDS Events Notebook 2 | 3 | These notebooks are from recent events (Conferences or Meetups). 4 | 5 | If you are looking for notebooks from previous conferences or events, please go to [Archived Conference Notebooks in Community Notebooks](https://github.com/rapidsai-community/notebooks-contrib/tree/community_relaunch/the_rapids_archive/archive_conference_notebooks) -------------------------------------------------------------------------------- /event_notebooks/TMLS_2020/notebooks/Taxi/Overview-Taxi.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Intro to RAPIDS using the New York City Yellow Taxi Data \n", 8 | "light on Data Science, heavy on comparisons.\n", 9 | "\n", 10 | "This notebook is for the The Toronto Machine Learning Summit, Nov 16 -29, 2020\n", 11 | "\n", 12 | "![TMLS](./img/TMLS.png)" 13 | ] 14 | }, 15 | { 16 | "cell_type": "markdown", 17 | "metadata": {}, 18 | "source": [ 19 | "This notebook includes\n", 20 | "\n", 21 | "* cudf - for basic ETL and some __statistical analysis__ \n", 22 | "* cuml - for __machine learning__\n", 23 | "* cugraph - for some __graph analysis__\n", 24 | "* cuxfilter - for __visualization__\n" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "metadata": {}, 30 | "source": [ 31 | "----\n", 32 | "# Setup" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "# load the libraries\n", 42 | "import cudf\n", 43 | "\n", 44 | "import numpy as np\n", 45 | "import pandas as pd\n", 46 | "import math\n", 47 | "\n", 48 | "import os\n", 49 | "import gc\n", 50 | "\n", 51 | "from collections import OrderedDict\n", 52 | "import argparse\n", 53 | "import datetime\n", 54 | "import time" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "try: \n", 64 | " import tqdm\n", 65 | "except ModuleNotFoundError:\n", 66 | " os.system('pip install tqdm')\n", 67 | " import tqdm" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": null, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "# Let's use Unified Memory (aka managed memory) so that we try and avoid OOM errors \n", 77 | "# start by importing the RAPIDS Memory Manager and then reinitializing with managed memory turn on\n", 78 | "import rmm\n", 79 | "\n", 80 | "rmm.reinitialize( \n", 81 | " managed_memory=True, # Use managed memory, this allows for oversubscription of the GPU\n", 82 | " pool_allocator=False, # default is False\n", 83 | " devices=0, # GPU device IDs to register. By default, registers only GPU 0.\n", 84 | ")" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "## Download the data" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "top_dir = \"./\"\n", 101 | "data_dir = \"./nyctaxi\"" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "# Download Taxi data\n", 111 | "\n", 112 | "if os.path.exists(data_dir) == False:\n", 113 | " import nyctaxi_data\n", 114 | "\n", 115 | " print(\"downloading data\")\n", 116 | " nyctaxi_data.download_nyctaxi_data([\"2016\"], top_dir)\n", 117 | " " 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "----\n", 125 | "\n", 126 | "# cuDF - Accelerated Data Frame " 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "# get a list of files\n", 136 | "data_path = top_dir + \"nyctaxi/2016\"\n", 137 | "\n", 138 | "files = []\n", 139 | "\n", 140 | "for f in sorted(os.listdir(data_path)):\n", 141 | " if f[0:6] != 'yellow':\n", 142 | " continue\n", 143 | " \n", 144 | " fname = os.path.join(data_path, f)\n", 145 | " \n", 146 | " files.append(fname)\n" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "files" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "!du -sh $data_path" 165 | ] 166 | }, 167 | { 168 | "cell_type": "markdown", 169 | "metadata": {}, 170 | "source": [ 171 | "## Loading data performance test" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "def read_pandas(f):\n", 181 | " start_t = time.time()\n", 182 | " df = pd.read_csv(f)\n", 183 | " end_t = time.time() - start_t\n", 184 | "\n", 185 | " return df, end_t" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "def read_cudf(f):\n", 195 | " start_t = time.time()\n", 196 | " df = cudf.read_csv(f)\n", 197 | " end_t = time.time() - start_t\n", 198 | "\n", 199 | " return df, end_t" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "_ = read_pandas(files[0])" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": null, 214 | "metadata": {}, 215 | "outputs": [], 216 | "source": [ 217 | "# Load data with Pandas\n", 218 | "\n", 219 | "data = []\n", 220 | "\n", 221 | "start_t = time.time()\n", 222 | "\n", 223 | "for f in files:\n", 224 | " print(\"\\treading \" + f, end = '')\n", 225 | " df, t = read_pandas(f)\n", 226 | " print(\" ... in time of \" + str(t) + \" seconds\")\n", 227 | " data.append(df)\n", 228 | " \n", 229 | "taxi_pdf = pd.concat(data)\n", 230 | "\n", 231 | "end_t = time.time()\n", 232 | "\n", 233 | "print(f\"loaded {len(taxi_pdf):,} records in {(end_t - start_t):2f} seconds\")\n", 234 | "\n", 235 | "del data" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "# Load data with RAPIDS cuDF\n", 245 | "\n", 246 | "data = []\n", 247 | "\n", 248 | "start_t = time.time()\n", 249 | "\n", 250 | "for f in files:\n", 251 | " print(\"\\treading \" + f, end = '')\n", 252 | " df, t = read_cudf(f)\n", 253 | " print(\" ... in time of \" + str(t)+ \" seconds\")\n", 254 | " data.append(df)\n", 255 | "\n", 256 | "taxi_gdf = cudf.concat(data)\n", 257 | "\n", 258 | "end_t = time.time()\n", 259 | "\n", 260 | "print(f\"loaded {len(taxi_gdf):,} records in {(end_t - start_t):2f} seconds\")\n", 261 | "\n", 262 | "del data" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": null, 275 | "metadata": {}, 276 | "outputs": [], 277 | "source": [ 278 | "taxi_gdf.head(5)" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "metadata": {}, 284 | "source": [ 285 | "## Sort Comparisons - Single Field" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "%%time\n", 295 | "sp = taxi_pdf.sort_values(by='trip_distance',ascending=False)" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "sp.head(5)" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "%%time\n", 314 | "sg = taxi_gdf.sort_values(by='trip_distance',ascending=False)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "sg.head(5)" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "metadata": {}, 329 | "source": [ 330 | "## Group By - Single Column " 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "%%time\n", 340 | "gbp = taxi_pdf.groupby('passenger_count').count()" 341 | ] 342 | }, 343 | { 344 | "cell_type": "code", 345 | "execution_count": null, 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [ 349 | "gbp.head(5)" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": null, 355 | "metadata": {}, 356 | "outputs": [], 357 | "source": [ 358 | "%%time\n", 359 | "gbg = taxi_gdf.groupby('passenger_count').count()" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": null, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "gbg.head(5)" 369 | ] 370 | }, 371 | { 372 | "cell_type": "markdown", 373 | "metadata": {}, 374 | "source": [ 375 | "## Fun with Data" 376 | ] 377 | }, 378 | { 379 | "cell_type": "code", 380 | "execution_count": null, 381 | "metadata": {}, 382 | "outputs": [], 383 | "source": [ 384 | "%%time\n", 385 | "print(f\"Max fare was ${taxi_pdf['fare_amount'].max():,}\")" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "%%time\n", 395 | "print(f\"Max fare was ${taxi_gdf['fare_amount'].max():,}\")" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": null, 401 | "metadata": {}, 402 | "outputs": [], 403 | "source": [] 404 | }, 405 | { 406 | "cell_type": "code", 407 | "execution_count": null, 408 | "metadata": {}, 409 | "outputs": [], 410 | "source": [ 411 | "# looking at that huge fare\n", 412 | "maxf = taxi_gdf['fare_amount'].max()\n", 413 | "taxi_gdf.query('fare_amount == @maxf')" 414 | ] 415 | }, 416 | { 417 | "cell_type": "code", 418 | "execution_count": null, 419 | "metadata": {}, 420 | "outputs": [], 421 | "source": [ 422 | "print(f\"Farthest trip was {taxi_gdf['trip_distance'].max():,} miles\")" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "metadata": {}, 429 | "outputs": [], 430 | "source": [ 431 | "# How long did it take to drive that distance?\n", 432 | "maxd= taxi_gdf['trip_distance'].max()\n", 433 | "taxi_gdf.query('trip_distance == @maxd')" 434 | ] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "metadata": {}, 439 | "source": [ 440 | "### Changing data types" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": null, 446 | "metadata": {}, 447 | "outputs": [], 448 | "source": [ 449 | "# change some data types\n", 450 | "taxi_gdf = taxi_gdf.astype({'tpep_pickup_datetime':'datetime64[ms]', 'tpep_dropoff_datetime':'datetime64[ms]'})" 451 | ] 452 | }, 453 | { 454 | "cell_type": "code", 455 | "execution_count": null, 456 | "metadata": {}, 457 | "outputs": [], 458 | "source": [] 459 | }, 460 | { 461 | "cell_type": "markdown", 462 | "metadata": {}, 463 | "source": [ 464 | "### Filtering data" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "metadata": {}, 471 | "outputs": [], 472 | "source": [ 473 | "# filter out records with missing or outlier values\n", 474 | "query_frags = (\"(fare_amount > 0 and fare_amount < 500) \" +\n", 475 | " \"and (passenger_count > 0 and passenger_count < 6) \" +\n", 476 | " \"and (pickup_longitude > -75 and pickup_longitude < -73) \" +\n", 477 | " \"and (dropoff_longitude > -75 and dropoff_longitude < -73) \" +\n", 478 | " \"and (pickup_latitude > 40 and pickup_latitude < 42) \" +\n", 479 | " \"and (dropoff_latitude > 40 and dropoff_latitude < 42)\" +\n", 480 | " \"and (pickup_latitude != dropoff_latitude) \" +\n", 481 | " \"and (pickup_longitude != dropoff_longitude)\"\n", 482 | " )\n", 483 | "\n", 484 | "taxi_gdf = taxi_gdf.query(query_frags)" 485 | ] 486 | }, 487 | { 488 | "cell_type": "markdown", 489 | "metadata": {}, 490 | "source": [ 491 | "### Add some new features" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": null, 497 | "metadata": {}, 498 | "outputs": [], 499 | "source": [ 500 | "# easier to reference time by YYYY MM DD version a time stamps\n", 501 | "taxi_gdf['hour'] = taxi_gdf['tpep_pickup_datetime'].dt.hour\n", 502 | "taxi_gdf['year'] = taxi_gdf['tpep_pickup_datetime'].dt.year\n", 503 | "taxi_gdf['month'] = taxi_gdf['tpep_pickup_datetime'].dt.month\n", 504 | "taxi_gdf['day'] = taxi_gdf['tpep_pickup_datetime'].dt.day\n", 505 | "taxi_gdf['diff'] = taxi_gdf['tpep_dropoff_datetime'].astype('int64') - taxi_gdf['tpep_pickup_datetime'].astype('int64')" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": null, 511 | "metadata": {}, 512 | "outputs": [], 513 | "source": [ 514 | "def day_of_the_week_kernel(day, month, year, day_of_week):\n", 515 | " for i, (d_1, m_1, y_1) in enumerate(zip(day, month, year)):\n", 516 | " if month[i] < 3:\n", 517 | " shift = month[i]\n", 518 | " else:\n", 519 | " shift = 0\n", 520 | " Y = year[i] - (month[i] < 3)\n", 521 | " y = Y - 2000\n", 522 | " c = 20\n", 523 | " d = day[i]\n", 524 | " m = month[i] + shift + 1\n", 525 | " day_of_week[i] = (d + math.floor(m * 2.6) + y + (y // 4) + (c // 4) - 2 * c) % 7\n", 526 | " \n", 527 | "taxi_gdf = taxi_gdf.apply_rows(\n", 528 | " day_of_the_week_kernel\n", 529 | " , incols = ['day', 'month', 'year']\n", 530 | " , outcols = {'day_of_week': np.int32}\n", 531 | " , kwargs = {}\n", 532 | " )" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": null, 538 | "metadata": {}, 539 | "outputs": [], 540 | "source": [ 541 | "taxi_gdf.head(5)" 542 | ] 543 | }, 544 | { 545 | "cell_type": "markdown", 546 | "metadata": {}, 547 | "source": [ 548 | "---" 549 | ] 550 | }, 551 | { 552 | "cell_type": "markdown", 553 | "metadata": {}, 554 | "source": [ 555 | "## Basic Statistical Data Science\n", 556 | "\n", 557 | "### Look at some feature - by Hour" 558 | ] 559 | }, 560 | { 561 | "cell_type": "code", 562 | "execution_count": null, 563 | "metadata": {}, 564 | "outputs": [], 565 | "source": [ 566 | "# 1) Let's look at a plot of fare by hour\n", 567 | "%matplotlib inline\n", 568 | "taxi_gdf.groupby('hour').fare_amount.mean().to_pandas().sort_index().plot(legend=True)" 569 | ] 570 | }, 571 | { 572 | "cell_type": "code", 573 | "execution_count": null, 574 | "metadata": {}, 575 | "outputs": [], 576 | "source": [ 577 | "# 2) Tips by hour\n", 578 | "%matplotlib inline\n", 579 | "taxi_gdf.groupby('hour').tip_amount.mean().to_pandas().sort_index().plot(legend=True)" 580 | ] 581 | }, 582 | { 583 | "cell_type": "code", 584 | "execution_count": null, 585 | "metadata": {}, 586 | "outputs": [], 587 | "source": [ 588 | "# 3) Number of taxi rides by Hour\n", 589 | "%matplotlib inline\n", 590 | "taxi_gdf['hour'].groupby('hour').count().to_pandas().sort_index().plot(legend=True)" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": null, 596 | "metadata": {}, 597 | "outputs": [], 598 | "source": [ 599 | "# Look at what days are the busiest\n", 600 | "%matplotlib inline\n", 601 | "taxi_gdf.groupby('day_of_week').day_of_week.count().to_pandas().sort_index().plot(legend=True)" 602 | ] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": null, 607 | "metadata": {}, 608 | "outputs": [], 609 | "source": [ 610 | "# What days have the best tips\n", 611 | "%matplotlib inline\n", 612 | "taxi_gdf.groupby('day_of_week').tip_amount.mean().to_pandas().sort_index().plot(legend=True)" 613 | ] 614 | }, 615 | { 616 | "cell_type": "code", 617 | "execution_count": null, 618 | "metadata": {}, 619 | "outputs": [], 620 | "source": [] 621 | }, 622 | { 623 | "cell_type": "markdown", 624 | "metadata": {}, 625 | "source": [ 626 | "# Dropping Columns" 627 | ] 628 | }, 629 | { 630 | "cell_type": "code", 631 | "execution_count": null, 632 | "metadata": {}, 633 | "outputs": [], 634 | "source": [ 635 | "taxi_gdf = taxi_gdf.drop('store_and_fwd_flag', axis=1)" 636 | ] 637 | }, 638 | { 639 | "cell_type": "code", 640 | "execution_count": null, 641 | "metadata": {}, 642 | "outputs": [], 643 | "source": [ 644 | "taxi_gdf.dtypes" 645 | ] 646 | }, 647 | { 648 | "cell_type": "markdown", 649 | "metadata": {}, 650 | "source": [ 651 | "---" 652 | ] 653 | }, 654 | { 655 | "cell_type": "markdown", 656 | "metadata": {}, 657 | "source": [ 658 | "# cuML - Accelerated Machine Learning" 659 | ] 660 | }, 661 | { 662 | "cell_type": "markdown", 663 | "metadata": {}, 664 | "source": [ 665 | "### In Corey's talk" 666 | ] 667 | }, 668 | { 669 | "cell_type": "markdown", 670 | "metadata": {}, 671 | "source": [ 672 | "---\n", 673 | "# cuGraph - Accelerated Graph Analytics\n", 674 | "\n", 675 | "We need vertex IDs to be integer values but what we have are lat-long pairs (float64). There are two way that we can address the issue. The hard way and an easy way" 676 | ] 677 | }, 678 | { 679 | "cell_type": "code", 680 | "execution_count": null, 681 | "metadata": {}, 682 | "outputs": [], 683 | "source": [ 684 | "import cugraph" 685 | ] 686 | }, 687 | { 688 | "cell_type": "code", 689 | "execution_count": null, 690 | "metadata": {}, 691 | "outputs": [], 692 | "source": [ 693 | "taxi_subset = taxi_gdf[['pickup_longitude', 'pickup_latitude','dropoff_longitude', 'dropoff_latitude', 'trip_distance']].reset_index()\n", 694 | "taxi_subset['count'] = 1\n", 695 | "del taxi_gdf" 696 | ] 697 | }, 698 | { 699 | "cell_type": "markdown", 700 | "metadata": {}, 701 | "source": [ 702 | "### Create vertices and edges the hard way" 703 | ] 704 | }, 705 | { 706 | "cell_type": "code", 707 | "execution_count": null, 708 | "metadata": {}, 709 | "outputs": [], 710 | "source": [ 711 | "# create node ID from lat-long combinatiuons\n", 712 | "nodes = [\n", 713 | " taxi_subset[['pickup_longitude', 'pickup_latitude']].drop_duplicates().rename(columns={'pickup_longitude': 'long', 'pickup_latitude': 'lat'})\n", 714 | " , taxi_subset[['dropoff_longitude', 'dropoff_latitude']].drop_duplicates().rename(columns={'dropoff_longitude': 'long', 'dropoff_latitude': 'lat'})\n", 715 | "]" 716 | ] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": null, 721 | "metadata": {}, 722 | "outputs": [], 723 | "source": [ 724 | "nodes = cudf.concat(nodes).drop_duplicates().reset_index(drop=True).reset_index().rename(columns={'index': 'id'})\n", 725 | "nodes.head(5)" 726 | ] 727 | }, 728 | { 729 | "cell_type": "code", 730 | "execution_count": null, 731 | "metadata": {}, 732 | "outputs": [], 733 | "source": [ 734 | "print('Total number of geo points in the dataset: {0:,}'.format(len(nodes)))" 735 | ] 736 | }, 737 | { 738 | "cell_type": "code", 739 | "execution_count": null, 740 | "metadata": {}, 741 | "outputs": [], 742 | "source": [ 743 | "edges = (\n", 744 | " taxi_subset[['pickup_longitude', 'pickup_latitude','dropoff_longitude', 'dropoff_latitude', 'trip_distance']]\n", 745 | " .drop_duplicates()\n", 746 | " .rename(columns={'pickup_longitude': 'long', 'pickup_latitude': 'lat'})\n", 747 | " .merge(nodes, on=['lat', 'long'])\n", 748 | " .rename(columns={'long': 'pickup_longitude', 'lat': 'pickup_latitude', 'id': 'pickup_id', 'dropoff_longitude': 'long', 'dropoff_latitude': 'lat'})\n", 749 | " .merge(nodes, on=['lat', 'long'])\n", 750 | " .rename(columns={'long': 'dropoff_longitude', 'lat': 'dropoff_latitude', 'id': 'dropoff_id'})\n", 751 | ")[['pickup_id', 'dropoff_id', 'trip_distance']]\n", 752 | "\n", 753 | "edges.head(5)" 754 | ] 755 | }, 756 | { 757 | "cell_type": "code", 758 | "execution_count": null, 759 | "metadata": {}, 760 | "outputs": [], 761 | "source": [ 762 | "len(edges)" 763 | ] 764 | }, 765 | { 766 | "cell_type": "code", 767 | "execution_count": null, 768 | "metadata": {}, 769 | "outputs": [], 770 | "source": [ 771 | "g = cugraph.Graph()\n", 772 | "g.from_cudf_edgelist(edges, source='pickup_id', destination='dropoff_id')" 773 | ] 774 | }, 775 | { 776 | "cell_type": "markdown", 777 | "metadata": {}, 778 | "source": [ 779 | "## Pagerank" 780 | ] 781 | }, 782 | { 783 | "cell_type": "code", 784 | "execution_count": null, 785 | "metadata": {}, 786 | "outputs": [], 787 | "source": [ 788 | "%%time\n", 789 | "page = cugraph.pagerank(g, alpha=.85, max_iter=1000, tol=1.0e-05)" 790 | ] 791 | }, 792 | { 793 | "cell_type": "code", 794 | "execution_count": null, 795 | "metadata": {}, 796 | "outputs": [], 797 | "source": [ 798 | "page.sort_values(by='pagerank', ascending=False).head(5).to_pandas()" 799 | ] 800 | }, 801 | { 802 | "cell_type": "markdown", 803 | "metadata": {}, 804 | "source": [ 805 | "## Now the easy way" 806 | ] 807 | }, 808 | { 809 | "cell_type": "code", 810 | "execution_count": null, 811 | "metadata": {}, 812 | "outputs": [], 813 | "source": [ 814 | "g2 = cugraph.Graph()\n", 815 | "g2.from_cudf_edgelist(taxi_subset, \n", 816 | " source=['pickup_longitude', 'pickup_latitude'], \n", 817 | " destination=['dropoff_longitude', 'dropoff_latitude'], \n", 818 | " edge_attr='count',\n", 819 | " renumber=True)" 820 | ] 821 | }, 822 | { 823 | "cell_type": "code", 824 | "execution_count": null, 825 | "metadata": {}, 826 | "outputs": [], 827 | "source": [ 828 | "page = cugraph.pagerank(g2, alpha=.85, max_iter=1000, tol=1.0e-05)\n", 829 | "page.sort_values(by='pagerank', ascending=False).head(5).to_pandas()" 830 | ] 831 | }, 832 | { 833 | "cell_type": "code", 834 | "execution_count": null, 835 | "metadata": {}, 836 | "outputs": [], 837 | "source": [] 838 | }, 839 | { 840 | "cell_type": "markdown", 841 | "metadata": {}, 842 | "source": [ 843 | "---" 844 | ] 845 | } 846 | ], 847 | "metadata": { 848 | "kernelspec": { 849 | "display_name": "cugraph_dev", 850 | "language": "python", 851 | "name": "cugraph_dev" 852 | }, 853 | "language_info": { 854 | "codemirror_mode": { 855 | "name": "ipython", 856 | "version": 3 857 | }, 858 | "file_extension": ".py", 859 | "mimetype": "text/x-python", 860 | "name": "python", 861 | "nbconvert_exporter": "python", 862 | "pygments_lexer": "ipython3", 863 | "version": "3.8.6" 864 | } 865 | }, 866 | "nbformat": 4, 867 | "nbformat_minor": 4 868 | } 869 | -------------------------------------------------------------------------------- /event_notebooks/TMLS_2020/notebooks/Taxi/img/TMLS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/event_notebooks/TMLS_2020/notebooks/Taxi/img/TMLS.png -------------------------------------------------------------------------------- /event_notebooks/TMLS_2020/notebooks/Taxi/nyctaxi_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import urllib.request 4 | from tqdm import tqdm 5 | from itertools import chain 6 | 7 | def download_nyctaxi_data(years, path): 8 | taxi_years = [ 9 | "2014", 10 | "2015", 11 | "2016" 12 | ] 13 | 14 | if not set(years) <= set(taxi_years): 15 | print(years) 16 | print("years list not valid, please specify a sublist of") 17 | print(taxi_years) 18 | raise Exception("{years} list is not valid".format(years=years)) 19 | 20 | data_dir = os.path.abspath(os.path.join(path, "nyctaxi")) 21 | if not os.path.exists(data_dir): 22 | os.makedirs(data_dir) 23 | 24 | filenames = [] 25 | local_paths = [] 26 | for year in years: 27 | if year == "2016": 28 | start = 1 29 | end = 7 30 | else: 31 | start = 1 32 | end = 13 33 | if not os.path.exists(os.path.join(data_dir, year)): 34 | os.makedirs(os.path.join(data_dir, year)) 35 | for i in range(start, end): 36 | filename = "yellow_tripdata_{year}-{month:02d}.csv".format(year=year, month=i) 37 | filenames.append(filename) 38 | local_path = os.path.join(data_dir, year, filename) 39 | local_paths.append(local_path) 40 | 41 | for year in years: 42 | for idx, filename in enumerate(filenames): 43 | filename_elements = [filename_element.split('-') for filename_element in filename.split('_')] 44 | filename_elements = list(chain.from_iterable(filename_elements)) 45 | if year in filename_elements: 46 | url = "https://storage.googleapis.com/anaconda-public-data/nyc-taxi/csv/{year}/".format(year=year) + filename 47 | print("- Downloading " + url) 48 | if not os.path.exists(local_paths[idx]): 49 | with open(local_paths[idx], 'wb') as file: 50 | with urllib.request.urlopen(url) as resp: 51 | length = int(resp.getheader('content-length')) 52 | blocksize = max(4096, length // 100) 53 | with tqdm(total=length, file=sys.stdout) as pbar: 54 | while True: 55 | buff = resp.read(blocksize) 56 | if not buff: 57 | break 58 | file.write(buff) 59 | pbar.update(len(buff)) 60 | else: 61 | print("- File already exists locally") 62 | 63 | print("-------------------") 64 | print("-Download complete-") 65 | print("-------------------") 66 | -------------------------------------------------------------------------------- /getting_started_tutorials/README.md: -------------------------------------------------------------------------------- 1 | # RAPIDS Getting Started Tutorial Notebooks 2 | Coming Soon! -------------------------------------------------------------------------------- /getting_started_tutorials/images/ibis-cudf-pandas-comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rapidsai-community/showcase/aa819ab5a0aab2d54eef07d37cd2874eeb9b25da/getting_started_tutorials/images/ibis-cudf-pandas-comparison.png -------------------------------------------------------------------------------- /getting_started_tutorials/opencellid_downloader.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | import tarfile 4 | from tqdm import tqdm 5 | 6 | def download_and_extract(dataset='us', directory='opencellid_data'): 7 | # Define the URLs for the datasets 8 | urls = { 9 | 'us': 'https://data.rapids.ai/cudf/datasets/cell_towers_us.tar.xz', 10 | 'worldwide': 'https://data.rapids.ai/cudf/datasets/cell_towers.tar.xz' 11 | } 12 | 13 | # Check if the dataset parameter is valid 14 | if dataset not in urls: 15 | raise ValueError("Invalid dataset parameter. Use 'us' or 'worldwide'.") 16 | 17 | # Get the URL for the selected dataset 18 | url = urls[dataset] 19 | 20 | # Define the local filename and directory 21 | local_filename = os.path.join(directory, url.split('/')[-1]) 22 | csv_filename = os.path.join(directory, url.split('/')[-1].replace('.tar.xz', '.csv')) 23 | 24 | # Create the directory if it doesn't exist 25 | os.makedirs(directory, exist_ok=True) 26 | 27 | # Check if the CSV file already exists 28 | if os.path.exists(csv_filename): 29 | print(f"{csv_filename} already exists. Skipping download and extraction.") 30 | return 31 | 32 | # Check if the tar.xz file already exists 33 | if not os.path.exists(local_filename): 34 | # Download the file with progress bar 35 | print(f"Downloading {dataset} dataset from {url}...") 36 | response = requests.get(url, stream=True) 37 | total_size = int(response.headers.get('content-length', 0)) 38 | with open(local_filename, 'wb') as f, tqdm( 39 | desc=local_filename, 40 | total=total_size, 41 | unit='iB', 42 | unit_scale=True, 43 | unit_divisor=1024, 44 | ) as bar: 45 | for chunk in response.iter_content(chunk_size=1024): 46 | size = f.write(chunk) 47 | bar.update(size) 48 | print(f"Downloaded {local_filename} successfully.") 49 | else: 50 | print(f"{local_filename} already exists. Skipping download.") 51 | 52 | # Extract the tar.xz file 53 | print(f"Extracting {local_filename}...") 54 | with tarfile.open(local_filename, 'r:xz') as tar: 55 | members = tar.getmembers() 56 | csv_members = [m for m in members if m.name.endswith('.csv')] 57 | for member in csv_members: 58 | member.name = os.path.basename(member.name) # Remove the directory structure 59 | tar.extract(member, path=directory) 60 | 61 | print(f"Extracted {local_filename} successfully.") 62 | 63 | -------------------------------------------------------------------------------- /team_contributions/README.md: -------------------------------------------------------------------------------- 1 | # RAPIDS Team Contribution Notebooks 2 | A place for contributions of data science notebooks, visualizations, and more from RAPIDS developers. -------------------------------------------------------------------------------- /team_contributions/cuxfilter-tutorial/LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /team_contributions/cuxfilter-tutorial/README.md: -------------------------------------------------------------------------------- 1 | # Cuxfilter Tutorial 2 | A brief notebook tutorial on [RAPIDS'](https://rapids.ai/) [cuxfilter](https://docs.rapids.ai/api/cuxfilter/stable/), a GPU accelerated cross-filtering dashboard library. 3 | -------------------------------------------------------------------------------- /team_contributions/cuxfilter-tutorial/preprocess.py: -------------------------------------------------------------------------------- 1 | from pyproj import Proj, Transformer 2 | 3 | def transform_coords(df, x='x', y='y'): 4 | transform_4326_to_3857 = Transformer.from_crs('epsg:4326', 'epsg:3857') 5 | df['x'], df['y'] = transform_4326_to_3857.transform(df[x].to_array(), df[y].to_array()) 6 | return df 7 | 8 | def process_trips(data): 9 | # Apply Transformation 10 | trips = transform_coords(data, x='latitude_start', y='longitude_start') 11 | 12 | 13 | # Note: days 0-4 are weekedays, days 5-6 are weekends 14 | trips['day_type'] = 0 15 | trips.loc[trips.query('day>4').index, 'day_type'] = 1 16 | 17 | 18 | # Note: Data always has edge cases, such as the extra week anomalies of 2015 and 2016: 19 | # trips.groupby('year').week.max().to_pandas().to_dict() is {2014: 52, 2015: 53, 2016: 53, 2017: 52} 20 | # Since 2015 and 2016 have 53 weeks, we add 1 to global week count for their following years - 2016 & 2017 21 | # (data.year/2016).astype('int') => returns 1 if year>=2016, else 0 22 | year0 = int(trips.year.min()) #2014 23 | trips['all_time_week'] = data.week + 52*(data.year - year0) + (data.year/2016).astype('int') 24 | 25 | #Finally, we remove the unused columns and reorganize our dataframe: 26 | trips = trips[[ 27 | 'year', 'month', 'week', 'day', 'hour', 'gender', 'from_station_name', 28 | 'from_station_id', 'to_station_id', 'x', 'y', 'from_station_name', 'to_station_name', 'all_time_week', 'day_type' 29 | ]] 30 | 31 | return trips --------------------------------------------------------------------------------