├── README.md ├── demo ├── python-agg.ipynb ├── nuclio-pandas-agg.ipynb ├── nuclio-cudf-agg.ipynb └── benchmark_cudf_vs_pd.ipynb └── LICENSE /README.md: -------------------------------------------------------------------------------- 1 | # rapids 2 | nuclio integration and demos with NVIDIA RAPIDS 3 | -------------------------------------------------------------------------------- /demo/python-agg.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Python (Standalone)\n", 8 | "## Unified Data batching & Agg function" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "### Installations" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 1, 21 | "metadata": {}, 22 | "outputs": [ 23 | { 24 | "name": "stdout", 25 | "output_type": "stream", 26 | "text": [ 27 | "Collecting kafka\n", 28 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/21/71/73286e748ac5045b6a669c2fe44b03ac4c5d3d2af9291c4c6fc76438a9a9/kafka-1.3.5-py2.py3-none-any.whl (207kB)\n", 29 | "\u001b[K 100% |████████████████████████████████| 215kB 20.1MB/s ta 0:00:01\n", 30 | "\u001b[?25hInstalling collected packages: kafka\n", 31 | "Successfully installed kafka-1.3.5\n" 32 | ] 33 | } 34 | ], 35 | "source": [ 36 | "!pip install kafka" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "### Script" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "from kafka import KafkaConsumer\n", 53 | "import os\n", 54 | "import glob\n", 55 | "from datetime import datetime, timedelta\n", 56 | "import time\n", 57 | "import itertools\n", 58 | "import json\n", 59 | "\n", 60 | "# Select DF Factory\n", 61 | "import cudf as pd\n", 62 | "# import pandas as pd\n", 63 | "\n", 64 | "\n", 65 | "# Basic configuration\n", 66 | "metric_names = ['cpu_utilization', 'latency', 'packet_loss', 'throughput']\n", 67 | "batch_len = 100\n", 68 | "batch = list()\n", 69 | "\n", 70 | "# Kafka configuration\n", 71 | "topic = ''\n", 72 | "servers = []\n", 73 | "offset = 'earliest'\n", 74 | "\n", 75 | "def handler(event):\n", 76 | " '''\n", 77 | " Processing function\n", 78 | " '''\n", 79 | " global batch\n", 80 | " global metric_names\n", 81 | " \n", 82 | " # Aggregate event jsons\n", 83 | " batch.append(event.body)\n", 84 | " \n", 85 | " # Did we aggregate enough events for aggregation?\n", 86 | " if len(batch) >= interval:\n", 87 | " \n", 88 | " # Create a DataFrame from the batch of event jsons\n", 89 | " df = cudf.read_json('\\n'.join(batch), lines=True)\n", 90 | " df = df.reset_index(drop=True)\n", 91 | " \n", 92 | " # Perform aggregations\n", 93 | " df = df.groupby(['company']).\\\n", 94 | " agg({k: ['min', 'max', 'mean'] for k in metric_names})\n", 95 | " \n", 96 | " # Save to parquet\n", 97 | " filename = f'{time.time()}.parquet'\n", 98 | " filepath = os.path.join(sink, filename)\n", 99 | " new_index = [f'{e[0]}_{e[1]}' for e in list(df.columns)]\n", 100 | " df.columns = new_index\n", 101 | " df.to_parquet(filepath)\n", 102 | " \n", 103 | " # Reset batch\n", 104 | " batch = list()\n", 105 | "\n", 106 | "\n", 107 | "# Kafka handling\n", 108 | "consumer = KafkaConsumer(\n", 109 | " topic,\n", 110 | " bootstrap_servers=servers,\n", 111 | " auto_offset_reset='offset',\n", 112 | " value_deserializer=lambda x: x.decode('utf-8'))\n", 113 | "\n", 114 | "for message in consumer:\n", 115 | " message = message.value\n", 116 | " handler(message)" 117 | ] 118 | } 119 | ], 120 | "metadata": { 121 | "kernelspec": { 122 | "display_name": "Python 3", 123 | "language": "python", 124 | "name": "python3" 125 | }, 126 | "language_info": { 127 | "codemirror_mode": { 128 | "name": "ipython", 129 | "version": 3 130 | }, 131 | "file_extension": ".py", 132 | "mimetype": "text/x-python", 133 | "name": "python", 134 | "nbconvert_exporter": "python", 135 | "pygments_lexer": "ipython3", 136 | "version": "3.6.8" 137 | } 138 | }, 139 | "nbformat": 4, 140 | "nbformat_minor": 4 141 | } 142 | -------------------------------------------------------------------------------- /demo/nuclio-pandas-agg.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Nuclio\n", 8 | "## Unified Data batching & Agg function" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "# nuclio: ignore\n", 18 | "import nuclio" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## Environment" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "### Base config" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "%nuclio: setting spec.triggers.hahttp.kind to 'http'\n", 45 | "%nuclio: setting spec.triggers.hahttp.maxWorkers to 1\n", 46 | "%nuclio: setting spec.triggers.hahttp.attributes.port to 31002\n", 47 | "%nuclio: setting spec.build.baseImage to 'python:3.6-jessie'\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "%%nuclio config\n", 53 | "\n", 54 | "# Kafka Trigger\n", 55 | "# spec.triggers.hakafka.kind = \"kafka\"\n", 56 | "# spec.trigger.url = \"1.1.1.1\"\n", 57 | "# spec.triggers.hakafka.attributes.topic = \"haproxy\"\n", 58 | "# spec.triggers.hakafka.attributes.partitions = [0, 1, 2]\n", 59 | "# spec.triggers.hakafka.attributes.sasl.enable: true\n", 60 | "# spec.triggers.hakafka.attributes.sasl.user: \"\"\n", 61 | "# spec.triggers.hakafka.attributes.sasl.password: \"\"\n", 62 | "\n", 63 | "# HTTP Trigger \n", 64 | "spec.triggers.hahttp.kind=\"http\"\n", 65 | "spec.triggers.hahttp.maxWorkers=1\n", 66 | "spec.triggers.hahttp.attributes.port=31002\n", 67 | "\n", 68 | "# Base image\n", 69 | "spec.build.baseImage = \"python:3.6-jessie\"" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "### Env variables" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 3, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "name": "stdout", 86 | "output_type": "stream", 87 | "text": [ 88 | "%nuclio: setting 'SINK_PATH' environment variable\n", 89 | "%nuclio: setting 'INTERVAL' environment variable\n", 90 | "%nuclio: setting 'METRIC_NAMES' environment variable\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "%nuclio env SINK_PATH=./sink\n", 96 | "%nuclio env INTERVAL=2\n", 97 | "%nuclio env METRIC_NAMES=cpu_utilization,latency,packet_loss,throughput" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "### Build commands" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 4, 110 | "metadata": {}, 111 | "outputs": [ 112 | { 113 | "name": "stdout", 114 | "output_type": "stream", 115 | "text": [ 116 | "Requirement already satisfied: pandas in /conda/lib/python3.6/site-packages (0.23.4)\n", 117 | "Requirement already satisfied: python-dateutil>=2.5.0 in /conda/lib/python3.6/site-packages (from pandas) (2.8.0)\n", 118 | "Requirement already satisfied: pytz>=2011k in /conda/lib/python3.6/site-packages (from pandas) (2019.1)\n", 119 | "Requirement already satisfied: numpy>=1.9.0 in /conda/lib/python3.6/site-packages (from pandas) (1.16.4)\n", 120 | "Requirement already satisfied: six>=1.5 in /conda/lib/python3.6/site-packages (from python-dateutil>=2.5.0->pandas) (1.12.0)\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "%%nuclio cmd\n", 126 | "pip install pandas" 127 | ] 128 | }, 129 | { 130 | "cell_type": "markdown", 131 | "metadata": {}, 132 | "source": [ 133 | "## Function" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 5, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "import os\n", 143 | "import glob\n", 144 | "from datetime import datetime, timedelta\n", 145 | "import time\n", 146 | "import pandas as pd\n", 147 | "import itertools\n", 148 | "import json\n", 149 | "\n", 150 | "# Define sink & Verify its available\n", 151 | "sink = os.getenv('SINK_PATH', './sink')\n", 152 | "os.makedirs(sink, exist_ok=True)\n", 153 | "\n", 154 | "# Expose metric names\n", 155 | "metric_names = os.environ['METRIC_NAMES']\n", 156 | "metric_names = metric_names.split(',')\n", 157 | "\n", 158 | "# Define batch & batch interval\n", 159 | "batch = list()\n", 160 | "interval = int(os.getenv('INTERVAL', 100))\n", 161 | "\n", 162 | "def handler(context, event):\n", 163 | " global batch\n", 164 | " global metric_names\n", 165 | " \n", 166 | " # Aggregate event jsons\n", 167 | " batch.append(event.body)\n", 168 | " \n", 169 | " # Did we aggregate enough events for aggregation?\n", 170 | " if len(batch) >= interval:\n", 171 | " \n", 172 | " # Create pandas DataFrame from the batch of event jsons\n", 173 | " df = pd.read_json('\\n'.join(batch), lines=True)\n", 174 | " df = df.reset_index(drop=True)\n", 175 | " \n", 176 | " # Perform aggregations\n", 177 | " df = df.groupby(['company']).\\\n", 178 | " agg({k: ['min', 'max', 'mean'] for k in metric_names})\n", 179 | " \n", 180 | " # Save to parquet\n", 181 | " filename = f'{time.time()}.parquet'\n", 182 | " filepath = os.path.join(sink, filename)\n", 183 | " new_index = [f'{e[0]}_{e[1]}' for e in list(df.columns)]\n", 184 | " df.columns = new_index\n", 185 | " df.to_parquet(filepath)\n", 186 | " \n", 187 | " # Reset batch\n", 188 | " batch = list()" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "## Test" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 6, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "# nuclio: ignore\n", 205 | "event = nuclio.Event(body='{\"company\":\"Rios__Pope_and_Baird\",\"cpu_utilization\":70.6942165035,\"cpu_utilization_is_error\":false,\"latency\":3.1373003261,\"latency_is_error\":false,\"packet_loss\":0.0,\"packet_loss_is_error\":false,\"throughput\":249.7207880994,\"throughput_is_error\":false,\"timestamp\":1563795193534}')\n", 206 | "out = handler(context, event)\n", 207 | "out" 208 | ] 209 | }, 210 | { 211 | "cell_type": "markdown", 212 | "metadata": {}, 213 | "source": [ 214 | "## Deploy (If a nuclio cluster is available)" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 7, 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "name": "stdout", 224 | "output_type": "stream", 225 | "text": [ 226 | "[nuclio.deploy] 2019-08-07 12:29:45,407 (info) Building processor image\n", 227 | "[nuclio.deploy] 2019-08-07 12:29:47,443 (info) Pushing image\n", 228 | "[nuclio.deploy] 2019-08-07 12:29:47,443 (info) Build complete\n", 229 | "[nuclio.deploy] 2019-08-07 12:29:51,488 (info) Function deploy complete\n", 230 | "[nuclio.deploy] 2019-08-07 12:29:51,494 done updating pd-batch-and-agg, function address: 3.120.15.118:31002\n", 231 | "%nuclio: function deployed\n" 232 | ] 233 | } 234 | ], 235 | "source": [ 236 | "%nuclio deploy -p nvidia -n pd_batch_and_agg -c" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [] 245 | } 246 | ], 247 | "metadata": { 248 | "kernelspec": { 249 | "display_name": "Python 3", 250 | "language": "python", 251 | "name": "python3" 252 | }, 253 | "language_info": { 254 | "codemirror_mode": { 255 | "name": "ipython", 256 | "version": 3 257 | }, 258 | "file_extension": ".py", 259 | "mimetype": "text/x-python", 260 | "name": "python", 261 | "nbconvert_exporter": "python", 262 | "pygments_lexer": "ipython3", 263 | "version": "3.6.8" 264 | } 265 | }, 266 | "nbformat": 4, 267 | "nbformat_minor": 4 268 | } 269 | -------------------------------------------------------------------------------- /demo/nuclio-cudf-agg.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Nuclio\n", 8 | "## Unified Data batching & Agg function" 9 | ] 10 | }, 11 | { 12 | "cell_type": "code", 13 | "execution_count": 1, 14 | "metadata": {}, 15 | "outputs": [], 16 | "source": [ 17 | "# nuclio: ignore\n", 18 | "import nuclio" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## Environment" 26 | ] 27 | }, 28 | { 29 | "cell_type": "markdown", 30 | "metadata": {}, 31 | "source": [ 32 | "### Base config" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 2, 38 | "metadata": {}, 39 | "outputs": [ 40 | { 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "%nuclio: setting spec.triggers.hahttp.kind to 'http'\n", 45 | "%nuclio: setting spec.triggers.hahttp.maxWorkers to 1\n", 46 | "%nuclio: setting spec.triggers.hahttp.attributes.port to 31001\n", 47 | "%nuclio: setting spec.build.baseImage to 'rapidsai/rapidsai:cuda10.0-runtime-centos7'\n" 48 | ] 49 | } 50 | ], 51 | "source": [ 52 | "%%nuclio config\n", 53 | "\n", 54 | "# Kafka Trigger\n", 55 | "# spec.triggers.hakafka.kind = \"kafka\"\n", 56 | "# spec.trigger.url = \"1.1.1.1\"\n", 57 | "# spec.triggers.hakafka.attributes.topic = \"haproxy\"\n", 58 | "# spec.triggers.hakafka.attributes.partitions = [0, 1, 2]\n", 59 | "# spec.triggers.hakafka.attributes.sasl.enable: true\n", 60 | "# spec.triggers.hakafka.attributes.sasl.user: \"\"\n", 61 | "# spec.triggers.hakafka.attributes.sasl.password: \"\"\n", 62 | "\n", 63 | "# HTTP Trigger \n", 64 | "spec.triggers.hahttp.kind=\"http\"\n", 65 | "spec.triggers.hahttp.maxWorkers=1\n", 66 | "spec.triggers.hahttp.attributes.port=31001\n", 67 | "\n", 68 | "# Base image\n", 69 | "spec.build.baseImage = \"rapidsai/rapidsai:cuda10.0-runtime-centos7\"" 70 | ] 71 | }, 72 | { 73 | "cell_type": "markdown", 74 | "metadata": {}, 75 | "source": [ 76 | "### Env variables" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 3, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "name": "stdout", 86 | "output_type": "stream", 87 | "text": [ 88 | "%nuclio: setting 'SINK_PATH' environment variable\n", 89 | "%nuclio: setting 'INTERVAL' environment variable\n", 90 | "%nuclio: setting 'METRIC_NAMES' environment variable\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "%nuclio env SINK_PATH=./sink\n", 96 | "%nuclio env INTERVAL=2\n", 97 | "%nuclio env METRIC_NAMES=cpu_utilization,latency,packet_loss,throughput" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "## Function" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 4, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "import os\n", 114 | "import glob\n", 115 | "from datetime import datetime, timedelta\n", 116 | "import time\n", 117 | "import cudf\n", 118 | "import itertools\n", 119 | "import json\n", 120 | "\n", 121 | "# Define sink & Verify its available\n", 122 | "sink = os.getenv('SINK_PATH', './sink')\n", 123 | "os.makedirs(sink, exist_ok=True)\n", 124 | "\n", 125 | "# Expose metric names\n", 126 | "metric_names = os.environ['METRIC_NAMES']\n", 127 | "metric_names = metric_names.split(',')\n", 128 | "\n", 129 | "# Define batch & batch interval\n", 130 | "batch = list()\n", 131 | "interval = int(os.getenv('INTERVAL', 100))\n", 132 | "\n", 133 | "def handler(context, event):\n", 134 | " global batch\n", 135 | " global metric_names\n", 136 | " \n", 137 | " # Aggregate event jsons\n", 138 | " batch.append(event.body)\n", 139 | " \n", 140 | " # Did we aggregate enough events for aggregation?\n", 141 | " if len(batch) >= interval:\n", 142 | " \n", 143 | " # Create cudf DataFrame from the batch of event jsons\n", 144 | " df = cudf.read_json('\\n'.join(batch), lines=True)\n", 145 | " df = df.reset_index(drop=True)\n", 146 | " \n", 147 | " # Perform aggregations\n", 148 | " df = df.groupby(['company']).\\\n", 149 | " agg({k: ['min', 'max', 'mean'] for k in metric_names})\n", 150 | " \n", 151 | " # Save to parquet\n", 152 | " filename = f'{time.time()}.parquet'\n", 153 | " filepath = os.path.join(sink, filename)\n", 154 | " new_index = [f'{e[0]}_{e[1]}' for e in list(df.columns)]\n", 155 | " df.columns = new_index\n", 156 | " df.to_parquet(filepath)\n", 157 | " \n", 158 | " # Reset batch\n", 159 | " batch = list()" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "## Test" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 5, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "# nuclio: ignore\n", 176 | "event = nuclio.Event(body='{\"company\":\"Rios__Pope_and_Baird\",\"cpu_utilization\":70.6942165035,\"cpu_utilization_is_error\":false,\"latency\":3.1373003261,\"latency_is_error\":false,\"packet_loss\":0.0,\"packet_loss_is_error\":false,\"throughput\":249.7207880994,\"throughput_is_error\":false,\"timestamp\":1563795193534}')\n", 177 | "out = handler(context, event)\n", 178 | "out" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "metadata": {}, 184 | "source": [ 185 | "## Deploy (If a nuclio cluster is available)" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 6, 191 | "metadata": {}, 192 | "outputs": [ 193 | { 194 | "name": "stdout", 195 | "output_type": "stream", 196 | "text": [ 197 | "[nuclio.deploy] 2019-08-07 12:29:26,987 (info) Building processor image\n" 198 | ] 199 | }, 200 | { 201 | "name": "stderr", 202 | "output_type": "stream", 203 | "text": [ 204 | "INFO:(info) Building processor image\n" 205 | ] 206 | }, 207 | { 208 | "name": "stdout", 209 | "output_type": "stream", 210 | "text": [ 211 | "[nuclio.deploy] 2019-08-07 12:29:30,024 (info) Pushing image\n" 212 | ] 213 | }, 214 | { 215 | "name": "stderr", 216 | "output_type": "stream", 217 | "text": [ 218 | "INFO:(info) Pushing image\n" 219 | ] 220 | }, 221 | { 222 | "name": "stdout", 223 | "output_type": "stream", 224 | "text": [ 225 | "[nuclio.deploy] 2019-08-07 12:29:30,026 (info) Build complete\n" 226 | ] 227 | }, 228 | { 229 | "name": "stderr", 230 | "output_type": "stream", 231 | "text": [ 232 | "INFO:(info) Build complete\n" 233 | ] 234 | }, 235 | { 236 | "name": "stdout", 237 | "output_type": "stream", 238 | "text": [ 239 | "[nuclio.deploy] 2019-08-07 12:29:34,085 (info) Function deploy complete\n" 240 | ] 241 | }, 242 | { 243 | "name": "stderr", 244 | "output_type": "stream", 245 | "text": [ 246 | "INFO:(info) Function deploy complete\n" 247 | ] 248 | }, 249 | { 250 | "name": "stdout", 251 | "output_type": "stream", 252 | "text": [ 253 | "[nuclio.deploy] 2019-08-07 12:29:34,096 done updating cudf-batch-and-agg, function address: 3.120.15.118:31001\n" 254 | ] 255 | }, 256 | { 257 | "name": "stderr", 258 | "output_type": "stream", 259 | "text": [ 260 | "INFO:done updating cudf-batch-and-agg, function address: 3.120.15.118:31001\n" 261 | ] 262 | }, 263 | { 264 | "name": "stdout", 265 | "output_type": "stream", 266 | "text": [ 267 | "%nuclio: function deployed\n" 268 | ] 269 | } 270 | ], 271 | "source": [ 272 | "%nuclio deploy -p nvidia -n cudf_batch_and_agg -c" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [] 281 | } 282 | ], 283 | "metadata": { 284 | "kernelspec": { 285 | "display_name": "Python 3", 286 | "language": "python", 287 | "name": "python3" 288 | }, 289 | "language_info": { 290 | "codemirror_mode": { 291 | "name": "ipython", 292 | "version": 3 293 | }, 294 | "file_extension": ".py", 295 | "mimetype": "text/x-python", 296 | "name": "python", 297 | "nbconvert_exporter": "python", 298 | "pygments_lexer": "ipython3", 299 | "version": "3.6.8" 300 | } 301 | }, 302 | "nbformat": 4, 303 | "nbformat_minor": 4 304 | } 305 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /demo/benchmark_cudf_vs_pd.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Benchmark Pandas vs Cudf\n", 8 | "- Using *timeit*" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "metadata": {}, 14 | "source": [ 15 | "### System details" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "#### GPU" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 1, 28 | "metadata": {}, 29 | "outputs": [ 30 | { 31 | "name": "stdout", 32 | "output_type": "stream", 33 | "text": [ 34 | "\n", 35 | "==============NVSMI LOG==============\n", 36 | "\n", 37 | "Timestamp : Mon Jul 22 11:32:33 2019\n", 38 | "Driver Version : 418.56\n", 39 | "CUDA Version : 10.1\n", 40 | "\n", 41 | "Attached GPUs : 1\n", 42 | "GPU 00000000:00:1E.0\n", 43 | " Product Name : Tesla V100-SXM2-16GB\n", 44 | " Product Brand : Tesla\n", 45 | " Display Mode : Enabled\n", 46 | " Display Active : Disabled\n", 47 | " Persistence Mode : Enabled\n", 48 | " Accounting Mode : Disabled\n", 49 | " Accounting Mode Buffer Size : 4000\n", 50 | " Driver Model\n", 51 | " Current : N/A\n", 52 | " Pending : N/A\n", 53 | " Serial Number : 0323217016780\n", 54 | " GPU UUID : GPU-3ec8803d-1d6d-b362-7a9d-57b78fe42967\n", 55 | " Minor Number : 0\n", 56 | " VBIOS Version : 88.00.4F.00.09\n", 57 | " MultiGPU Board : No\n", 58 | " Board ID : 0x1e\n", 59 | " GPU Part Number : 900-2G503-0000-000\n", 60 | " Inforom Version\n", 61 | " Image Version : G503.0201.00.03\n", 62 | " OEM Object : 1.1\n", 63 | " ECC Object : 5.0\n", 64 | " Power Management Object : N/A\n", 65 | " GPU Operation Mode\n", 66 | " Current : N/A\n", 67 | " Pending : N/A\n", 68 | " GPU Virtualization Mode\n", 69 | " Virtualization mode : Pass-Through\n", 70 | " IBMNPU\n", 71 | " Relaxed Ordering Mode : N/A\n", 72 | " PCI\n", 73 | " Bus : 0x00\n", 74 | " Device : 0x1E\n", 75 | " Domain : 0x0000\n", 76 | " Device Id : 0x1DB110DE\n", 77 | " Bus Id : 00000000:00:1E.0\n", 78 | " Sub System Id : 0x121210DE\n", 79 | " GPU Link Info\n", 80 | " PCIe Generation\n", 81 | " Max : 3\n", 82 | " Current : 3\n", 83 | " Link Width\n", 84 | " Max : 16x\n", 85 | " Current : 16x\n", 86 | " Bridge Chip\n", 87 | " Type : N/A\n", 88 | " Firmware : N/A\n", 89 | " Replays Since Reset : 0\n", 90 | " Replay Number Rollovers : 0\n", 91 | " Tx Throughput : 0 KB/s\n", 92 | " Rx Throughput : 0 KB/s\n", 93 | " Fan Speed : N/A\n", 94 | " Performance State : P0\n", 95 | " Clocks Throttle Reasons\n", 96 | " Idle : Active\n", 97 | " Applications Clocks Setting : Not Active\n", 98 | " SW Power Cap : Not Active\n", 99 | " HW Slowdown : Not Active\n", 100 | " HW Thermal Slowdown : Not Active\n", 101 | " HW Power Brake Slowdown : Not Active\n", 102 | " Sync Boost : Not Active\n", 103 | " SW Thermal Slowdown : Not Active\n", 104 | " Display Clock Setting : Not Active\n", 105 | " FB Memory Usage\n", 106 | " Total : 16130 MiB\n", 107 | " Used : 0 MiB\n", 108 | " Free : 16130 MiB\n", 109 | " BAR1 Memory Usage\n", 110 | " Total : 16384 MiB\n", 111 | " Used : 2 MiB\n", 112 | " Free : 16382 MiB\n", 113 | " Compute Mode : Default\n", 114 | " Utilization\n", 115 | " Gpu : 0 %\n", 116 | " Memory : 0 %\n", 117 | " Encoder : 0 %\n", 118 | " Decoder : 0 %\n", 119 | " Encoder Stats\n", 120 | " Active Sessions : 0\n", 121 | " Average FPS : 0\n", 122 | " Average Latency : 0\n", 123 | " FBC Stats\n", 124 | " Active Sessions : 0\n", 125 | " Average FPS : 0\n", 126 | " Average Latency : 0\n", 127 | " Ecc Mode\n", 128 | " Current : Enabled\n", 129 | " Pending : Enabled\n", 130 | " ECC Errors\n", 131 | " Volatile\n", 132 | " Single Bit \n", 133 | " Device Memory : 0\n", 134 | " Register File : 0\n", 135 | " L1 Cache : 0\n", 136 | " L2 Cache : 0\n", 137 | " Texture Memory : N/A\n", 138 | " Texture Shared : N/A\n", 139 | " CBU : N/A\n", 140 | " Total : 0\n", 141 | " Double Bit \n", 142 | " Device Memory : 0\n", 143 | " Register File : 0\n", 144 | " L1 Cache : 0\n", 145 | " L2 Cache : 0\n", 146 | " Texture Memory : N/A\n", 147 | " Texture Shared : N/A\n", 148 | " CBU : 0\n", 149 | " Total : 0\n", 150 | " Aggregate\n", 151 | " Single Bit \n", 152 | " Device Memory : 0\n", 153 | " Register File : 0\n", 154 | " L1 Cache : 0\n", 155 | " L2 Cache : 0\n", 156 | " Texture Memory : N/A\n", 157 | " Texture Shared : N/A\n", 158 | " CBU : N/A\n", 159 | " Total : 0\n", 160 | " Double Bit \n", 161 | " Device Memory : 0\n", 162 | " Register File : 0\n", 163 | " L1 Cache : 0\n", 164 | " L2 Cache : 0\n", 165 | " Texture Memory : N/A\n", 166 | " Texture Shared : N/A\n", 167 | " CBU : 0\n", 168 | " Total : 0\n", 169 | " Retired Pages\n", 170 | " Single Bit ECC : 0\n", 171 | " Double Bit ECC : 0\n", 172 | " Pending : No\n", 173 | " Temperature\n", 174 | " GPU Current Temp : 32 C\n", 175 | " GPU Shutdown Temp : 90 C\n", 176 | " GPU Slowdown Temp : 87 C\n", 177 | " GPU Max Operating Temp : 83 C\n", 178 | " Memory Current Temp : 28 C\n", 179 | " Memory Max Operating Temp : 85 C\n", 180 | " Power Readings\n", 181 | " Power Management : Supported\n", 182 | " Power Draw : 23.61 W\n", 183 | " Power Limit : 300.00 W\n", 184 | " Default Power Limit : 300.00 W\n", 185 | " Enforced Power Limit : 300.00 W\n", 186 | " Min Power Limit : 150.00 W\n", 187 | " Max Power Limit : 300.00 W\n", 188 | " Clocks\n", 189 | " Graphics : 135 MHz\n", 190 | " SM : 135 MHz\n", 191 | " Memory : 877 MHz\n", 192 | " Video : 555 MHz\n", 193 | " Applications Clocks\n", 194 | " Graphics : 1312 MHz\n", 195 | " Memory : 877 MHz\n", 196 | " Default Applications Clocks\n", 197 | " Graphics : 1312 MHz\n", 198 | " Memory : 877 MHz\n", 199 | " Max Clocks\n", 200 | " Graphics : 1530 MHz\n", 201 | " SM : 1530 MHz\n", 202 | " Memory : 877 MHz\n", 203 | " Video : 1372 MHz\n", 204 | " Max Customer Boost Clocks\n", 205 | " Graphics : 1530 MHz\n", 206 | " Clock Policy\n", 207 | " Auto Boost : N/A\n", 208 | " Auto Boost Default : N/A\n", 209 | " Processes : None\n", 210 | "\n" 211 | ] 212 | } 213 | ], 214 | "source": [ 215 | "!nvidia-smi -q" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "#### CPU" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 2, 228 | "metadata": {}, 229 | "outputs": [ 230 | { 231 | "name": "stdout", 232 | "output_type": "stream", 233 | "text": [ 234 | "processor : 0\n", 235 | "vendor_id : GenuineIntel\n", 236 | "cpu family : 6\n", 237 | "model : 79\n", 238 | "model name : Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz\n", 239 | "stepping : 1\n", 240 | "microcode : 0xb000037\n", 241 | "cpu MHz : 2699.945\n", 242 | "cache size : 46080 KB\n", 243 | "physical id : 0\n", 244 | "siblings : 8\n", 245 | "core id : 0\n", 246 | "cpu cores : 4\n", 247 | "apicid : 0\n", 248 | "initial apicid : 0\n", 249 | "fpu : yes\n", 250 | "fpu_exception : yes\n", 251 | "cpuid level : 13\n", 252 | "wp : yes\n", 253 | "\u001b[K:\u001b[K" 254 | ] 255 | } 256 | ], 257 | "source": [ 258 | "!less /proc/cpuinfo" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": {}, 264 | "source": [ 265 | "## Benchmark setup" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "### Installations\n", 273 | "Install our v3io-generator to create our 1gb dataset for the benchmark" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 3, 279 | "metadata": {}, 280 | "outputs": [ 281 | { 282 | "name": "stdout", 283 | "output_type": "stream", 284 | "text": [ 285 | "Looking in indexes: https://test.pypi.org/simple/\n", 286 | "Requirement already up-to-date: v3io-generator in /User/.pythonlibs/lib/python3.6/site-packages (0.0.27.dev0)\n" 287 | ] 288 | } 289 | ], 290 | "source": [ 291 | "!pip install pytimeparse\n", 292 | "!pip install -i https://test.pypi.org/simple/ v3io-generator --upgrade\n", 293 | "!pin install faker" 294 | ] 295 | }, 296 | { 297 | "cell_type": "markdown", 298 | "metadata": {}, 299 | "source": [ 300 | "### Configurations" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": 2, 306 | "metadata": {}, 307 | "outputs": [], 308 | "source": [ 309 | "# Benchmark configurations\n", 310 | "metric_names = ['cpu_utilization', 'latency', 'packet_loss', 'throughput']\n", 311 | "nlargest = 10" 312 | ] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": {}, 317 | "source": [ 318 | "### Imports" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 1, 324 | "metadata": {}, 325 | "outputs": [], 326 | "source": [ 327 | "import os\n", 328 | "import yaml\n", 329 | "import time\n", 330 | "import datetime\n", 331 | "import json\n", 332 | "import itertools\n", 333 | "\n", 334 | "# Generator\n", 335 | "from v3io_generator import metrics_generator, deployment_generator\n", 336 | "\n", 337 | "# Dataframes\n", 338 | "import cudf\n", 339 | "import pandas as pd" 340 | ] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "metadata": {}, 345 | "source": [ 346 | "### Create data source\n", 347 | "Using our V3IO-Generator we will create a timeseries network-operations dataset for 100 companies including 4 metrics (cpu utilization, latency, throughput, packet loss).\n", 348 | "\n", 349 | "We will then write the dataset to a json file to be used as our source" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 6, 355 | "metadata": {}, 356 | "outputs": [ 357 | { 358 | "data": { 359 | "text/html": [ 360 | "
\n", 361 | "\n", 374 | "\n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | " \n", 403 | " \n", 404 | " \n", 405 | " \n", 406 | " \n", 407 | " \n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | "
companycpu_utilizationlatencypacket_lossthroughput
0Rios__Pope_and_Baird0000
1Ross__Calderon_and_Brown0000
2Jackson_PLC0000
3Reyes_Group0000
4Carr-Reyes0000
\n", 428 | "
" 429 | ], 430 | "text/plain": [ 431 | " company cpu_utilization latency packet_loss throughput\n", 432 | "0 Rios__Pope_and_Baird 0 0 0 0\n", 433 | "1 Ross__Calderon_and_Brown 0 0 0 0\n", 434 | "2 Jackson_PLC 0 0 0 0\n", 435 | "3 Reyes_Group 0 0 0 0\n", 436 | "4 Carr-Reyes 0 0 0 0" 437 | ] 438 | }, 439 | "execution_count": 6, 440 | "metadata": {}, 441 | "output_type": "execute_result" 442 | } 443 | ], 444 | "source": [ 445 | "# Create meta-data factory\n", 446 | "dep_gen = deployment_generator.deployment_generator()\n", 447 | "faker=dep_gen.get_faker()\n", 448 | "\n", 449 | "# Design meta-data\n", 450 | "dep_gen.add_level(name='company',number=100,level_type=faker.company)\n", 451 | "\n", 452 | "# Generate deployment structure\n", 453 | "deployment_df = dep_gen.generate_deployment()\n", 454 | "\n", 455 | "# Setup initial values\n", 456 | "for metric in metrics:\n", 457 | " deployment_df[metric] = 0\n", 458 | "\n", 459 | "deployment_df.head()" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": 7, 465 | "metadata": {}, 466 | "outputs": [], 467 | "source": [ 468 | "metrics_configuration = yaml.safe_load(\"\"\"\n", 469 | "errors: {length_in_ticks: 50, rate_in_ticks: 150}\n", 470 | "timestamps: {interval: 5s, stochastic_interval: false}\n", 471 | "metrics:\n", 472 | " cpu_utilization:\n", 473 | " accuracy: 2\n", 474 | " distribution: normal\n", 475 | " distribution_params: {mu: 70, noise: 0, sigma: 10}\n", 476 | " is_threshold_below: true\n", 477 | " past_based_value: false\n", 478 | " produce_max: false\n", 479 | " produce_min: false\n", 480 | " validation:\n", 481 | " distribution: {max: 1, min: -1, validate: false}\n", 482 | " metric: {max: 100, min: 0, validate: true}\n", 483 | " latency:\n", 484 | " accuracy: 2\n", 485 | " distribution: normal\n", 486 | " distribution_params: {mu: 0, noise: 0, sigma: 5}\n", 487 | " is_threshold_below: true\n", 488 | " past_based_value: false\n", 489 | " produce_max: false\n", 490 | " produce_min: false\n", 491 | " validation:\n", 492 | " distribution: {max: 1, min: -1, validate: false}\n", 493 | " metric: {max: 100, min: 0, validate: true}\n", 494 | " packet_loss:\n", 495 | " accuracy: 0\n", 496 | " distribution: normal\n", 497 | " distribution_params: {mu: 0, noise: 0, sigma: 2}\n", 498 | " is_threshold_below: true\n", 499 | " past_based_value: false\n", 500 | " produce_max: false\n", 501 | " produce_min: false\n", 502 | " validation:\n", 503 | " distribution: {max: 1, min: -1, validate: false}\n", 504 | " metric: {max: 50, min: 0, validate: true}\n", 505 | " throughput:\n", 506 | " accuracy: 2\n", 507 | " distribution: normal\n", 508 | " distribution_params: {mu: 250, noise: 0, sigma: 20}\n", 509 | " is_threshold_below: false\n", 510 | " past_based_value: false\n", 511 | " produce_max: false\n", 512 | " produce_min: false\n", 513 | " validation:\n", 514 | " distribution: {max: 1, min: -1, validate: false}\n", 515 | " metric: {max: 300, min: 0, validate: true}\n", 516 | "\"\"\")" 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": 8, 522 | "metadata": {}, 523 | "outputs": [], 524 | "source": [ 525 | "met_gen = metrics_generator.Generator_df(metrics_configuration, \n", 526 | " user_hierarchy=deployment_df, \n", 527 | " initial_timestamp=time.time())" 528 | ] 529 | }, 530 | { 531 | "cell_type": "code", 532 | "execution_count": 9, 533 | "metadata": {}, 534 | "outputs": [], 535 | "source": [ 536 | "source_file = '/tmp/ops.logs'\n", 537 | "metrics = met_gen.generate_range(start_time=datetime.datetime.now(),\n", 538 | " end_time=datetime.datetime.now()+datetime.timedelta(hours=62),\n", 539 | " as_df=True,\n", 540 | " as_iterator=False)\n", 541 | "\n", 542 | "# Generate file from metrics\n", 543 | "with open(source_file, 'w') as f:\n", 544 | " metrics_batch = metrics\n", 545 | " metrics_batch.to_json(f,\n", 546 | " orient='records',\n", 547 | " lines=True)" 548 | ] 549 | }, 550 | { 551 | "cell_type": "markdown", 552 | "metadata": {}, 553 | "source": [ 554 | "## Target file size validation\n", 555 | "Set target size (in MB) for the test file" 556 | ] 557 | }, 558 | { 559 | "cell_type": "code", 560 | "execution_count": 10, 561 | "metadata": {}, 562 | "outputs": [ 563 | { 564 | "name": "stdout", 565 | "output_type": "stream", 566 | "text": [ 567 | "total 0\n", 568 | "drwxr-xr-x 2 50 nogroup 0 Jul 22 11:32 .ipynb_checkpoints\n", 569 | "-rw-r--r-- 1 50 nogroup 1.2G Jul 22 07:17 ops-1gb.logs\n", 570 | "-rw-r--r-- 1 50 nogroup 1.2G Jul 22 11:40 ops.logs\n" 571 | ] 572 | } 573 | ], 574 | "source": [ 575 | "!ls -lah data" 576 | ] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": 11, 581 | "metadata": {}, 582 | "outputs": [ 583 | { 584 | "name": "stdout", 585 | "output_type": "stream", 586 | "text": [ 587 | "{\"company\":\"Rios__Pope_and_Baird\",\"cpu_utilization\":70.6942165035,\"cpu_utilization_is_error\":false,\"latency\":3.1373003261,\"latency_is_error\":false,\"packet_loss\":0.0,\"packet_loss_is_error\":false,\"throughput\":249.7207880994,\"throughput_is_error\":false,\"timestamp\":1563795193534}\n", 588 | "{\"company\":\"Ross__Calderon_and_Brown\",\"cpu_utilization\":56.540474522,\"cpu_utilization_is_error\":false,\"latency\":0.0,\"latency_is_error\":false,\"packet_loss\":0.0,\"packet_loss_is_error\":false,\"throughput\":261.9362588938,\"throughput_is_error\":false,\"timestamp\":1563795193534}\n", 589 | "{\"company\":\"Jackson_PLC\",\"cpu_utilization\":75.7476859549,\"cpu_utilization_is_error\":false,\"latency\":0.0,\"latency_is_error\":false,\"packet_loss\":1.3991427041,\"packet_loss_is_error\":false,\"throughput\":221.8819458316,\"throughput_is_error\":false,\"timestamp\":1563795193534}\n", 590 | "{\"company\":\"Reyes_Group\",\"cpu_utilization\":61.4657850595,\"cpu_utilization_is_error\":false,\"latency\":0.0,\"latency_is_error\":false,\"packet_loss\":1.7039267608,\"packet_loss_is_error\":false,\"throughput\":232.4317426357,\"throughput_is_error\":false,\"timestamp\":1563795193534}\n", 591 | "{\"company\":\"Carr-Reyes\",\"cpu_utilization\":49.9688296158,\"cpu_utilization_is_error\":false,\"latency\":0.4559851085,\"latency_is_error\":false,\"packet_loss\":0.0,\"packet_loss_is_error\":false,\"throughput\":276.0540712289,\"throughput_is_error\":false,\"timestamp\":1563795193534}\n", 592 | "{\"company\":\"Wilson_and_Sons\",\"cpu_utilization\":66.0718786965,\"cpu_utilization_is_error\":false,\"latency\":0.0,\"latency_is_error\":false,\"packet_loss\":0.2971395649,\"packet_loss_is_error\":false,\"throughput\":264.7801359998,\"throughput_is_error\":false,\"timestamp\":1563795193534}\n", 593 | "{\"company\":\"Nolan__Norton_and_Best\",\"cpu_utilization\":63.0572487273,\"cpu_utilization_is_error\":false,\"latency\":0.0,\"latency_is_error\":false,\"packet_loss\":0.0,\"packet_loss_is_error\":false,\"throughput\":264.033448601,\"throughput_is_error\":false,\"timestamp\":1563795193534}\n", 594 | "{\"company\":\"Brandt_LLC\",\"cpu_utilization\":90.6382973539,\"cpu_utilization_is_error\":false,\"latency\":1.9411864095,\"latency_is_error\":false,\"packet_loss\":2.527867727,\"packet_loss_is_error\":false,\"throughput\":232.8742238776,\"throughput_is_error\":false,\"timestamp\":1563795193534}\n", 595 | "{\"company\":\"Williams-Collins\",\"cpu_utilization\":64.1690341745,\"cpu_utilization_is_error\":false,\"latency\":0.0,\"latency_is_error\":false,\"packet_loss\":3.2983607178,\"packet_loss_is_error\":false,\"throughput\":264.0176356426,\"throughput_is_error\":false,\"timestamp\":1563795193534}\n", 596 | "{\"company\":\"Williams__Hutchinson_and_Harrison\",\"cpu_utilization\":60.6966740664,\"cpu_utilization_is_error\":false,\"latency\":0.0,\"latency_is_error\":false,\"packet_loss\":3.1742385435,\"packet_loss_is_error\":false,\"throughput\":230.875976139,\"throughput_is_error\":false,\"timestamp\":1563795193534}\n" 597 | ] 598 | } 599 | ], 600 | "source": [ 601 | "!head data/ops.logs" 602 | ] 603 | }, 604 | { 605 | "cell_type": "markdown", 606 | "metadata": {}, 607 | "source": [ 608 | "## Benchmark\n", 609 | "\n", 610 | "### Flow\n", 611 | "- Read file\n", 612 | "- Compute aggregations\n", 613 | "- get nlargest()" 614 | ] 615 | }, 616 | { 617 | "cell_type": "code", 618 | "execution_count": 3, 619 | "metadata": {}, 620 | "outputs": [], 621 | "source": [ 622 | "benchmark_file = source_file" 623 | ] 624 | }, 625 | { 626 | "cell_type": "markdown", 627 | "metadata": {}, 628 | "source": [ 629 | "#### cudf" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": 7, 635 | "metadata": {}, 636 | "outputs": [ 637 | { 638 | "name": "stdout", 639 | "output_type": "stream", 640 | "text": [ 641 | "1.44 s ± 23.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 642 | ] 643 | } 644 | ], 645 | "source": [ 646 | "%%timeit\n", 647 | "\n", 648 | "# Read file\n", 649 | "gdf = cudf.read_json(benchmark_file, lines=True)\n", 650 | "\n", 651 | "# Perform aggregation\n", 652 | "ggdf = gdf.groupby(['company']).\\\n", 653 | " agg({k: ['min', 'max', 'mean'] for k in metric_names})\n", 654 | "\n", 655 | "# Get N Largest (From original df)\n", 656 | "raw_nlargest = gdf.nlargest(nlargest, 'cpu_utilization')" 657 | ] 658 | }, 659 | { 660 | "cell_type": "markdown", 661 | "metadata": {}, 662 | "source": [ 663 | "#### Pandas" 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": 8, 669 | "metadata": {}, 670 | "outputs": [ 671 | { 672 | "name": "stdout", 673 | "output_type": "stream", 674 | "text": [ 675 | "43.4 s ± 627 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 676 | ] 677 | } 678 | ], 679 | "source": [ 680 | "%%timeit\n", 681 | "\n", 682 | "# Read file\n", 683 | "pdf = pd.read_json(benchmark_file, lines=True)\n", 684 | "\n", 685 | "# Perform aggregation\n", 686 | "gpdf = pdf.groupby(['company']).\\\n", 687 | " agg({k: ['min', 'max', 'mean'] for k in metric_names})\n", 688 | "\n", 689 | "# Get N Largest (From original df)\n", 690 | "raw_nlargest = pdf.nlargest(nlargest, 'cpu_utilization')" 691 | ] 692 | }, 693 | { 694 | "cell_type": "markdown", 695 | "metadata": {}, 696 | "source": [ 697 | "## Test loading times" 698 | ] 699 | }, 700 | { 701 | "cell_type": "markdown", 702 | "metadata": {}, 703 | "source": [ 704 | "#### cudf" 705 | ] 706 | }, 707 | { 708 | "cell_type": "code", 709 | "execution_count": 35, 710 | "metadata": {}, 711 | "outputs": [ 712 | { 713 | "name": "stdout", 714 | "output_type": "stream", 715 | "text": [ 716 | "1.2 s ± 120 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 717 | ] 718 | } 719 | ], 720 | "source": [ 721 | "%%timeit\n", 722 | "gdf = cudf.read_json(benchmark_file, lines=True)" 723 | ] 724 | }, 725 | { 726 | "cell_type": "markdown", 727 | "metadata": {}, 728 | "source": [ 729 | "#### Pandas" 730 | ] 731 | }, 732 | { 733 | "cell_type": "code", 734 | "execution_count": 36, 735 | "metadata": {}, 736 | "outputs": [ 737 | { 738 | "name": "stdout", 739 | "output_type": "stream", 740 | "text": [ 741 | "41.1 s ± 651 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 742 | ] 743 | } 744 | ], 745 | "source": [ 746 | "%%timeit\n", 747 | "gdf = pd.read_json(benchmark_file, lines=True)" 748 | ] 749 | }, 750 | { 751 | "cell_type": "markdown", 752 | "metadata": {}, 753 | "source": [ 754 | "## Test aggregation\n", 755 | "Load the files to memory so we can %timeit on the aggregations only" 756 | ] 757 | }, 758 | { 759 | "cell_type": "code", 760 | "execution_count": 4, 761 | "metadata": {}, 762 | "outputs": [], 763 | "source": [ 764 | "gdf = cudf.read_json(benchmark_file, lines=True)\n", 765 | "pdf = pd.read_json(benchmark_file, lines=True)" 766 | ] 767 | }, 768 | { 769 | "cell_type": "markdown", 770 | "metadata": {}, 771 | "source": [ 772 | "#### cudf" 773 | ] 774 | }, 775 | { 776 | "cell_type": "code", 777 | "execution_count": 5, 778 | "metadata": {}, 779 | "outputs": [ 780 | { 781 | "name": "stdout", 782 | "output_type": "stream", 783 | "text": [ 784 | "212 ms ± 14.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 785 | ] 786 | } 787 | ], 788 | "source": [ 789 | "%%timeit\n", 790 | "\n", 791 | "ggdf = gdf.groupby(['company']).\\\n", 792 | " agg({k: ['min', 'max', 'mean'] for k in metric_names})\n", 793 | "raw_nlargest = gdf.nlargest(nlargest, 'cpu_utilization')" 794 | ] 795 | }, 796 | { 797 | "cell_type": "markdown", 798 | "metadata": {}, 799 | "source": [ 800 | "#### Pandas" 801 | ] 802 | }, 803 | { 804 | "cell_type": "code", 805 | "execution_count": 6, 806 | "metadata": {}, 807 | "outputs": [ 808 | { 809 | "name": "stdout", 810 | "output_type": "stream", 811 | "text": [ 812 | "2.17 s ± 72.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 813 | ] 814 | } 815 | ], 816 | "source": [ 817 | "%%timeit\n", 818 | "\n", 819 | "gpdf = pdf.groupby(['company']).\\\n", 820 | " agg({k: ['min', 'max', 'mean'] for k in metric_names})\n", 821 | "raw_nlargest = pdf.nlargest(nlargest, 'cpu_utilization')" 822 | ] 823 | }, 824 | { 825 | "cell_type": "code", 826 | "execution_count": null, 827 | "metadata": {}, 828 | "outputs": [], 829 | "source": [] 830 | } 831 | ], 832 | "metadata": { 833 | "kernelspec": { 834 | "display_name": "Python 3", 835 | "language": "python", 836 | "name": "python3" 837 | }, 838 | "language_info": { 839 | "codemirror_mode": { 840 | "name": "ipython", 841 | "version": 3 842 | }, 843 | "file_extension": ".py", 844 | "mimetype": "text/x-python", 845 | "name": "python", 846 | "nbconvert_exporter": "python", 847 | "pygments_lexer": "ipython3", 848 | "version": "3.6.8" 849 | } 850 | }, 851 | "nbformat": 4, 852 | "nbformat_minor": 4 853 | } 854 | --------------------------------------------------------------------------------