├── README.md
├── demo
    ├── python-agg.ipynb
    ├── nuclio-pandas-agg.ipynb
    ├── nuclio-cudf-agg.ipynb
    └── benchmark_cudf_vs_pd.ipynb
└── LICENSE


/README.md:
--------------------------------------------------------------------------------
1 | # rapids
2 | nuclio integration and demos with NVIDIA RAPIDS
3 | 


--------------------------------------------------------------------------------
/demo/python-agg.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Python (Standalone)\n",
  8 |     "## Unified Data batching & Agg function"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "metadata": {},
 14 |    "source": [
 15 |     "### Installations"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 1,
 21 |    "metadata": {},
 22 |    "outputs": [
 23 |     {
 24 |      "name": "stdout",
 25 |      "output_type": "stream",
 26 |      "text": [
 27 |       "Collecting kafka\n",
 28 |       "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/21/71/73286e748ac5045b6a669c2fe44b03ac4c5d3d2af9291c4c6fc76438a9a9/kafka-1.3.5-py2.py3-none-any.whl (207kB)\n",
 29 |       "\u001b[K    100% |████████████████████████████████| 215kB 20.1MB/s ta 0:00:01\n",
 30 |       "\u001b[?25hInstalling collected packages: kafka\n",
 31 |       "Successfully installed kafka-1.3.5\n"
 32 |      ]
 33 |     }
 34 |    ],
 35 |    "source": [
 36 |     "!pip install kafka"
 37 |    ]
 38 |   },
 39 |   {
 40 |    "cell_type": "markdown",
 41 |    "metadata": {},
 42 |    "source": [
 43 |     "### Script"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "from kafka import KafkaConsumer\n",
 53 |     "import os\n",
 54 |     "import glob\n",
 55 |     "from datetime import datetime, timedelta\n",
 56 |     "import time\n",
 57 |     "import itertools\n",
 58 |     "import json\n",
 59 |     "\n",
 60 |     "# Select DF Factory\n",
 61 |     "import cudf as pd\n",
 62 |     "# import pandas as pd\n",
 63 |     "\n",
 64 |     "\n",
 65 |     "# Basic configuration\n",
 66 |     "metric_names = ['cpu_utilization', 'latency', 'packet_loss', 'throughput']\n",
 67 |     "batch_len = 100\n",
 68 |     "batch = list()\n",
 69 |     "\n",
 70 |     "# Kafka configuration\n",
 71 |     "topic = ''\n",
 72 |     "servers = []\n",
 73 |     "offset = 'earliest'\n",
 74 |     "\n",
 75 |     "def handler(event):\n",
 76 |     "    '''\n",
 77 |     "        Processing function\n",
 78 |     "    '''\n",
 79 |     "    global batch\n",
 80 |     "    global metric_names\n",
 81 |     "    \n",
 82 |     "    # Aggregate event jsons\n",
 83 |     "    batch.append(event.body)\n",
 84 |     "    \n",
 85 |     "    # Did we aggregate enough events for aggregation?\n",
 86 |     "    if len(batch) >= interval:\n",
 87 |     "        \n",
 88 |     "        # Create a DataFrame from the batch of event jsons\n",
 89 |     "        df = cudf.read_json('\\n'.join(batch), lines=True)\n",
 90 |     "        df = df.reset_index(drop=True)\n",
 91 |     "        \n",
 92 |     "        # Perform aggregations\n",
 93 |     "        df = df.groupby(['company']).\\\n",
 94 |     "                    agg({k: ['min', 'max', 'mean'] for k in metric_names})\n",
 95 |     "        \n",
 96 |     "        # Save to parquet\n",
 97 |     "        filename = f'{time.time()}.parquet'\n",
 98 |     "        filepath = os.path.join(sink, filename)\n",
 99 |     "        new_index = [f'{e[0]}_{e[1]}' for e in list(df.columns)]\n",
100 |     "        df.columns = new_index\n",
101 |     "        df.to_parquet(filepath)\n",
102 |     "        \n",
103 |     "        # Reset batch\n",
104 |     "        batch = list()\n",
105 |     "\n",
106 |     "\n",
107 |     "# Kafka handling\n",
108 |     "consumer = KafkaConsumer(\n",
109 |     "     topic,\n",
110 |     "     bootstrap_servers=servers,\n",
111 |     "     auto_offset_reset='offset',\n",
112 |     "     value_deserializer=lambda x: x.decode('utf-8'))\n",
113 |     "\n",
114 |     "for message in consumer:\n",
115 |     "    message = message.value\n",
116 |     "    handler(message)"
117 |    ]
118 |   }
119 |  ],
120 |  "metadata": {
121 |   "kernelspec": {
122 |    "display_name": "Python 3",
123 |    "language": "python",
124 |    "name": "python3"
125 |   },
126 |   "language_info": {
127 |    "codemirror_mode": {
128 |     "name": "ipython",
129 |     "version": 3
130 |    },
131 |    "file_extension": ".py",
132 |    "mimetype": "text/x-python",
133 |    "name": "python",
134 |    "nbconvert_exporter": "python",
135 |    "pygments_lexer": "ipython3",
136 |    "version": "3.6.8"
137 |   }
138 |  },
139 |  "nbformat": 4,
140 |  "nbformat_minor": 4
141 | }
142 | 


--------------------------------------------------------------------------------
/demo/nuclio-pandas-agg.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Nuclio\n",
  8 |     "## Unified Data batching & Agg function"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 1,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "# nuclio: ignore\n",
 18 |     "import nuclio"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "## Environment"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "### Base config"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 2,
 38 |    "metadata": {},
 39 |    "outputs": [
 40 |     {
 41 |      "name": "stdout",
 42 |      "output_type": "stream",
 43 |      "text": [
 44 |       "%nuclio: setting spec.triggers.hahttp.kind to 'http'\n",
 45 |       "%nuclio: setting spec.triggers.hahttp.maxWorkers to 1\n",
 46 |       "%nuclio: setting spec.triggers.hahttp.attributes.port to 31002\n",
 47 |       "%nuclio: setting spec.build.baseImage to 'python:3.6-jessie'\n"
 48 |      ]
 49 |     }
 50 |    ],
 51 |    "source": [
 52 |     "%%nuclio config\n",
 53 |     "\n",
 54 |     "# Kafka Trigger\n",
 55 |     "# spec.triggers.hakafka.kind = \"kafka\"\n",
 56 |     "# spec.trigger.url = \"1.1.1.1\"\n",
 57 |     "# spec.triggers.hakafka.attributes.topic = \"haproxy\"\n",
 58 |     "# spec.triggers.hakafka.attributes.partitions = [0, 1, 2]\n",
 59 |     "# spec.triggers.hakafka.attributes.sasl.enable: true\n",
 60 |     "# spec.triggers.hakafka.attributes.sasl.user: \"\"\n",
 61 |     "# spec.triggers.hakafka.attributes.sasl.password: \"\"\n",
 62 |     "\n",
 63 |     "# HTTP Trigger      \n",
 64 |     "spec.triggers.hahttp.kind=\"http\"\n",
 65 |     "spec.triggers.hahttp.maxWorkers=1\n",
 66 |     "spec.triggers.hahttp.attributes.port=31002\n",
 67 |     "\n",
 68 |     "# Base image\n",
 69 |     "spec.build.baseImage = \"python:3.6-jessie\""
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "### Env variables"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 3,
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "name": "stdout",
 86 |      "output_type": "stream",
 87 |      "text": [
 88 |       "%nuclio: setting 'SINK_PATH' environment variable\n",
 89 |       "%nuclio: setting 'INTERVAL' environment variable\n",
 90 |       "%nuclio: setting 'METRIC_NAMES' environment variable\n"
 91 |      ]
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "%nuclio env SINK_PATH=./sink\n",
 96 |     "%nuclio env INTERVAL=2\n",
 97 |     "%nuclio env METRIC_NAMES=cpu_utilization,latency,packet_loss,throughput"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "### Build commands"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 4,
110 |    "metadata": {},
111 |    "outputs": [
112 |     {
113 |      "name": "stdout",
114 |      "output_type": "stream",
115 |      "text": [
116 |       "Requirement already satisfied: pandas in /conda/lib/python3.6/site-packages (0.23.4)\n",
117 |       "Requirement already satisfied: python-dateutil>=2.5.0 in /conda/lib/python3.6/site-packages (from pandas) (2.8.0)\n",
118 |       "Requirement already satisfied: pytz>=2011k in /conda/lib/python3.6/site-packages (from pandas) (2019.1)\n",
119 |       "Requirement already satisfied: numpy>=1.9.0 in /conda/lib/python3.6/site-packages (from pandas) (1.16.4)\n",
120 |       "Requirement already satisfied: six>=1.5 in /conda/lib/python3.6/site-packages (from python-dateutil>=2.5.0->pandas) (1.12.0)\n"
121 |      ]
122 |     }
123 |    ],
124 |    "source": [
125 |     "%%nuclio cmd\n",
126 |     "pip install pandas"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "markdown",
131 |    "metadata": {},
132 |    "source": [
133 |     "## Function"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 5,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "import os\n",
143 |     "import glob\n",
144 |     "from datetime import datetime, timedelta\n",
145 |     "import time\n",
146 |     "import pandas as pd\n",
147 |     "import itertools\n",
148 |     "import json\n",
149 |     "\n",
150 |     "# Define sink & Verify its available\n",
151 |     "sink = os.getenv('SINK_PATH', './sink')\n",
152 |     "os.makedirs(sink, exist_ok=True)\n",
153 |     "\n",
154 |     "# Expose metric names\n",
155 |     "metric_names = os.environ['METRIC_NAMES']\n",
156 |     "metric_names = metric_names.split(',')\n",
157 |     "\n",
158 |     "# Define batch & batch interval\n",
159 |     "batch = list()\n",
160 |     "interval = int(os.getenv('INTERVAL', 100))\n",
161 |     "\n",
162 |     "def handler(context, event):\n",
163 |     "    global batch\n",
164 |     "    global metric_names\n",
165 |     "    \n",
166 |     "    # Aggregate event jsons\n",
167 |     "    batch.append(event.body)\n",
168 |     "    \n",
169 |     "    # Did we aggregate enough events for aggregation?\n",
170 |     "    if len(batch) >= interval:\n",
171 |     "        \n",
172 |     "        # Create pandas DataFrame from the batch of event jsons\n",
173 |     "        df = pd.read_json('\\n'.join(batch), lines=True)\n",
174 |     "        df = df.reset_index(drop=True)\n",
175 |     "        \n",
176 |     "        # Perform aggregations\n",
177 |     "        df = df.groupby(['company']).\\\n",
178 |     "                    agg({k: ['min', 'max', 'mean'] for k in metric_names})\n",
179 |     "        \n",
180 |     "        # Save to parquet\n",
181 |     "        filename = f'{time.time()}.parquet'\n",
182 |     "        filepath = os.path.join(sink, filename)\n",
183 |     "        new_index = [f'{e[0]}_{e[1]}' for e in list(df.columns)]\n",
184 |     "        df.columns = new_index\n",
185 |     "        df.to_parquet(filepath)\n",
186 |     "        \n",
187 |     "        # Reset batch\n",
188 |     "        batch = list()"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "markdown",
193 |    "metadata": {},
194 |    "source": [
195 |     "## Test"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 6,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "# nuclio: ignore\n",
205 |     "event = nuclio.Event(body='{\"company\":\"Rios__Pope_and_Baird\",\"cpu_utilization\":70.6942165035,\"cpu_utilization_is_error\":false,\"latency\":3.1373003261,\"latency_is_error\":false,\"packet_loss\":0.0,\"packet_loss_is_error\":false,\"throughput\":249.7207880994,\"throughput_is_error\":false,\"timestamp\":1563795193534}')\n",
206 |     "out = handler(context, event)\n",
207 |     "out"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "markdown",
212 |    "metadata": {},
213 |    "source": [
214 |     "## Deploy (If a nuclio cluster is available)"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 7,
220 |    "metadata": {},
221 |    "outputs": [
222 |     {
223 |      "name": "stdout",
224 |      "output_type": "stream",
225 |      "text": [
226 |       "[nuclio.deploy] 2019-08-07 12:29:45,407 (info) Building processor image\n",
227 |       "[nuclio.deploy] 2019-08-07 12:29:47,443 (info) Pushing image\n",
228 |       "[nuclio.deploy] 2019-08-07 12:29:47,443 (info) Build complete\n",
229 |       "[nuclio.deploy] 2019-08-07 12:29:51,488 (info) Function deploy complete\n",
230 |       "[nuclio.deploy] 2019-08-07 12:29:51,494 done updating pd-batch-and-agg, function address: 3.120.15.118:31002\n",
231 |       "%nuclio: function deployed\n"
232 |      ]
233 |     }
234 |    ],
235 |    "source": [
236 |     "%nuclio deploy -p nvidia -n pd_batch_and_agg -c"
237 |    ]
238 |   },
239 |   {
240 |    "cell_type": "code",
241 |    "execution_count": null,
242 |    "metadata": {},
243 |    "outputs": [],
244 |    "source": []
245 |   }
246 |  ],
247 |  "metadata": {
248 |   "kernelspec": {
249 |    "display_name": "Python 3",
250 |    "language": "python",
251 |    "name": "python3"
252 |   },
253 |   "language_info": {
254 |    "codemirror_mode": {
255 |     "name": "ipython",
256 |     "version": 3
257 |    },
258 |    "file_extension": ".py",
259 |    "mimetype": "text/x-python",
260 |    "name": "python",
261 |    "nbconvert_exporter": "python",
262 |    "pygments_lexer": "ipython3",
263 |    "version": "3.6.8"
264 |   }
265 |  },
266 |  "nbformat": 4,
267 |  "nbformat_minor": 4
268 | }
269 | 


--------------------------------------------------------------------------------
/demo/nuclio-cudf-agg.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Nuclio\n",
  8 |     "## Unified Data batching & Agg function"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "code",
 13 |    "execution_count": 1,
 14 |    "metadata": {},
 15 |    "outputs": [],
 16 |    "source": [
 17 |     "# nuclio: ignore\n",
 18 |     "import nuclio"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "markdown",
 23 |    "metadata": {},
 24 |    "source": [
 25 |     "## Environment"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "markdown",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "### Base config"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 2,
 38 |    "metadata": {},
 39 |    "outputs": [
 40 |     {
 41 |      "name": "stdout",
 42 |      "output_type": "stream",
 43 |      "text": [
 44 |       "%nuclio: setting spec.triggers.hahttp.kind to 'http'\n",
 45 |       "%nuclio: setting spec.triggers.hahttp.maxWorkers to 1\n",
 46 |       "%nuclio: setting spec.triggers.hahttp.attributes.port to 31001\n",
 47 |       "%nuclio: setting spec.build.baseImage to 'rapidsai/rapidsai:cuda10.0-runtime-centos7'\n"
 48 |      ]
 49 |     }
 50 |    ],
 51 |    "source": [
 52 |     "%%nuclio config\n",
 53 |     "\n",
 54 |     "# Kafka Trigger\n",
 55 |     "# spec.triggers.hakafka.kind = \"kafka\"\n",
 56 |     "# spec.trigger.url = \"1.1.1.1\"\n",
 57 |     "# spec.triggers.hakafka.attributes.topic = \"haproxy\"\n",
 58 |     "# spec.triggers.hakafka.attributes.partitions = [0, 1, 2]\n",
 59 |     "# spec.triggers.hakafka.attributes.sasl.enable: true\n",
 60 |     "# spec.triggers.hakafka.attributes.sasl.user: \"\"\n",
 61 |     "# spec.triggers.hakafka.attributes.sasl.password: \"\"\n",
 62 |     "\n",
 63 |     "# HTTP Trigger      \n",
 64 |     "spec.triggers.hahttp.kind=\"http\"\n",
 65 |     "spec.triggers.hahttp.maxWorkers=1\n",
 66 |     "spec.triggers.hahttp.attributes.port=31001\n",
 67 |     "\n",
 68 |     "# Base image\n",
 69 |     "spec.build.baseImage = \"rapidsai/rapidsai:cuda10.0-runtime-centos7\""
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "markdown",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "### Env variables"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 3,
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "name": "stdout",
 86 |      "output_type": "stream",
 87 |      "text": [
 88 |       "%nuclio: setting 'SINK_PATH' environment variable\n",
 89 |       "%nuclio: setting 'INTERVAL' environment variable\n",
 90 |       "%nuclio: setting 'METRIC_NAMES' environment variable\n"
 91 |      ]
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "%nuclio env SINK_PATH=./sink\n",
 96 |     "%nuclio env INTERVAL=2\n",
 97 |     "%nuclio env METRIC_NAMES=cpu_utilization,latency,packet_loss,throughput"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "markdown",
102 |    "metadata": {},
103 |    "source": [
104 |     "## Function"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "code",
109 |    "execution_count": 4,
110 |    "metadata": {},
111 |    "outputs": [],
112 |    "source": [
113 |     "import os\n",
114 |     "import glob\n",
115 |     "from datetime import datetime, timedelta\n",
116 |     "import time\n",
117 |     "import cudf\n",
118 |     "import itertools\n",
119 |     "import json\n",
120 |     "\n",
121 |     "# Define sink & Verify its available\n",
122 |     "sink = os.getenv('SINK_PATH', './sink')\n",
123 |     "os.makedirs(sink, exist_ok=True)\n",
124 |     "\n",
125 |     "# Expose metric names\n",
126 |     "metric_names = os.environ['METRIC_NAMES']\n",
127 |     "metric_names = metric_names.split(',')\n",
128 |     "\n",
129 |     "# Define batch & batch interval\n",
130 |     "batch = list()\n",
131 |     "interval = int(os.getenv('INTERVAL', 100))\n",
132 |     "\n",
133 |     "def handler(context, event):\n",
134 |     "    global batch\n",
135 |     "    global metric_names\n",
136 |     "    \n",
137 |     "    # Aggregate event jsons\n",
138 |     "    batch.append(event.body)\n",
139 |     "    \n",
140 |     "    # Did we aggregate enough events for aggregation?\n",
141 |     "    if len(batch) >= interval:\n",
142 |     "        \n",
143 |     "        # Create cudf DataFrame from the batch of event jsons\n",
144 |     "        df = cudf.read_json('\\n'.join(batch), lines=True)\n",
145 |     "        df = df.reset_index(drop=True)\n",
146 |     "        \n",
147 |     "        # Perform aggregations\n",
148 |     "        df = df.groupby(['company']).\\\n",
149 |     "                    agg({k: ['min', 'max', 'mean'] for k in metric_names})\n",
150 |     "        \n",
151 |     "        # Save to parquet\n",
152 |     "        filename = f'{time.time()}.parquet'\n",
153 |     "        filepath = os.path.join(sink, filename)\n",
154 |     "        new_index = [f'{e[0]}_{e[1]}' for e in list(df.columns)]\n",
155 |     "        df.columns = new_index\n",
156 |     "        df.to_parquet(filepath)\n",
157 |     "        \n",
158 |     "        # Reset batch\n",
159 |     "        batch = list()"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "metadata": {},
165 |    "source": [
166 |     "## Test"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": 5,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "# nuclio: ignore\n",
176 |     "event = nuclio.Event(body='{\"company\":\"Rios__Pope_and_Baird\",\"cpu_utilization\":70.6942165035,\"cpu_utilization_is_error\":false,\"latency\":3.1373003261,\"latency_is_error\":false,\"packet_loss\":0.0,\"packet_loss_is_error\":false,\"throughput\":249.7207880994,\"throughput_is_error\":false,\"timestamp\":1563795193534}')\n",
177 |     "out = handler(context, event)\n",
178 |     "out"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "markdown",
183 |    "metadata": {},
184 |    "source": [
185 |     "## Deploy (If a nuclio cluster is available)"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": 6,
191 |    "metadata": {},
192 |    "outputs": [
193 |     {
194 |      "name": "stdout",
195 |      "output_type": "stream",
196 |      "text": [
197 |       "[nuclio.deploy] 2019-08-07 12:29:26,987 (info) Building processor image\n"
198 |      ]
199 |     },
200 |     {
201 |      "name": "stderr",
202 |      "output_type": "stream",
203 |      "text": [
204 |       "INFO:(info) Building processor image\n"
205 |      ]
206 |     },
207 |     {
208 |      "name": "stdout",
209 |      "output_type": "stream",
210 |      "text": [
211 |       "[nuclio.deploy] 2019-08-07 12:29:30,024 (info) Pushing image\n"
212 |      ]
213 |     },
214 |     {
215 |      "name": "stderr",
216 |      "output_type": "stream",
217 |      "text": [
218 |       "INFO:(info) Pushing image\n"
219 |      ]
220 |     },
221 |     {
222 |      "name": "stdout",
223 |      "output_type": "stream",
224 |      "text": [
225 |       "[nuclio.deploy] 2019-08-07 12:29:30,026 (info) Build complete\n"
226 |      ]
227 |     },
228 |     {
229 |      "name": "stderr",
230 |      "output_type": "stream",
231 |      "text": [
232 |       "INFO:(info) Build complete\n"
233 |      ]
234 |     },
235 |     {
236 |      "name": "stdout",
237 |      "output_type": "stream",
238 |      "text": [
239 |       "[nuclio.deploy] 2019-08-07 12:29:34,085 (info) Function deploy complete\n"
240 |      ]
241 |     },
242 |     {
243 |      "name": "stderr",
244 |      "output_type": "stream",
245 |      "text": [
246 |       "INFO:(info) Function deploy complete\n"
247 |      ]
248 |     },
249 |     {
250 |      "name": "stdout",
251 |      "output_type": "stream",
252 |      "text": [
253 |       "[nuclio.deploy] 2019-08-07 12:29:34,096 done updating cudf-batch-and-agg, function address: 3.120.15.118:31001\n"
254 |      ]
255 |     },
256 |     {
257 |      "name": "stderr",
258 |      "output_type": "stream",
259 |      "text": [
260 |       "INFO:done updating cudf-batch-and-agg, function address: 3.120.15.118:31001\n"
261 |      ]
262 |     },
263 |     {
264 |      "name": "stdout",
265 |      "output_type": "stream",
266 |      "text": [
267 |       "%nuclio: function deployed\n"
268 |      ]
269 |     }
270 |    ],
271 |    "source": [
272 |     "%nuclio deploy -p nvidia -n cudf_batch_and_agg -c"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": null,
278 |    "metadata": {},
279 |    "outputs": [],
280 |    "source": []
281 |   }
282 |  ],
283 |  "metadata": {
284 |   "kernelspec": {
285 |    "display_name": "Python 3",
286 |    "language": "python",
287 |    "name": "python3"
288 |   },
289 |   "language_info": {
290 |    "codemirror_mode": {
291 |     "name": "ipython",
292 |     "version": 3
293 |    },
294 |    "file_extension": ".py",
295 |    "mimetype": "text/x-python",
296 |    "name": "python",
297 |    "nbconvert_exporter": "python",
298 |    "pygments_lexer": "ipython3",
299 |    "version": "3.6.8"
300 |   }
301 |  },
302 |  "nbformat": 4,
303 |  "nbformat_minor": 4
304 | }
305 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/demo/benchmark_cudf_vs_pd.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Benchmark Pandas vs Cudf\n",
  8 |     "- Using *timeit*"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "metadata": {},
 14 |    "source": [
 15 |     "### System details"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "markdown",
 20 |    "metadata": {},
 21 |    "source": [
 22 |     "#### GPU"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 1,
 28 |    "metadata": {},
 29 |    "outputs": [
 30 |     {
 31 |      "name": "stdout",
 32 |      "output_type": "stream",
 33 |      "text": [
 34 |       "\n",
 35 |       "==============NVSMI LOG==============\n",
 36 |       "\n",
 37 |       "Timestamp                           : Mon Jul 22 11:32:33 2019\n",
 38 |       "Driver Version                      : 418.56\n",
 39 |       "CUDA Version                        : 10.1\n",
 40 |       "\n",
 41 |       "Attached GPUs                       : 1\n",
 42 |       "GPU 00000000:00:1E.0\n",
 43 |       "    Product Name                    : Tesla V100-SXM2-16GB\n",
 44 |       "    Product Brand                   : Tesla\n",
 45 |       "    Display Mode                    : Enabled\n",
 46 |       "    Display Active                  : Disabled\n",
 47 |       "    Persistence Mode                : Enabled\n",
 48 |       "    Accounting Mode                 : Disabled\n",
 49 |       "    Accounting Mode Buffer Size     : 4000\n",
 50 |       "    Driver Model\n",
 51 |       "        Current                     : N/A\n",
 52 |       "        Pending                     : N/A\n",
 53 |       "    Serial Number                   : 0323217016780\n",
 54 |       "    GPU UUID                        : GPU-3ec8803d-1d6d-b362-7a9d-57b78fe42967\n",
 55 |       "    Minor Number                    : 0\n",
 56 |       "    VBIOS Version                   : 88.00.4F.00.09\n",
 57 |       "    MultiGPU Board                  : No\n",
 58 |       "    Board ID                        : 0x1e\n",
 59 |       "    GPU Part Number                 : 900-2G503-0000-000\n",
 60 |       "    Inforom Version\n",
 61 |       "        Image Version               : G503.0201.00.03\n",
 62 |       "        OEM Object                  : 1.1\n",
 63 |       "        ECC Object                  : 5.0\n",
 64 |       "        Power Management Object     : N/A\n",
 65 |       "    GPU Operation Mode\n",
 66 |       "        Current                     : N/A\n",
 67 |       "        Pending                     : N/A\n",
 68 |       "    GPU Virtualization Mode\n",
 69 |       "        Virtualization mode         : Pass-Through\n",
 70 |       "    IBMNPU\n",
 71 |       "        Relaxed Ordering Mode       : N/A\n",
 72 |       "    PCI\n",
 73 |       "        Bus                         : 0x00\n",
 74 |       "        Device                      : 0x1E\n",
 75 |       "        Domain                      : 0x0000\n",
 76 |       "        Device Id                   : 0x1DB110DE\n",
 77 |       "        Bus Id                      : 00000000:00:1E.0\n",
 78 |       "        Sub System Id               : 0x121210DE\n",
 79 |       "        GPU Link Info\n",
 80 |       "            PCIe Generation\n",
 81 |       "                Max                 : 3\n",
 82 |       "                Current             : 3\n",
 83 |       "            Link Width\n",
 84 |       "                Max                 : 16x\n",
 85 |       "                Current             : 16x\n",
 86 |       "        Bridge Chip\n",
 87 |       "            Type                    : N/A\n",
 88 |       "            Firmware                : N/A\n",
 89 |       "        Replays Since Reset         : 0\n",
 90 |       "        Replay Number Rollovers     : 0\n",
 91 |       "        Tx Throughput               : 0 KB/s\n",
 92 |       "        Rx Throughput               : 0 KB/s\n",
 93 |       "    Fan Speed                       : N/A\n",
 94 |       "    Performance State               : P0\n",
 95 |       "    Clocks Throttle Reasons\n",
 96 |       "        Idle                        : Active\n",
 97 |       "        Applications Clocks Setting : Not Active\n",
 98 |       "        SW Power Cap                : Not Active\n",
 99 |       "        HW Slowdown                 : Not Active\n",
100 |       "            HW Thermal Slowdown     : Not Active\n",
101 |       "            HW Power Brake Slowdown : Not Active\n",
102 |       "        Sync Boost                  : Not Active\n",
103 |       "        SW Thermal Slowdown         : Not Active\n",
104 |       "        Display Clock Setting       : Not Active\n",
105 |       "    FB Memory Usage\n",
106 |       "        Total                       : 16130 MiB\n",
107 |       "        Used                        : 0 MiB\n",
108 |       "        Free                        : 16130 MiB\n",
109 |       "    BAR1 Memory Usage\n",
110 |       "        Total                       : 16384 MiB\n",
111 |       "        Used                        : 2 MiB\n",
112 |       "        Free                        : 16382 MiB\n",
113 |       "    Compute Mode                    : Default\n",
114 |       "    Utilization\n",
115 |       "        Gpu                         : 0 %\n",
116 |       "        Memory                      : 0 %\n",
117 |       "        Encoder                     : 0 %\n",
118 |       "        Decoder                     : 0 %\n",
119 |       "    Encoder Stats\n",
120 |       "        Active Sessions             : 0\n",
121 |       "        Average FPS                 : 0\n",
122 |       "        Average Latency             : 0\n",
123 |       "    FBC Stats\n",
124 |       "        Active Sessions             : 0\n",
125 |       "        Average FPS                 : 0\n",
126 |       "        Average Latency             : 0\n",
127 |       "    Ecc Mode\n",
128 |       "        Current                     : Enabled\n",
129 |       "        Pending                     : Enabled\n",
130 |       "    ECC Errors\n",
131 |       "        Volatile\n",
132 |       "            Single Bit            \n",
133 |       "                Device Memory       : 0\n",
134 |       "                Register File       : 0\n",
135 |       "                L1 Cache            : 0\n",
136 |       "                L2 Cache            : 0\n",
137 |       "                Texture Memory      : N/A\n",
138 |       "                Texture Shared      : N/A\n",
139 |       "                CBU                 : N/A\n",
140 |       "                Total               : 0\n",
141 |       "            Double Bit            \n",
142 |       "                Device Memory       : 0\n",
143 |       "                Register File       : 0\n",
144 |       "                L1 Cache            : 0\n",
145 |       "                L2 Cache            : 0\n",
146 |       "                Texture Memory      : N/A\n",
147 |       "                Texture Shared      : N/A\n",
148 |       "                CBU                 : 0\n",
149 |       "                Total               : 0\n",
150 |       "        Aggregate\n",
151 |       "            Single Bit            \n",
152 |       "                Device Memory       : 0\n",
153 |       "                Register File       : 0\n",
154 |       "                L1 Cache            : 0\n",
155 |       "                L2 Cache            : 0\n",
156 |       "                Texture Memory      : N/A\n",
157 |       "                Texture Shared      : N/A\n",
158 |       "                CBU                 : N/A\n",
159 |       "                Total               : 0\n",
160 |       "            Double Bit            \n",
161 |       "                Device Memory       : 0\n",
162 |       "                Register File       : 0\n",
163 |       "                L1 Cache            : 0\n",
164 |       "                L2 Cache            : 0\n",
165 |       "                Texture Memory      : N/A\n",
166 |       "                Texture Shared      : N/A\n",
167 |       "                CBU                 : 0\n",
168 |       "                Total               : 0\n",
169 |       "    Retired Pages\n",
170 |       "        Single Bit ECC              : 0\n",
171 |       "        Double Bit ECC              : 0\n",
172 |       "        Pending                     : No\n",
173 |       "    Temperature\n",
174 |       "        GPU Current Temp            : 32 C\n",
175 |       "        GPU Shutdown Temp           : 90 C\n",
176 |       "        GPU Slowdown Temp           : 87 C\n",
177 |       "        GPU Max Operating Temp      : 83 C\n",
178 |       "        Memory Current Temp         : 28 C\n",
179 |       "        Memory Max Operating Temp   : 85 C\n",
180 |       "    Power Readings\n",
181 |       "        Power Management            : Supported\n",
182 |       "        Power Draw                  : 23.61 W\n",
183 |       "        Power Limit                 : 300.00 W\n",
184 |       "        Default Power Limit         : 300.00 W\n",
185 |       "        Enforced Power Limit        : 300.00 W\n",
186 |       "        Min Power Limit             : 150.00 W\n",
187 |       "        Max Power Limit             : 300.00 W\n",
188 |       "    Clocks\n",
189 |       "        Graphics                    : 135 MHz\n",
190 |       "        SM                          : 135 MHz\n",
191 |       "        Memory                      : 877 MHz\n",
192 |       "        Video                       : 555 MHz\n",
193 |       "    Applications Clocks\n",
194 |       "        Graphics                    : 1312 MHz\n",
195 |       "        Memory                      : 877 MHz\n",
196 |       "    Default Applications Clocks\n",
197 |       "        Graphics                    : 1312 MHz\n",
198 |       "        Memory                      : 877 MHz\n",
199 |       "    Max Clocks\n",
200 |       "        Graphics                    : 1530 MHz\n",
201 |       "        SM                          : 1530 MHz\n",
202 |       "        Memory                      : 877 MHz\n",
203 |       "        Video                       : 1372 MHz\n",
204 |       "    Max Customer Boost Clocks\n",
205 |       "        Graphics                    : 1530 MHz\n",
206 |       "    Clock Policy\n",
207 |       "        Auto Boost                  : N/A\n",
208 |       "        Auto Boost Default          : N/A\n",
209 |       "    Processes                       : None\n",
210 |       "\n"
211 |      ]
212 |     }
213 |    ],
214 |    "source": [
215 |     "!nvidia-smi -q"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "metadata": {},
221 |    "source": [
222 |     "#### CPU"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 2,
228 |    "metadata": {},
229 |    "outputs": [
230 |     {
231 |      "name": "stdout",
232 |      "output_type": "stream",
233 |      "text": [
234 |       "processor       : 0\n",
235 |       "vendor_id       : GenuineIntel\n",
236 |       "cpu family      : 6\n",
237 |       "model           : 79\n",
238 |       "model name      : Intel(R) Xeon(R) CPU E5-2686 v4 @ 2.30GHz\n",
239 |       "stepping        : 1\n",
240 |       "microcode       : 0xb000037\n",
241 |       "cpu MHz         : 2699.945\n",
242 |       "cache size      : 46080 KB\n",
243 |       "physical id     : 0\n",
244 |       "siblings        : 8\n",
245 |       "core id         : 0\n",
246 |       "cpu cores       : 4\n",
247 |       "apicid          : 0\n",
248 |       "initial apicid  : 0\n",
249 |       "fpu             : yes\n",
250 |       "fpu_exception   : yes\n",
251 |       "cpuid level     : 13\n",
252 |       "wp              : yes\n",
253 |       "\u001b[K:\u001b[K"
254 |      ]
255 |     }
256 |    ],
257 |    "source": [
258 |     "!less /proc/cpuinfo"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "markdown",
263 |    "metadata": {},
264 |    "source": [
265 |     "## Benchmark setup"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "markdown",
270 |    "metadata": {},
271 |    "source": [
272 |     "### Installations\n",
273 |     "Install our v3io-generator to create our 1gb dataset for the benchmark"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 3,
279 |    "metadata": {},
280 |    "outputs": [
281 |     {
282 |      "name": "stdout",
283 |      "output_type": "stream",
284 |      "text": [
285 |       "Looking in indexes: https://test.pypi.org/simple/\n",
286 |       "Requirement already up-to-date: v3io-generator in /User/.pythonlibs/lib/python3.6/site-packages (0.0.27.dev0)\n"
287 |      ]
288 |     }
289 |    ],
290 |    "source": [
291 |     "!pip install pytimeparse\n",
292 |     "!pip install -i https://test.pypi.org/simple/ v3io-generator --upgrade\n",
293 |     "!pin install faker"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "markdown",
298 |    "metadata": {},
299 |    "source": [
300 |     "### Configurations"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": 2,
306 |    "metadata": {},
307 |    "outputs": [],
308 |    "source": [
309 |     "# Benchmark configurations\n",
310 |     "metric_names = ['cpu_utilization', 'latency', 'packet_loss', 'throughput']\n",
311 |     "nlargest = 10"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "markdown",
316 |    "metadata": {},
317 |    "source": [
318 |     "### Imports"
319 |    ]
320 |   },
321 |   {
322 |    "cell_type": "code",
323 |    "execution_count": 1,
324 |    "metadata": {},
325 |    "outputs": [],
326 |    "source": [
327 |     "import os\n",
328 |     "import yaml\n",
329 |     "import time\n",
330 |     "import datetime\n",
331 |     "import json\n",
332 |     "import itertools\n",
333 |     "\n",
334 |     "# Generator\n",
335 |     "from v3io_generator import metrics_generator, deployment_generator\n",
336 |     "\n",
337 |     "# Dataframes\n",
338 |     "import cudf\n",
339 |     "import pandas as pd"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "markdown",
344 |    "metadata": {},
345 |    "source": [
346 |     "### Create data source\n",
347 |     "Using our V3IO-Generator we will create a timeseries network-operations dataset for 100 companies including 4 metrics (cpu utilization, latency, throughput, packet loss).\n",
348 |     "\n",
349 |     "We will then write the dataset to a json file to be used as our source"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": 6,
355 |    "metadata": {},
356 |    "outputs": [
357 |     {
358 |      "data": {
359 |       "text/html": [
360 |        "<div>\n",
361 |        "<style scoped>\n",
362 |        "    .dataframe tbody tr th:only-of-type {\n",
363 |        "        vertical-align: middle;\n",
364 |        "    }\n",
365 |        "\n",
366 |        "    .dataframe tbody tr th {\n",
367 |        "        vertical-align: top;\n",
368 |        "    }\n",
369 |        "\n",
370 |        "    .dataframe thead th {\n",
371 |        "        text-align: right;\n",
372 |        "    }\n",
373 |        "</style>\n",
374 |        "<table border=\"1\" class=\"dataframe\">\n",
375 |        "  <thead>\n",
376 |        "    <tr style=\"text-align: right;\">\n",
377 |        "      <th></th>\n",
378 |        "      <th>company</th>\n",
379 |        "      <th>cpu_utilization</th>\n",
380 |        "      <th>latency</th>\n",
381 |        "      <th>packet_loss</th>\n",
382 |        "      <th>throughput</th>\n",
383 |        "    </tr>\n",
384 |        "  </thead>\n",
385 |        "  <tbody>\n",
386 |        "    <tr>\n",
387 |        "      <th>0</th>\n",
388 |        "      <td>Rios__Pope_and_Baird</td>\n",
389 |        "      <td>0</td>\n",
390 |        "      <td>0</td>\n",
391 |        "      <td>0</td>\n",
392 |        "      <td>0</td>\n",
393 |        "    </tr>\n",
394 |        "    <tr>\n",
395 |        "      <th>1</th>\n",
396 |        "      <td>Ross__Calderon_and_Brown</td>\n",
397 |        "      <td>0</td>\n",
398 |        "      <td>0</td>\n",
399 |        "      <td>0</td>\n",
400 |        "      <td>0</td>\n",
401 |        "    </tr>\n",
402 |        "    <tr>\n",
403 |        "      <th>2</th>\n",
404 |        "      <td>Jackson_PLC</td>\n",
405 |        "      <td>0</td>\n",
406 |        "      <td>0</td>\n",
407 |        "      <td>0</td>\n",
408 |        "      <td>0</td>\n",
409 |        "    </tr>\n",
410 |        "    <tr>\n",
411 |        "      <th>3</th>\n",
412 |        "      <td>Reyes_Group</td>\n",
413 |        "      <td>0</td>\n",
414 |        "      <td>0</td>\n",
415 |        "      <td>0</td>\n",
416 |        "      <td>0</td>\n",
417 |        "    </tr>\n",
418 |        "    <tr>\n",
419 |        "      <th>4</th>\n",
420 |        "      <td>Carr-Reyes</td>\n",
421 |        "      <td>0</td>\n",
422 |        "      <td>0</td>\n",
423 |        "      <td>0</td>\n",
424 |        "      <td>0</td>\n",
425 |        "    </tr>\n",
426 |        "  </tbody>\n",
427 |        "</table>\n",
428 |        "</div>"
429 |       ],
430 |       "text/plain": [
431 |        "                    company  cpu_utilization  latency  packet_loss  throughput\n",
432 |        "0      Rios__Pope_and_Baird                0        0            0           0\n",
433 |        "1  Ross__Calderon_and_Brown                0        0            0           0\n",
434 |        "2               Jackson_PLC                0        0            0           0\n",
435 |        "3               Reyes_Group                0        0            0           0\n",
436 |        "4                Carr-Reyes                0        0            0           0"
437 |       ]
438 |      },
439 |      "execution_count": 6,
440 |      "metadata": {},
441 |      "output_type": "execute_result"
442 |     }
443 |    ],
444 |    "source": [
445 |     "# Create meta-data factory\n",
446 |     "dep_gen = deployment_generator.deployment_generator()\n",
447 |     "faker=dep_gen.get_faker()\n",
448 |     "\n",
449 |     "# Design meta-data\n",
450 |     "dep_gen.add_level(name='company',number=100,level_type=faker.company)\n",
451 |     "\n",
452 |     "# Generate deployment structure\n",
453 |     "deployment_df = dep_gen.generate_deployment()\n",
454 |     "\n",
455 |     "# Setup initial values\n",
456 |     "for metric in metrics:\n",
457 |     "    deployment_df[metric] = 0\n",
458 |     "\n",
459 |     "deployment_df.head()"
460 |    ]
461 |   },
462 |   {
463 |    "cell_type": "code",
464 |    "execution_count": 7,
465 |    "metadata": {},
466 |    "outputs": [],
467 |    "source": [
468 |     "metrics_configuration = yaml.safe_load(\"\"\"\n",
469 |     "errors: {length_in_ticks: 50, rate_in_ticks: 150}\n",
470 |     "timestamps: {interval: 5s, stochastic_interval: false}\n",
471 |     "metrics:\n",
472 |     "  cpu_utilization:\n",
473 |     "    accuracy: 2\n",
474 |     "    distribution: normal\n",
475 |     "    distribution_params: {mu: 70, noise: 0, sigma: 10}\n",
476 |     "    is_threshold_below: true\n",
477 |     "    past_based_value: false\n",
478 |     "    produce_max: false\n",
479 |     "    produce_min: false\n",
480 |     "    validation:\n",
481 |     "      distribution: {max: 1, min: -1, validate: false}\n",
482 |     "      metric: {max: 100, min: 0, validate: true}\n",
483 |     "  latency:\n",
484 |     "    accuracy: 2\n",
485 |     "    distribution: normal\n",
486 |     "    distribution_params: {mu: 0, noise: 0, sigma: 5}\n",
487 |     "    is_threshold_below: true\n",
488 |     "    past_based_value: false\n",
489 |     "    produce_max: false\n",
490 |     "    produce_min: false\n",
491 |     "    validation:\n",
492 |     "      distribution: {max: 1, min: -1, validate: false}\n",
493 |     "      metric: {max: 100, min: 0, validate: true}\n",
494 |     "  packet_loss:\n",
495 |     "    accuracy: 0\n",
496 |     "    distribution: normal\n",
497 |     "    distribution_params: {mu: 0, noise: 0, sigma: 2}\n",
498 |     "    is_threshold_below: true\n",
499 |     "    past_based_value: false\n",
500 |     "    produce_max: false\n",
501 |     "    produce_min: false\n",
502 |     "    validation:\n",
503 |     "      distribution: {max: 1, min: -1, validate: false}\n",
504 |     "      metric: {max: 50, min: 0, validate: true}\n",
505 |     "  throughput:\n",
506 |     "    accuracy: 2\n",
507 |     "    distribution: normal\n",
508 |     "    distribution_params: {mu: 250, noise: 0, sigma: 20}\n",
509 |     "    is_threshold_below: false\n",
510 |     "    past_based_value: false\n",
511 |     "    produce_max: false\n",
512 |     "    produce_min: false\n",
513 |     "    validation:\n",
514 |     "      distribution: {max: 1, min: -1, validate: false}\n",
515 |     "      metric: {max: 300, min: 0, validate: true}\n",
516 |     "\"\"\")"
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "code",
521 |    "execution_count": 8,
522 |    "metadata": {},
523 |    "outputs": [],
524 |    "source": [
525 |     "met_gen = metrics_generator.Generator_df(metrics_configuration, \n",
526 |     "                                         user_hierarchy=deployment_df, \n",
527 |     "                                         initial_timestamp=time.time())"
528 |    ]
529 |   },
530 |   {
531 |    "cell_type": "code",
532 |    "execution_count": 9,
533 |    "metadata": {},
534 |    "outputs": [],
535 |    "source": [
536 |     "source_file = '/tmp/ops.logs'\n",
537 |     "metrics = met_gen.generate_range(start_time=datetime.datetime.now(),\n",
538 |     "                                 end_time=datetime.datetime.now()+datetime.timedelta(hours=62),\n",
539 |     "                                 as_df=True,\n",
540 |     "                                 as_iterator=False)\n",
541 |     "\n",
542 |     "# Generate file from metrics\n",
543 |     "with open(source_file, 'w') as f:\n",
544 |     "    metrics_batch = metrics\n",
545 |     "    metrics_batch.to_json(f,\n",
546 |     "                          orient='records',\n",
547 |     "                          lines=True)"
548 |    ]
549 |   },
550 |   {
551 |    "cell_type": "markdown",
552 |    "metadata": {},
553 |    "source": [
554 |     "## Target file size validation\n",
555 |     "Set target size (in MB) for the test file"
556 |    ]
557 |   },
558 |   {
559 |    "cell_type": "code",
560 |    "execution_count": 10,
561 |    "metadata": {},
562 |    "outputs": [
563 |     {
564 |      "name": "stdout",
565 |      "output_type": "stream",
566 |      "text": [
567 |       "total 0\n",
568 |       "drwxr-xr-x 2 50 nogroup    0 Jul 22 11:32 .ipynb_checkpoints\n",
569 |       "-rw-r--r-- 1 50 nogroup 1.2G Jul 22 07:17 ops-1gb.logs\n",
570 |       "-rw-r--r-- 1 50 nogroup 1.2G Jul 22 11:40 ops.logs\n"
571 |      ]
572 |     }
573 |    ],
574 |    "source": [
575 |     "!ls -lah data"
576 |    ]
577 |   },
578 |   {
579 |    "cell_type": "code",
580 |    "execution_count": 11,
581 |    "metadata": {},
582 |    "outputs": [
583 |     {
584 |      "name": "stdout",
585 |      "output_type": "stream",
586 |      "text": [
587 |       "{\"company\":\"Rios__Pope_and_Baird\",\"cpu_utilization\":70.6942165035,\"cpu_utilization_is_error\":false,\"latency\":3.1373003261,\"latency_is_error\":false,\"packet_loss\":0.0,\"packet_loss_is_error\":false,\"throughput\":249.7207880994,\"throughput_is_error\":false,\"timestamp\":1563795193534}\n",
588 |       "{\"company\":\"Ross__Calderon_and_Brown\",\"cpu_utilization\":56.540474522,\"cpu_utilization_is_error\":false,\"latency\":0.0,\"latency_is_error\":false,\"packet_loss\":0.0,\"packet_loss_is_error\":false,\"throughput\":261.9362588938,\"throughput_is_error\":false,\"timestamp\":1563795193534}\n",
589 |       "{\"company\":\"Jackson_PLC\",\"cpu_utilization\":75.7476859549,\"cpu_utilization_is_error\":false,\"latency\":0.0,\"latency_is_error\":false,\"packet_loss\":1.3991427041,\"packet_loss_is_error\":false,\"throughput\":221.8819458316,\"throughput_is_error\":false,\"timestamp\":1563795193534}\n",
590 |       "{\"company\":\"Reyes_Group\",\"cpu_utilization\":61.4657850595,\"cpu_utilization_is_error\":false,\"latency\":0.0,\"latency_is_error\":false,\"packet_loss\":1.7039267608,\"packet_loss_is_error\":false,\"throughput\":232.4317426357,\"throughput_is_error\":false,\"timestamp\":1563795193534}\n",
591 |       "{\"company\":\"Carr-Reyes\",\"cpu_utilization\":49.9688296158,\"cpu_utilization_is_error\":false,\"latency\":0.4559851085,\"latency_is_error\":false,\"packet_loss\":0.0,\"packet_loss_is_error\":false,\"throughput\":276.0540712289,\"throughput_is_error\":false,\"timestamp\":1563795193534}\n",
592 |       "{\"company\":\"Wilson_and_Sons\",\"cpu_utilization\":66.0718786965,\"cpu_utilization_is_error\":false,\"latency\":0.0,\"latency_is_error\":false,\"packet_loss\":0.2971395649,\"packet_loss_is_error\":false,\"throughput\":264.7801359998,\"throughput_is_error\":false,\"timestamp\":1563795193534}\n",
593 |       "{\"company\":\"Nolan__Norton_and_Best\",\"cpu_utilization\":63.0572487273,\"cpu_utilization_is_error\":false,\"latency\":0.0,\"latency_is_error\":false,\"packet_loss\":0.0,\"packet_loss_is_error\":false,\"throughput\":264.033448601,\"throughput_is_error\":false,\"timestamp\":1563795193534}\n",
594 |       "{\"company\":\"Brandt_LLC\",\"cpu_utilization\":90.6382973539,\"cpu_utilization_is_error\":false,\"latency\":1.9411864095,\"latency_is_error\":false,\"packet_loss\":2.527867727,\"packet_loss_is_error\":false,\"throughput\":232.8742238776,\"throughput_is_error\":false,\"timestamp\":1563795193534}\n",
595 |       "{\"company\":\"Williams-Collins\",\"cpu_utilization\":64.1690341745,\"cpu_utilization_is_error\":false,\"latency\":0.0,\"latency_is_error\":false,\"packet_loss\":3.2983607178,\"packet_loss_is_error\":false,\"throughput\":264.0176356426,\"throughput_is_error\":false,\"timestamp\":1563795193534}\n",
596 |       "{\"company\":\"Williams__Hutchinson_and_Harrison\",\"cpu_utilization\":60.6966740664,\"cpu_utilization_is_error\":false,\"latency\":0.0,\"latency_is_error\":false,\"packet_loss\":3.1742385435,\"packet_loss_is_error\":false,\"throughput\":230.875976139,\"throughput_is_error\":false,\"timestamp\":1563795193534}\n"
597 |      ]
598 |     }
599 |    ],
600 |    "source": [
601 |     "!head data/ops.logs"
602 |    ]
603 |   },
604 |   {
605 |    "cell_type": "markdown",
606 |    "metadata": {},
607 |    "source": [
608 |     "## Benchmark\n",
609 |     "\n",
610 |     "### Flow\n",
611 |     "- Read file\n",
612 |     "- Compute aggregations\n",
613 |     "- get nlargest()"
614 |    ]
615 |   },
616 |   {
617 |    "cell_type": "code",
618 |    "execution_count": 3,
619 |    "metadata": {},
620 |    "outputs": [],
621 |    "source": [
622 |     "benchmark_file = source_file"
623 |    ]
624 |   },
625 |   {
626 |    "cell_type": "markdown",
627 |    "metadata": {},
628 |    "source": [
629 |     "#### cudf"
630 |    ]
631 |   },
632 |   {
633 |    "cell_type": "code",
634 |    "execution_count": 7,
635 |    "metadata": {},
636 |    "outputs": [
637 |     {
638 |      "name": "stdout",
639 |      "output_type": "stream",
640 |      "text": [
641 |       "1.44 s ± 23.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
642 |      ]
643 |     }
644 |    ],
645 |    "source": [
646 |     "%%timeit\n",
647 |     "\n",
648 |     "# Read file\n",
649 |     "gdf = cudf.read_json(benchmark_file, lines=True)\n",
650 |     "\n",
651 |     "# Perform aggregation\n",
652 |     "ggdf = gdf.groupby(['company']).\\\n",
653 |     "            agg({k: ['min', 'max', 'mean'] for k in metric_names})\n",
654 |     "\n",
655 |     "# Get N Largest (From original df)\n",
656 |     "raw_nlargest = gdf.nlargest(nlargest, 'cpu_utilization')"
657 |    ]
658 |   },
659 |   {
660 |    "cell_type": "markdown",
661 |    "metadata": {},
662 |    "source": [
663 |     "#### Pandas"
664 |    ]
665 |   },
666 |   {
667 |    "cell_type": "code",
668 |    "execution_count": 8,
669 |    "metadata": {},
670 |    "outputs": [
671 |     {
672 |      "name": "stdout",
673 |      "output_type": "stream",
674 |      "text": [
675 |       "43.4 s ± 627 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
676 |      ]
677 |     }
678 |    ],
679 |    "source": [
680 |     "%%timeit\n",
681 |     "\n",
682 |     "# Read file\n",
683 |     "pdf = pd.read_json(benchmark_file, lines=True)\n",
684 |     "\n",
685 |     "# Perform aggregation\n",
686 |     "gpdf = pdf.groupby(['company']).\\\n",
687 |     "            agg({k: ['min', 'max', 'mean'] for k in metric_names})\n",
688 |     "\n",
689 |     "# Get N Largest (From original df)\n",
690 |     "raw_nlargest = pdf.nlargest(nlargest, 'cpu_utilization')"
691 |    ]
692 |   },
693 |   {
694 |    "cell_type": "markdown",
695 |    "metadata": {},
696 |    "source": [
697 |     "## Test loading times"
698 |    ]
699 |   },
700 |   {
701 |    "cell_type": "markdown",
702 |    "metadata": {},
703 |    "source": [
704 |     "#### cudf"
705 |    ]
706 |   },
707 |   {
708 |    "cell_type": "code",
709 |    "execution_count": 35,
710 |    "metadata": {},
711 |    "outputs": [
712 |     {
713 |      "name": "stdout",
714 |      "output_type": "stream",
715 |      "text": [
716 |       "1.2 s ± 120 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
717 |      ]
718 |     }
719 |    ],
720 |    "source": [
721 |     "%%timeit\n",
722 |     "gdf = cudf.read_json(benchmark_file, lines=True)"
723 |    ]
724 |   },
725 |   {
726 |    "cell_type": "markdown",
727 |    "metadata": {},
728 |    "source": [
729 |     "#### Pandas"
730 |    ]
731 |   },
732 |   {
733 |    "cell_type": "code",
734 |    "execution_count": 36,
735 |    "metadata": {},
736 |    "outputs": [
737 |     {
738 |      "name": "stdout",
739 |      "output_type": "stream",
740 |      "text": [
741 |       "41.1 s ± 651 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
742 |      ]
743 |     }
744 |    ],
745 |    "source": [
746 |     "%%timeit\n",
747 |     "gdf = pd.read_json(benchmark_file, lines=True)"
748 |    ]
749 |   },
750 |   {
751 |    "cell_type": "markdown",
752 |    "metadata": {},
753 |    "source": [
754 |     "## Test aggregation\n",
755 |     "Load the files to memory so we can %timeit on the aggregations only"
756 |    ]
757 |   },
758 |   {
759 |    "cell_type": "code",
760 |    "execution_count": 4,
761 |    "metadata": {},
762 |    "outputs": [],
763 |    "source": [
764 |     "gdf = cudf.read_json(benchmark_file, lines=True)\n",
765 |     "pdf = pd.read_json(benchmark_file, lines=True)"
766 |    ]
767 |   },
768 |   {
769 |    "cell_type": "markdown",
770 |    "metadata": {},
771 |    "source": [
772 |     "#### cudf"
773 |    ]
774 |   },
775 |   {
776 |    "cell_type": "code",
777 |    "execution_count": 5,
778 |    "metadata": {},
779 |    "outputs": [
780 |     {
781 |      "name": "stdout",
782 |      "output_type": "stream",
783 |      "text": [
784 |       "212 ms ± 14.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
785 |      ]
786 |     }
787 |    ],
788 |    "source": [
789 |     "%%timeit\n",
790 |     "\n",
791 |     "ggdf = gdf.groupby(['company']).\\\n",
792 |     "            agg({k: ['min', 'max', 'mean'] for k in metric_names})\n",
793 |     "raw_nlargest = gdf.nlargest(nlargest, 'cpu_utilization')"
794 |    ]
795 |   },
796 |   {
797 |    "cell_type": "markdown",
798 |    "metadata": {},
799 |    "source": [
800 |     "#### Pandas"
801 |    ]
802 |   },
803 |   {
804 |    "cell_type": "code",
805 |    "execution_count": 6,
806 |    "metadata": {},
807 |    "outputs": [
808 |     {
809 |      "name": "stdout",
810 |      "output_type": "stream",
811 |      "text": [
812 |       "2.17 s ± 72.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
813 |      ]
814 |     }
815 |    ],
816 |    "source": [
817 |     "%%timeit\n",
818 |     "\n",
819 |     "gpdf = pdf.groupby(['company']).\\\n",
820 |     "            agg({k: ['min', 'max', 'mean'] for k in metric_names})\n",
821 |     "raw_nlargest = pdf.nlargest(nlargest, 'cpu_utilization')"
822 |    ]
823 |   },
824 |   {
825 |    "cell_type": "code",
826 |    "execution_count": null,
827 |    "metadata": {},
828 |    "outputs": [],
829 |    "source": []
830 |   }
831 |  ],
832 |  "metadata": {
833 |   "kernelspec": {
834 |    "display_name": "Python 3",
835 |    "language": "python",
836 |    "name": "python3"
837 |   },
838 |   "language_info": {
839 |    "codemirror_mode": {
840 |     "name": "ipython",
841 |     "version": 3
842 |    },
843 |    "file_extension": ".py",
844 |    "mimetype": "text/x-python",
845 |    "name": "python",
846 |    "nbconvert_exporter": "python",
847 |    "pygments_lexer": "ipython3",
848 |    "version": "3.6.8"
849 |   }
850 |  },
851 |  "nbformat": 4,
852 |  "nbformat_minor": 4
853 | }
854 | 


--------------------------------------------------------------------------------