├── .gitignore ├── .ipynb_checkpoints └── benchmarking-checkpoint.ipynb ├── README.md ├── benchmarking.ipynb └── random_people.csv /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | 3 | -------------------------------------------------------------------------------- /.ipynb_checkpoints/benchmarking-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import dask.dataframe as ddf\n", 10 | "import time\n", 11 | "import pandas as pd" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "df = ddf.read_csv('random_people.csv')" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "dfo.head(1)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "df['bonus'] = df['salary']*.5" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "dfo = pd.read_csv('random_people.csv')" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "dfo['bonus'] = dfo['salary']*.5" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "def benchmark(function, function_name):\n", 66 | " start = time.time()\n", 67 | " function()\n", 68 | " end = time.time()\n", 69 | " print(\"{0} seconds for {1}\".format((end - start), function_name))" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "def get_bonus(df):\n", 79 | " df['bonus'] = df['salary']*.5" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "def test_1():\n", 89 | " get_bonus(df)\n", 90 | "def test_2():\n", 91 | " get_bonus(dfo)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "benchmark(test_1, 'dataframe nuevo')\n", 101 | "benchmark(test_2, 'dataframe viejo')" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "df3.head(10)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "df2 = pd.concat([dfo for _ in range(1000)])\n", 120 | "\n", 121 | "df3 = pd.concat([df2 for _ in range(500)])" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "dfn = ddf.from_pandas(df3, npartitions=8)" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "del dfn" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "def test_big():\n", 149 | " get_bonus(dfn)\n", 150 | "def test_big_old():\n", 151 | " get_bonus(df3)\n", 152 | "\n", 153 | "def get_big_mean():\n", 154 | " return dfn.salary.mean().compute()\n", 155 | "def get_big_mean_old():\n", 156 | " return df3.salary.mean()\n", 157 | "\n", 158 | "def get_big_max():\n", 159 | " return dfn.salary.max().compute()\n", 160 | "def get_big_max_old():\n", 161 | " return df3.salary.max()\n", 162 | "\n", 163 | "def get_big_sum():\n", 164 | " return dfn.salary.sum().compute()\n", 165 | "def get_big_sum_old():\n", 166 | " return df3.salary.sum()\n", 167 | "\n", 168 | "def filter_df():\n", 169 | " df = dfn[dfn['salary']>5000]\n", 170 | "def filter_df_old():\n", 171 | " df = df3[df3['salary']>5000]\n", 172 | " \n", 173 | "def run_benchmarks():\n", 174 | " for i,f in enumerate([test_big, #test_big_old,\n", 175 | " get_big_mean,# get_big_mean_old,\n", 176 | " get_big_max, #get_big_max_old,\n", 177 | " get_big_sum, #get_big_sum_old,\n", 178 | " filter_df,#filter_df_old\n", 179 | " ]):\n", 180 | " benchmark(f, f.__name__)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "def f(x):\n", 190 | " return (13*x+5)%7\n", 191 | "\n", 192 | "def apply_random_old():\n", 193 | " df3['random']= df3['salary'].apply(f)\n", 194 | " \n", 195 | "def apply_random():\n", 196 | " dfn['random']= dfn['salary'].apply(f).compute()\n", 197 | "\n", 198 | "def value_count_test():\n", 199 | " dfn.salary.value_counts().compute()\n", 200 | "\n", 201 | "def value_count_test_old():\n", 202 | " df3.salary.value_counts()\n", 203 | " " 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "run_benchmarks()\n", 213 | "benchmark(apply_random, apply_random.__name__)\n", 214 | "#benchmark(apply_random_old, apply_random_old.__name__)\n", 215 | "benchmark(value_count_test, value_count_test.__name__)\n", 216 | "#benchmark(value_count_test_old, value_count_test_old.__name__)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "dfn.salary.value_counts().compute()" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "dfn.head(10)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [] 243 | } 244 | ], 245 | "metadata": { 246 | "kernelspec": { 247 | "display_name": "Python 2", 248 | "language": "python", 249 | "name": "python2" 250 | }, 251 | "language_info": { 252 | "codemirror_mode": { 253 | "name": "ipython", 254 | "version": 2 255 | }, 256 | "file_extension": ".py", 257 | "mimetype": "text/x-python", 258 | "name": "python", 259 | "nbconvert_exporter": "python", 260 | "pygments_lexer": "ipython2", 261 | "version": "2.7.12" 262 | } 263 | }, 264 | "nbformat": 4, 265 | "nbformat_minor": 2 266 | } 267 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This is a benchmark I ran on Dask Dataframes, comparing them to regular Pandas Dataframes. 2 | See for yourself, or read my medium article for the results here: 3 | 4 | https://towardsdatascience.com/trying-out-dask-dataframes-in-python-for-fast-data-analysis-in-parallel-aa960c18a915 5 | -------------------------------------------------------------------------------- /benchmarking.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import dask.dataframe as ddf\n", 10 | "import time\n", 11 | "import pandas as pd" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "df = ddf.read_csv('random_people.csv')" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "dfo.head(1)" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "df['bonus'] = df['salary']*.5" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [ 47 | "dfo = pd.read_csv('random_people.csv')" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "dfo['bonus'] = dfo['salary']*.5" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "def benchmark(function, function_name):\n", 66 | " start = time.time()\n", 67 | " function()\n", 68 | " end = time.time()\n", 69 | " print(\"{0} seconds for {1}\".format((end - start), function_name))" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "def get_bonus(df):\n", 79 | " df['bonus'] = df['salary']*.5" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "def test_1():\n", 89 | " get_bonus(df)\n", 90 | "def test_2():\n", 91 | " get_bonus(dfo)" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "benchmark(test_1, 'dataframe nuevo')\n", 101 | "benchmark(test_2, 'dataframe viejo')" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "df3.head(10)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": null, 116 | "metadata": {}, 117 | "outputs": [], 118 | "source": [ 119 | "df2 = pd.concat([dfo for _ in range(1000)])\n", 120 | "\n", 121 | "df3 = pd.concat([df2 for _ in range(500)])" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "dfn = ddf.from_pandas(df3, npartitions=8)" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "del dfn" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "def test_big():\n", 149 | " get_bonus(dfn)\n", 150 | "def test_big_old():\n", 151 | " get_bonus(df3)\n", 152 | "\n", 153 | "def get_big_mean():\n", 154 | " return dfn.salary.mean().compute()\n", 155 | "def get_big_mean_old():\n", 156 | " return df3.salary.mean()\n", 157 | "\n", 158 | "def get_big_max():\n", 159 | " return dfn.salary.max().compute()\n", 160 | "def get_big_max_old():\n", 161 | " return df3.salary.max()\n", 162 | "\n", 163 | "def get_big_sum():\n", 164 | " return dfn.salary.sum().compute()\n", 165 | "def get_big_sum_old():\n", 166 | " return df3.salary.sum()\n", 167 | "\n", 168 | "def filter_df():\n", 169 | " df = dfn[dfn['salary']>5000]\n", 170 | "def filter_df_old():\n", 171 | " df = df3[df3['salary']>5000]\n", 172 | " \n", 173 | "def run_benchmarks():\n", 174 | " for i,f in enumerate([test_big, #test_big_old,\n", 175 | " get_big_mean,# get_big_mean_old,\n", 176 | " get_big_max, #get_big_max_old,\n", 177 | " get_big_sum, #get_big_sum_old,\n", 178 | " filter_df,#filter_df_old\n", 179 | " ]):\n", 180 | " benchmark(f, f.__name__)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "def f(x):\n", 190 | " return (13*x+5)%7\n", 191 | "\n", 192 | "def apply_random_old():\n", 193 | " df3['random']= df3['salary'].apply(f)\n", 194 | " \n", 195 | "def apply_random():\n", 196 | " dfn['random']= dfn['salary'].apply(f).compute()\n", 197 | "\n", 198 | "def value_count_test():\n", 199 | " dfn.salary.value_counts().compute()\n", 200 | "\n", 201 | "def value_count_test_old():\n", 202 | " df3.salary.value_counts()\n", 203 | " " 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "run_benchmarks()\n", 213 | "benchmark(apply_random, apply_random.__name__)\n", 214 | "#benchmark(apply_random_old, apply_random_old.__name__)\n", 215 | "benchmark(value_count_test, value_count_test.__name__)\n", 216 | "#benchmark(value_count_test_old, value_count_test_old.__name__)" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "dfn.salary.value_counts().compute()" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "metadata": {}, 232 | "outputs": [], 233 | "source": [ 234 | "dfn.head(10)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [] 243 | } 244 | ], 245 | "metadata": { 246 | "kernelspec": { 247 | "display_name": "Python 2", 248 | "language": "python", 249 | "name": "python2" 250 | }, 251 | "language_info": { 252 | "codemirror_mode": { 253 | "name": "ipython", 254 | "version": 2 255 | }, 256 | "file_extension": ".py", 257 | "mimetype": "text/x-python", 258 | "name": "python", 259 | "nbconvert_exporter": "python", 260 | "pygments_lexer": "ipython2", 261 | "version": "2.7.12" 262 | } 263 | }, 264 | "nbformat": 4, 265 | "nbformat_minor": 2 266 | } 267 | -------------------------------------------------------------------------------- /random_people.csv: -------------------------------------------------------------------------------- 1 | ,name,surname,salary 2 | 0,Henry,Joneson,5000 3 | 1,Albert,Goodman,10000 4 | 2,William,Goodman,10000 5 | 3,John,Joneson,10000 6 | 4,Albert,Black,10000 7 | 5,Henry,Joneson,12000 8 | 6,Richard,Green,5500 9 | 7,Henry,Joneson,11000 10 | 8,Henry,Goodman,12000 11 | 9,Albert,Joneson,11000 12 | 10,William,Joneson,10000 13 | 11,John,White,10000 14 | 12,Henry,Black,11000 15 | 13,Albert,Goodman,10000 16 | 14,Richard,Green,5500 17 | 15,Henry,Black,13500 18 | 16,Richard,White,11000 19 | 17,Albert,Black,5500 20 | 18,Henry,Green,10000 21 | 19,Albert,Joneson,11000 22 | 20,William,Goodman,12000 23 | 21,William,Goodman,5000 24 | 22,John,Green,9500 25 | 23,John,Black,13500 26 | 24,Richard,Green,13500 27 | 25,Henry,Joneson,12000 28 | 26,Henry,Goodman,10000 29 | 27,John,Joneson,9500 30 | 28,Henry,Goodman,11000 31 | 29,William,Green,12000 32 | 30,Henry,Goodman,10000 33 | 31,Richard,Black,10000 34 | 32,Richard,Joneson,5500 35 | 33,Richard,Joneson,5000 36 | 34,Henry,Black,9500 37 | 35,John,White,13500 38 | 36,Henry,Green,11000 39 | 37,John,Black,5500 40 | 38,William,Green,12000 41 | 39,Albert,Green,10000 42 | 40,Richard,Joneson,9500 43 | 41,William,Joneson,12000 44 | 42,John,Joneson,10000 45 | 43,William,Black,10000 46 | 44,Albert,Black,12000 47 | 45,John,Goodman,13500 48 | 46,John,Joneson,10000 49 | 47,John,Joneson,9500 50 | 48,Richard,Black,9500 51 | 49,Albert,White,10000 52 | --------------------------------------------------------------------------------