├── 20190510-RStats-NYC.ipynb ├── 20190725-Parquet-Dict-Benchmark.ipynb ├── 20190726-Parquet-Float-Encoding.ipynb ├── 20190802-Parquet-Dict-Decoding-Benchmark.ipynb ├── 20190803-ARROW-6060.ipynb ├── 20190815-Parquet-Direct-Dictionary-Write.ipynb ├── 20190830-VLDB-FlightDemo.ipynb ├── 20190919file_benchmarks ├── FeatherCompression.ipynb ├── all_read_results.csv ├── all_results.csv ├── all_write_results.csv ├── benchmark.R ├── benchmark.py ├── file_sizes.csv ├── generate_results.sh ├── glue_results.py ├── i9-9880H-1 │ ├── all_results.csv │ ├── plot.png │ ├── py_results.csv │ └── r_results.csv ├── i9-9880H-4 │ ├── all_results.csv │ ├── plot.png │ ├── py_results.csv │ └── r_results.csv ├── i9-9880H-8 │ ├── all_results.csv │ ├── plot.png │ ├── py_results.csv │ └── r_results.csv ├── make_feather_plots.R ├── make_plots.R ├── py_read_results_1.csv ├── py_read_results_4.csv ├── py_write_results_1.csv ├── py_write_results_4.csv ├── r_read_results_1.csv ├── r_read_results_4.csv ├── r_write_results_1.csv └── r_write_results_4.csv ├── 20200402pandas_load ├── .ipynb_checkpoints │ └── Untitled-checkpoint.ipynb └── Untitled.ipynb ├── 20200509wideparquet └── WideParquet.ipynb ├── peak_use.py └── scripts ├── 20190903_parquet_benchmark.py └── arrow7305.py /20190725-Parquet-Dict-Benchmark.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [] 9 | } 10 | ], 11 | "metadata": { 12 | "kernelspec": { 13 | "display_name": "Python 3", 14 | "language": "python", 15 | "name": "python3" 16 | }, 17 | "language_info": { 18 | "codemirror_mode": { 19 | "name": "ipython", 20 | "version": 3 21 | }, 22 | "file_extension": ".py", 23 | "mimetype": "text/x-python", 24 | "name": "python", 25 | "nbconvert_exporter": "python", 26 | "pygments_lexer": "ipython3", 27 | "version": "3.7.3" 28 | } 29 | }, 30 | "nbformat": 4, 31 | "nbformat_minor": 2 32 | } 33 | -------------------------------------------------------------------------------- /20190726-Parquet-Float-Encoding.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pyarrow as pa\n", 17 | "import pyarrow.parquet as pq\n", 18 | "import numpy as np\n", 19 | "import pandas as pd\n", 20 | "arr = pa.array([np.nan] * 10000000)\n", 21 | "t = pa.Table.from_arrays([arr], names=['f0'])" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 2, 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "name": "stdout", 31 | "output_type": "stream", 32 | "text": [ 33 | "88.1 ms ± 1.68 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" 34 | ] 35 | } 36 | ], 37 | "source": [ 38 | "%timeit pq.write_table(t, '/home/wesm/tmp/nans.parquet')" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": null, 44 | "metadata": {}, 45 | "outputs": [], 46 | "source": [] 47 | } 48 | ], 49 | "metadata": { 50 | "kernelspec": { 51 | "display_name": "Python 3", 52 | "language": "python", 53 | "name": "python3" 54 | }, 55 | "language_info": { 56 | "codemirror_mode": { 57 | "name": "ipython", 58 | "version": 3 59 | }, 60 | "file_extension": ".py", 61 | "mimetype": "text/x-python", 62 | "name": "python", 63 | "nbconvert_exporter": "python", 64 | "pygments_lexer": "ipython3", 65 | "version": "3.7.3" 66 | } 67 | }, 68 | "nbformat": 4, 69 | "nbformat_minor": 2 70 | } 71 | -------------------------------------------------------------------------------- /20190802-Parquet-Dict-Decoding-Benchmark.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pyarrow as pa\n", 11 | "import pyarrow.parquet as pq\n", 12 | "import pandas as pd\n", 13 | "from pandas.util.testing import rands\n", 14 | " \n", 15 | "NUNIQUE = 1000\n", 16 | "STRING_SIZE = 50\n", 17 | "LENGTH = 10_000_000\n", 18 | "REPEATS = LENGTH // NUNIQUE\n", 19 | "\n", 20 | "data = [rands(STRING_SIZE) for i in range(NUNIQUE)] * REPEATS\n", 21 | "table = pa.table([data], names=['f0'])\n", 22 | "\n", 23 | "out_stream = pa.BufferOutputStream()\n", 24 | "pq.write_table(table, out_stream)\n", 25 | "contents = out_stream.getvalue()" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 6, 31 | "metadata": {}, 32 | "outputs": [ 33 | { 34 | "data": { 35 | "text/plain": [ 36 | "1129939" 37 | ] 38 | }, 39 | "execution_count": 6, 40 | "metadata": {}, 41 | "output_type": "execute_result" 42 | } 43 | ], 44 | "source": [ 45 | "len(contents)" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 12, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "name": "stdout", 55 | "output_type": "stream", 56 | "text": [ 57 | "0\n" 58 | ] 59 | } 60 | ], 61 | "source": [ 62 | "import gc\n", 63 | "class memory_use:\n", 64 | " \n", 65 | " def __init__(self):\n", 66 | " self.start_use = pa.total_allocated_bytes()\n", 67 | " \n", 68 | " def __enter__(self):\n", 69 | " return\n", 70 | " \n", 71 | " def __exit__(self, type, value, traceback):\n", 72 | " gc.collect()\n", 73 | " print(pa.total_allocated_bytes() - self.start_use)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 13, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "541250112\n" 86 | ] 87 | } 88 | ], 89 | "source": [ 90 | "with memory_use():\n", 91 | " memory_use_no_dict = pq.read_table(pa.BufferReader(contents))" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 15, 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "name": "stdout", 101 | "output_type": "stream", 102 | "text": [ 103 | "41304128\n" 104 | ] 105 | } 106 | ], 107 | "source": [ 108 | "with memory_use():\n", 109 | " memory_use_dict = pq.read_table(pa.BufferReader(contents), read_dictionary=['f0'])" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 16, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | "1.79 s ± 7.86 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 122 | ] 123 | } 124 | ], 125 | "source": [ 126 | "%timeit memory_use_no_dict = pq.read_table(pa.BufferReader(contents))" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": 17, 132 | "metadata": {}, 133 | "outputs": [ 134 | { 135 | "name": "stdout", 136 | "output_type": "stream", 137 | "text": [ 138 | "106 ms ± 11.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" 139 | ] 140 | } 141 | ], 142 | "source": [ 143 | "%timeit memory_use_dict = pq.read_table(pa.BufferReader(contents), read_dictionary=['f0'])" 144 | ] 145 | }, 146 | { 147 | "cell_type": "code", 148 | "execution_count": 18, 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "data": { 153 | "text/plain": [ 154 | "516.1763305664062" 155 | ] 156 | }, 157 | "execution_count": 18, 158 | "metadata": {}, 159 | "output_type": "execute_result" 160 | } 161 | ], 162 | "source": [ 163 | "541250112 / (1 << 20)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 19, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "data": { 173 | "text/plain": [ 174 | "39.39068603515625" 175 | ] 176 | }, 177 | "execution_count": 19, 178 | "metadata": {}, 179 | "output_type": "execute_result" 180 | } 181 | ], 182 | "source": [ 183 | "41304128 / (1 << 20)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": {}, 190 | "outputs": [], 191 | "source": [] 192 | } 193 | ], 194 | "metadata": { 195 | "kernelspec": { 196 | "display_name": "Python 3", 197 | "language": "python", 198 | "name": "python3" 199 | }, 200 | "language_info": { 201 | "codemirror_mode": { 202 | "name": "ipython", 203 | "version": 3 204 | }, 205 | "file_extension": ".py", 206 | "mimetype": "text/x-python", 207 | "name": "python", 208 | "nbconvert_exporter": "python", 209 | "pygments_lexer": "ipython3", 210 | "version": "3.7.3" 211 | } 212 | }, 213 | "nbformat": 4, 214 | "nbformat_minor": 2 215 | } 216 | -------------------------------------------------------------------------------- /20190803-ARROW-6060.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Change in memory use: 76502016\n", 13 | "Change in peak use: 5843859776\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import pandas as pd\n", 19 | "from pandas.util.testing import rands\n", 20 | "\n", 21 | "import pyarrow as pa\n", 22 | "import pyarrow.parquet as pq\n", 23 | "\n", 24 | "import gc\n", 25 | "class memory_use:\n", 26 | " \n", 27 | " def __init__(self):\n", 28 | " self.start_use = pa.total_allocated_bytes() \n", 29 | " self.pool = pa.default_memory_pool()\n", 30 | " self.start_peak_use = self.pool.max_memory()\n", 31 | " \n", 32 | " def __enter__(self):\n", 33 | " return\n", 34 | " \n", 35 | " def __exit__(self, type, value, traceback):\n", 36 | " gc.collect()\n", 37 | " print(\"Change in memory use: {}\"\n", 38 | " .format(pa.total_allocated_bytes() - self.start_use))\n", 39 | " print(\"Change in peak use: {}\"\n", 40 | " .format(self.pool.max_memory() - self.start_peak_use))\n", 41 | "\n", 42 | "with memory_use():\n", 43 | " table = pq.read_table('/home/wesm/Downloads/demofile.parquet') " 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 2, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "Change in memory use: 34499968\n", 56 | "Change in peak use: 5801857728\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "with memory_use():\n", 62 | " table = pq.read_table('/home/wesm/Downloads/demofile.parquet', columns=['body'])" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 5, 68 | "metadata": {}, 69 | "outputs": [ 70 | { 71 | "data": { 72 | "text/plain": [ 73 | "pyarrow.Table\n", 74 | "archived: bool\n", 75 | "author: string\n", 76 | "author_flair_css_class: string\n", 77 | "author_flair_text: string\n", 78 | "body: string\n", 79 | "controversiality: int64\n", 80 | "created_utc: string\n", 81 | "distinguished: string\n", 82 | "downs: int64\n", 83 | "edited: string\n", 84 | "gilded: int64\n", 85 | "id: string\n", 86 | "link_id: string\n", 87 | "name: string\n", 88 | "parent_id: string\n", 89 | "retrieved_on: int64\n", 90 | "score: int64\n", 91 | "score_hidden: bool\n", 92 | "subreddit: string\n", 93 | "subreddit_id: string\n", 94 | "ups: int64" 95 | ] 96 | }, 97 | "execution_count": 5, 98 | "metadata": {}, 99 | "output_type": "execute_result" 100 | } 101 | ], 102 | "source": [ 103 | "table" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": null, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "def generate_strings(length, nunique, string_length=10):\n", 113 | " unique_values = [rands(string_length) for i in range(nunique)]\n", 114 | " values = unique_values * (length // nunique)\n", 115 | " return values\n", 116 | "\n", 117 | "df = pd.DataFrame()\n", 118 | "df['a'] = generate_strings(100000000, 10000)\n", 119 | "df['b'] = generate_strings(100000000, 10000)\n", 120 | "df.to_parquet('/tmp/test.parquet')" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 2, 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "name": "stdout", 130 | "output_type": "stream", 131 | "text": [ 132 | "Change in memory use: 825560448\n", 133 | "Change in peak use: 1484772224\n" 134 | ] 135 | } 136 | ], 137 | "source": [ 138 | "with memory_use():\n", 139 | " table = pq.read_table('/tmp/test.parquet', read_dictionary=['a', 'b'])" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 2, 145 | "metadata": {}, 146 | "outputs": [], 147 | "source": [ 148 | "pool = pa.default_memory_pool()" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 3, 154 | "metadata": {}, 155 | "outputs": [ 156 | { 157 | "data": { 158 | "text/plain": [ 159 | "0" 160 | ] 161 | }, 162 | "execution_count": 3, 163 | "metadata": {}, 164 | "output_type": "execute_result" 165 | } 166 | ], 167 | "source": [ 168 | "pool.max_memory()" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": 4, 174 | "metadata": {}, 175 | "outputs": [], 176 | "source": [] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 7, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | "-rw-r--r-- 1 wesm wesm 274263652 Aug 3 14:19 /tmp/test.parquet\r\n" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "!ls -l /tmp/*.parquet" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 8, 198 | "metadata": {}, 199 | "outputs": [ 200 | { 201 | "name": "stdout", 202 | "output_type": "stream", 203 | "text": [ 204 | "Change in memory use: 2825000192\n", 205 | "Change in peak use: 3827684608\n" 206 | ] 207 | } 208 | ], 209 | "source": [] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 10, 214 | "metadata": {}, 215 | "outputs": [ 216 | { 217 | "data": { 218 | "text/plain": [ 219 | "20585786752" 220 | ] 221 | }, 222 | "execution_count": 10, 223 | "metadata": {}, 224 | "output_type": "execute_result" 225 | } 226 | ], 227 | "source": [ 228 | "pool.max_memory()" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [] 237 | } 238 | ], 239 | "metadata": { 240 | "kernelspec": { 241 | "display_name": "Python 3", 242 | "language": "python", 243 | "name": "python3" 244 | }, 245 | "language_info": { 246 | "codemirror_mode": { 247 | "name": "ipython", 248 | "version": 3 249 | }, 250 | "file_extension": ".py", 251 | "mimetype": "text/x-python", 252 | "name": "python", 253 | "nbconvert_exporter": "python", 254 | "pygments_lexer": "ipython3", 255 | "version": "3.7.3" 256 | } 257 | }, 258 | "nbformat": 4, 259 | "nbformat_minor": 2 260 | } 261 | -------------------------------------------------------------------------------- /20190815-Parquet-Direct-Dictionary-Write.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import numpy as np\n", 10 | "import pyarrow as pa\n", 11 | "import pyarrow.parquet as pq\n", 12 | "import pandas as pd\n", 13 | "from pandas.util.testing import rands\n", 14 | " \n", 15 | "NUNIQUE = 1000\n", 16 | "STRING_SIZE = 50\n", 17 | "LENGTH = 10_000_000\n", 18 | "REPEATS = LENGTH // NUNIQUE\n", 19 | "\n", 20 | "uniques = np.array([rands(STRING_SIZE) for i in range(NUNIQUE)], dtype='O')\n", 21 | "indices = np.random.randint(0, NUNIQUE, size=LENGTH).astype('i4') \n", 22 | "data = uniques.take(indices)" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import gc\n", 32 | "class memory_use:\n", 33 | " \n", 34 | " def __init__(self):\n", 35 | " self.start_use = pa.total_allocated_bytes() \n", 36 | " self.pool = pa.default_memory_pool()\n", 37 | " self.start_peak_use = self.pool.max_memory()\n", 38 | " \n", 39 | " def __enter__(self):\n", 40 | " return\n", 41 | " \n", 42 | " def __exit__(self, type, value, traceback):\n", 43 | " gc.collect()\n", 44 | " print(\"Change in memory use: {}\"\n", 45 | " .format(pa.total_allocated_bytes() - self.start_use))\n", 46 | " print(\"Change in peak use: {}\"\n", 47 | " .format(self.pool.max_memory() - self.start_peak_use))" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 3, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "dict_data = pa.DictionaryArray.from_arrays(indices, uniques)" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 4, 62 | "metadata": {}, 63 | "outputs": [ 64 | { 65 | "data": { 66 | "text/plain": [ 67 | "72320" 68 | ] 69 | }, 70 | "execution_count": 4, 71 | "metadata": {}, 72 | "output_type": "execute_result" 73 | } 74 | ], 75 | "source": [ 76 | "pa.default_memory_pool().max_memory()" 77 | ] 78 | }, 79 | { 80 | "cell_type": "code", 81 | "execution_count": 5, 82 | "metadata": {}, 83 | "outputs": [ 84 | { 85 | "name": "stdout", 86 | "output_type": "stream", 87 | "text": [ 88 | "Change in memory use: 16777216\n", 89 | "Change in peak use: 753475648\n" 90 | ] 91 | } 92 | ], 93 | "source": [ 94 | "table = pa.table([dict_data], names=['f0'])\n", 95 | "with memory_use():\n", 96 | " out_stream = pa.BufferOutputStream()\n", 97 | " pq.write_table(table, out_stream)\n", 98 | " contents = out_stream.getvalue()" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 6, 104 | "metadata": {}, 105 | "outputs": [ 106 | { 107 | "name": "stdout", 108 | "output_type": "stream", 109 | "text": [ 110 | "820 ms ± 11.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 111 | ] 112 | } 113 | ], 114 | "source": [ 115 | "%%timeit\n", 116 | "out_stream = pa.BufferOutputStream()\n", 117 | "pq.write_table(table, out_stream)\n", 118 | "contents = out_stream.getvalue()" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 7, 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "data": { 128 | "text/plain": [ 129 | "12576182" 130 | ] 131 | }, 132 | "execution_count": 7, 133 | "metadata": {}, 134 | "output_type": "execute_result" 135 | } 136 | ], 137 | "source": [ 138 | "len(contents)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 8, 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "name": "stdout", 148 | "output_type": "stream", 149 | "text": [ 150 | "495 ms ± 8.04 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "%timeit returned_table = pq.read_table(contents)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 9, 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "name": "stdout", 165 | "output_type": "stream", 166 | "text": [ 167 | "93.1 ms ± 3.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" 168 | ] 169 | } 170 | ], 171 | "source": [ 172 | "%timeit returned_table = pq.read_table(contents, read_dictionary=['f0'])" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 10, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "dense_data = dict_data.cast(pa.utf8())\n", 182 | "table = pa.table([dense_data], names=['f0'])" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 11, 188 | "metadata": {}, 189 | "outputs": [ 190 | { 191 | "name": "stdout", 192 | "output_type": "stream", 193 | "text": [ 194 | "405 ms ± 27.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 195 | ] 196 | } 197 | ], 198 | "source": [ 199 | "%%timeit\n", 200 | "out_stream = pa.BufferOutputStream()\n", 201 | "pq.write_table(table, out_stream)\n", 202 | "contents = out_stream.getvalue()" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 12, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "out_stream = pa.BufferOutputStream()\n", 212 | "pq.write_table(table, out_stream)\n", 213 | "contents = out_stream.getvalue()" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 13, 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "name": "stdout", 223 | "output_type": "stream", 224 | "text": [ 225 | "430 ms ± 8.12 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 226 | ] 227 | } 228 | ], 229 | "source": [ 230 | "%%timeit\n", 231 | "returned_table = pq.read_table(contents)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 14, 237 | "metadata": {}, 238 | "outputs": [ 239 | { 240 | "data": { 241 | "text/plain": [ 242 | "pyarrow.Table\n", 243 | "f0: string" 244 | ] 245 | }, 246 | "execution_count": 14, 247 | "metadata": {}, 248 | "output_type": "execute_result" 249 | } 250 | ], 251 | "source": [ 252 | "pq.read_table(contents)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [] 261 | } 262 | ], 263 | "metadata": { 264 | "kernelspec": { 265 | "display_name": "Python 3", 266 | "language": "python", 267 | "name": "python3" 268 | }, 269 | "language_info": { 270 | "codemirror_mode": { 271 | "name": "ipython", 272 | "version": 3 273 | }, 274 | "file_extension": ".py", 275 | "mimetype": "text/x-python", 276 | "name": "python", 277 | "nbconvert_exporter": "python", 278 | "pygments_lexer": "ipython3", 279 | "version": "3.7.3" 280 | } 281 | }, 282 | "nbformat": 4, 283 | "nbformat_minor": 2 284 | } 285 | -------------------------------------------------------------------------------- /20190830-VLDB-FlightDemo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pyarrow as pa\n", 10 | "import pyarrow.parquet as pq\n", 11 | "import pyarrow.flight as flight\n", 12 | "import numpy as np\n", 13 | "import pandas as pd\n", 14 | "import time\n", 15 | "import threading" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": null, 21 | "metadata": {}, 22 | "outputs": [], 23 | "source": [ 24 | "class DemoServer(flight.FlightServerBase):\n", 25 | " \n", 26 | " def __init__(self):\n", 27 | " self._cache = {}\n", 28 | " \n", 29 | " def list_actions(self, context):\n", 30 | " return [flight.ActionType('list-tables', 'List stored tables'),\n", 31 | " flight.ActionType('drop-table', 'Drop a stored table')]\n", 32 | "\n", 33 | " # -----------------------------------------------------------------\n", 34 | " # Implement actions\n", 35 | " \n", 36 | " def do_action(self, context, action):\n", 37 | " handlers = {\n", 38 | " 'list-tables': self._list_tables,\n", 39 | " 'drop-table': self._drop_table\n", 40 | " } \n", 41 | " handler = handlers.get(action.type)\n", 42 | " if not handler:\n", 43 | " raise NotImplementedError \n", 44 | " return handlers[action.type](action)\n", 45 | " \n", 46 | " def _drop_table(self, action):\n", 47 | " del self._cache[action.body]\n", 48 | " \n", 49 | " def _list_tables(self, action):\n", 50 | " return iter([flight.Result(cache_key) \n", 51 | " for cache_key in sorted(self._cache.keys())])\n", 52 | "\n", 53 | " # -----------------------------------------------------------------\n", 54 | " # Implement puts\n", 55 | " \n", 56 | " def do_put(self, context, descriptor, reader, writer):\n", 57 | " self._cache[descriptor.command] = reader.read_all()\n", 58 | " \n", 59 | " # -----------------------------------------------------------------\n", 60 | " # Implement gets\n", 61 | "\n", 62 | " def do_get(self, context, ticket):\n", 63 | " table = self._cache[ticket.ticket]\n", 64 | " return flight.RecordBatchStream(table)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "import contextlib\n", 74 | "import socket\n", 75 | "def find_free_port():\n", 76 | " # Find a free port\n", 77 | " sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n", 78 | " with contextlib.closing(sock) as sock:\n", 79 | " sock.bind(('', 0))\n", 80 | " sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)\n", 81 | " port = sock.getsockname()[1]\n", 82 | " return port\n", 83 | "\n", 84 | "def wait_for_available(client):\n", 85 | " deadline = time.time() + 5.0\n", 86 | " while True:\n", 87 | " try:\n", 88 | " list(client.list_flights())\n", 89 | " except Exception as e:\n", 90 | " if 'Connect Failed' in str(e):\n", 91 | " if time.time() < deadline:\n", 92 | " time.sleep(0.025)\n", 93 | " continue\n", 94 | " else:\n", 95 | " raise\n", 96 | " break" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "port = 1337\n", 106 | "location = flight.Location.for_grpc_tcp(\"localhost\", find_free_port())\n", 107 | "location" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "server = DemoServer()\n", 117 | "server.init(location)\n", 118 | "\n", 119 | "thread = threading.Thread(target=lambda: server.run(), daemon=True)\n", 120 | "thread.start()\n", 121 | "\n", 122 | "client = flight.FlightClient.connect(location)\n", 123 | "wait_for_available(client)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "client.list_actions()" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": null, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "def list_tables(client):\n", 142 | " action = flight.Action('list-tables', b'')\n", 143 | " return [x.body.to_pybytes().decode('utf8') for x in client.do_action(action)] \n", 144 | "\n", 145 | "# def drop_table(client):\n", 146 | "\n", 147 | "list_tables(client)" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": null, 153 | "metadata": {}, 154 | "outputs": [], 155 | "source": [ 156 | "def cache_table_in_server(name, table):\n", 157 | " desc = flight.FlightDescriptor.for_command(name.encode('utf8'))\n", 158 | " put_writer, put_meta_reader = client.do_put(desc, table.schema)\n", 159 | " put_writer.write(table)\n", 160 | " put_writer.close()\n", 161 | " \n", 162 | " \n", 163 | "def get_table(name):\n", 164 | " reader = client.do_get(flight.Ticket(name.encode('utf8')))\n", 165 | " return reader.read_all()" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": null, 171 | "metadata": {}, 172 | "outputs": [], 173 | "source": [ 174 | "table = pa.table([pa.array([1,2,3,4,5])], names=['f0'])\n", 175 | "cache_table_in_server('table1', table)" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": null, 181 | "metadata": {}, 182 | "outputs": [], 183 | "source": [ 184 | "list_tables(client)" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "cache_table_in_server('table2', table)\n", 194 | "cache_table_in_server('table3', table)\n", 195 | "cache_table_in_server('table4', table)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "list_tables(client)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "get_table('table1')" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [ 222 | "import pandas as pd\n", 223 | "fec = pd.read_csv('/home/wesm/code/pydata-book/datasets/fec/P00000001-ALL.csv')\n", 224 | "fec.head()\n", 225 | "def coerce_int(x):\n", 226 | " try:\n", 227 | " return int(x)\n", 228 | " except:\n", 229 | " return -1\n", 230 | "\n", 231 | "fec['contbr_zip'] = fec['contbr_zip'].map(coerce_int).astype(np.int64)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": 17, 237 | "metadata": {}, 238 | "outputs": [], 239 | "source": [ 240 | "fec_table = pa.table(fec)" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 18, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "fec_table = pa.concat_tables([fec_table] * 10)" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 19, 255 | "metadata": {}, 256 | "outputs": [ 257 | { 258 | "name": "stdout", 259 | "output_type": "stream", 260 | "text": [ 261 | "CPU times: user 425 ms, sys: 1.13 s, total: 1.56 s\n", 262 | "Wall time: 1.16 s\n" 263 | ] 264 | } 265 | ], 266 | "source": [ 267 | "%%time\n", 268 | "cache_table_in_server('fec_table', fec_table)" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 20, 274 | "metadata": {}, 275 | "outputs": [ 276 | { 277 | "data": { 278 | "text/plain": [ 279 | "['fec_table', 'table1', 'table2', 'table3', 'table4']" 280 | ] 281 | }, 282 | "execution_count": 20, 283 | "metadata": {}, 284 | "output_type": "execute_result" 285 | } 286 | ], 287 | "source": [ 288 | "list_tables(client)" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": 21, 294 | "metadata": {}, 295 | "outputs": [ 296 | { 297 | "name": "stdout", 298 | "output_type": "stream", 299 | "text": [ 300 | "CPU times: user 404 ms, sys: 995 ms, total: 1.4 s\n", 301 | "Wall time: 1.1 s\n" 302 | ] 303 | } 304 | ], 305 | "source": [ 306 | "%%time \n", 307 | "\n", 308 | "fec_table_received = get_table('fec_table')" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": null, 314 | "metadata": {}, 315 | "outputs": [], 316 | "source": [] 317 | } 318 | ], 319 | "metadata": { 320 | "kernelspec": { 321 | "display_name": "Python 3", 322 | "language": "python", 323 | "name": "python3" 324 | }, 325 | "language_info": { 326 | "codemirror_mode": { 327 | "name": "ipython", 328 | "version": 3 329 | }, 330 | "file_extension": ".py", 331 | "mimetype": "text/x-python", 332 | "name": "python", 333 | "nbconvert_exporter": "python", 334 | "pygments_lexer": "ipython3", 335 | "version": "3.7.3" 336 | } 337 | }, 338 | "nbformat": 4, 339 | "nbformat_minor": 2 340 | } 341 | -------------------------------------------------------------------------------- /20190919file_benchmarks/FeatherCompression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "using 8 cpu cores\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "# flake8: noqa\n", 18 | "\n", 19 | "import pyarrow.feather as feather\n", 20 | "import pandas as pd\n", 21 | "import json\n", 22 | "import numpy as np\n", 23 | "import pyarrow as pa\n", 24 | "import pyarrow.parquet as pq\n", 25 | "from pandas.util.testing import rands\n", 26 | "import gc\n", 27 | "import os\n", 28 | "import time\n", 29 | "\n", 30 | "pa.set_cpu_count(8)\n", 31 | "\n", 32 | "print(f\"using {pa.cpu_count()} cpu cores\")\n", 33 | " \n", 34 | "\n", 35 | "def get_timing(f, niter=1):\n", 36 | " start = time.clock_gettime(time.CLOCK_REALTIME)\n", 37 | " for i in range(niter):\n", 38 | " f()\n", 39 | " result = (time.clock_gettime(time.CLOCK_REALTIME) - start) / niter\n", 40 | " return result\n", 41 | "\n", 42 | "\n", 43 | "files = {\n", 44 | " 'fanniemae': {\n", 45 | " 'base': '2016Q4',\n", 46 | " 'source': {\n", 47 | " 'path': '2016Q4.txt',\n", 48 | " 'sep': '|',\n", 49 | " 'header': None\n", 50 | " }\n", 51 | " },\n", 52 | " 'nyctaxi': {\n", 53 | " 'base': 'yellow_tripdata_2010-01',\n", 54 | " 'source': {\n", 55 | " 'path': 'yellow_tripdata_2010-01.csv',\n", 56 | " 'sep': ',',\n", 57 | " 'header': 0\n", 58 | " }\n", 59 | " }\n", 60 | "}\n", 61 | "\n", 62 | "\n", 63 | "compression_cases = [\n", 64 | " (None, None), # uncompressed\n", 65 | " ('zstd', 1), # minimal compression\n", 66 | " ('zstd', 10), # moderate\n", 67 | " ('lz4', None) # LZ4 doesn't support compression level\n", 68 | "]\n", 69 | "\n", 70 | "\n", 71 | "def write_files(files, chunksize=1<<16):\n", 72 | " statistics = []\n", 73 | " for name, info in files.items():\n", 74 | " source = info['source']\n", 75 | " print(\"reading {}\".format(source['path']))\n", 76 | " df = pd.read_csv(source['path'], sep=source['sep'], \n", 77 | " header=source['header'], \n", 78 | " low_memory=False)\n", 79 | " if source['header'] is None:\n", 80 | " df.columns = ['f{}'.format(i) for i in range(len(df.columns))]\n", 81 | "\n", 82 | " t = (pa.Table.from_pandas(df, preserve_index=False)\n", 83 | " .replace_schema_metadata(None))\n", 84 | " for compression, compression_level in compression_cases:\n", 85 | " path = '{}_{}_{}.feather'.format(info['base'], \n", 86 | " compression or 'uncompressed',\n", 87 | " compression_level)\n", 88 | " print((name, compression, compression_level))\n", 89 | " tm = get_timing(lambda: \n", 90 | " feather.write_feather(df, path, compression=compression,\n", 91 | " compression_level=compression_level,\n", 92 | " chunksize=chunksize))\n", 93 | " file_size = os.stat(path).st_size\n", 94 | " result = name, compression, compression_level, file_size, tm\n", 95 | " print(result)\n", 96 | " statistics.append(result)\n", 97 | " return statistics\n", 98 | "\n", 99 | "def get_read_results():\n", 100 | " all_results = []\n", 101 | " for name, info in files.items():\n", 102 | " for compression, compression_level in compression_cases:\n", 103 | " path = '{}_{}_{}.feather'.format(info['base'], \n", 104 | " compression or 'uncompressed',\n", 105 | " compression_level)\n", 106 | " read_time = get_timing(lambda: feather.read_table(path, memory_map=False),\n", 107 | " niter=5)\n", 108 | " result = name, compression, compression_level, read_time\n", 109 | " print(result)\n", 110 | " all_results.append(result) \n", 111 | " return all_results" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 5, 124 | "metadata": {}, 125 | "outputs": [ 126 | { 127 | "name": "stdout", 128 | "output_type": "stream", 129 | "text": [ 130 | "1024\n", 131 | "reading 2016Q4.txt\n", 132 | "('fanniemae', None, None)\n", 133 | "('fanniemae', None, None, 5084410194, 11.884642839431763)\n", 134 | "('fanniemae', 'zstd', 1)\n", 135 | "('fanniemae', 'zstd', 1, 501955562, 11.54361605644226)\n", 136 | "('fanniemae', 'zstd', 10)\n", 137 | "('fanniemae', 'zstd', 10, 439460538, 40.15117073059082)\n", 138 | "('fanniemae', 'lz4', None)\n", 139 | "('fanniemae', 'lz4', None, 765604482, 12.363005876541138)\n", 140 | "reading yellow_tripdata_2010-01.csv\n", 141 | "('nyctaxi', None, None)\n", 142 | "('nyctaxi', None, None, 2522035242, 6.970196723937988)\n", 143 | "('nyctaxi', 'zstd', 1)\n", 144 | "('nyctaxi', 'zstd', 1, 878914098, 7.667033433914185)\n", 145 | "('nyctaxi', 'zstd', 10)\n", 146 | "('nyctaxi', 'zstd', 10, 828266042, 32.220927715301514)\n", 147 | "('nyctaxi', 'lz4', None)\n", 148 | "('nyctaxi', 'lz4', None, 1262344938, 7.114352226257324)\n", 149 | "('fanniemae', None, None, 2.2620407581329345)\n", 150 | "('fanniemae', 'zstd', 1, 3.4737910270690917)\n", 151 | "('fanniemae', 'zstd', 10, 3.4430580615997313)\n", 152 | "('fanniemae', 'lz4', None, 3.521429014205933)\n", 153 | "('nyctaxi', None, None, 1.0237845420837401)\n", 154 | "('nyctaxi', 'zstd', 1, 1.8016125202178954)\n", 155 | "('nyctaxi', 'zstd', 10, 1.7049409389495849)\n", 156 | "('nyctaxi', 'lz4', None, 1.3043041229248047)\n", 157 | "2048\n", 158 | "reading 2016Q4.txt\n", 159 | "('fanniemae', None, None)\n", 160 | "('fanniemae', None, None, 5063114554, 11.932640790939331)\n", 161 | "('fanniemae', 'zstd', 1)\n", 162 | "('fanniemae', 'zstd', 1, 468753626, 9.218210458755493)\n", 163 | "('fanniemae', 'zstd', 10)\n", 164 | "('fanniemae', 'zstd', 10, 401064538, 40.00880241394043)\n", 165 | "('fanniemae', 'lz4', None)\n", 166 | "('fanniemae', 'lz4', None, 701361578, 9.259565353393555)\n", 167 | "reading yellow_tripdata_2010-01.csv\n", 168 | "('nyctaxi', None, None)\n", 169 | "('nyctaxi', None, None, 2513790386, 5.759558200836182)\n", 170 | "('nyctaxi', 'zstd', 1)\n", 171 | "('nyctaxi', 'zstd', 1, 851430546, 6.4997947216033936)\n", 172 | "('nyctaxi', 'zstd', 10)\n", 173 | "('nyctaxi', 'zstd', 10, 790773018, 33.690829277038574)\n", 174 | "('nyctaxi', 'lz4', None)\n", 175 | "('nyctaxi', 'lz4', None, 1223064234, 5.975880861282349)\n", 176 | "('fanniemae', None, None, 1.610342788696289)\n", 177 | "('fanniemae', 'zstd', 1, 1.983039951324463)\n", 178 | "('fanniemae', 'zstd', 10, 1.9032105445861816)\n", 179 | "('fanniemae', 'lz4', None, 1.7990120887756347)\n", 180 | "('nyctaxi', None, None, 0.8817797660827636)\n", 181 | "('nyctaxi', 'zstd', 1, 1.2915375709533692)\n", 182 | "('nyctaxi', 'zstd', 10, 1.13835711479187)\n", 183 | "('nyctaxi', 'lz4', None, 0.8505313873291016)\n", 184 | "4096\n", 185 | "reading 2016Q4.txt\n", 186 | "('fanniemae', None, None)\n", 187 | "('fanniemae', None, None, 5052804778, 10.22159743309021)\n", 188 | "('fanniemae', 'zstd', 1)\n", 189 | "('fanniemae', 'zstd', 1, 473501522, 8.019737958908081)\n", 190 | "('fanniemae', 'zstd', 10)\n", 191 | "('fanniemae', 'zstd', 10, 384761498, 13.248246908187866)\n", 192 | "('fanniemae', 'lz4', None)\n", 193 | "('fanniemae', 'lz4', None, 666704194, 7.61299991607666)\n", 194 | "reading yellow_tripdata_2010-01.csv\n", 195 | "('nyctaxi', None, None)\n", 196 | "('nyctaxi', None, None, 2509671706, 6.375310659408569)\n", 197 | "('nyctaxi', 'zstd', 1)\n", 198 | "('nyctaxi', 'zstd', 1, 841720058, 5.634358882904053)\n", 199 | "('nyctaxi', 'zstd', 10)\n", 200 | "('nyctaxi', 'zstd', 10, 765991802, 23.161847591400146)\n", 201 | "('nyctaxi', 'lz4', None)\n", 202 | "('nyctaxi', 'lz4', None, 1165201354, 6.004603624343872)\n", 203 | "('fanniemae', None, None, 1.341871976852417)\n", 204 | "('fanniemae', 'zstd', 1, 1.2426270961761474)\n", 205 | "('fanniemae', 'zstd', 10, 1.113413667678833)\n", 206 | "('fanniemae', 'lz4', None, 1.0141475200653076)\n", 207 | "('nyctaxi', None, None, 0.7986891269683838)\n", 208 | "('nyctaxi', 'zstd', 1, 0.974519681930542)\n", 209 | "('nyctaxi', 'zstd', 10, 0.8223378658294678)\n", 210 | "('nyctaxi', 'lz4', None, 0.5664512634277343)\n", 211 | "8192\n", 212 | "reading 2016Q4.txt\n", 213 | "('fanniemae', None, None)\n", 214 | "('fanniemae', None, None, 5048174170, 10.155773162841797)\n", 215 | "('fanniemae', 'zstd', 1)\n", 216 | "('fanniemae', 'zstd', 1, 476147690, 7.544380187988281)\n", 217 | "('fanniemae', 'zstd', 10)\n", 218 | "('fanniemae', 'zstd', 10, 380904258, 13.942293882369995)\n", 219 | "('fanniemae', 'lz4', None)\n", 220 | "('fanniemae', 'lz4', None, 648217594, 7.258745193481445)\n", 221 | "reading yellow_tripdata_2010-01.csv\n", 222 | "('nyctaxi', None, None)\n", 223 | "('nyctaxi', None, None, 2507611258, 6.470987319946289)\n", 224 | "('nyctaxi', 'zstd', 1)\n", 225 | "('nyctaxi', 'zstd', 1, 837304882, 5.931333303451538)\n", 226 | "('nyctaxi', 'zstd', 10)\n", 227 | "('nyctaxi', 'zstd', 10, 739310474, 29.601118326187134)\n", 228 | "('nyctaxi', 'lz4', None)\n", 229 | "('nyctaxi', 'lz4', None, 1144720050, 4.887480974197388)\n", 230 | "('fanniemae', None, None, 1.3168220043182373)\n", 231 | "('fanniemae', 'zstd', 1, 0.8572097301483155)\n", 232 | "('fanniemae', 'zstd', 10, 0.7228690624237061)\n", 233 | "('fanniemae', 'lz4', None, 0.6564846992492676)\n", 234 | "('nyctaxi', None, None, 0.7386976718902588)\n", 235 | "('nyctaxi', 'zstd', 1, 0.9264132499694824)\n", 236 | "('nyctaxi', 'zstd', 10, 0.7089903354644775)\n", 237 | "('nyctaxi', 'lz4', None, 0.46931772232055663)\n", 238 | "16384\n", 239 | "reading 2016Q4.txt\n", 240 | "('fanniemae', None, None)\n", 241 | "('fanniemae', None, None, 5046354402, 10.359640121459961)\n", 242 | "('fanniemae', 'zstd', 1)\n", 243 | "('fanniemae', 'zstd', 1, 488072882, 6.634678363800049)\n", 244 | "('fanniemae', 'zstd', 10)\n", 245 | "('fanniemae', 'zstd', 10, 386850010, 14.295108318328857)\n", 246 | "('fanniemae', 'lz4', None)\n", 247 | "('fanniemae', 'lz4', None, 644333354, 6.482739210128784)\n", 248 | "reading yellow_tripdata_2010-01.csv\n", 249 | "('nyctaxi', None, None)\n", 250 | "('nyctaxi', None, None, 2506582282, 5.567317008972168)\n", 251 | "('nyctaxi', 'zstd', 1)\n", 252 | "('nyctaxi', 'zstd', 1, 833835922, 4.956018924713135)\n", 253 | "('nyctaxi', 'zstd', 10)\n", 254 | "('nyctaxi', 'zstd', 10, 709229218, 17.30007767677307)\n", 255 | "('nyctaxi', 'lz4', None)\n", 256 | "('nyctaxi', 'lz4', None, 1179681450, 5.5779945850372314)\n", 257 | "('fanniemae', None, None, 1.3266838550567628)\n", 258 | "('fanniemae', 'zstd', 1, 0.7207117557525635)\n", 259 | "('fanniemae', 'zstd', 10, 0.5619686603546142)\n", 260 | "('fanniemae', 'lz4', None, 0.5085867404937744)\n", 261 | "('nyctaxi', None, None, 0.7293866634368896)\n", 262 | "('nyctaxi', 'zstd', 1, 0.780490779876709)\n", 263 | "('nyctaxi', 'zstd', 10, 0.6338376045227051)\n", 264 | "('nyctaxi', 'lz4', None, 0.42446208000183105)\n", 265 | "32768\n", 266 | "reading 2016Q4.txt\n", 267 | "('fanniemae', None, None)\n", 268 | "('fanniemae', None, None, 5045772882, 11.194675922393799)\n", 269 | "('fanniemae', 'zstd', 1)\n", 270 | "('fanniemae', 'zstd', 1, 494361698, 6.307297229766846)\n", 271 | "('fanniemae', 'zstd', 10)\n", 272 | "('fanniemae', 'zstd', 10, 394216642, 16.57004427909851)\n", 273 | "('fanniemae', 'lz4', None)\n", 274 | "('fanniemae', 'lz4', None, 640424914, 6.438863277435303)\n", 275 | "reading yellow_tripdata_2010-01.csv\n", 276 | "('nyctaxi', None, None)\n", 277 | "('nyctaxi', None, None, 2506066506, 5.98804497718811)\n", 278 | "('nyctaxi', 'zstd', 1)\n", 279 | "('nyctaxi', 'zstd', 1, 817758394, 4.760921478271484)\n", 280 | "('nyctaxi', 'zstd', 10)\n", 281 | "('nyctaxi', 'zstd', 10, 675626410, 19.773839712142944)\n", 282 | "('nyctaxi', 'lz4', None)\n", 283 | "('nyctaxi', 'lz4', None, 1176543226, 5.565099239349365)\n", 284 | "('fanniemae', None, None, 1.207357358932495)\n", 285 | "('fanniemae', 'zstd', 1, 0.6379957675933838)\n", 286 | "('fanniemae', 'zstd', 10, 0.5131874561309815)\n", 287 | "('fanniemae', 'lz4', None, 0.45996761322021484)\n", 288 | "('nyctaxi', None, None, 0.6317520141601562)\n", 289 | "('nyctaxi', 'zstd', 1, 0.7357310771942138)\n", 290 | "('nyctaxi', 'zstd', 10, 0.5581299781799316)\n", 291 | "('nyctaxi', 'lz4', None, 0.37372236251831054)\n", 292 | "65536\n", 293 | "reading 2016Q4.txt\n", 294 | "('fanniemae', None, None)\n", 295 | "('fanniemae', None, None, 5045771154, 11.179830074310303)\n", 296 | "('fanniemae', 'zstd', 1)\n", 297 | "('fanniemae', 'zstd', 1, 524046410, 6.3280885219573975)\n", 298 | "('fanniemae', 'zstd', 10)\n", 299 | "('fanniemae', 'zstd', 10, 395368482, 14.682528018951416)\n", 300 | "('fanniemae', 'lz4', None)\n", 301 | "('fanniemae', 'lz4', None, 638440418, 5.975476264953613)\n", 302 | "reading yellow_tripdata_2010-01.csv\n", 303 | "('nyctaxi', None, None)\n", 304 | "('nyctaxi', None, None, 2505808570, 5.9450695514678955)\n", 305 | "('nyctaxi', 'zstd', 1)\n", 306 | "('nyctaxi', 'zstd', 1, 821964938, 5.244204044342041)\n", 307 | "('nyctaxi', 'zstd', 10)\n", 308 | "('nyctaxi', 'zstd', 10, 651798442, 19.96653389930725)\n", 309 | "('nyctaxi', 'lz4', None)\n", 310 | "('nyctaxi', 'lz4', None, 1174964650, 5.419882297515869)\n", 311 | "('fanniemae', None, None, 1.0205121994018556)\n", 312 | "('fanniemae', 'zstd', 1, 0.5739494800567627)\n", 313 | "('fanniemae', 'zstd', 10, 0.4582984924316406)\n", 314 | "('fanniemae', 'lz4', None, 0.41712336540222167)\n", 315 | "('nyctaxi', None, None, 0.5486010074615478)\n", 316 | "('nyctaxi', 'zstd', 1, 0.6663787841796875)\n", 317 | "('nyctaxi', 'zstd', 10, 0.5117742538452148)\n", 318 | "('nyctaxi', 'lz4', None, 0.34208340644836427)\n" 319 | ] 320 | } 321 | ], 322 | "source": [ 323 | "chunksizes = [1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15,\n", 324 | " 1 << 16]\n", 325 | "\n", 326 | "results_by_chunksize = {}\n", 327 | "for chunksize in chunksizes:\n", 328 | " print(chunksize)\n", 329 | " write_results = write_files(files, chunksize=chunksize)\n", 330 | " read_results = get_read_results() \n", 331 | " results_by_chunksize[chunksize] = write_results, read_results" 332 | ] 333 | }, 334 | { 335 | "cell_type": "code", 336 | "execution_count": null, 337 | "metadata": {}, 338 | "outputs": [], 339 | "source": [] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 6, 344 | "metadata": {}, 345 | "outputs": [], 346 | "source": [ 347 | "reads = []\n", 348 | "writes = []\n", 349 | "\n", 350 | "for chunksize, (write_results, read_results) in results_by_chunksize.items():\n", 351 | " write_results = pd.DataFrame.from_records(\n", 352 | " write_results, columns=['dataset', 'codec', 'codec_level', \n", 353 | " 'file_size', 'write_time'])\n", 354 | " read_results = pd.DataFrame.from_records(\n", 355 | " read_results, columns=['dataset', 'codec', 'codec_level', \n", 356 | " 'read_time'])\n", 357 | " write_results['chunksize'] = chunksize\n", 358 | " read_results['chunksize'] = chunksize\n", 359 | " \n", 360 | " reads.append(read_results)\n", 361 | " writes.append(write_results)\n", 362 | " \n", 363 | "reads = pd.concat(reads, ignore_index=True)\n", 364 | "writes = pd.concat(writes, ignore_index=True)\n", 365 | "\n", 366 | "def munge_codecs(codec_s, codec_level_s):\n", 367 | " results = []\n", 368 | " codec_s = codec_s.fillna('uncompressed')\n", 369 | " for codec, codec_level in zip(codec_s, codec_level_s):\n", 370 | " if pd.isnull(codec_level):\n", 371 | " results.append(codec)\n", 372 | " else:\n", 373 | " results.append(codec + '-' + str(int(codec_level)))\n", 374 | " return results\n", 375 | "\n", 376 | "reads['codec'] = munge_codecs(reads['codec'], reads.pop('codec_level'))\n", 377 | "writes['codec'] = munge_codecs(writes['codec'], writes.pop('codec_level'))" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 13, 383 | "metadata": {}, 384 | "outputs": [], 385 | "source": [ 386 | "%matplotlib notebook" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": 7, 392 | "metadata": {}, 393 | "outputs": [], 394 | "source": [ 395 | "reads.to_csv('ipc_read_parallel.csv')\n", 396 | "writes.to_csv('ipc_write_parallel.csv')" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": 8, 402 | "metadata": {}, 403 | "outputs": [ 404 | { 405 | "data": { 406 | "text/html": [ 407 | "
\n", 408 | "\n", 421 | "\n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | " \n", 455 | " \n", 456 | " \n", 457 | " \n", 458 | " \n", 459 | " \n", 460 | " \n", 461 | " \n", 462 | " \n", 463 | " \n", 464 | " \n", 465 | " \n", 466 | " \n", 467 | " \n", 468 | " \n", 469 | " \n", 470 | " \n", 471 | " \n", 472 | " \n", 473 | " \n", 474 | " \n", 475 | " \n", 476 | " \n", 477 | " \n", 478 | " \n", 479 | " \n", 480 | " \n", 481 | " \n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | " \n", 529 | " \n", 530 | " \n", 531 | " \n", 532 | " \n", 533 | " \n", 534 | " \n", 535 | " \n", 536 | " \n", 537 | " \n", 538 | " \n", 539 | " \n", 540 | " \n", 541 | " \n", 542 | " \n", 543 | " \n", 544 | " \n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | " \n", 592 | " \n", 593 | " \n", 594 | " \n", 595 | " \n", 596 | " \n", 597 | " \n", 598 | " \n", 599 | " \n", 600 | " \n", 601 | " \n", 602 | " \n", 603 | " \n", 604 | " \n", 605 | " \n", 606 | " \n", 607 | " \n", 608 | " \n", 609 | " \n", 610 | " \n", 611 | " \n", 612 | " \n", 613 | " \n", 614 | " \n", 615 | " \n", 616 | " \n", 617 | " \n", 618 | " \n", 619 | " \n", 620 | " \n", 621 | " \n", 622 | " \n", 623 | " \n", 624 | " \n", 625 | " \n", 626 | " \n", 627 | " \n", 628 | " \n", 629 | " \n", 630 | " \n", 631 | " \n", 632 | " \n", 633 | " \n", 634 | " \n", 635 | " \n", 636 | " \n", 637 | " \n", 638 | " \n", 639 | " \n", 640 | " \n", 641 | " \n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | " \n", 773 | " \n", 774 | " \n", 775 | " \n", 776 | " \n", 777 | " \n", 778 | " \n", 779 | " \n", 780 | " \n", 781 | " \n", 782 | " \n", 783 | " \n", 784 | " \n", 785 | " \n", 786 | " \n", 787 | " \n", 788 | " \n", 789 | " \n", 790 | " \n", 791 | " \n", 792 | " \n", 793 | " \n", 794 | " \n", 795 | " \n", 796 | " \n", 797 | " \n", 798 | " \n", 799 | " \n", 800 | " \n", 801 | " \n", 802 | " \n", 803 | " \n", 804 | " \n", 805 | " \n", 806 | " \n", 807 | " \n", 808 | " \n", 809 | " \n", 810 | " \n", 811 | " \n", 812 | " \n", 813 | " \n", 814 | " \n", 815 | " \n", 816 | " \n", 817 | " \n", 818 | " \n", 819 | " \n", 820 | " \n", 821 | " \n", 822 | " \n", 823 | " \n", 824 | " \n", 825 | "
datasetcodecread_timechunksize
0fanniemaeuncompressed1.2850941024
1fanniemaezstd-13.5862691024
2fanniemaezstd-103.7045911024
3fanniemaelz43.5909861024
4nyctaxiuncompressed0.6085891024
5nyctaxizstd-11.8911271024
6nyctaxizstd-101.6657661024
7nyctaxilz41.2277171024
8fanniemaeuncompressed0.4721912048
9fanniemaezstd-12.0574202048
10fanniemaezstd-101.7989602048
11fanniemaelz41.6340482048
12nyctaxiuncompressed0.1714632048
13nyctaxizstd-11.2872502048
14nyctaxizstd-101.0457732048
15nyctaxilz40.7308722048
16fanniemaeuncompressed0.1937994096
17fanniemaezstd-11.3149034096
18fanniemaezstd-101.1079554096
19fanniemaelz40.9833374096
20nyctaxiuncompressed0.0893004096
21nyctaxizstd-11.0095194096
22nyctaxizstd-100.8068824096
23nyctaxilz40.4958444096
24fanniemaeuncompressed0.1015958192
25fanniemaezstd-10.8970388192
26fanniemaezstd-100.7139818192
27fanniemaelz40.6359398192
28nyctaxiuncompressed0.0373418192
29nyctaxizstd-10.7348908192
30nyctaxizstd-100.6067308192
31nyctaxilz40.3500308192
32fanniemaeuncompressed0.04857916384
33fanniemaezstd-10.76009816384
34fanniemaezstd-100.55990516384
35fanniemaelz40.48168316384
36nyctaxiuncompressed0.01613516384
37nyctaxizstd-10.69735116384
38nyctaxizstd-100.55672316384
39nyctaxilz40.29760216384
40fanniemaeuncompressed0.02181732768
41fanniemaezstd-10.63826332768
42fanniemaezstd-100.47689232768
43fanniemaelz40.41804432768
44nyctaxiuncompressed0.00824232768
45nyctaxizstd-10.71237932768
46nyctaxizstd-100.53014132768
47nyctaxilz40.28829032768
48fanniemaeuncompressed0.01054665536
49fanniemaezstd-10.59558565536
50fanniemaezstd-100.44069465536
51fanniemaelz40.39517465536
52nyctaxiuncompressed0.00675065536
53nyctaxizstd-10.59305765536
54nyctaxizstd-100.46795265536
55nyctaxilz40.27778365536
\n", 826 | "
" 827 | ], 828 | "text/plain": [ 829 | " dataset codec read_time chunksize\n", 830 | "0 fanniemae uncompressed 1.285094 1024\n", 831 | "1 fanniemae zstd-1 3.586269 1024\n", 832 | "2 fanniemae zstd-10 3.704591 1024\n", 833 | "3 fanniemae lz4 3.590986 1024\n", 834 | "4 nyctaxi uncompressed 0.608589 1024\n", 835 | "5 nyctaxi zstd-1 1.891127 1024\n", 836 | "6 nyctaxi zstd-10 1.665766 1024\n", 837 | "7 nyctaxi lz4 1.227717 1024\n", 838 | "8 fanniemae uncompressed 0.472191 2048\n", 839 | "9 fanniemae zstd-1 2.057420 2048\n", 840 | "10 fanniemae zstd-10 1.798960 2048\n", 841 | "11 fanniemae lz4 1.634048 2048\n", 842 | "12 nyctaxi uncompressed 0.171463 2048\n", 843 | "13 nyctaxi zstd-1 1.287250 2048\n", 844 | "14 nyctaxi zstd-10 1.045773 2048\n", 845 | "15 nyctaxi lz4 0.730872 2048\n", 846 | "16 fanniemae uncompressed 0.193799 4096\n", 847 | "17 fanniemae zstd-1 1.314903 4096\n", 848 | "18 fanniemae zstd-10 1.107955 4096\n", 849 | "19 fanniemae lz4 0.983337 4096\n", 850 | "20 nyctaxi uncompressed 0.089300 4096\n", 851 | "21 nyctaxi zstd-1 1.009519 4096\n", 852 | "22 nyctaxi zstd-10 0.806882 4096\n", 853 | "23 nyctaxi lz4 0.495844 4096\n", 854 | "24 fanniemae uncompressed 0.101595 8192\n", 855 | "25 fanniemae zstd-1 0.897038 8192\n", 856 | "26 fanniemae zstd-10 0.713981 8192\n", 857 | "27 fanniemae lz4 0.635939 8192\n", 858 | "28 nyctaxi uncompressed 0.037341 8192\n", 859 | "29 nyctaxi zstd-1 0.734890 8192\n", 860 | "30 nyctaxi zstd-10 0.606730 8192\n", 861 | "31 nyctaxi lz4 0.350030 8192\n", 862 | "32 fanniemae uncompressed 0.048579 16384\n", 863 | "33 fanniemae zstd-1 0.760098 16384\n", 864 | "34 fanniemae zstd-10 0.559905 16384\n", 865 | "35 fanniemae lz4 0.481683 16384\n", 866 | "36 nyctaxi uncompressed 0.016135 16384\n", 867 | "37 nyctaxi zstd-1 0.697351 16384\n", 868 | "38 nyctaxi zstd-10 0.556723 16384\n", 869 | "39 nyctaxi lz4 0.297602 16384\n", 870 | "40 fanniemae uncompressed 0.021817 32768\n", 871 | "41 fanniemae zstd-1 0.638263 32768\n", 872 | "42 fanniemae zstd-10 0.476892 32768\n", 873 | "43 fanniemae lz4 0.418044 32768\n", 874 | "44 nyctaxi uncompressed 0.008242 32768\n", 875 | "45 nyctaxi zstd-1 0.712379 32768\n", 876 | "46 nyctaxi zstd-10 0.530141 32768\n", 877 | "47 nyctaxi lz4 0.288290 32768\n", 878 | "48 fanniemae uncompressed 0.010546 65536\n", 879 | "49 fanniemae zstd-1 0.595585 65536\n", 880 | "50 fanniemae zstd-10 0.440694 65536\n", 881 | "51 fanniemae lz4 0.395174 65536\n", 882 | "52 nyctaxi uncompressed 0.006750 65536\n", 883 | "53 nyctaxi zstd-1 0.593057 65536\n", 884 | "54 nyctaxi zstd-10 0.467952 65536\n", 885 | "55 nyctaxi lz4 0.277783 65536" 886 | ] 887 | }, 888 | "execution_count": 8, 889 | "metadata": {}, 890 | "output_type": "execute_result" 891 | } 892 | ], 893 | "source": [ 894 | "reads" 895 | ] 896 | }, 897 | { 898 | "cell_type": "code", 899 | "execution_count": null, 900 | "metadata": {}, 901 | "outputs": [], 902 | "source": [] 903 | } 904 | ], 905 | "metadata": { 906 | "kernelspec": { 907 | "display_name": "Python 3", 908 | "language": "python", 909 | "name": "python3" 910 | }, 911 | "language_info": { 912 | "codemirror_mode": { 913 | "name": "ipython", 914 | "version": 3 915 | }, 916 | "file_extension": ".py", 917 | "mimetype": "text/x-python", 918 | "name": "python", 919 | "nbconvert_exporter": "python", 920 | "pygments_lexer": "ipython3", 921 | "version": "3.7.6" 922 | } 923 | }, 924 | "nbformat": 4, 925 | "nbformat_minor": 4 926 | } 927 | -------------------------------------------------------------------------------- /20190919file_benchmarks/all_read_results.csv: -------------------------------------------------------------------------------- 1 | expr,time,dataset,output_type,nthreads,language 2 | csv_fread,17.535751806,fanniemae,R data.frame,1,R 3 | fst (UNC),5.83356695,fanniemae,R data.frame,1,R 4 | fst (c=50),5.875382178,fanniemae,R data.frame,1,R 5 | feather V1,10.078519502799999,fanniemae,R data.frame,1,R 6 | feather V2 (UNC),4.7198155451999995,fanniemae,R data.frame,1,R 7 | feather V2 (LZ4),5.852145495199999,fanniemae,R data.frame,1,R 8 | feather V2 (ZSTD),7.77908361,fanniemae,R data.frame,1,R 9 | parquet (UNC),9.4933916048,fanniemae,R data.frame,1,R 10 | parquet (SNAPPY),9.911315661200002,fanniemae,R data.frame,1,R 11 | RDS (UNC),30.2670197082,fanniemae,R data.frame,1,R 12 | RDS (C),41.482849064199996,fanniemae,R data.frame,1,R 13 | csv_fread,23.370041255,nyctaxi,R data.frame,1,R 14 | fst (UNC),13.017416436,nyctaxi,R data.frame,1,R 15 | fst (c=50),12.6347099714,nyctaxi,R data.frame,1,R 16 | feather V1,13.443664009399999,nyctaxi,R data.frame,1,R 17 | feather V2 (UNC),11.3714301042,nyctaxi,R data.frame,1,R 18 | feather V2 (LZ4),13.29604463,nyctaxi,R data.frame,1,R 19 | feather V2 (ZSTD),14.5943007722,nyctaxi,R data.frame,1,R 20 | parquet (UNC),13.1586667582,nyctaxi,R data.frame,1,R 21 | parquet (SNAPPY),13.958228992,nyctaxi,R data.frame,1,R 22 | RDS (UNC),22.211784820200002,nyctaxi,R data.frame,1,R 23 | RDS (C),30.765105346200002,nyctaxi,R data.frame,1,R 24 | parquet (UNC),6.126083183288574,fanniemae,arrow Table,1,Python 25 | parquet (UNC),9.3643874168396,fanniemae,pandas,1,Python 26 | parquet (SNAPPY),6.056532478332518,fanniemae,arrow Table,1,Python 27 | parquet (SNAPPY),9.177780771255494,fanniemae,pandas,1,Python 28 | feather V2 (UNC),4.354116058349609,fanniemae,pandas,1,Python 29 | feather V2 (LZ4),4.396533584594726,fanniemae,pandas,1,Python 30 | feather V2 (ZSTD),5.775776481628418,fanniemae,pandas,1,Python 31 | feather V2 (UNC),1.0860649585723876,fanniemae,arrow Table,1,Python 32 | feather V2 (LZ4),1.0962132453918456,fanniemae,arrow Table,1,Python 33 | feather V2 (ZSTD),2.531323909759521,fanniemae,arrow Table,1,Python 34 | parquet (UNC),2.2780594348907472,nyctaxi,arrow Table,1,Python 35 | parquet (UNC),9.222453880310061,nyctaxi,pandas,1,Python 36 | parquet (SNAPPY),2.8247000694274904,nyctaxi,arrow Table,1,Python 37 | parquet (SNAPPY),9.735122680664062,nyctaxi,pandas,1,Python 38 | feather V2 (UNC),7.608278465270996,nyctaxi,pandas,1,Python 39 | feather V2 (LZ4),7.784061861038206,nyctaxi,pandas,1,Python 40 | feather V2 (ZSTD),9.633673095703122,nyctaxi,pandas,1,Python 41 | feather V2 (UNC),0.5403317451477051,nyctaxi,arrow Table,1,Python 42 | feather V2 (LZ4),0.9643253803253172,nyctaxi,arrow Table,1,Python 43 | feather V2 (ZSTD),2.7800182342529296,nyctaxi,arrow Table,1,Python 44 | csv_fread,8.036938666600001,fanniemae,R data.frame,4,R 45 | fst (UNC),6.3416014972,fanniemae,R data.frame,4,R 46 | fst (c=50),5.0547549678,fanniemae,R data.frame,4,R 47 | feather V1,9.799018014,fanniemae,R data.frame,4,R 48 | feather V2 (UNC),5.0542017474,fanniemae,R data.frame,4,R 49 | feather V2 (LZ4),4.928118181,fanniemae,R data.frame,4,R 50 | feather V2 (ZSTD),5.5355538286,fanniemae,R data.frame,4,R 51 | parquet (UNC),6.281569166600001,fanniemae,R data.frame,4,R 52 | parquet (SNAPPY),6.3922376926,fanniemae,R data.frame,4,R 53 | RDS (UNC),29.8928874914,fanniemae,R data.frame,4,R 54 | RDS (C),41.273872293800004,fanniemae,R data.frame,4,R 55 | csv_fread,18.312046954,nyctaxi,R data.frame,4,R 56 | fst (UNC),11.9693504656,nyctaxi,R data.frame,4,R 57 | fst (c=50),13.4391470686,nyctaxi,R data.frame,4,R 58 | feather V1,12.034649945,nyctaxi,R data.frame,4,R 59 | feather V2 (UNC),11.0239614322,nyctaxi,R data.frame,4,R 60 | feather V2 (LZ4),11.592801001,nyctaxi,R data.frame,4,R 61 | feather V2 (ZSTD),12.704684877,nyctaxi,R data.frame,4,R 62 | parquet (UNC),12.225668849,nyctaxi,R data.frame,4,R 63 | parquet (SNAPPY),12.0044663816,nyctaxi,R data.frame,4,R 64 | RDS (UNC),21.847153904,nyctaxi,R data.frame,4,R 65 | RDS (C),30.735937022799998,nyctaxi,R data.frame,4,R 66 | parquet (UNC),1.841284704208374,fanniemae,arrow Table,4,Python 67 | parquet (UNC),4.0880148887634284,fanniemae,pandas,4,Python 68 | parquet (SNAPPY),1.8786502361297608,fanniemae,arrow Table,4,Python 69 | parquet (SNAPPY),4.165652704238892,fanniemae,pandas,4,Python 70 | feather V2 (UNC),3.5610058307647705,fanniemae,pandas,4,Python 71 | feather V2 (LZ4),2.778682994842529,fanniemae,pandas,4,Python 72 | feather V2 (ZSTD),3.0616337299346923,fanniemae,pandas,4,Python 73 | feather V2 (UNC),1.1269856452941895,fanniemae,arrow Table,4,Python 74 | feather V2 (LZ4),0.4898182392120362,fanniemae,arrow Table,4,Python 75 | feather V2 (ZSTD),0.8093690395355224,fanniemae,arrow Table,4,Python 76 | parquet (UNC),0.6995339870452881,nyctaxi,arrow Table,4,Python 77 | parquet (UNC),7.4361457347869875,nyctaxi,pandas,4,Python 78 | parquet (SNAPPY),0.78084397315979,nyctaxi,arrow Table,4,Python 79 | parquet (SNAPPY),7.540273284912108,nyctaxi,pandas,4,Python 80 | feather V2 (UNC),7.369460582733153,nyctaxi,pandas,4,Python 81 | feather V2 (LZ4),7.119231033325195,nyctaxi,pandas,4,Python 82 | feather V2 (ZSTD),7.537483549118043,nyctaxi,pandas,4,Python 83 | feather V2 (UNC),0.6116453170776367,nyctaxi,arrow Table,4,Python 84 | feather V2 (LZ4),0.4065845012664795,nyctaxi,arrow Table,4,Python 85 | feather V2 (ZSTD),0.8925417900085449,nyctaxi,arrow Table,4,Python 86 | -------------------------------------------------------------------------------- /20190919file_benchmarks/all_results.csv: -------------------------------------------------------------------------------- 1 | expr,time,dataset,output_type,nthreads,language 2 | R csv_fread,17.8678609836,fanniemae,R data.frame,1,R 3 | R fst,5.7596893994,fanniemae,R data.frame,1,R 4 | feather (UNC),4.4072281468000005,fanniemae,R data.frame,1,R 5 | feather (LZ4),6.0330426373999995,fanniemae,R data.frame,1,R 6 | feather (ZSTD),7.526674342,fanniemae,R data.frame,1,R 7 | parquet (SNAPPY),10.0367648462,fanniemae,R data.frame,1,R 8 | R csv_fread,24.648797387400002,nyctaxi,R data.frame,1,R 9 | R fst,13.142260905799999,nyctaxi,R data.frame,1,R 10 | feather (UNC),10.96529547,nyctaxi,R data.frame,1,R 11 | feather (LZ4),11.801702598,nyctaxi,R data.frame,1,R 12 | feather (ZSTD),14.2444990752,nyctaxi,R data.frame,1,R 13 | parquet (SNAPPY),13.157420057,nyctaxi,R data.frame,1,R 14 | parquet (SNAPPY),5.7931968688964846,fanniemae,arrow Table,1,Python 15 | parquet (SNAPPY),9.107409811019897,fanniemae,pandas,1,Python 16 | feather (UNC),4.035408067703247,fanniemae,pandas,1,Python 17 | feather (LZ4),4.295090818405152,fanniemae,pandas,1,Python 18 | feather (ZSTD),5.678592157363893,fanniemae,pandas,1,Python 19 | feather (UNC),1.2464978694915771,fanniemae,arrow Table,1,Python 20 | feather (LZ4),1.062558937072754,fanniemae,arrow Table,1,Python 21 | feather (ZSTD),2.471682643890381,fanniemae,arrow Table,1,Python 22 | parquet (SNAPPY),2.7657272338867194,nyctaxi,arrow Table,1,Python 23 | parquet (SNAPPY),9.840531587600706,nyctaxi,pandas,1,Python 24 | feather (UNC),7.5906150341033936,nyctaxi,pandas,1,Python 25 | feather (LZ4),7.9236814975738525,nyctaxi,pandas,1,Python 26 | feather (ZSTD),9.791791486740113,nyctaxi,pandas,1,Python 27 | feather (UNC),0.6637681007385254,nyctaxi,arrow Table,1,Python 28 | feather (LZ4),1.0227035522460937,nyctaxi,arrow Table,1,Python 29 | feather (ZSTD),2.77500696182251,nyctaxi,arrow Table,1,Python 30 | R csv_fread,8.381513095,fanniemae,R data.frame,4,R 31 | R fst,4.8154870964,fanniemae,R data.frame,4,R 32 | feather (UNC),4.8105564258,fanniemae,R data.frame,4,R 33 | feather (LZ4),5.4882766928,fanniemae,R data.frame,4,R 34 | feather (ZSTD),5.986291964,fanniemae,R data.frame,4,R 35 | parquet (SNAPPY),6.7089619354,fanniemae,R data.frame,4,R 36 | R csv_fread,19.3027468002,nyctaxi,R data.frame,4,R 37 | R fst,13.0800444294,nyctaxi,R data.frame,4,R 38 | feather (UNC),11.8721187678,nyctaxi,R data.frame,4,R 39 | feather (LZ4),12.5549529788,nyctaxi,R data.frame,4,R 40 | feather (ZSTD),12.829650966600001,nyctaxi,R data.frame,4,R 41 | parquet (SNAPPY),12.7536964852,nyctaxi,R data.frame,4,R 42 | parquet (SNAPPY),1.8717081069946289,fanniemae,arrow Table,4,Python 43 | parquet (SNAPPY),4.098778772354127,fanniemae,pandas,4,Python 44 | feather (UNC),3.539084482192993,fanniemae,pandas,4,Python 45 | feather (LZ4),2.8530110359191894,fanniemae,pandas,4,Python 46 | feather (ZSTD),3.06166353225708,fanniemae,pandas,4,Python 47 | feather (UNC),1.3176395416259763,fanniemae,arrow Table,4,Python 48 | feather (LZ4),0.4744390964508057,fanniemae,arrow Table,4,Python 49 | feather (ZSTD),0.7838622570037842,fanniemae,arrow Table,4,Python 50 | parquet (SNAPPY),0.8635732173919678,nyctaxi,arrow Table,4,Python 51 | parquet (SNAPPY),7.623702335357666,nyctaxi,pandas,4,Python 52 | feather (UNC),7.328182792663574,nyctaxi,pandas,4,Python 53 | feather (LZ4),7.2832419872283936,nyctaxi,pandas,4,Python 54 | feather (ZSTD),8.017264556884765,nyctaxi,pandas,4,Python 55 | feather (UNC),0.6738637924194336,nyctaxi,arrow Table,4,Python 56 | feather (LZ4),0.4330804347991944,nyctaxi,arrow Table,4,Python 57 | feather (ZSTD),0.9005756855010987,nyctaxi,arrow Table,4,Python 58 | R csv_fread,8.235247531,fanniemae,R data.frame,8,R 59 | R fst,4.5943393692,fanniemae,R data.frame,8,R 60 | feather (UNC),4.7164801714,fanniemae,R data.frame,8,R 61 | feather (LZ4),4.6001075036,fanniemae,R data.frame,8,R 62 | feather (ZSTD),5.166106334399999,fanniemae,R data.frame,8,R 63 | parquet (SNAPPY),5.9058646954,fanniemae,R data.frame,8,R 64 | R csv_fread,17.998316013,nyctaxi,R data.frame,8,R 65 | R fst,13.064559282,nyctaxi,R data.frame,8,R 66 | feather (UNC),11.93319899,nyctaxi,R data.frame,8,R 67 | feather (LZ4),12.5654696644,nyctaxi,R data.frame,8,R 68 | feather (ZSTD),12.1251017998,nyctaxi,R data.frame,8,R 69 | parquet (SNAPPY),11.4879469076,nyctaxi,R data.frame,8,R 70 | parquet (SNAPPY),1.3059203624725342,fanniemae,arrow Table,8,Python 71 | parquet (SNAPPY),3.710281848907471,fanniemae,pandas,8,Python 72 | feather (UNC),3.67109489440918,fanniemae,pandas,8,Python 73 | feather (LZ4),2.8483234405517583,fanniemae,pandas,8,Python 74 | feather (ZSTD),2.943112850189209,fanniemae,pandas,8,Python 75 | feather (UNC),1.3228723049163815,fanniemae,arrow Table,8,Python 76 | feather (LZ4),0.4322311401367188,fanniemae,arrow Table,8,Python 77 | feather (ZSTD),0.5514030933380127,fanniemae,arrow Table,8,Python 78 | parquet (SNAPPY),0.7145666599273681,nyctaxi,arrow Table,8,Python 79 | parquet (SNAPPY),7.5506598472595226,nyctaxi,pandas,8,Python 80 | feather (UNC),7.442094039916992,nyctaxi,pandas,8,Python 81 | feather (LZ4),7.163635158538819,nyctaxi,pandas,8,Python 82 | feather (ZSTD),7.376304483413696,nyctaxi,pandas,8,Python 83 | feather (UNC),0.638268232345581,nyctaxi,arrow Table,8,Python 84 | feather (LZ4),0.3298566818237305,nyctaxi,arrow Table,8,Python 85 | feather (ZSTD),0.576887559890747,nyctaxi,arrow Table,8,Python 86 | -------------------------------------------------------------------------------- /20190919file_benchmarks/all_write_results.csv: -------------------------------------------------------------------------------- 1 | expr,time,dataset,output_type,nthreads,language 2 | fst (UNC),7.00702392,fanniemae,R data.frame,1,R 3 | fst (c=50),4.385196419,fanniemae,R data.frame,1,R 4 | feather V1,8.656647228,fanniemae,R data.frame,1,R 5 | feather V2 (UNC),10.040626659,fanniemae,R data.frame,1,R 6 | feather V2 (LZ4),10.818098194,fanniemae,R data.frame,1,R 7 | feather V2 (ZSTD),11.438481575,fanniemae,R data.frame,1,R 8 | parquet (UNC),10.434816898,fanniemae,R data.frame,1,R 9 | parquet (SNAPPY),10.800951873,fanniemae,R data.frame,1,R 10 | RDS (C),76.929230341,fanniemae,R data.frame,1,R 11 | RDS (UNC),24.216423401,fanniemae,R data.frame,1,R 12 | fst (UNC),4.08787925,nyctaxi,R data.frame,1,R 13 | fst (c=50),3.950344461,nyctaxi,R data.frame,1,R 14 | feather V1,5.97229482,nyctaxi,R data.frame,1,R 15 | feather V2 (UNC),5.888590985,nyctaxi,R data.frame,1,R 16 | feather V2 (LZ4),8.325439328,nyctaxi,R data.frame,1,R 17 | feather V2 (ZSTD),10.223231254,nyctaxi,R data.frame,1,R 18 | parquet (UNC),7.71564074,nyctaxi,R data.frame,1,R 19 | parquet (SNAPPY),8.585539352,nyctaxi,R data.frame,1,R 20 | RDS (C),104.898052261,nyctaxi,R data.frame,1,R 21 | RDS (UNC),10.739751088,nyctaxi,R data.frame,1,R 22 | parquet (UNC),6.220219850540161,fanniemae,arrow Table,1,Python 23 | parquet (UNC),12.395264983177185,fanniemae,pandas,1,Python 24 | parquet (SNAPPY),6.694774866104126,fanniemae,arrow Table,1,Python 25 | parquet (SNAPPY),13.161320447921753,fanniemae,pandas,1,Python 26 | feather V2 (UNC),12.677234172821045,fanniemae,pandas,1,Python 27 | feather V2 (UNC),6.397535443305969,fanniemae,arrow Table,1,Python 28 | feather V2 (LZ4),8.32238781452179,fanniemae,pandas,1,Python 29 | feather V2 (LZ4),2.2326916456222534,fanniemae,arrow Table,1,Python 30 | feather V2 (ZSTD),10.61594545841217,fanniemae,pandas,1,Python 31 | feather V2 (ZSTD),4.308579444885254,fanniemae,arrow Table,1,Python 32 | parquet (UNC),4.5986950397491455,nyctaxi,arrow Table,1,Python 33 | parquet (UNC),9.009780049324037,nyctaxi,pandas,1,Python 34 | parquet (SNAPPY),5.70121443271637,nyctaxi,arrow Table,1,Python 35 | parquet (SNAPPY),10.175373315811155,nyctaxi,pandas,1,Python 36 | feather V2 (UNC),7.1334041357040405,nyctaxi,pandas,1,Python 37 | feather V2 (UNC),3.112175464630127,nyctaxi,arrow Table,1,Python 38 | feather V2 (LZ4),7.4143136739730835,nyctaxi,pandas,1,Python 39 | feather V2 (LZ4),3.567118763923645,nyctaxi,arrow Table,1,Python 40 | feather V2 (ZSTD),11.283223748207092,nyctaxi,pandas,1,Python 41 | feather V2 (ZSTD),6.928452372550964,nyctaxi,arrow Table,1,Python 42 | fst (UNC),7.758567831,fanniemae,R data.frame,4,R 43 | fst (c=50),3.700873556,fanniemae,R data.frame,4,R 44 | feather V1,7.08059183,fanniemae,R data.frame,4,R 45 | feather V2 (UNC),10.413025112,fanniemae,R data.frame,4,R 46 | feather V2 (LZ4),10.818213516,fanniemae,R data.frame,4,R 47 | feather V2 (ZSTD),11.563816777,fanniemae,R data.frame,4,R 48 | parquet (UNC),10.814584911,fanniemae,R data.frame,4,R 49 | parquet (SNAPPY),11.152511189,fanniemae,R data.frame,4,R 50 | RDS (C),78.42714811,fanniemae,R data.frame,4,R 51 | RDS (UNC),24.919762665,fanniemae,R data.frame,4,R 52 | fst (UNC),4.399914353,nyctaxi,R data.frame,4,R 53 | fst (c=50),3.305661431,nyctaxi,R data.frame,4,R 54 | feather V1,5.47744372,nyctaxi,R data.frame,4,R 55 | feather V2 (UNC),5.864371601,nyctaxi,R data.frame,4,R 56 | feather V2 (LZ4),8.494803995,nyctaxi,R data.frame,4,R 57 | feather V2 (ZSTD),10.073068744,nyctaxi,R data.frame,4,R 58 | parquet (UNC),7.675560036,nyctaxi,R data.frame,4,R 59 | parquet (SNAPPY),8.428579617,nyctaxi,R data.frame,4,R 60 | RDS (C),108.234060692,nyctaxi,R data.frame,4,R 61 | RDS (UNC),10.717121094,nyctaxi,R data.frame,4,R 62 | parquet (UNC),6.162686586380005,fanniemae,arrow Table,4,Python 63 | parquet (UNC),11.565850496292114,fanniemae,pandas,4,Python 64 | parquet (SNAPPY),6.410535216331482,fanniemae,arrow Table,4,Python 65 | parquet (SNAPPY),11.6298109292984,fanniemae,pandas,4,Python 66 | feather V2 (UNC),11.104193806648254,fanniemae,pandas,4,Python 67 | feather V2 (UNC),5.889622092247009,fanniemae,arrow Table,4,Python 68 | feather V2 (LZ4),6.612253308296204,fanniemae,pandas,4,Python 69 | feather V2 (LZ4),1.306950330734253,fanniemae,arrow Table,4,Python 70 | feather V2 (ZSTD),7.202290296554565,fanniemae,pandas,4,Python 71 | feather V2 (ZSTD),1.8320761919021609,fanniemae,arrow Table,4,Python 72 | parquet (UNC),4.338123440742494,nyctaxi,arrow Table,4,Python 73 | parquet (UNC),8.028993129730225,nyctaxi,pandas,4,Python 74 | parquet (SNAPPY),5.622675895690918,nyctaxi,arrow Table,4,Python 75 | parquet (SNAPPY),9.33586835861206,nyctaxi,pandas,4,Python 76 | feather V2 (UNC),6.233096599578857,nyctaxi,pandas,4,Python 77 | feather V2 (UNC),2.994387269020081,nyctaxi,arrow Table,4,Python 78 | feather V2 (LZ4),5.67785370349884,nyctaxi,pandas,4,Python 79 | feather V2 (LZ4),2.289505124092102,nyctaxi,arrow Table,4,Python 80 | feather V2 (ZSTD),6.161942005157472,nyctaxi,pandas,4,Python 81 | feather V2 (ZSTD),2.7954366207122803,nyctaxi,arrow Table,4,Python 82 | -------------------------------------------------------------------------------- /20190919file_benchmarks/benchmark.R: -------------------------------------------------------------------------------- 1 | library(fst) 2 | library(microbenchmark) 3 | library(data.table) 4 | library(arrow) 5 | library(feather) 6 | library(stringr) 7 | library(dplyr) 8 | 9 | files <- c("2016Q4", "yellow_tripdata_2010-01") 10 | names <- c("fanniemae", "nyctaxi") 11 | seps <- c("|", ",") 12 | 13 | create_files <- function(base) { 14 | df <- arrow::read_parquet(str_c(base, "_snappy.parquet")) 15 | feather::write_feather(df, str_c(base, "_v1.feather")) 16 | fst::write_fst(df, str_c(base, "_0.fst"), compress=0) 17 | fst::write_fst(df, str_c(base, "_50.fst"), compress=50) 18 | saveRDS(df, str_c(base, "_compressed.rds"), compress=TRUE) 19 | saveRDS(df, str_c(base, "_uncompressed.rds"), compress=FALSE) 20 | } 21 | 22 | do_benchmark <- function(index) { 23 | base <- files[index] 24 | sep <- seps[index] 25 | 26 | csv_path <- str_c("data/", base, ".csv") 27 | feather_v1_path <- str_c(base, "_v1.feather") 28 | feather_unc_path <- str_c(base, "_uncompressed.feather") 29 | feather_lz4_path <- str_c(base, "_lz4.feather") 30 | feather_zstd_path <- str_c(base, "_zstd.feather") 31 | fst_0_path <- str_c(base, "_0.fst") 32 | fst_50_path <- str_c(base, "_50.fst") 33 | parquet_unc_path <- str_c(base, "_uncompressed.parquet") 34 | parquet_snappy_path <- str_c(base, "_snappy.parquet") 35 | rds_unc_path <- str_c(base, "_uncompressed.rds") 36 | rds_compressed_path <- str_c(base, "_compressed.rds") 37 | 38 | mbm <- microbenchmark( 39 | csv_fread=data.table::fread(csv_path, sep=sep, header=FALSE), 40 | fst_unc=fst::read_fst(fst_0_path), 41 | fst_50=fst::read_fst(fst_50_path), 42 | feather_v1=feather::read_feather(feather_v1_path), 43 | feather_unc=arrow::read_feather(feather_unc_path), 44 | feather_lz4=arrow::read_feather(feather_lz4_path), 45 | feather_zstd=arrow::read_feather(feather_zstd_path), 46 | parquet_unc=arrow::read_parquet(parquet_unc_path), 47 | parquet_snappy=arrow::read_parquet(parquet_snappy_path), 48 | rds_unc=readRDS(rds_unc_path), 49 | rds_compressed=readRDS(rds_compressed_path), 50 | times=5 51 | ) 52 | mbm <- data.frame(mbm) %>% dplyr::group_by(expr) %>% dplyr::summarize(time=mean(time)) 53 | mbm$dataset <- names[index] 54 | mbm 55 | } 56 | 57 | do_write_benchmark <- function(index) { 58 | base <- files[index] 59 | sep <- seps[index] 60 | 61 | df <- arrow::read_parquet(str_c(base, "_snappy.parquet")) 62 | 63 | mbm <- microbenchmark( 64 | fst_unc=fst::write_fst(df, str_c(base, "_0.fst"), compress=0), 65 | fst_50=fst::write_fst(df, str_c(base, "_50.fst"), compress=50), 66 | feather_v1=feather::write_feather(df, str_c(base, "_v1.feather")), 67 | feather_unc=arrow::write_feather(df, str_c(base, "_unc_r.feather"), 68 | compression="uncompressed"), 69 | feather_lz4=arrow::write_parquet(df, str_c(base, "_lz4_r.feather"), 70 | compression="lz4"), 71 | feather_zstd=arrow::write_parquet(df, str_c(base, "_zstd_r.feather"), 72 | compression="zstd"), 73 | parquet_unc=arrow::write_parquet(df, str_c(base, "_unc_r.parquet"), 74 | compression="uncompressed"), 75 | parquet_snappy=arrow::write_parquet(df, str_c(base, "_snappy_r.parquet"), 76 | compression="snappy"), 77 | rds_compressed=saveRDS(df, str_c(base, "_compressed.rds"), compress=TRUE), 78 | rds_unc=saveRDS(df, str_c(base, "_uncompressed.rds"), compress=FALSE), 79 | times=1 80 | ) 81 | mbm <- data.frame(mbm) %>% dplyr::group_by(expr) %>% dplyr::summarize(time=mean(time)) 82 | mbm$dataset <- names[index] 83 | mbm 84 | } 85 | 86 | generate_files <- function() { 87 | for (base in files) { 88 | create_files(base) 89 | } 90 | } 91 | 92 | # generate_files() 93 | 94 | print(str_c("Using ", arrow::cpu_count(), " threads")) 95 | 96 | results <- dplyr::bind_rows(do_benchmark(1), do_benchmark(2)) 97 | print(results) 98 | write.csv(results, str_c("r_read_results_", arrow::cpu_count(), ".csv")) 99 | 100 | write_results <- dplyr::bind_rows(do_write_benchmark(1), do_write_benchmark(2)) 101 | print(write_results) 102 | write.csv(write_results, str_c("r_write_results_", arrow::cpu_count(), ".csv")) 103 | -------------------------------------------------------------------------------- /20190919file_benchmarks/benchmark.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | import pyarrow.feather as feather 4 | import pandas as pd 5 | import json 6 | import numpy as np 7 | import pyarrow as pa 8 | import pyarrow.parquet as pq 9 | from pandas.util.testing import rands 10 | import gc 11 | import time 12 | 13 | 14 | def get_timing(f, niter): 15 | start = time.clock_gettime(time.CLOCK_REALTIME) 16 | for i in range(niter): 17 | f() 18 | result = (time.clock_gettime(time.CLOCK_REALTIME) - start) / niter 19 | return result 20 | 21 | 22 | class Benchmarker: 23 | 24 | def __init__(self, file_info): 25 | self.base = file_info['base'] 26 | (self.csv_path, 27 | self.sep, 28 | self.header) = unpack(file_info['source'], 'path', 'sep', 'header') 29 | 30 | self.parquet_unc_path = '{}_uncompressed.parquet'.format(self.base) 31 | self.parquet_snappy_path = '{}_snappy.parquet'.format(self.base) 32 | self.feather_unc_path = '{}_uncompressed.feather'.format(self.base) 33 | self.feather_lz4_path = '{}_lz4.feather'.format(self.base) 34 | self.feather_zstd_path = '{}_zstd.feather'.format(self.base) 35 | 36 | def bench_read(self, niter=5): 37 | cases = [ 38 | ('parquet (UNC)', 'arrow Table', 39 | lambda: pq.read_table(self.parquet_unc_path, memory_map=False)), 40 | ('parquet (UNC)', 'pandas', 41 | lambda: (pq.read_table(self.parquet_unc_path, memory_map=False) 42 | .to_pandas())), 43 | ('parquet (SNAPPY)', 'arrow Table', 44 | lambda: pq.read_table(self.parquet_snappy_path, 45 | memory_map=False)), 46 | ('parquet (SNAPPY)', 'pandas', 47 | lambda: (pq.read_table(self.parquet_snappy_path, memory_map=False) 48 | .to_pandas())), 49 | ('feather V2 (UNC)', 'pandas', 50 | lambda: feather.read_feather(self.feather_unc_path, 51 | memory_map=False)), 52 | ('feather V2 (LZ4)', 'pandas', 53 | lambda: feather.read_feather(self.feather_lz4_path, 54 | memory_map=False)), 55 | ('feather V2 (ZSTD)', 'pandas', 56 | lambda: feather.read_feather(self.feather_zstd_path, 57 | memory_map=False)), 58 | ('feather V2 (UNC)', 'arrow Table', 59 | lambda: feather.read_table(self.feather_unc_path, 60 | memory_map=False)), 61 | ('feather V2 (LZ4)', 'arrow Table', 62 | lambda: feather.read_table(self.feather_lz4_path, 63 | memory_map=False)), 64 | ('feather V2 (ZSTD)', 'arrow Table', 65 | lambda: feather.read_table(self.feather_zstd_path, 66 | memory_map=False)), 67 | ] 68 | 69 | return self._bench_cases(cases, niter) 70 | 71 | def bench_write(self, niter=2): 72 | print("Reading text file: {}".format(self.csv_path)) 73 | df = pd.read_csv(self.csv_path, sep=self.sep, header=self.header, 74 | low_memory=False) 75 | if self.header is None: 76 | df.columns = ['f{}'.format(i) for i in range(len(df.columns))] 77 | 78 | def _get_table(df): 79 | return (pa.Table.from_pandas(df, preserve_index=False) 80 | .replace_schema_metadata(None)) 81 | 82 | t = _get_table(df) 83 | 84 | cases = [ 85 | ('parquet (UNC)', 'arrow Table', 86 | lambda: pq.write_table(t, self.parquet_unc_path, 87 | compression='NONE')), 88 | ('parquet (UNC)', 'pandas', 89 | lambda: pq.write_table(_get_table(df), self.parquet_unc_path, 90 | compression='NONE')), 91 | ('parquet (SNAPPY)', 'arrow Table', 92 | lambda: pq.write_table(t, self.parquet_snappy_path)), 93 | ('parquet (SNAPPY)', 'pandas', 94 | lambda: pq.write_table(_get_table(df), self.parquet_snappy_path)), 95 | ('feather V2 (UNC)', 'pandas', 96 | lambda: feather.write_feather(df, self.feather_unc_path, 97 | compression='uncompressed')), 98 | ('feather V2 (UNC)', 'arrow Table', 99 | lambda: feather.write_feather(t, self.feather_unc_path, 100 | compression='uncompressed')), 101 | ('feather V2 (LZ4)', 'pandas', 102 | lambda: feather.write_feather(df, self.feather_lz4_path, 103 | compression='lz4')), 104 | ('feather V2 (LZ4)', 'arrow Table', 105 | lambda: feather.write_feather(t, self.feather_lz4_path, 106 | compression='lz4')), 107 | ('feather V2 (ZSTD)', 'pandas', 108 | lambda: feather.write_feather(df, self.feather_zstd_path, 109 | compression='zstd')), 110 | ('feather V2 (ZSTD)', 'arrow Table', 111 | lambda: feather.write_feather(t, self.feather_zstd_path, 112 | compression='zstd')) 113 | ] 114 | 115 | return self._bench_cases(cases, niter) 116 | 117 | def _bench_cases(self, cases, niter): 118 | results = [] 119 | for name, output_type, f in cases: 120 | print(name) 121 | result = (name, output_type, get_timing(f, niter)) 122 | print(result) 123 | results.append(result) 124 | return pd.DataFrame.from_records(results, 125 | columns=['expr', 'output_type', 126 | 'mean']) 127 | 128 | 129 | def unpack(d, *fields): 130 | return (d[f] for f in fields) 131 | 132 | 133 | 134 | files = { 135 | 'fanniemae': { 136 | 'base': '2016Q4', 137 | 'source': { 138 | 'path': 'data/2016Q4.csv', 139 | 'sep': '|', 140 | 'header': None 141 | } 142 | }, 143 | 'nyctaxi': { 144 | 'base': 'yellow_tripdata_2010-01', 145 | 'source': { 146 | 'path': 'data/yellow_tripdata_2010-01.csv', 147 | 'sep': ',', 148 | 'header': 0 149 | } 150 | } 151 | } 152 | 153 | 154 | def run_benchmarks(num_threads, what='read'): 155 | pa.set_cpu_count(num_threads) 156 | 157 | all_results = [] 158 | for name, info in files.items(): 159 | benchmarker = Benchmarker(info) 160 | if what == 'read': 161 | print("Benchmarking reads") 162 | file_results = benchmarker.bench_read() 163 | elif what == 'write': 164 | print("Benchmarking writes") 165 | file_results = benchmarker.bench_write() 166 | else: 167 | raise ValueError(what) 168 | file_results['dataset'] = name 169 | all_results.append(file_results) 170 | 171 | print(all_results) 172 | return pd.concat(all_results, ignore_index=True) 173 | 174 | 175 | 176 | # for i in range(5): 177 | # pq.read_table('yellow_tripdata_2010-01.parquet').to_pandas() 178 | 179 | # write_files(files) 180 | 181 | num_threads_cases = [1, 4] 182 | 183 | for nthreads in num_threads_cases: 184 | write_results = run_benchmarks(nthreads, what='write') 185 | write_results.to_csv('py_write_results_{}.csv'.format(nthreads)) 186 | 187 | read_results = run_benchmarks(nthreads, what='read') 188 | read_results.to_csv('py_read_results_{}.csv'.format(nthreads)) 189 | 190 | # for nthreads in num_threads_cases: 191 | # run_benchmarks(nthreads) 192 | 193 | # ('pyarrow.parquet', 1.5470361709594727) 194 | # ('pyarrow.parquet-pandas', 2.925654172897339) 195 | # ('pyarrow.feather', 1.6384665012359618) 196 | -------------------------------------------------------------------------------- /20190919file_benchmarks/file_sizes.csv: -------------------------------------------------------------------------------- 1 | dataset,file_type,size 2 | fanniemae,feather V1,4812.5457763671875 3 | fanniemae,feather V2 (UNC),4812.015695571899 4 | fanniemae,feather V2 (LZ4),608.8575687408447 5 | fanniemae,feather V2 (ZSTD),499.76035499572754 6 | fanniemae,parquet (UNC),372.0420684814453 7 | fanniemae,parquet (SNAPPY),136.54194450378418 8 | fanniemae,fst (UNC),5033.68958568573 9 | fanniemae,fst (C=50),766.0145416259766 10 | fanniemae,RDS (C),114.32603359222412 11 | fanniemae,RDS (UNC),5682.447074890137 12 | nyctaxi,feather V1,2389.4743881225586 13 | nyctaxi,feather V2 (UNC),2389.72052192688 14 | nyctaxi,feather V2 (LZ4),1120.5288562774658 15 | nyctaxi,feather V2 (ZSTD),783.8803653717041 16 | nyctaxi,parquet (UNC),1188.3576135635376 17 | nyctaxi,parquet (SNAPPY),719.5741958618164 18 | nyctaxi,fst (UNC),2412.6597032546997 19 | nyctaxi,fst (C=50),1199.6639070510864 20 | nyctaxi,RDS (C),541.6701745986938 21 | nyctaxi,RDS (UNC),2671.2057056427 22 | -------------------------------------------------------------------------------- /20190919file_benchmarks/generate_results.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Make sure we're using performance CPU governor 4 | sudo cpufreq-set -g performance 5 | 6 | python benchmark.py 7 | 8 | OMP_NUM_THREADS=1 Rscript benchmark.R 9 | OMP_NUM_THREADS=4 Rscript benchmark.R 10 | 11 | python glue_results.py 12 | -------------------------------------------------------------------------------- /20190919file_benchmarks/glue_results.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | 4 | 5 | def munge_results(kind='read'): 6 | pieces = [] 7 | for num_threads in (1, 4): 8 | expr_rename = { 9 | 'parquet_unc': 'parquet (UNC)', 10 | 'parquet_snappy': 'parquet (SNAPPY)', 11 | 'feather_v1': 'feather V1', 12 | 'feather_unc': 'feather V2 (UNC)', 13 | 'feather_lz4': 'feather V2 (LZ4)', 14 | 'feather_zstd': 'feather V2 (ZSTD)', 15 | 'fst_unc': 'fst (UNC)', 16 | 'fst_50': 'fst (c=50)', 17 | 'rds_unc': 'RDS (UNC)', 18 | 'rds_compressed': 'RDS (C)', 19 | 'pyarrow.parquet': 'parquet (SNAPPY)', 20 | 'pyarrow.feather (UNC)': 'feather V2 (UNC)', 21 | 'pyarrow.feather (LZ4)': 'feather V2 (LZ4)', 22 | 'pyarrow.feather (ZSTD)': 'feather V2 (ZSTD)', 23 | } 24 | 25 | r_results = pd.read_csv('r_{}_results_{}.csv'.format(kind, 26 | num_threads)) 27 | r_results = r_results[['expr', 'time', 'dataset']] 28 | r_results['output_type'] = "R data.frame" 29 | r_results['expr'] = r_results['expr'] 30 | r_results['time'] /= 1e9 31 | r_results['nthreads'] = num_threads 32 | r_results['language'] = 'R' 33 | 34 | r_results.expr = r_results.expr.map(lambda x: expr_rename.get(x, x)) 35 | 36 | py_results = pd.read_csv('py_{}_results_{}.csv'.format(kind, 37 | num_threads)) 38 | py_results = py_results[['expr', 'output_type', 'mean', 'dataset']] 39 | py_results['time'] = py_results.pop('mean') 40 | py_results['nthreads'] = num_threads 41 | py_results['language'] = 'Python' 42 | 43 | py_results.expr = py_results.expr.map(lambda x: expr_rename.get(x, x)) 44 | 45 | renamings = { 46 | 'pyarrow.Table': 'arrow Table', 47 | } 48 | 49 | py_results.output_type = py_results.output_type.map( 50 | lambda x: renamings.get(x, x)) 51 | 52 | pieces.extend([r_results, py_results]) 53 | return pd.concat(pieces, ignore_index=True, sort=False) 54 | 55 | 56 | read_results = munge_results('read') 57 | read_results.to_csv('all_read_results.csv', index=False) 58 | 59 | write_results = munge_results('write') 60 | write_results.to_csv('all_write_results.csv', index=False) 61 | 62 | 63 | files = [('fanniemae', '2016Q4'), 64 | ('nyctaxi', 'yellow_tripdata_2010-01')] 65 | 66 | cases = [ 67 | ('feather V1', '_v1.feather'), 68 | ('feather V2 (UNC)', '_uncompressed.feather'), 69 | ('feather V2 (LZ4)', '_lz4.feather'), 70 | ('feather V2 (ZSTD)', '_zstd.feather'), 71 | ('parquet (UNC)', '_uncompressed.parquet'), 72 | ('parquet (SNAPPY)', '_snappy.parquet'), 73 | ('fst (UNC)', '_0.fst'), 74 | ('fst (C=50)', '_50.fst'), 75 | ('RDS (C)', '_compressed.rds'), 76 | ('RDS (UNC)', '_uncompressed.rds') 77 | ] 78 | 79 | file_sizes = [] 80 | 81 | 82 | for logical_name, file_base in files: 83 | for storage, ending in cases: 84 | full_path = f'{file_base}{ending}' 85 | size = os.stat(full_path).st_size 86 | result = (logical_name, storage, size / (1 << 20)) 87 | print(result) 88 | file_sizes.append(result) 89 | 90 | file_sizes = pd.DataFrame.from_records( 91 | file_sizes, columns=['dataset', 'file_type', 'size']) 92 | 93 | file_sizes.to_csv('file_sizes.csv', index=False) 94 | -------------------------------------------------------------------------------- /20190919file_benchmarks/i9-9880H-1/all_results.csv: -------------------------------------------------------------------------------- 1 | expr,time,dataset,output_type 2 | R rds,24.258071124,fanniemae,R data.frame 3 | R csv_fread,11.890215172200001,fanniemae,R data.frame 4 | R feather_old,5.2771154078,fanniemae,R data.frame 5 | R fst,4.2202414134,fanniemae,R data.frame 6 | R feather_arrow,2.9834087618000003,fanniemae,R data.frame 7 | R parquet,7.969573458,fanniemae,R data.frame 8 | R rds,21.7943077156,nyctaxi,R data.frame 9 | R csv_fread,21.743098532599998,nyctaxi,R data.frame 10 | R feather_old,13.1421169332,nyctaxi,R data.frame 11 | R fst,13.226063631799999,nyctaxi,R data.frame 12 | R feather_arrow,11.358103880200002,nyctaxi,R data.frame 13 | R parquet,13.9190224234,nyctaxi,R data.frame 14 | pyarrow.parquet,5.198975515365602,fanniemae,arrow Table 15 | pyarrow.parquet-pandas,7.051469707489014,fanniemae,pandas 16 | pyarrow.feather,1.979597759246826,fanniemae,pandas 17 | pyarrow.parquet,2.888606691360473,nyctaxi,arrow Table 18 | pyarrow.parquet-pandas,9.884848737716675,nyctaxi,pandas 19 | pyarrow.feather,6.670159721374513,nyctaxi,pandas 20 | -------------------------------------------------------------------------------- /20190919file_benchmarks/i9-9880H-1/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ursa-labs/notebooks/fef623e363546b76333a58d31dd58805248a596d/20190919file_benchmarks/i9-9880H-1/plot.png -------------------------------------------------------------------------------- /20190919file_benchmarks/i9-9880H-1/py_results.csv: -------------------------------------------------------------------------------- 1 | ,expr,output_type,mean,dataset 2 | 0,pyarrow.parquet,arrow Table,5.198975515365601,fanniemae 3 | 1,pyarrow.parquet-pandas,pandas,7.051469707489014,fanniemae 4 | 2,pyarrow.feather,pandas,1.9795977592468261,fanniemae 5 | 3,pyarrow.parquet,arrow Table,2.8886066913604735,nyctaxi 6 | 4,pyarrow.parquet-pandas,pandas,9.884848737716675,nyctaxi 7 | 5,pyarrow.feather,pandas,6.670159721374512,nyctaxi 8 | -------------------------------------------------------------------------------- /20190919file_benchmarks/i9-9880H-1/r_results.csv: -------------------------------------------------------------------------------- 1 | "","expr","time","dataset" 2 | "1","rds",24258071124,"fanniemae" 3 | "2","csv_fread",11890215172.2,"fanniemae" 4 | "3","feather_old",5277115407.8,"fanniemae" 5 | "4","fst",4220241413.4,"fanniemae" 6 | "5","feather_arrow",2983408761.8,"fanniemae" 7 | "6","parquet",7969573458,"fanniemae" 8 | "7","rds",21794307715.6,"nyctaxi" 9 | "8","csv_fread",21743098532.6,"nyctaxi" 10 | "9","feather_old",13142116933.2,"nyctaxi" 11 | "10","fst",13226063631.8,"nyctaxi" 12 | "11","feather_arrow",11358103880.2,"nyctaxi" 13 | "12","parquet",13919022423.4,"nyctaxi" 14 | -------------------------------------------------------------------------------- /20190919file_benchmarks/i9-9880H-4/all_results.csv: -------------------------------------------------------------------------------- 1 | expr,time,dataset,output_type 2 | R rds,24.7440853608,fanniemae,R data.frame 3 | R csv_fread,5.4296275238,fanniemae,R data.frame 4 | R feather_old,5.2222283298,fanniemae,R data.frame 5 | R fst,3.3813570264,fanniemae,R data.frame 6 | R feather_arrow,2.9662292186,fanniemae,R data.frame 7 | R parquet,4.6544630666,fanniemae,R data.frame 8 | R rds,22.135398477200003,nyctaxi,R data.frame 9 | R csv_fread,17.687647606,nyctaxi,R data.frame 10 | R feather_old,11.989569364200001,nyctaxi,R data.frame 11 | R fst,12.0112101424,nyctaxi,R data.frame 12 | R feather_arrow,11.617949409200001,nyctaxi,R data.frame 13 | R parquet,13.0886089094,nyctaxi,R data.frame 14 | pyarrow.parquet,2.1267578125,fanniemae,arrow Table 15 | pyarrow.parquet-pandas,3.518295383453369,fanniemae,pandas 16 | pyarrow.feather,1.6831360816955567,fanniemae,pandas 17 | pyarrow.parquet,1.1050359725952148,nyctaxi,arrow Table 18 | pyarrow.parquet-pandas,7.481458854675293,nyctaxi,pandas 19 | pyarrow.feather,6.5046766757965075,nyctaxi,pandas 20 | -------------------------------------------------------------------------------- /20190919file_benchmarks/i9-9880H-4/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ursa-labs/notebooks/fef623e363546b76333a58d31dd58805248a596d/20190919file_benchmarks/i9-9880H-4/plot.png -------------------------------------------------------------------------------- /20190919file_benchmarks/i9-9880H-4/py_results.csv: -------------------------------------------------------------------------------- 1 | ,expr,output_type,mean,dataset 2 | 0,pyarrow.parquet,arrow Table,2.1267578125,fanniemae 3 | 1,pyarrow.parquet-pandas,pandas,3.5182953834533692,fanniemae 4 | 2,pyarrow.feather,pandas,1.6831360816955567,fanniemae 5 | 3,pyarrow.parquet,arrow Table,1.1050359725952148,nyctaxi 6 | 4,pyarrow.parquet-pandas,pandas,7.481458854675293,nyctaxi 7 | 5,pyarrow.feather,pandas,6.504676675796508,nyctaxi 8 | -------------------------------------------------------------------------------- /20190919file_benchmarks/i9-9880H-4/r_results.csv: -------------------------------------------------------------------------------- 1 | "","expr","time","dataset" 2 | "1","rds",24744085360.8,"fanniemae" 3 | "2","csv_fread",5429627523.8,"fanniemae" 4 | "3","feather_old",5222228329.8,"fanniemae" 5 | "4","fst",3381357026.4,"fanniemae" 6 | "5","feather_arrow",2966229218.6,"fanniemae" 7 | "6","parquet",4654463066.6,"fanniemae" 8 | "7","rds",22135398477.2,"nyctaxi" 9 | "8","csv_fread",17687647606,"nyctaxi" 10 | "9","feather_old",11989569364.2,"nyctaxi" 11 | "10","fst",12011210142.4,"nyctaxi" 12 | "11","feather_arrow",11617949409.2,"nyctaxi" 13 | "12","parquet",13088608909.4,"nyctaxi" 14 | -------------------------------------------------------------------------------- /20190919file_benchmarks/i9-9880H-8/all_results.csv: -------------------------------------------------------------------------------- 1 | expr,time,dataset,output_type 2 | R rds,25.963613643200002,fanniemae,R data.frame 3 | R csv_fread,5.125688092600001,fanniemae,R data.frame 4 | R feather_old,5.8978863578,fanniemae,R data.frame 5 | R fst,3.6207146728,fanniemae,R data.frame 6 | R feather_arrow,3.285127359,fanniemae,R data.frame 7 | R parquet,4.608878230399999,fanniemae,R data.frame 8 | R rds,22.5701864218,nyctaxi,R data.frame 9 | R csv_fread,17.681116847200002,nyctaxi,R data.frame 10 | R feather_old,13.7390440426,nyctaxi,R data.frame 11 | R fst,13.188127108200002,nyctaxi,R data.frame 12 | R feather_arrow,12.2201220736,nyctaxi,R data.frame 13 | R parquet,12.6165632024,nyctaxi,R data.frame 14 | pyarrow.parquet,1.6406285285949709,fanniemae,arrow Table 15 | pyarrow.parquet-pandas,3.035256814956665,fanniemae,pandas 16 | pyarrow.feather,1.6025235176086423,fanniemae,pandas 17 | pyarrow.parquet,0.9039567470550536,nyctaxi,arrow Table 18 | pyarrow.parquet-pandas,6.945300436019897,nyctaxi,pandas 19 | pyarrow.feather,6.311788606643678,nyctaxi,pandas 20 | -------------------------------------------------------------------------------- /20190919file_benchmarks/i9-9880H-8/plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ursa-labs/notebooks/fef623e363546b76333a58d31dd58805248a596d/20190919file_benchmarks/i9-9880H-8/plot.png -------------------------------------------------------------------------------- /20190919file_benchmarks/i9-9880H-8/py_results.csv: -------------------------------------------------------------------------------- 1 | ,expr,output_type,mean,dataset 2 | 0,pyarrow.parquet,arrow Table,1.6406285285949707,fanniemae 3 | 1,pyarrow.parquet-pandas,pandas,3.035256814956665,fanniemae 4 | 2,pyarrow.feather,pandas,1.6025235176086425,fanniemae 5 | 3,pyarrow.parquet,arrow Table,0.9039567470550537,nyctaxi 6 | 4,pyarrow.parquet-pandas,pandas,6.945300436019897,nyctaxi 7 | 5,pyarrow.feather,pandas,6.311788606643677,nyctaxi 8 | -------------------------------------------------------------------------------- /20190919file_benchmarks/i9-9880H-8/r_results.csv: -------------------------------------------------------------------------------- 1 | "","expr","time","dataset" 2 | "1","rds",25963613643.2,"fanniemae" 3 | "2","csv_fread",5125688092.6,"fanniemae" 4 | "3","feather_old",5897886357.8,"fanniemae" 5 | "4","fst",3620714672.8,"fanniemae" 6 | "5","feather_arrow",3285127359,"fanniemae" 7 | "6","parquet",4608878230.4,"fanniemae" 8 | "7","rds",22570186421.8,"nyctaxi" 9 | "8","csv_fread",17681116847.2,"nyctaxi" 10 | "9","feather_old",13739044042.6,"nyctaxi" 11 | "10","fst",13188127108.2,"nyctaxi" 12 | "11","feather_arrow",12220122073.6,"nyctaxi" 13 | "12","parquet",12616563202.4,"nyctaxi" 14 | -------------------------------------------------------------------------------- /20190919file_benchmarks/make_feather_plots.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | 3 | # install.packages("stringi") 4 | 5 | setwd("~/code/notebooks/20190919file_benchmarks/") 6 | 7 | reads <- read.csv("ipc_read_parallel.csv") 8 | writes <- read.csv("ipc_write_parallel.csv") 9 | 10 | writes 11 | 12 | # file size 13 | ggplot(writes, aes(fill=factor(chunksize), y=file_size, x=codec)) + 14 | facet_grid(rows=vars(dataset)) + 15 | geom_bar(position="dodge", stat="identity") + 16 | coord_flip() 17 | 18 | ggsave("ipc_file_size.png", width=10, height=4) 19 | 20 | # write time 21 | ggplot(writes, aes(fill=factor(chunksize), y=write_time, x=codec)) + 22 | facet_grid(rows=vars(dataset)) + 23 | geom_bar(position="dodge", stat="identity") + 24 | coord_flip() 25 | 26 | ggsave("ipc_write_time.png", width=10, height=4) 27 | 28 | # read time 29 | ggplot(reads, aes(fill=factor(chunksize), y=read_time, x=codec)) + 30 | facet_grid(rows=vars(dataset)) + 31 | geom_bar(position="dodge", stat="identity") + 32 | coord_flip() 33 | 34 | ggsave("ipc_read_time.png", width=10, height=4) -------------------------------------------------------------------------------- /20190919file_benchmarks/make_plots.R: -------------------------------------------------------------------------------- 1 | library(ggplot2) 2 | library(dplyr) 3 | 4 | 5 | read_results <- read.csv("all_read_results.csv") 6 | write_results <- read.csv("all_write_results.csv") 7 | # Add a row for the Fannie Mae CSV file size, as reported in the previous post 8 | file_sizes <- rbind( 9 | read.csv("file_sizes.csv", stringsAsFactors = FALSE), 10 | data.frame( 11 | dataset = "fanniemae", 12 | file_type = "CSV", 13 | size = 1.52*1024, 14 | stringsAsFactors = FALSE 15 | ) 16 | ) 17 | 18 | # Color mapping 19 | cols <- c( 20 | "feather V1" = "steelblue", 21 | "feather V2 (UNC)" = "steelblue", 22 | "feather V2 (LZ4)" = "steelblue", 23 | "feather V2 (ZSTD)" = "steelblue", 24 | "parquet (SNAPPY)" = "steelblue1", 25 | "parquet (UNC)" = "steelblue1", 26 | "fst (C=50)" = "wheat4", 27 | "fst (UNC)" = "wheat4", 28 | "RDS (C)" = "gray", 29 | "RDS (UNC)" = "gray", 30 | "csv_fread" = "wheat3", 31 | "CSV" = "wheat3" 32 | ) 33 | 34 | # This is ugly but it makes the graph labels prettier 35 | munge_labels <- function (x) { 36 | sub("csv_fread", "CSV (data.table::fread)", 37 | sub("UNC", "Uncompressed", 38 | sub("feather", "Feather", 39 | sub("parquet", "Parquet", 40 | sub("[Cc]=", "ZSTD, ", 41 | sub("ZSTD", "ZSTD, 1", 42 | sub("\\(C\\)", "(GZIP)", 43 | sub("V1", "V1 (Uncompressed)", 44 | x)))))))) 45 | } 46 | names(cols) <- munge_labels(names(cols)) 47 | 48 | fix_formats <- function(x) { 49 | # This applies the pretty names and reorders the factor levels so that 50 | # they print in the order we want 51 | levels(x) <- munge_labels(levels(x)) 52 | factor(x, levels = rev(c( 53 | "Feather V1 (Uncompressed)", 54 | "Feather V2 (Uncompressed)", 55 | "Feather V2 (LZ4)", 56 | "Feather V2 (ZSTD, 1)", 57 | "Parquet (Uncompressed)", 58 | "Parquet (SNAPPY)", 59 | "RDS (Uncompressed)", 60 | "RDS (GZIP)", 61 | "CSV", 62 | "CSV (data.table::fread)", 63 | "fst (Uncompressed)", 64 | "fst (ZSTD, 50)" 65 | )) 66 | ) 67 | } 68 | 69 | benchmark_plot <- function(data) { 70 | # Since we do the same thing for most of the graphs, collect plotting logic here 71 | ggplot(data, aes(y=time, fill=expr, x=expr)) + 72 | facet_wrap(vars(output_type), ncol=1) + 73 | geom_col(position="dodge") + 74 | theme_minimal() + 75 | scale_fill_manual(values = cols) + 76 | coord_flip() + 77 | theme( 78 | legend.position = "none", 79 | panel.grid.major.y = element_blank() 80 | ) 81 | } 82 | 83 | 84 | ### Reading 85 | read_results$expr <- fix_formats(read_results$expr) 86 | read_results$Threads <- factor(read_results$nthreads) 87 | # All 88 | ggplot(read_results, aes(fill=Threads, y=time, x=expr)) + 89 | facet_grid(rows=vars(output_type), col=vars(dataset)) + 90 | geom_bar(position="dodge", stat="identity") + 91 | coord_flip() + 92 | theme_minimal() + 93 | theme(legend.position = "right") + 94 | labs(x = "Format", y = "Time to read (s)", title = "") 95 | ggsave("20200414_read_full.png", width=10, height=6) 96 | 97 | # Python and Arrow only 98 | read_results %>% 99 | filter(nthreads == 4 & dataset == "fanniemae" & language == "Python") %>% 100 | benchmark_plot() + 101 | scale_y_continuous(breaks = seq(0, 10, 2), limits = c(0, 10)) + 102 | labs(x = "", y = "Time to read (s)", title = "") 103 | ggsave("20200414_read_py.png", width=10, height=3) 104 | 105 | # R (and drop RDS because it is out of range) 106 | read_results %>% 107 | filter(nthreads == 4 & dataset == "fanniemae" & language == "R" & !grepl("^RDS", expr)) %>% 108 | benchmark_plot() + 109 | scale_y_continuous(breaks = seq(0, 10, 2), limits = c(0, 10)) + 110 | labs(x = "", y = "Time to read (s)", title = "") 111 | ggsave("20200414_read_r.png", width=10, height=3) 112 | 113 | ### Writing 114 | write_results$expr <- fix_formats(write_results$expr) 115 | write_results$Threads <- factor(write_results$nthreads) 116 | # All 117 | ggplot(write_results, aes(fill=Threads, y=time, x=expr)) + 118 | facet_grid(rows=vars(output_type), col=vars(dataset)) + 119 | geom_bar(position="dodge", stat="identity") + 120 | coord_flip() + 121 | labs(x = "Format", y = "Time (s)", title = "Write speeds") 122 | ggsave("20200414_write_full.png", width=10, height=6) 123 | 124 | # Python and Arrow only 125 | write_results %>% 126 | filter(nthreads == 4 & dataset == "fanniemae" & language == "Python") %>% 127 | benchmark_plot() + 128 | scale_y_continuous(breaks = seq(0, 12, 2), limits = c(0, 12)) + 129 | labs(x = "", y = "Time to write (s)", title = "") 130 | ggsave("20200414_write_py.png", width=10, height=3) 131 | 132 | # R 133 | write_results %>% 134 | filter(nthreads == 4 & dataset == "fanniemae" & language == "R" & !grepl("^RDS", expr)) %>% 135 | benchmark_plot() + 136 | scale_y_continuous(breaks = seq(0, 12, 2), limits = c(0, 12)) + 137 | labs(x = "", y = "Time to write (s)", title = "") 138 | ggsave("20200414_write_r.png", width=10, height=3) 139 | 140 | ### File sizes 141 | file_sizes$file_type <- fix_formats(as.factor(file_sizes$file_type)) 142 | ggplot(file_sizes[file_sizes$dataset == "fanniemae",], aes(y=size/1024, file_type, fill = file_type)) + 143 | geom_col(position="dodge") + 144 | theme_minimal() + 145 | scale_fill_manual(values = cols) + 146 | coord_flip() + 147 | theme( 148 | legend.position = "none", 149 | panel.grid.major.y = element_blank() 150 | ) + 151 | labs(y = "File size (GB)", x = "", title = "") 152 | ggsave("20200414_file_sizes.png", width=10, height=3) 153 | -------------------------------------------------------------------------------- /20190919file_benchmarks/py_read_results_1.csv: -------------------------------------------------------------------------------- 1 | ,expr,output_type,mean,dataset 2 | 0,parquet (UNC),arrow Table,6.126083183288574,fanniemae 3 | 1,parquet (UNC),pandas,9.3643874168396,fanniemae 4 | 2,parquet (SNAPPY),arrow Table,6.056532478332519,fanniemae 5 | 3,parquet (SNAPPY),pandas,9.177780771255494,fanniemae 6 | 4,feather V2 (UNC),pandas,4.354116058349609,fanniemae 7 | 5,feather V2 (LZ4),pandas,4.396533584594726,fanniemae 8 | 6,feather V2 (ZSTD),pandas,5.775776481628418,fanniemae 9 | 7,feather V2 (UNC),arrow Table,1.0860649585723876,fanniemae 10 | 8,feather V2 (LZ4),arrow Table,1.0962132453918456,fanniemae 11 | 9,feather V2 (ZSTD),arrow Table,2.5313239097595215,fanniemae 12 | 10,parquet (UNC),arrow Table,2.2780594348907472,nyctaxi 13 | 11,parquet (UNC),pandas,9.22245388031006,nyctaxi 14 | 12,parquet (SNAPPY),arrow Table,2.8247000694274904,nyctaxi 15 | 13,parquet (SNAPPY),pandas,9.735122680664062,nyctaxi 16 | 14,feather V2 (UNC),pandas,7.608278465270996,nyctaxi 17 | 15,feather V2 (LZ4),pandas,7.784061861038208,nyctaxi 18 | 16,feather V2 (ZSTD),pandas,9.633673095703125,nyctaxi 19 | 17,feather V2 (UNC),arrow Table,0.5403317451477051,nyctaxi 20 | 18,feather V2 (LZ4),arrow Table,0.9643253803253173,nyctaxi 21 | 19,feather V2 (ZSTD),arrow Table,2.7800182342529296,nyctaxi 22 | -------------------------------------------------------------------------------- /20190919file_benchmarks/py_read_results_4.csv: -------------------------------------------------------------------------------- 1 | ,expr,output_type,mean,dataset 2 | 0,parquet (UNC),arrow Table,1.841284704208374,fanniemae 3 | 1,parquet (UNC),pandas,4.088014888763428,fanniemae 4 | 2,parquet (SNAPPY),arrow Table,1.8786502361297608,fanniemae 5 | 3,parquet (SNAPPY),pandas,4.165652704238892,fanniemae 6 | 4,feather V2 (UNC),pandas,3.5610058307647705,fanniemae 7 | 5,feather V2 (LZ4),pandas,2.778682994842529,fanniemae 8 | 6,feather V2 (ZSTD),pandas,3.0616337299346923,fanniemae 9 | 7,feather V2 (UNC),arrow Table,1.1269856452941895,fanniemae 10 | 8,feather V2 (LZ4),arrow Table,0.48981823921203616,fanniemae 11 | 9,feather V2 (ZSTD),arrow Table,0.8093690395355224,fanniemae 12 | 10,parquet (UNC),arrow Table,0.6995339870452881,nyctaxi 13 | 11,parquet (UNC),pandas,7.4361457347869875,nyctaxi 14 | 12,parquet (SNAPPY),arrow Table,0.78084397315979,nyctaxi 15 | 13,parquet (SNAPPY),pandas,7.540273284912109,nyctaxi 16 | 14,feather V2 (UNC),pandas,7.369460582733154,nyctaxi 17 | 15,feather V2 (LZ4),pandas,7.119231033325195,nyctaxi 18 | 16,feather V2 (ZSTD),pandas,7.537483549118042,nyctaxi 19 | 17,feather V2 (UNC),arrow Table,0.6116453170776367,nyctaxi 20 | 18,feather V2 (LZ4),arrow Table,0.4065845012664795,nyctaxi 21 | 19,feather V2 (ZSTD),arrow Table,0.8925417900085449,nyctaxi 22 | -------------------------------------------------------------------------------- /20190919file_benchmarks/py_write_results_1.csv: -------------------------------------------------------------------------------- 1 | ,expr,output_type,mean,dataset 2 | 0,parquet (UNC),arrow Table,6.220219850540161,fanniemae 3 | 1,parquet (UNC),pandas,12.395264983177185,fanniemae 4 | 2,parquet (SNAPPY),arrow Table,6.694774866104126,fanniemae 5 | 3,parquet (SNAPPY),pandas,13.161320447921753,fanniemae 6 | 4,feather V2 (UNC),pandas,12.677234172821045,fanniemae 7 | 5,feather V2 (UNC),arrow Table,6.397535443305969,fanniemae 8 | 6,feather V2 (LZ4),pandas,8.32238781452179,fanniemae 9 | 7,feather V2 (LZ4),arrow Table,2.2326916456222534,fanniemae 10 | 8,feather V2 (ZSTD),pandas,10.61594545841217,fanniemae 11 | 9,feather V2 (ZSTD),arrow Table,4.308579444885254,fanniemae 12 | 10,parquet (UNC),arrow Table,4.5986950397491455,nyctaxi 13 | 11,parquet (UNC),pandas,9.009780049324036,nyctaxi 14 | 12,parquet (SNAPPY),arrow Table,5.70121443271637,nyctaxi 15 | 13,parquet (SNAPPY),pandas,10.175373315811157,nyctaxi 16 | 14,feather V2 (UNC),pandas,7.1334041357040405,nyctaxi 17 | 15,feather V2 (UNC),arrow Table,3.112175464630127,nyctaxi 18 | 16,feather V2 (LZ4),pandas,7.4143136739730835,nyctaxi 19 | 17,feather V2 (LZ4),arrow Table,3.567118763923645,nyctaxi 20 | 18,feather V2 (ZSTD),pandas,11.283223748207092,nyctaxi 21 | 19,feather V2 (ZSTD),arrow Table,6.928452372550964,nyctaxi 22 | -------------------------------------------------------------------------------- /20190919file_benchmarks/py_write_results_4.csv: -------------------------------------------------------------------------------- 1 | ,expr,output_type,mean,dataset 2 | 0,parquet (UNC),arrow Table,6.162686586380005,fanniemae 3 | 1,parquet (UNC),pandas,11.565850496292114,fanniemae 4 | 2,parquet (SNAPPY),arrow Table,6.410535216331482,fanniemae 5 | 3,parquet (SNAPPY),pandas,11.6298109292984,fanniemae 6 | 4,feather V2 (UNC),pandas,11.104193806648254,fanniemae 7 | 5,feather V2 (UNC),arrow Table,5.889622092247009,fanniemae 8 | 6,feather V2 (LZ4),pandas,6.612253308296204,fanniemae 9 | 7,feather V2 (LZ4),arrow Table,1.306950330734253,fanniemae 10 | 8,feather V2 (ZSTD),pandas,7.202290296554565,fanniemae 11 | 9,feather V2 (ZSTD),arrow Table,1.8320761919021606,fanniemae 12 | 10,parquet (UNC),arrow Table,4.338123440742493,nyctaxi 13 | 11,parquet (UNC),pandas,8.028993129730225,nyctaxi 14 | 12,parquet (SNAPPY),arrow Table,5.622675895690918,nyctaxi 15 | 13,parquet (SNAPPY),pandas,9.33586835861206,nyctaxi 16 | 14,feather V2 (UNC),pandas,6.233096599578857,nyctaxi 17 | 15,feather V2 (UNC),arrow Table,2.9943872690200806,nyctaxi 18 | 16,feather V2 (LZ4),pandas,5.67785370349884,nyctaxi 19 | 17,feather V2 (LZ4),arrow Table,2.289505124092102,nyctaxi 20 | 18,feather V2 (ZSTD),pandas,6.161942005157471,nyctaxi 21 | 19,feather V2 (ZSTD),arrow Table,2.7954366207122803,nyctaxi 22 | -------------------------------------------------------------------------------- /20190919file_benchmarks/r_read_results_1.csv: -------------------------------------------------------------------------------- 1 | "","expr","time","dataset" 2 | "1","csv_fread",17535751806,"fanniemae" 3 | "2","fst_unc",5833566950,"fanniemae" 4 | "3","fst_50",5875382178,"fanniemae" 5 | "4","feather_v1",10078519502.8,"fanniemae" 6 | "5","feather_unc",4719815545.2,"fanniemae" 7 | "6","feather_lz4",5852145495.2,"fanniemae" 8 | "7","feather_zstd",7779083610,"fanniemae" 9 | "8","parquet_unc",9493391604.8,"fanniemae" 10 | "9","parquet_snappy",9911315661.2,"fanniemae" 11 | "10","rds_unc",30267019708.2,"fanniemae" 12 | "11","rds_compressed",41482849064.2,"fanniemae" 13 | "12","csv_fread",23370041255,"nyctaxi" 14 | "13","fst_unc",13017416436,"nyctaxi" 15 | "14","fst_50",12634709971.4,"nyctaxi" 16 | "15","feather_v1",13443664009.4,"nyctaxi" 17 | "16","feather_unc",11371430104.2,"nyctaxi" 18 | "17","feather_lz4",13296044630,"nyctaxi" 19 | "18","feather_zstd",14594300772.2,"nyctaxi" 20 | "19","parquet_unc",13158666758.2,"nyctaxi" 21 | "20","parquet_snappy",13958228992,"nyctaxi" 22 | "21","rds_unc",22211784820.2,"nyctaxi" 23 | "22","rds_compressed",30765105346.2,"nyctaxi" 24 | -------------------------------------------------------------------------------- /20190919file_benchmarks/r_read_results_4.csv: -------------------------------------------------------------------------------- 1 | "","expr","time","dataset" 2 | "1","csv_fread",8036938666.6,"fanniemae" 3 | "2","fst_unc",6341601497.2,"fanniemae" 4 | "3","fst_50",5054754967.8,"fanniemae" 5 | "4","feather_v1",9799018014,"fanniemae" 6 | "5","feather_unc",5054201747.4,"fanniemae" 7 | "6","feather_lz4",4928118181,"fanniemae" 8 | "7","feather_zstd",5535553828.6,"fanniemae" 9 | "8","parquet_unc",6281569166.6,"fanniemae" 10 | "9","parquet_snappy",6392237692.6,"fanniemae" 11 | "10","rds_unc",29892887491.4,"fanniemae" 12 | "11","rds_compressed",41273872293.8,"fanniemae" 13 | "12","csv_fread",18312046954,"nyctaxi" 14 | "13","fst_unc",11969350465.6,"nyctaxi" 15 | "14","fst_50",13439147068.6,"nyctaxi" 16 | "15","feather_v1",12034649945,"nyctaxi" 17 | "16","feather_unc",11023961432.2,"nyctaxi" 18 | "17","feather_lz4",11592801001,"nyctaxi" 19 | "18","feather_zstd",12704684877,"nyctaxi" 20 | "19","parquet_unc",12225668849,"nyctaxi" 21 | "20","parquet_snappy",12004466381.6,"nyctaxi" 22 | "21","rds_unc",21847153904,"nyctaxi" 23 | "22","rds_compressed",30735937022.8,"nyctaxi" 24 | -------------------------------------------------------------------------------- /20190919file_benchmarks/r_write_results_1.csv: -------------------------------------------------------------------------------- 1 | "","expr","time","dataset" 2 | "1","fst_unc",7007023920,"fanniemae" 3 | "2","fst_50",4385196419,"fanniemae" 4 | "3","feather_v1",8656647228,"fanniemae" 5 | "4","feather_unc",10040626659,"fanniemae" 6 | "5","feather_lz4",10818098194,"fanniemae" 7 | "6","feather_zstd",11438481575,"fanniemae" 8 | "7","parquet_unc",10434816898,"fanniemae" 9 | "8","parquet_snappy",10800951873,"fanniemae" 10 | "9","rds_compressed",76929230341,"fanniemae" 11 | "10","rds_unc",24216423401,"fanniemae" 12 | "11","fst_unc",4087879250,"nyctaxi" 13 | "12","fst_50",3950344461,"nyctaxi" 14 | "13","feather_v1",5972294820,"nyctaxi" 15 | "14","feather_unc",5888590985,"nyctaxi" 16 | "15","feather_lz4",8325439328,"nyctaxi" 17 | "16","feather_zstd",10223231254,"nyctaxi" 18 | "17","parquet_unc",7715640740,"nyctaxi" 19 | "18","parquet_snappy",8585539352,"nyctaxi" 20 | "19","rds_compressed",104898052261,"nyctaxi" 21 | "20","rds_unc",10739751088,"nyctaxi" 22 | -------------------------------------------------------------------------------- /20190919file_benchmarks/r_write_results_4.csv: -------------------------------------------------------------------------------- 1 | "","expr","time","dataset" 2 | "1","fst_unc",7758567831,"fanniemae" 3 | "2","fst_50",3700873556,"fanniemae" 4 | "3","feather_v1",7080591830,"fanniemae" 5 | "4","feather_unc",10413025112,"fanniemae" 6 | "5","feather_lz4",10818213516,"fanniemae" 7 | "6","feather_zstd",11563816777,"fanniemae" 8 | "7","parquet_unc",10814584911,"fanniemae" 9 | "8","parquet_snappy",11152511189,"fanniemae" 10 | "9","rds_compressed",78427148110,"fanniemae" 11 | "10","rds_unc",24919762665,"fanniemae" 12 | "11","fst_unc",4399914353,"nyctaxi" 13 | "12","fst_50",3305661431,"nyctaxi" 14 | "13","feather_v1",5477443720,"nyctaxi" 15 | "14","feather_unc",5864371601,"nyctaxi" 16 | "15","feather_lz4",8494803995,"nyctaxi" 17 | "16","feather_zstd",10073068744,"nyctaxi" 18 | "17","parquet_unc",7675560036,"nyctaxi" 19 | "18","parquet_snappy",8428579617,"nyctaxi" 20 | "19","rds_compressed",108234060692,"nyctaxi" 21 | "20","rds_unc",10717121094,"nyctaxi" 22 | -------------------------------------------------------------------------------- /20200402pandas_load/.ipynb_checkpoints/Untitled-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import pyarrow as pa\n", 11 | "import numpy as np\n", 12 | "\n", 13 | "num_rows = 1_000_000\n", 14 | "num_columns = 100\n", 15 | "arr = np.random.randn(num_rows)\n", 16 | "dict_of_numpy_arrays = {\n", 17 | " 'f{}'.format(i): arr\n", 18 | " for i in range(num_columns)\n", 19 | "}" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 4, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 8, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "85.7 ms ± 4.55 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" 46 | ] 47 | } 48 | ], 49 | "source": [ 50 | "%timeit df = pd.DataFrame(dict_of_numpy_arrays)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 11, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "49.7 ms ± 4.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "timeit df = pa.table(dict_of_numpy_arrays).to_pandas()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 12, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "name": "stdout", 77 | "output_type": "stream", 78 | "text": [ 79 | "66.9 ms ± 4.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" 80 | ] 81 | } 82 | ], 83 | "source": [ 84 | "timeit df = pa.table(dict_of_numpy_arrays).to_pandas(use_threads=False)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [] 93 | } 94 | ], 95 | "metadata": { 96 | "kernelspec": { 97 | "display_name": "Python 3", 98 | "language": "python", 99 | "name": "python3" 100 | }, 101 | "language_info": { 102 | "codemirror_mode": { 103 | "name": "ipython", 104 | "version": 3 105 | }, 106 | "file_extension": ".py", 107 | "mimetype": "text/x-python", 108 | "name": "python", 109 | "nbconvert_exporter": "python", 110 | "pygments_lexer": "ipython3", 111 | "version": "3.7.6" 112 | } 113 | }, 114 | "nbformat": 4, 115 | "nbformat_minor": 4 116 | } 117 | -------------------------------------------------------------------------------- /20200402pandas_load/Untitled.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import pyarrow as pa\n", 11 | "import numpy as np\n", 12 | "\n", 13 | "num_rows = 1_000_000\n", 14 | "num_columns = 100\n", 15 | "arr = np.random.randn(num_rows)\n", 16 | "dict_of_numpy_arrays = {\n", 17 | " 'f{}'.format(i): arr\n", 18 | " for i in range(num_columns)\n", 19 | "}" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 3, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 4, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 8, 39 | "metadata": {}, 40 | "outputs": [ 41 | { 42 | "name": "stdout", 43 | "output_type": "stream", 44 | "text": [ 45 | "85.7 ms ± 4.55 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" 46 | ] 47 | } 48 | ], 49 | "source": [ 50 | "%timeit df = pd.DataFrame(dict_of_numpy_arrays)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 11, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "49.7 ms ± 4.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "timeit df = pa.table(dict_of_numpy_arrays).to_pandas()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 12, 73 | "metadata": {}, 74 | "outputs": [ 75 | { 76 | "name": "stdout", 77 | "output_type": "stream", 78 | "text": [ 79 | "66.9 ms ± 4.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" 80 | ] 81 | } 82 | ], 83 | "source": [ 84 | "timeit df = pa.table(dict_of_numpy_arrays).to_pandas(use_threads=False)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": null, 90 | "metadata": {}, 91 | "outputs": [], 92 | "source": [] 93 | } 94 | ], 95 | "metadata": { 96 | "kernelspec": { 97 | "display_name": "Python 3", 98 | "language": "python", 99 | "name": "python3" 100 | }, 101 | "language_info": { 102 | "codemirror_mode": { 103 | "name": "ipython", 104 | "version": 3 105 | }, 106 | "file_extension": ".py", 107 | "mimetype": "text/x-python", 108 | "name": "python", 109 | "nbconvert_exporter": "python", 110 | "pygments_lexer": "ipython3", 111 | "version": "3.7.6" 112 | } 113 | }, 114 | "nbformat": 4, 115 | "nbformat_minor": 4 116 | } 117 | -------------------------------------------------------------------------------- /20200509wideparquet/WideParquet.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pyarrow as pa\n", 10 | "import pyarrow.parquet as pq\n", 11 | "import numpy as np\n", 12 | "import os\n", 13 | "import pandas as pd\n", 14 | "import time\n", 15 | "\n", 16 | "pa.set_cpu_count(8)\n", 17 | "\n", 18 | "def get_timing(f, niter):\n", 19 | " start = time.clock_gettime(time.CLOCK_REALTIME)\n", 20 | " for i in range(niter):\n", 21 | " f()\n", 22 | " result = (time.clock_gettime(time.CLOCK_REALTIME) - start) / niter\n", 23 | " return result" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "outputs": [], 31 | "source": [ 32 | "total_num_values = 100_000_000" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 4, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "def make_example_table(num_cols):\n", 42 | " num_rows = total_num_values // num_cols\n", 43 | " \n", 44 | " values = np.arange(num_rows)\n", 45 | " \n", 46 | " return pa.table([values] * num_cols, \n", 47 | " names=['f{}'.format(i) for i in range(num_cols)])\n", 48 | " " 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 5, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "t1 = make_example_table(100)\n", 58 | "t2 = make_example_table(100000)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 6, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "import pyarrow.feather as fth" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 17, 73 | "metadata": {}, 74 | "outputs": [], 75 | "source": [ 76 | "fth.write_feather(t1, 't1.arrow', compression=None)\n", 77 | "fth.write_feather(t2, 't2.arrow', compression=None)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 18, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "name": "stdout", 87 | "output_type": "stream", 88 | "text": [ 89 | "total 2519376\r\n", 90 | "-rw------- 1 wesm wesm 800088522 May 9 16:44 t1.arrow\r\n", 91 | "-rw------- 1 wesm wesm 815199522 May 9 16:44 t2.arrow\r\n", 92 | "-rw------- 1 wesm wesm 964513790 May 9 16:23 test.parquet\r\n", 93 | "-rw------- 1 wesm wesm 17815 May 9 16:42 WideParquet.ipynb\r\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "!ls -l" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": 15, 104 | "metadata": {}, 105 | "outputs": [ 106 | { 107 | "name": "stdout", 108 | "output_type": "stream", 109 | "text": [ 110 | "118 ms ± 1.28 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" 111 | ] 112 | } 113 | ], 114 | "source": [ 115 | "%timeit fth.read_table('t1.arrow')" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 16, 121 | "metadata": {}, 122 | "outputs": [ 123 | { 124 | "name": "stdout", 125 | "output_type": "stream", 126 | "text": [ 127 | "398 ms ± 10.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "%timeit fth.read_table('t2.arrow')" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 13, 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "fth.write_feather?" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 27, 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "10\n", 154 | "100\n", 155 | "1000\n", 156 | "10000\n", 157 | "100000\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "num_cols_cases = [10, 100, 1000, 10000, 100000]\n", 163 | "\n", 164 | "file_path = 'test.parquet'\n", 165 | "\n", 166 | "file_sizes = {}\n", 167 | "read_times = {}\n", 168 | "\n", 169 | "for num_cols in num_cols_cases:\n", 170 | " print(num_cols)\n", 171 | "\n", 172 | " table = make_example_table(num_cols)\n", 173 | " \n", 174 | " pq.write_table(table, file_path, compression='NONE')\n", 175 | " \n", 176 | " file_sizes[num_cols] = os.stat(file_path).st_size\n", 177 | " read_times[num_cols] = get_timing(lambda: pq.read_table(file_path), 10)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 28, 183 | "metadata": {}, 184 | "outputs": [ 185 | { 186 | "data": { 187 | "text/plain": [ 188 | "{10: 802850724,\n", 189 | " 100: 827973796,\n", 190 | " 1000: 1013094899,\n", 191 | " 10000: 979191400,\n", 192 | " 100000: 964513790}" 193 | ] 194 | }, 195 | "execution_count": 28, 196 | "metadata": {}, 197 | "output_type": "execute_result" 198 | } 199 | ], 200 | "source": [ 201 | "file_sizes" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 29, 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "data": { 211 | "text/plain": [ 212 | "{10: 0.10510084629058838,\n", 213 | " 100: 0.07333686351776122,\n", 214 | " 1000: 0.12494065761566162,\n", 215 | " 10000: 0.5032524585723877,\n", 216 | " 100000: 3.8229554891586304}" 217 | ] 218 | }, 219 | "execution_count": 29, 220 | "metadata": {}, 221 | "output_type": "execute_result" 222 | } 223 | ], 224 | "source": [ 225 | "read_times" 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": 30, 231 | "metadata": {}, 232 | "outputs": [ 233 | { 234 | "data": { 235 | "text/plain": [ 236 | "" 237 | ] 238 | }, 239 | "execution_count": 30, 240 | "metadata": {}, 241 | "output_type": "execute_result" 242 | }, 243 | { 244 | "data": { 245 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAEgCAYAAACkfIiyAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAQ40lEQVR4nO3dbYxcZ3nG8f9Vm6BSAqGxScEvOCoO4A8EgUlQ1aqhCHBAqkspJQE1TRRkRRBKPyAwX9pShAillSjKi2WBCfnQutCi4oIbI9EGWkEgNoUUJ3VihbxsTInDS1CLaORw98NMMuNhvDs24z27z/x/0spznvNk9s7R6tpnn7nnTKoKSdLy9wtdFyBJmg4DXZIaYaBLUiMMdElqhIEuSY0w0CWpEZ0GepJdSR5K8q0J5j4nyReS3J7kliRrF6NGSVouul6h3whsmXDuXwI3VdULgT8HPnC6ipKk5ajTQK+qLwHfHx5L8qtJbk5yIMm/JXl+/9Qm4Av9x/8KbF3EUiVpyet6hT7OTuDtVfUS4J3A9f3xbwKv7z9+HXBmkrM7qE+SlqSVXRcwLMlTgV8DPpXk8eEn9/99J3BtksuBLwEPAscWu0ZJWqqWVKDT+4vhh1X1otETVXUE+F14IvhfX1WPLHJ9krRkLaktl6r6EfDtJG8ASM/5/cerkjxe73uAXR2VKUlLUtdti38LfAV4XpK5JFcCbwauTPJN4CCDFz8vAg4luQs4B3h/ByVL0pIVb58rSW1YUlsukqRTZ6BLUiM663JZtWpVbdiwoatvL0nL0oEDBx6uqtXjznUW6Bs2bGD//v1dfXtJWpaS3Heic265SFIjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUiAUDfaHP/ezfEfEjSQ73P+/zxdMvU5K0kEneWHQjcC1w0wnOXwxs7H9dCNzQ/1fqxIbtn+u6BO695rVdl6AZtOAKfdznfo7YSu/Dm6uqbgXOSvKsaRUoSZrMNPbQ1wAPDB3P9cckSYtoGoGeMWNjb7KeZFuS/Un2Hz16dArfWpL0uGkE+hywbuh4LXBk3MSq2llVm6tq8+rVY28WJkk6RdMI9D3AZf1ul5cBj1TVd6bwvJKkk7Bgl0v/cz8vAlYlmQP+FHgSQFXtAPYCrwEOAz8GrjhdxUqSTmzBQK+qSxc4X8DbplaRpKmxhXO2+E5RSWqEgS5JjejsI+gkaTHNwvaTK3RJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREruy5A07Fh++e6LoF7r3lt1yVIM80VuiQ1YqJAT7IlyaEkh5NsH3P+6Un+Kck3kxxMcsX0S5UkzWfBQE+yArgOuBjYBFyaZNPItLcBd1TV+cBFwF8lOWPKtUqS5jHJCv0C4HBV3VNVjwK7ga0jcwo4M0mApwLfB45NtVJJ0rwmeVF0DfDA0PEccOHInGuBPcAR4EzgjVX106lUOA9fCJSkgUlW6BkzViPHrwa+ATwbeBFwbZKn/cwTJduS7E+y/+jRoyddrCTpxCYJ9Dlg3dDxWnor8WFXAJ+unsPAt4Hnjz5RVe2sqs1VtXn16tWnWrMkaYxJAv02YGOSc/svdF5Cb3tl2P3AKwCSnAM8D7hnmoVKkua34B56VR1LcjWwD1gB7Kqqg0mu6p/fAbwPuDHJf9Lbonl3VT18GuuWJI2Y6J2iVbUX2DsytmPo8RHgVdMtTZJ0MnynqCQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakREwV6ki1JDiU5nGT7CeZclOQbSQ4m+eJ0y5QkLWTlQhOSrACuA14JzAG3JdlTVXcMzTkLuB7YUlX3J3nm6SpYkjTeJCv0C4DDVXVPVT0K7Aa2jsx5E/DpqrofoKoemm6ZkqSFTBLoa4AHho7n+mPDzgOekeSWJAeSXDatAiVJk1lwywXImLEa8zwvAV4B/CLwlSS3VtVdxz1Rsg3YBrB+/fqTr1aSdEKTrNDngHVDx2uBI2Pm3FxV/1tVDwNfAs4ffaKq2llVm6tq8+rVq0+1ZknSGJME+m3AxiTnJjkDuATYMzLnM8BvJFmZ5CnAhcCd0y1VkjSfBbdcqupYkquBfcAKYFdVHUxyVf/8jqq6M8nNwO3AT4GPVtW3TmfhkqTjTbKHTlXtBfaOjO0YOf4Q8KHplSZJOhm+U1SSGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUiIkCPcmWJIeSHE6yfZ55L03yWJLfm16JkqRJLBjoSVYA1wEXA5uAS5NsOsG8DwL7pl2kJGlhk6zQLwAOV9U9VfUosBvYOmbe24F/AB6aYn2SpAlNEuhrgAeGjuf6Y09IsgZ4HbBjeqVJkk7GJIGeMWM1cvxh4N1V9di8T5RsS7I/yf6jR49OWqMkaQIrJ5gzB6wbOl4LHBmZsxnYnQRgFfCaJMeq6h+HJ1XVTmAnwObNm0d/KUiSfg6TBPptwMYk5wIPApcAbxqeUFXnPv44yY3AZ0fDXJJ0ei0Y6FV1LMnV9LpXVgC7qupgkqv65903l6QlYJIVOlW1F9g7MjY2yKvq8p+/LEnSyfKdopLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGTBToSbYkOZTkcJLtY86/Ocnt/a8vJzl/+qVKkuazYKAnWQFcB1wMbAIuTbJpZNq3gd+sqhcC7wN2TrtQSdL8JlmhXwAcrqp7qupRYDewdXhCVX25qn7QP7wVWDvdMiVJC5kk0NcADwwdz/XHTuRK4J9/nqIkSSdv5QRzMmasxk5MXk4v0H/9BOe3AdsA1q9fP2GJkqRJTLJCnwPWDR2vBY6MTkryQuCjwNaq+t64J6qqnVW1uao2r169+lTqlSSdwCSBfhuwMcm5Sc4ALgH2DE9Ish74NPAHVXXX9MuUJC1kwS2XqjqW5GpgH7AC2FVVB5Nc1T+/A/gT4Gzg+iQAx6pq8+krW5I0apI9dKpqL7B3ZGzH0OO3AG+ZbmmSpJPhO0UlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjZgo0JNsSXIoyeEk28ecT5KP9M/fnuTF0y9VkjSfBQM9yQrgOuBiYBNwaZJNI9MuBjb2v7YBN0y5TknSAiZZoV8AHK6qe6rqUWA3sHVkzlbgpuq5FTgrybOmXKskaR6TBPoa4IGh47n+2MnOkSSdRisnmJMxY3UKc0iyjd6WDMD/JDk0wfc/3VYBD5/qf5wPTrGS7nktBrwWA16LgaVwLZ5zohOTBPocsG7oeC1w5BTmUFU7gZ0TfM9Fk2R/VW3uuo6lwGsx4LUY8FoMLPVrMcmWy23AxiTnJjkDuATYMzJnD3BZv9vlZcAjVfWdKdcqSZrHgiv0qjqW5GpgH7AC2FVVB5Nc1T+/A9gLvAY4DPwYuOL0lSxJGmeSLReqai+90B4e2zH0uIC3Tbe0RbOktoA65rUY8FoMeC0GlvS1SC+LJUnLnW/9l6RGGOiS1AgDXZIaYaBLUiMm6nJpRZKnA+8BfgdY3R9+CPgMcE1V/bCr2rqQJPTu1bOG3jt7jwBfqxl8pTzJq+n9XAxfi89U1c2dFtahJL9Mr4ntB13X0pV+Zmzh+J+LfUs1K2Zthf5J4AfARVV1dlWdDby8P/apTitbZEleBdwN/Bm99xC8FngvcHf/3MxI8mHgHcAXgb8APtR//EdJ/rrL2hZbkvVJdic5CnwVuC3JQ/2xDd1Wt7iSXAZ8HbgIeArwS/Ty4kD/3JIzU22LSQ5V1fNO9lyLktwJXFxV946MnwvsraoXdFJYB5LcVVXnjRkPcFdVbeygrE4k+QrwYeDvq+qx/tgK4A3AH1fVy7qsbzH17zV14ehqPMkzgK+O+5np2qyt0O9L8q4k5zw+kOScJO/m+LtFzoKV9O7BM+pB4EmLXEvXfpLkgjHjLwV+stjFdGxVVf3d42EOUFWPVdVu4OwO6+pCGHOTQeCnjL8hYedmag8deCOwHfhikmf2x75L7140b+isqm7sovfn9G4Gv8zW0btXz8c6q6oblwM3JDmTwS+5dcCP+udmyYEk1wOf4Pifiz8E/qOzqrrxfuDrST7P4FqsB14JvK+zquYxU1su80lyRVV9vOs6FlP/k6d+m94LPqEXZnuq6o5OC+tIkl9h6FpU1X93XNKi69+A70p6H1pz3M8F8LGq+r8Oy1t0/e2VV3P8tdi3VF8oNtD7ktxfVeu7rkPdsONH81kuHT8zteWS5PYTnQLOOcG5JtnCOdDv6rmeXtfPg/3htcBzk7y1qj7fWXEdsIWzJ8l6el1PvwU80hvK04B/AbaPNhQsBTO1Qk/yXXp/Po3+lg3w5ap69uJX1Y0k++j9YH7i8a2F/pbD5cArquqVHZa3qOz4Gei3cJ4H3MTg9YS1wGXA3VX1jq5qW2zLseNn1gL9Y8DHq+rfx5z7m6p6UwdldcIWzoEkdwMvqKpjI+NnAHdU1XO7qWzx2cI5kOTuE/3/zneuSzO15VJVV85zbmbCvO++JO+it0L/LvRaOOmt0GethdOOn4GfJLmgqr42Mj6LLZzLruNnplboGui/er+dXjfDaAvnNUv9xZ9ps+OnJ8mLgRuAcS2cb62qA13VttiWY8ePga6fMYstnDqeLZzLk4GunzFrLZx2/BzPFs6B5dbxY6DPqAVaOM+rqicvZj1dsuNnYL4WTnpbLjPTwrkcO34M9BllC+eAHT8DtnAOLMeOn1m7OZcGPgs8taruG/m6F7il29IWnTdtG/CmbQPL7qZtrtA18+z4GUjyHuD3gXEtnJ+sqg90VdtiW44dPwa6NI9Z7PixhfN4y6njx0CX5jFrHT863nLr+Jmpd4pK43jTtgFbOAeW403bXKFr5tnxM2AL58By7PhxhS4NOn6+MXoiyS2LX06nNlTVB4cH+sF+TZIrOqqpK8uu48dA18zzpm3H8aZtA8vupm1uuUh6gi2cx1tuHT8GuqSJzGIL53LjO0UlTeq9XRewmJI8Pck1Sf4ryff6X3f2x87qur5x3EOX9ARbOI/zSXodPxeN6fj5FLDkOn7ccpH0BFs4B5bjTdtcoUsaZgvnwLLr+HGFLkljLMeOHwNdkk7SUu34MdAl6SQt1Zu2uYcuSWMsx44fA12SxjuHeTp+Fr+chRnokjTesuv4cQ9dkhrhW/8lqREGuiQ1wkCXpEYY6JLUCANdkhrx/4TufEphIn9cAAAAAElFTkSuQmCC\n", 246 | "text/plain": [ 247 | "
" 248 | ] 249 | }, 250 | "metadata": { 251 | "needs_background": "light" 252 | }, 253 | "output_type": "display_data" 254 | } 255 | ], 256 | "source": [ 257 | "pd.Series(file_sizes).plot.bar()" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 21, 263 | "metadata": {}, 264 | "outputs": [ 265 | { 266 | "data": { 267 | "text/plain": [ 268 | "" 269 | ] 270 | }, 271 | "execution_count": 21, 272 | "metadata": {}, 273 | "output_type": "execute_result" 274 | }, 275 | { 276 | "data": { 277 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAEVCAYAAADwyx6sAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAARcUlEQVR4nO3dbYxcZ3nG8f9VYwotlLR4S9LYjpEwFS9SIXJNEF9cCiUJUdMP0AYkUiIkizcVJCQa+ABFVdXQSojSQCxLCRCJkoaCQgROA1J5i9oEHNcJJAZiIWhMUmICOFi8yfTuhzlh1pPZndl4do/3mf9PGuXMc56duXO0uvb4mfucSVUhSVr/fq3vAiRJs2GgS1IjDHRJaoSBLkmNMNAlqREGuiQ14jF9vfGmTZtq27Ztfb29JK1Lt99++/eramHcvt4Cfdu2bezfv7+vt5ekdSnJd5ba55KLJDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqRG9XVgkSWtp2+Wf7rsEvn3FS1f19T1Dl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRkwM9CSPS/LlJHckuSvJu8bM2ZXkWJKD3eMdq1OuJGkp01z6/3PghVV1PMlG4JYkN1XVrSPzvlRVF82+REnSNCYGelUVcLx7urF71GoWJUlauanW0JNsSHIQeAD4bFXdNmba87tlmZuSPGumVUqSJpoq0Kvql1X1HGAzsDPJs0emHADOqao/AP4ZuGHc6yTZnWR/kv1Hjx49lbolSSNW1OVSVT8CPg+cPzL+UFUd77b3ARuTbBrz83urakdV7VhYWHj0VUuSHmGaLpeFJGd0248HXgR8fWTOmUnSbe/sXvfB2ZcrSVrKNF0uZwEfTrKBQVBfX1WfSvJagKraA7wMeF2SE8BPgUu6D1MlSWtkmi6XO4Hnjhnfs2j7SuDK2ZYmSVoJrxSVpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGjEx0JM8LsmXk9yR5K4k7xozJ0nel+RwkjuTnLs65UqSljLxS6KBnwMvrKrjSTYCtyS5qapuXTTnAmB793gecFX3X0nSGpl4hl4Dx7unG7tHjUy7GLi2m3srcEaSs2ZbqiRpOVOtoSfZkOQg8ADw2aq6bWTK2cC9i54f6cYkSWtkqkCvql9W1XOAzcDOJM8emZJxPzY6kGR3kv1J9h89enTl1UqSlrSiLpeq+hHweeD8kV1HgC2Lnm8G7hvz83urakdV7VhYWFhhqZKk5UzT5bKQ5Ixu+/HAi4Cvj0y7Ebi063Y5DzhWVffPvFpJ0pKm6XI5C/hwkg0M/gBcX1WfSvJagKraA+wDLgQOAz8BLluleiVJS5gY6FV1J/DcMeN7Fm0X8IbZliZJWgmvFJWkRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1IhpviR6S5LPJTmU5K4kbxozZ1eSY0kOdo93rE65kqSlTPMl0SeAt1TVgSRPBG5P8tmquntk3peq6qLZlyhJmsbEM/Squr+qDnTbPwYOAWevdmGSpJVZ0Rp6km3Ac4Hbxux+fpI7ktyU5FkzqE2StALTLLkAkOQJwMeBN1fVQyO7DwDnVNXxJBcCNwDbx7zGbmA3wNatWx910ZKkR5rqDD3JRgZh/pGq+sTo/qp6qKqOd9v7gI1JNo2Zt7eqdlTVjoWFhVMsXZK02DRdLgGuBg5V1XuWmHNmN48kO7vXfXCWhUqSljfNkssLgFcBX01ysBt7O7AVoKr2AC8DXpfkBPBT4JKqqlWoV5K0hImBXlW3AJkw50rgylkVJUlaOa8UlaRGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhoxMdCTbEnyuSSHktyV5E1j5iTJ+5IcTnJnknNXp1xJ0lImfkk0cAJ4S1UdSPJE4PYkn62quxfNuQDY3j2eB1zV/VeStEYmnqFX1f1VdaDb/jFwCDh7ZNrFwLU1cCtwRpKzZl6tJGlJK1pDT7INeC5w28ius4F7Fz0/wiNDX5K0iqYO9CRPAD4OvLmqHhrdPeZHasxr7E6yP8n+o0ePrqxSSdKypgr0JBsZhPlHquoTY6YcAbYser4ZuG90UlXtraodVbVjYWHh0dQrSVrCNF0uAa4GDlXVe5aYdiNwadftch5wrKrun2GdkqQJpulyeQHwKuCrSQ52Y28HtgJU1R5gH3AhcBj4CXDZ7EuVJC1nYqBX1S2MXyNfPKeAN8yqKEnSynmlqCQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktSIiYGe5JokDyT52hL7dyU5luRg93jH7MuUJE0y8UuigQ8BVwLXLjPnS1V10UwqkiQ9KhPP0Kvqi8AP1qAWSdIpmNUa+vOT3JHkpiTPmtFrSpJWYJoll0kOAOdU1fEkFwI3ANvHTUyyG9gNsHXr1hm8tSTpYad8hl5VD1XV8W57H7AxyaYl5u6tqh1VtWNhYeFU31qStMgpB3qSM5Ok297ZveaDp/q6kqSVmbjkkuSjwC5gU5IjwDuBjQBVtQd4GfC6JCeAnwKXVFWtWsWSpLEmBnpVvWLC/isZtDVKknrklaKS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhoxMdCTXJPkgSRfW2J/krwvyeEkdyY5d/ZlSpImmeYM/UPA+cvsvwDY3j12A1edelmSpJWaGOhV9UXgB8tMuRi4tgZuBc5IctasCpQkTWcWa+hnA/cuen6kG5MkraFZBHrGjNXYicnuJPuT7D969OgM3lqS9LBZBPoRYMui55uB+8ZNrKq9VbWjqnYsLCzM4K0lSQ+bRaDfCFzadbucBxyrqvtn8LqSpBV4zKQJST4K7AI2JTkCvBPYCFBVe4B9wIXAYeAnwGWrVawkaWkTA72qXjFhfwFvmFlFkqRHxStFJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEZMFehJzk/yjSSHk1w+Zv+uJMeSHOwe75h9qZKk5Uz8kugkG4D3Ay8GjgBfSXJjVd09MvVLVXXRKtQoSZrCNGfoO4HDVfWtqvoFcB1w8eqWJUlaqWkC/Wzg3kXPj3Rjo56f5I4kNyV51kyqkyRNbeKSC5AxYzXy/ABwTlUdT3IhcAOw/REvlOwGdgNs3bp1haVKkpYzzRn6EWDLouebgfsWT6iqh6rqeLe9D9iYZNPoC1XV3qraUVU7FhYWTqFsSdKoac7QvwJsT/JU4LvAJcArF09IcibwvaqqJDsZ/KF4cNbFSlqZbZd/uu8S+PYVL+27hLkxMdCr6kSSNwI3AxuAa6rqriSv7fbvAV4GvC7JCeCnwCVVNbosI0laRdOcoT+8jLJvZGzPou0rgStnW5okaSW8UlSSGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGTHU/dGk98Vt6NK88Q5ekRqzrM3TPxCRpaF0Huob84yZpqiWXJOcn+UaSw0kuH7M/Sd7X7b8zybmzL1WStJyJgZ5kA/B+4ALgmcArkjxzZNoFwPbusRu4asZ1SpImmOYMfSdwuKq+VVW/AK4DLh6ZczFwbQ3cCpyR5KwZ1ypJWsY0gX42cO+i50e6sZXOkSStomk+FM2YsXoUc0iym8GSDMDxJN+Y4v1X2ybg+4/2h/PuGVbSP4/FkMdiyGMxdDoci3OW2jFNoB8Btix6vhm471HMoar2AnuneM81k2R/Ve3ou47TgcdiyGMx5LEYOt2PxTRLLl8Btid5apLHApcAN47MuRG4tOt2OQ84VlX3z7hWSdIyJp6hV9WJJG8EbgY2ANdU1V1JXtvt3wPsAy4EDgM/AS5bvZIlSeNMdWFRVe1jENqLx/Ys2i7gDbMtbc2cVktAPfNYDHkshjwWQ6f1scggiyVJ650355KkRhjoktQIA12SGmGgS1Ij5ur2uUmeBLwN+DNgoRt+APgkcEVV/aiv2vqQJAzu1XM2gyt77wO+XHP4SXmSlzD4vVh8LD5ZVf/ea2E9SvI7DJrYfth3LX3pMuN8Tv69uPl0zYp5O0O/HvghsKuqnlxVTwb+qBv7WK+VrbEkfwLcA/wNg2sIXgq8C7in2zc3krwXeBPwBeAfgH/stv8qyT/1WdtaS7I1yXVJjgK3AV9J8kA3tq3f6tZWkkuBA8Au4DeA32SQF7d3+047c9W2mOQbVfX7K93XoiSHgAuq6tsj408F9lXVM3oprAdJvllVTx8zHuCbVbW9h7J6keS/gPcC/1ZVv+zGNgAvB95cVef1Wd9a6u419bzRs/Ekvw3cNu53pm/zdob+nSRvTfKUhweSPCXJX3Py3SLnwWMY3INn1HeBjWtcS99+lmTnmPE/BH621sX0bFNV/evDYQ5QVb+squuAJ/dYVx/CmJsMAv/H+BsS9m6u1tCBvwAuB76Q5He7se8xuBfNy3urqh/XMPjn9HUM/5htYXCvnqt7q6ofrwauSvJEhn/ktgAPdfvmye1JPgB8mJN/L/4S+O/equrH3wEHknyG4bHYCrwY+NveqlrGXC25LCfJZVX1wb7rWEvdN0/9KYMPfMIgzG6sqrt7LawnSc5k0bGoqv/tuaQ1192A7zUMvrTmpN8L4Oqq+nmP5a25bnnlJZx8LG4+XT8oNtA7Sf6nqrb2XYf6YcePlrNeOn7masklyZ1L7QKessS+JtnCOdR19XyAQdfPd7vhzcDTkry+qj7TW3E9sIVzIMlWBl1PLwSODYbyW8B/AJePNhScDubqDD3J9xj882n0r2yA/6yq31v7qvqR5GYGv5gffnhpoVtyeDXwx1X14h7LW1N2/Ax1LZxPB65l+HnCZuBS4J6qelNfta219djxM2+BfjXwwaq6Zcy+f6mqV/ZQVi9s4RxKcg/wjKo6MTL+WODuqnpaP5WtPVs4h5Lcs9T/73L7+jRXSy5V9Zpl9s1NmHe+k+StDM7QvweDFk4GZ+jz1sJpx8/Qz5LsrKovj4zPYwvnuuv4maszdA11n95fzqCbYbSF84rT/cOfWbPjZyDJucBVwLgWztdX1e191bbW1mPHj4GuR5jHFk6dzBbO9clA1yPMWwunHT8ns4VzaL11/Bjoc2pCC+fTq+rX17KePtnxM7RcCyeDJZe5aeFcjx0/BvqcsoVzyI6fIVs4h9Zjx8+83ZxLQ58CnlBV3xl5fBv4fL+lrTlv2jbkTduG1t1N2zxD19yz42coyduAPwfGtXBeX1V/31dta209dvwY6NIy5rHjxxbOk62njh8DXVrGvHX86GTrreNnrq4Ulcbxpm1DtnAOrcebtnmGrrlnx8+QLZxD67HjxzN0adjxc3B0R5LPr305vdpWVe9ePNAF+xVJLuuppr6su44fA11zz5u2ncSbtg2tu5u2ueQi6Vds4TzZeuv4MdAlTWUeWzjXG68UlTStd/VdwFpK8qQkVyT5epIHu8ehbuyMvusbxzV0Sb9iC+dJrmfQ8bNrTMfPx4DTruPHJRdJv2IL59B6vGmbZ+iSFrOFc2jddfx4hi5JY6zHjh8DXZJW6HTt+DHQJWmFTtebtrmGLkljrMeOHwNdksZ7Cst0/Kx9OZMZ6JI03rrr+HENXZIa4aX/ktQIA12SGmGgS1IjDHRJaoSBLkmN+H+NiAucWRIu8wAAAABJRU5ErkJggg==\n", 278 | "text/plain": [ 279 | "
" 280 | ] 281 | }, 282 | "metadata": { 283 | "needs_background": "light" 284 | }, 285 | "output_type": "display_data" 286 | } 287 | ], 288 | "source": [ 289 | "pd.Series(read_times).plot.bar()" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": {}, 296 | "outputs": [], 297 | "source": [] 298 | } 299 | ], 300 | "metadata": { 301 | "kernelspec": { 302 | "display_name": "Python 3", 303 | "language": "python", 304 | "name": "python3" 305 | }, 306 | "language_info": { 307 | "codemirror_mode": { 308 | "name": "ipython", 309 | "version": 3 310 | }, 311 | "file_extension": ".py", 312 | "mimetype": "text/x-python", 313 | "name": "python", 314 | "nbconvert_exporter": "python", 315 | "pygments_lexer": "ipython3", 316 | "version": "3.7.3" 317 | } 318 | }, 319 | "nbformat": 4, 320 | "nbformat_minor": 2 321 | } 322 | -------------------------------------------------------------------------------- /peak_use.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from pandas.util.testing import rands 3 | 4 | import pyarrow as pa 5 | import pyarrow.parquet as pq 6 | 7 | import gc 8 | 9 | GB = 1 << 30 10 | 11 | class memory_use: 12 | 13 | def __init__(self): 14 | self.start_use = pa.total_allocated_bytes() 15 | self.pool = pa.default_memory_pool() 16 | self.start_peak_use = self.pool.max_memory() 17 | 18 | def __enter__(self): 19 | return 20 | 21 | def __exit__(self, type, value, traceback): 22 | gc.collect() 23 | print("Change in memory use: {}" 24 | .format((pa.total_allocated_bytes() - self.start_use) / GB)) 25 | print("Change in peak use: {}" 26 | .format((self.pool.max_memory() - self.start_peak_use) / GB)) 27 | 28 | 29 | with memory_use(): 30 | table = pq.read_table('/tmp/test.parquet') 31 | -------------------------------------------------------------------------------- /scripts/20190903_parquet_benchmark.py: -------------------------------------------------------------------------------- 1 | import json 2 | import numpy as np 3 | import pyarrow as pa 4 | import pyarrow.parquet as pq 5 | from pandas.util.testing import rands 6 | import gc 7 | import time 8 | 9 | 10 | class memory_use: 11 | 12 | def __init__(self): 13 | self.start_use = pa.total_allocated_bytes() 14 | self.pool = pa.default_memory_pool() 15 | self.start_peak_use = self.pool.max_memory() 16 | 17 | def __enter__(self): 18 | return 19 | 20 | def __exit__(self, type, value, traceback): 21 | gc.collect() 22 | print("Change in memory use: {}" 23 | .format(pa.total_allocated_bytes() - self.start_use)) 24 | print("Change in peak use: {}" 25 | .format(self.pool.max_memory() - self.start_peak_use)) 26 | 27 | 28 | def generate_strings(string_size, nunique, length, random_order=True): 29 | uniques = np.array([rands(string_size) for i in range(nunique)], dtype='O') 30 | if random_order: 31 | indices = np.random.randint(0, nunique, size=length).astype('i4') 32 | return uniques.take(indices) 33 | else: 34 | return uniques.repeat(length // nunique) 35 | 36 | 37 | def generate_dict_strings(string_size, nunique, length, random_order=True): 38 | uniques = np.array([rands(string_size) for i in range(nunique)], dtype='O') 39 | if random_order: 40 | indices = np.random.randint(0, nunique, size=length).astype('i4') 41 | else: 42 | indices = np.arange(nunique).astype('i4').repeat(length // nunique) 43 | return pa.DictionaryArray.from_arrays(indices, uniques) 44 | 45 | 46 | STRING_SIZE = 32 47 | LENGTH = 3_000_000 48 | NITER = 5 49 | 50 | 51 | def generate_table(nunique, num_cols=10, random_order=True): 52 | data = generate_strings(STRING_SIZE, nunique, LENGTH, 53 | random_order=random_order) 54 | return pa.Table.from_arrays([ 55 | pa.array(data) for i in range(num_cols) 56 | ], names=['f{}'.format(i) for i in range(num_cols)]) 57 | 58 | 59 | def generate_dict_table(nunique, num_cols=10, random_order=True): 60 | data = generate_dict_strings(STRING_SIZE, nunique, LENGTH, 61 | random_order=random_order) 62 | return pa.Table.from_arrays([ 63 | data for i in range(num_cols) 64 | ], names=['f{}'.format(i) for i in range(num_cols)]) 65 | 66 | 67 | def get_timing(f, niter): 68 | start = time.clock_gettime(time.CLOCK_REALTIME) 69 | gc.disable() 70 | for i in range(niter): 71 | f() 72 | result = (time.clock_gettime(time.CLOCK_REALTIME) - start) / niter 73 | gc.enable() 74 | gc.collect() 75 | return result 76 | 77 | 78 | def write_table(t): 79 | out = pa.BufferOutputStream() 80 | pq.write_table(t, out) 81 | return out.getvalue() 82 | 83 | 84 | def read_table(source): 85 | return pq.read_table(source) 86 | 87 | 88 | def get_write_read_results(table, case_name): 89 | buf = write_table(table) 90 | results = [({'case': f'write-{case_name}'}, 91 | get_timing(lambda: write_table(table), 1)), 92 | ({'case': f'read-{case_name}'}, 93 | get_timing(lambda: read_table(buf), NITER)), 94 | ({'case': f'read-{case_name}-single-thread'}, 95 | get_timing(lambda: pq.read_table(buf, use_threads=False), 96 | NITER))] 97 | for item in results: 98 | print(item) 99 | return results 100 | 101 | 102 | def get_cases(nunique): 103 | return { 104 | 'dense-random': generate_table(nunique), 105 | 'dense-sequential': generate_table(nunique, random_order=False), 106 | 'dict-random': generate_dict_table(nunique), 107 | 'dict-sequential': generate_dict_table(nunique, random_order=False) 108 | } 109 | 110 | 111 | def run_benchmarks(): 112 | results = {} 113 | 114 | nuniques = [1000, 100000] 115 | # nuniques = [100000] 116 | for nunique in nuniques: 117 | nunique_results = [] 118 | 119 | cases = get_cases(nunique) 120 | for case_name, table in cases.items(): 121 | print(case_name, nunique) 122 | nunique_results.extend(get_write_read_results(table, case_name)) 123 | 124 | results[nunique] = nunique_results 125 | 126 | return results 127 | 128 | 129 | # cases = get_cases(100000) 130 | 131 | # buf = write_table(cases['dict-random']) 132 | # with memory_use(): 133 | # result = pq.read_table(buf) 134 | 135 | 136 | print(json.dumps(run_benchmarks())) 137 | -------------------------------------------------------------------------------- /scripts/arrow7305.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import pyarrow as pa 4 | import pyarrow.parquet as pq 5 | import time 6 | 7 | import gc 8 | import psutil 9 | 10 | 11 | PROC = psutil.Process() 12 | 13 | 14 | def get_rss(): 15 | return PROC.memory_info().rss 16 | 17 | 18 | def print_rss(): 19 | print(f"RSS: {get_rss()}") 20 | 21 | 22 | RSS_TELEMETRY = [] 23 | 24 | 25 | class memory_use: 26 | 27 | def __init__(self): 28 | self.start_use = pa.total_allocated_bytes() 29 | self.start_rss = get_rss() 30 | self.pool = pa.default_memory_pool() 31 | self.start_peak_use = self.pool.max_memory() 32 | 33 | def __enter__(self): 34 | return 35 | 36 | def __exit__(self, type, value, traceback): 37 | gc.collect() 38 | rss = get_rss() 39 | print("RSS: {}, change: {}" 40 | .format(rss, rss - self.start_rss)) 41 | RSS_TELEMETRY.append(rss) 42 | # print("Change in Arrow allocations: {}" 43 | # .format(pa.total_allocated_bytes() - self.start_use)) 44 | # print("Change in peak use: {}" 45 | # .format(self.pool.max_memory() - self.start_peak_use)) 46 | 47 | 48 | def log_(msg): 49 | print(f"{msg} RSS: {get_rss()}") 50 | 51 | 52 | path = '/home/wesm/Downloads/big.snappy.parquet' 53 | 54 | CSV_PATH = '/home/wesm/Downloads/50mb.csv.gz' 55 | 56 | pa.jemalloc_set_decay_ms(0) 57 | 58 | log_("Starting") 59 | 60 | for i in range(10): 61 | df = pd.read_csv(CSV_PATH) 62 | log_("Read CSV") 63 | 64 | df.to_parquet('out.parquet') 65 | log_("Wrote Parquet") 66 | 67 | time.sleep(1) 68 | log_(f"Waited 1 second") 69 | 70 | # for i in range(10): 71 | # time.sleep(0.1) 72 | # elapsed = "%.2f" % (0.1 * (i + 1)) 73 | # log_(f"{elapsed} seconds elapsed") 74 | 75 | 76 | for i in range(10): 77 | time.sleep(1) 78 | log_(f"Waited 1 second") 79 | --------------------------------------------------------------------------------