├── 20190510-RStats-NYC.ipynb
├── 20190725-Parquet-Dict-Benchmark.ipynb
├── 20190726-Parquet-Float-Encoding.ipynb
├── 20190802-Parquet-Dict-Decoding-Benchmark.ipynb
├── 20190803-ARROW-6060.ipynb
├── 20190815-Parquet-Direct-Dictionary-Write.ipynb
├── 20190830-VLDB-FlightDemo.ipynb
├── 20190919file_benchmarks
    ├── FeatherCompression.ipynb
    ├── all_read_results.csv
    ├── all_results.csv
    ├── all_write_results.csv
    ├── benchmark.R
    ├── benchmark.py
    ├── file_sizes.csv
    ├── generate_results.sh
    ├── glue_results.py
    ├── i9-9880H-1
    │   ├── all_results.csv
    │   ├── plot.png
    │   ├── py_results.csv
    │   └── r_results.csv
    ├── i9-9880H-4
    │   ├── all_results.csv
    │   ├── plot.png
    │   ├── py_results.csv
    │   └── r_results.csv
    ├── i9-9880H-8
    │   ├── all_results.csv
    │   ├── plot.png
    │   ├── py_results.csv
    │   └── r_results.csv
    ├── make_feather_plots.R
    ├── make_plots.R
    ├── py_read_results_1.csv
    ├── py_read_results_4.csv
    ├── py_write_results_1.csv
    ├── py_write_results_4.csv
    ├── r_read_results_1.csv
    ├── r_read_results_4.csv
    ├── r_write_results_1.csv
    └── r_write_results_4.csv
├── 20200402pandas_load
    ├── .ipynb_checkpoints
    │   └── Untitled-checkpoint.ipynb
    └── Untitled.ipynb
├── 20200509wideparquet
    └── WideParquet.ipynb
├── peak_use.py
└── scripts
    ├── 20190903_parquet_benchmark.py
    └── arrow7305.py


/20190725-Parquet-Dict-Benchmark.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": []
 9 |   }
10 |  ],
11 |  "metadata": {
12 |   "kernelspec": {
13 |    "display_name": "Python 3",
14 |    "language": "python",
15 |    "name": "python3"
16 |   },
17 |   "language_info": {
18 |    "codemirror_mode": {
19 |     "name": "ipython",
20 |     "version": 3
21 |    },
22 |    "file_extension": ".py",
23 |    "mimetype": "text/x-python",
24 |    "name": "python",
25 |    "nbconvert_exporter": "python",
26 |    "pygments_lexer": "ipython3",
27 |    "version": "3.7.3"
28 |   }
29 |  },
30 |  "nbformat": 4,
31 |  "nbformat_minor": 2
32 | }
33 | 


--------------------------------------------------------------------------------
/20190726-Parquet-Float-Encoding.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": []
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": 1,
13 |    "metadata": {},
14 |    "outputs": [],
15 |    "source": [
16 |     "import pyarrow as pa\n",
17 |     "import pyarrow.parquet as pq\n",
18 |     "import numpy as np\n",
19 |     "import pandas as pd\n",
20 |     "arr = pa.array([np.nan] * 10000000)\n",
21 |     "t = pa.Table.from_arrays([arr], names=['f0'])"
22 |    ]
23 |   },
24 |   {
25 |    "cell_type": "code",
26 |    "execution_count": 2,
27 |    "metadata": {},
28 |    "outputs": [
29 |     {
30 |      "name": "stdout",
31 |      "output_type": "stream",
32 |      "text": [
33 |       "88.1 ms ± 1.68 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
34 |      ]
35 |     }
36 |    ],
37 |    "source": [
38 |     "%timeit pq.write_table(t, '/home/wesm/tmp/nans.parquet')"
39 |    ]
40 |   },
41 |   {
42 |    "cell_type": "code",
43 |    "execution_count": null,
44 |    "metadata": {},
45 |    "outputs": [],
46 |    "source": []
47 |   }
48 |  ],
49 |  "metadata": {
50 |   "kernelspec": {
51 |    "display_name": "Python 3",
52 |    "language": "python",
53 |    "name": "python3"
54 |   },
55 |   "language_info": {
56 |    "codemirror_mode": {
57 |     "name": "ipython",
58 |     "version": 3
59 |    },
60 |    "file_extension": ".py",
61 |    "mimetype": "text/x-python",
62 |    "name": "python",
63 |    "nbconvert_exporter": "python",
64 |    "pygments_lexer": "ipython3",
65 |    "version": "3.7.3"
66 |   }
67 |  },
68 |  "nbformat": 4,
69 |  "nbformat_minor": 2
70 | }
71 | 


--------------------------------------------------------------------------------
/20190802-Parquet-Dict-Decoding-Benchmark.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 5,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pyarrow as pa\n",
 11 |     "import pyarrow.parquet as pq\n",
 12 |     "import pandas as pd\n",
 13 |     "from pandas.util.testing import rands\n",
 14 |     "        \n",
 15 |     "NUNIQUE = 1000\n",
 16 |     "STRING_SIZE = 50\n",
 17 |     "LENGTH = 10_000_000\n",
 18 |     "REPEATS = LENGTH // NUNIQUE\n",
 19 |     "\n",
 20 |     "data = [rands(STRING_SIZE) for i in range(NUNIQUE)] * REPEATS\n",
 21 |     "table = pa.table([data], names=['f0'])\n",
 22 |     "\n",
 23 |     "out_stream = pa.BufferOutputStream()\n",
 24 |     "pq.write_table(table, out_stream)\n",
 25 |     "contents = out_stream.getvalue()"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": 6,
 31 |    "metadata": {},
 32 |    "outputs": [
 33 |     {
 34 |      "data": {
 35 |       "text/plain": [
 36 |        "1129939"
 37 |       ]
 38 |      },
 39 |      "execution_count": 6,
 40 |      "metadata": {},
 41 |      "output_type": "execute_result"
 42 |     }
 43 |    ],
 44 |    "source": [
 45 |     "len(contents)"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 12,
 51 |    "metadata": {},
 52 |    "outputs": [
 53 |     {
 54 |      "name": "stdout",
 55 |      "output_type": "stream",
 56 |      "text": [
 57 |       "0\n"
 58 |      ]
 59 |     }
 60 |    ],
 61 |    "source": [
 62 |     "import gc\n",
 63 |     "class memory_use:\n",
 64 |     "    \n",
 65 |     "    def __init__(self):\n",
 66 |     "        self.start_use = pa.total_allocated_bytes()\n",
 67 |     "    \n",
 68 |     "    def __enter__(self):\n",
 69 |     "        return\n",
 70 |     "    \n",
 71 |     "    def __exit__(self, type, value, traceback):\n",
 72 |     "        gc.collect()\n",
 73 |     "        print(pa.total_allocated_bytes() - self.start_use)"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": 13,
 79 |    "metadata": {},
 80 |    "outputs": [
 81 |     {
 82 |      "name": "stdout",
 83 |      "output_type": "stream",
 84 |      "text": [
 85 |       "541250112\n"
 86 |      ]
 87 |     }
 88 |    ],
 89 |    "source": [
 90 |     "with memory_use():\n",
 91 |     "    memory_use_no_dict = pq.read_table(pa.BufferReader(contents))"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 15,
 97 |    "metadata": {},
 98 |    "outputs": [
 99 |     {
100 |      "name": "stdout",
101 |      "output_type": "stream",
102 |      "text": [
103 |       "41304128\n"
104 |      ]
105 |     }
106 |    ],
107 |    "source": [
108 |     "with memory_use():\n",
109 |     "    memory_use_dict = pq.read_table(pa.BufferReader(contents), read_dictionary=['f0'])"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 16,
115 |    "metadata": {},
116 |    "outputs": [
117 |     {
118 |      "name": "stdout",
119 |      "output_type": "stream",
120 |      "text": [
121 |       "1.79 s ± 7.86 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
122 |      ]
123 |     }
124 |    ],
125 |    "source": [
126 |     "%timeit memory_use_no_dict = pq.read_table(pa.BufferReader(contents))"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": 17,
132 |    "metadata": {},
133 |    "outputs": [
134 |     {
135 |      "name": "stdout",
136 |      "output_type": "stream",
137 |      "text": [
138 |       "106 ms ± 11.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
139 |      ]
140 |     }
141 |    ],
142 |    "source": [
143 |     "%timeit memory_use_dict = pq.read_table(pa.BufferReader(contents), read_dictionary=['f0'])"
144 |    ]
145 |   },
146 |   {
147 |    "cell_type": "code",
148 |    "execution_count": 18,
149 |    "metadata": {},
150 |    "outputs": [
151 |     {
152 |      "data": {
153 |       "text/plain": [
154 |        "516.1763305664062"
155 |       ]
156 |      },
157 |      "execution_count": 18,
158 |      "metadata": {},
159 |      "output_type": "execute_result"
160 |     }
161 |    ],
162 |    "source": [
163 |     "541250112 / (1 << 20)"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 19,
169 |    "metadata": {},
170 |    "outputs": [
171 |     {
172 |      "data": {
173 |       "text/plain": [
174 |        "39.39068603515625"
175 |       ]
176 |      },
177 |      "execution_count": 19,
178 |      "metadata": {},
179 |      "output_type": "execute_result"
180 |     }
181 |    ],
182 |    "source": [
183 |     "41304128 / (1 << 20)"
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": []
192 |   }
193 |  ],
194 |  "metadata": {
195 |   "kernelspec": {
196 |    "display_name": "Python 3",
197 |    "language": "python",
198 |    "name": "python3"
199 |   },
200 |   "language_info": {
201 |    "codemirror_mode": {
202 |     "name": "ipython",
203 |     "version": 3
204 |    },
205 |    "file_extension": ".py",
206 |    "mimetype": "text/x-python",
207 |    "name": "python",
208 |    "nbconvert_exporter": "python",
209 |    "pygments_lexer": "ipython3",
210 |    "version": "3.7.3"
211 |   }
212 |  },
213 |  "nbformat": 4,
214 |  "nbformat_minor": 2
215 | }
216 | 


--------------------------------------------------------------------------------
/20190803-ARROW-6060.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "Change in memory use: 76502016\n",
 13 |       "Change in peak use: 5843859776\n"
 14 |      ]
 15 |     }
 16 |    ],
 17 |    "source": [
 18 |     "import pandas as pd\n",
 19 |     "from pandas.util.testing import rands\n",
 20 |     "\n",
 21 |     "import pyarrow as pa\n",
 22 |     "import pyarrow.parquet as pq\n",
 23 |     "\n",
 24 |     "import gc\n",
 25 |     "class memory_use:\n",
 26 |     "    \n",
 27 |     "    def __init__(self):\n",
 28 |     "        self.start_use = pa.total_allocated_bytes()        \n",
 29 |     "        self.pool = pa.default_memory_pool()\n",
 30 |     "        self.start_peak_use = self.pool.max_memory()\n",
 31 |     "        \n",
 32 |     "    def __enter__(self):\n",
 33 |     "        return\n",
 34 |     "    \n",
 35 |     "    def __exit__(self, type, value, traceback):\n",
 36 |     "        gc.collect()\n",
 37 |     "        print(\"Change in memory use: {}\"\n",
 38 |     "              .format(pa.total_allocated_bytes() - self.start_use))\n",
 39 |     "        print(\"Change in peak use: {}\"\n",
 40 |     "              .format(self.pool.max_memory() - self.start_peak_use))\n",
 41 |     "\n",
 42 |     "with memory_use():\n",
 43 |     "    table = pq.read_table('/home/wesm/Downloads/demofile.parquet')        "
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 2,
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "name": "stdout",
 53 |      "output_type": "stream",
 54 |      "text": [
 55 |       "Change in memory use: 34499968\n",
 56 |       "Change in peak use: 5801857728\n"
 57 |      ]
 58 |     }
 59 |    ],
 60 |    "source": [
 61 |     "with memory_use():\n",
 62 |     "    table = pq.read_table('/home/wesm/Downloads/demofile.parquet', columns=['body'])"
 63 |    ]
 64 |   },
 65 |   {
 66 |    "cell_type": "code",
 67 |    "execution_count": 5,
 68 |    "metadata": {},
 69 |    "outputs": [
 70 |     {
 71 |      "data": {
 72 |       "text/plain": [
 73 |        "pyarrow.Table\n",
 74 |        "archived: bool\n",
 75 |        "author: string\n",
 76 |        "author_flair_css_class: string\n",
 77 |        "author_flair_text: string\n",
 78 |        "body: string\n",
 79 |        "controversiality: int64\n",
 80 |        "created_utc: string\n",
 81 |        "distinguished: string\n",
 82 |        "downs: int64\n",
 83 |        "edited: string\n",
 84 |        "gilded: int64\n",
 85 |        "id: string\n",
 86 |        "link_id: string\n",
 87 |        "name: string\n",
 88 |        "parent_id: string\n",
 89 |        "retrieved_on: int64\n",
 90 |        "score: int64\n",
 91 |        "score_hidden: bool\n",
 92 |        "subreddit: string\n",
 93 |        "subreddit_id: string\n",
 94 |        "ups: int64"
 95 |       ]
 96 |      },
 97 |      "execution_count": 5,
 98 |      "metadata": {},
 99 |      "output_type": "execute_result"
100 |     }
101 |    ],
102 |    "source": [
103 |     "table"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": null,
109 |    "metadata": {},
110 |    "outputs": [],
111 |    "source": [
112 |     "def generate_strings(length, nunique, string_length=10):\n",
113 |     "    unique_values = [rands(string_length) for i in range(nunique)]\n",
114 |     "    values = unique_values * (length // nunique)\n",
115 |     "    return values\n",
116 |     "\n",
117 |     "df = pd.DataFrame()\n",
118 |     "df['a'] = generate_strings(100000000, 10000)\n",
119 |     "df['b'] = generate_strings(100000000, 10000)\n",
120 |     "df.to_parquet('/tmp/test.parquet')"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 2,
126 |    "metadata": {},
127 |    "outputs": [
128 |     {
129 |      "name": "stdout",
130 |      "output_type": "stream",
131 |      "text": [
132 |       "Change in memory use: 825560448\n",
133 |       "Change in peak use: 1484772224\n"
134 |      ]
135 |     }
136 |    ],
137 |    "source": [
138 |     "with memory_use():\n",
139 |     "    table = pq.read_table('/tmp/test.parquet', read_dictionary=['a', 'b'])"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 2,
145 |    "metadata": {},
146 |    "outputs": [],
147 |    "source": [
148 |     "pool = pa.default_memory_pool()"
149 |    ]
150 |   },
151 |   {
152 |    "cell_type": "code",
153 |    "execution_count": 3,
154 |    "metadata": {},
155 |    "outputs": [
156 |     {
157 |      "data": {
158 |       "text/plain": [
159 |        "0"
160 |       ]
161 |      },
162 |      "execution_count": 3,
163 |      "metadata": {},
164 |      "output_type": "execute_result"
165 |     }
166 |    ],
167 |    "source": [
168 |     "pool.max_memory()"
169 |    ]
170 |   },
171 |   {
172 |    "cell_type": "code",
173 |    "execution_count": 4,
174 |    "metadata": {},
175 |    "outputs": [],
176 |    "source": []
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 7,
181 |    "metadata": {},
182 |    "outputs": [
183 |     {
184 |      "name": "stdout",
185 |      "output_type": "stream",
186 |      "text": [
187 |       "-rw-r--r-- 1 wesm wesm 274263652 Aug  3 14:19 /tmp/test.parquet\r\n"
188 |      ]
189 |     }
190 |    ],
191 |    "source": [
192 |     "!ls -l /tmp/*.parquet"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": 8,
198 |    "metadata": {},
199 |    "outputs": [
200 |     {
201 |      "name": "stdout",
202 |      "output_type": "stream",
203 |      "text": [
204 |       "Change in memory use: 2825000192\n",
205 |       "Change in peak use: 3827684608\n"
206 |      ]
207 |     }
208 |    ],
209 |    "source": []
210 |   },
211 |   {
212 |    "cell_type": "code",
213 |    "execution_count": 10,
214 |    "metadata": {},
215 |    "outputs": [
216 |     {
217 |      "data": {
218 |       "text/plain": [
219 |        "20585786752"
220 |       ]
221 |      },
222 |      "execution_count": 10,
223 |      "metadata": {},
224 |      "output_type": "execute_result"
225 |     }
226 |    ],
227 |    "source": [
228 |     "pool.max_memory()"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": []
237 |   }
238 |  ],
239 |  "metadata": {
240 |   "kernelspec": {
241 |    "display_name": "Python 3",
242 |    "language": "python",
243 |    "name": "python3"
244 |   },
245 |   "language_info": {
246 |    "codemirror_mode": {
247 |     "name": "ipython",
248 |     "version": 3
249 |    },
250 |    "file_extension": ".py",
251 |    "mimetype": "text/x-python",
252 |    "name": "python",
253 |    "nbconvert_exporter": "python",
254 |    "pygments_lexer": "ipython3",
255 |    "version": "3.7.3"
256 |   }
257 |  },
258 |  "nbformat": 4,
259 |  "nbformat_minor": 2
260 | }
261 | 


--------------------------------------------------------------------------------
/20190815-Parquet-Direct-Dictionary-Write.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import numpy as np\n",
 10 |     "import pyarrow as pa\n",
 11 |     "import pyarrow.parquet as pq\n",
 12 |     "import pandas as pd\n",
 13 |     "from pandas.util.testing import rands\n",
 14 |     "        \n",
 15 |     "NUNIQUE = 1000\n",
 16 |     "STRING_SIZE = 50\n",
 17 |     "LENGTH = 10_000_000\n",
 18 |     "REPEATS = LENGTH // NUNIQUE\n",
 19 |     "\n",
 20 |     "uniques = np.array([rands(STRING_SIZE) for i in range(NUNIQUE)], dtype='O')\n",
 21 |     "indices = np.random.randint(0, NUNIQUE, size=LENGTH).astype('i4')        \n",
 22 |     "data = uniques.take(indices)"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": 2,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "import gc\n",
 32 |     "class memory_use:\n",
 33 |     "    \n",
 34 |     "    def __init__(self):\n",
 35 |     "        self.start_use = pa.total_allocated_bytes()        \n",
 36 |     "        self.pool = pa.default_memory_pool()\n",
 37 |     "        self.start_peak_use = self.pool.max_memory()\n",
 38 |     "        \n",
 39 |     "    def __enter__(self):\n",
 40 |     "        return\n",
 41 |     "    \n",
 42 |     "    def __exit__(self, type, value, traceback):\n",
 43 |     "        gc.collect()\n",
 44 |     "        print(\"Change in memory use: {}\"\n",
 45 |     "              .format(pa.total_allocated_bytes() - self.start_use))\n",
 46 |     "        print(\"Change in peak use: {}\"\n",
 47 |     "              .format(self.pool.max_memory() - self.start_peak_use))"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 3,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "dict_data = pa.DictionaryArray.from_arrays(indices, uniques)"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 4,
 62 |    "metadata": {},
 63 |    "outputs": [
 64 |     {
 65 |      "data": {
 66 |       "text/plain": [
 67 |        "72320"
 68 |       ]
 69 |      },
 70 |      "execution_count": 4,
 71 |      "metadata": {},
 72 |      "output_type": "execute_result"
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "pa.default_memory_pool().max_memory()"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "code",
 81 |    "execution_count": 5,
 82 |    "metadata": {},
 83 |    "outputs": [
 84 |     {
 85 |      "name": "stdout",
 86 |      "output_type": "stream",
 87 |      "text": [
 88 |       "Change in memory use: 16777216\n",
 89 |       "Change in peak use: 753475648\n"
 90 |      ]
 91 |     }
 92 |    ],
 93 |    "source": [
 94 |     "table = pa.table([dict_data], names=['f0'])\n",
 95 |     "with memory_use():\n",
 96 |     "    out_stream = pa.BufferOutputStream()\n",
 97 |     "    pq.write_table(table, out_stream)\n",
 98 |     "    contents = out_stream.getvalue()"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 6,
104 |    "metadata": {},
105 |    "outputs": [
106 |     {
107 |      "name": "stdout",
108 |      "output_type": "stream",
109 |      "text": [
110 |       "820 ms ± 11.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
111 |      ]
112 |     }
113 |    ],
114 |    "source": [
115 |     "%%timeit\n",
116 |     "out_stream = pa.BufferOutputStream()\n",
117 |     "pq.write_table(table, out_stream)\n",
118 |     "contents = out_stream.getvalue()"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 7,
124 |    "metadata": {},
125 |    "outputs": [
126 |     {
127 |      "data": {
128 |       "text/plain": [
129 |        "12576182"
130 |       ]
131 |      },
132 |      "execution_count": 7,
133 |      "metadata": {},
134 |      "output_type": "execute_result"
135 |     }
136 |    ],
137 |    "source": [
138 |     "len(contents)"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 8,
144 |    "metadata": {},
145 |    "outputs": [
146 |     {
147 |      "name": "stdout",
148 |      "output_type": "stream",
149 |      "text": [
150 |       "495 ms ± 8.04 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
151 |      ]
152 |     }
153 |    ],
154 |    "source": [
155 |     "%timeit returned_table = pq.read_table(contents)"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 9,
161 |    "metadata": {},
162 |    "outputs": [
163 |     {
164 |      "name": "stdout",
165 |      "output_type": "stream",
166 |      "text": [
167 |       "93.1 ms ± 3.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
168 |      ]
169 |     }
170 |    ],
171 |    "source": [
172 |     "%timeit returned_table = pq.read_table(contents, read_dictionary=['f0'])"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": 10,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "dense_data = dict_data.cast(pa.utf8())\n",
182 |     "table = pa.table([dense_data], names=['f0'])"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 11,
188 |    "metadata": {},
189 |    "outputs": [
190 |     {
191 |      "name": "stdout",
192 |      "output_type": "stream",
193 |      "text": [
194 |       "405 ms ± 27.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
195 |      ]
196 |     }
197 |    ],
198 |    "source": [
199 |     "%%timeit\n",
200 |     "out_stream = pa.BufferOutputStream()\n",
201 |     "pq.write_table(table, out_stream)\n",
202 |     "contents = out_stream.getvalue()"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 12,
208 |    "metadata": {},
209 |    "outputs": [],
210 |    "source": [
211 |     "out_stream = pa.BufferOutputStream()\n",
212 |     "pq.write_table(table, out_stream)\n",
213 |     "contents = out_stream.getvalue()"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": 13,
219 |    "metadata": {},
220 |    "outputs": [
221 |     {
222 |      "name": "stdout",
223 |      "output_type": "stream",
224 |      "text": [
225 |       "430 ms ± 8.12 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
226 |      ]
227 |     }
228 |    ],
229 |    "source": [
230 |     "%%timeit\n",
231 |     "returned_table = pq.read_table(contents)"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 14,
237 |    "metadata": {},
238 |    "outputs": [
239 |     {
240 |      "data": {
241 |       "text/plain": [
242 |        "pyarrow.Table\n",
243 |        "f0: string"
244 |       ]
245 |      },
246 |      "execution_count": 14,
247 |      "metadata": {},
248 |      "output_type": "execute_result"
249 |     }
250 |    ],
251 |    "source": [
252 |     "pq.read_table(contents)"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": null,
258 |    "metadata": {},
259 |    "outputs": [],
260 |    "source": []
261 |   }
262 |  ],
263 |  "metadata": {
264 |   "kernelspec": {
265 |    "display_name": "Python 3",
266 |    "language": "python",
267 |    "name": "python3"
268 |   },
269 |   "language_info": {
270 |    "codemirror_mode": {
271 |     "name": "ipython",
272 |     "version": 3
273 |    },
274 |    "file_extension": ".py",
275 |    "mimetype": "text/x-python",
276 |    "name": "python",
277 |    "nbconvert_exporter": "python",
278 |    "pygments_lexer": "ipython3",
279 |    "version": "3.7.3"
280 |   }
281 |  },
282 |  "nbformat": 4,
283 |  "nbformat_minor": 2
284 | }
285 | 


--------------------------------------------------------------------------------
/20190830-VLDB-FlightDemo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pyarrow as pa\n",
 10 |     "import pyarrow.parquet as pq\n",
 11 |     "import pyarrow.flight as flight\n",
 12 |     "import numpy as np\n",
 13 |     "import pandas as pd\n",
 14 |     "import time\n",
 15 |     "import threading"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": null,
 21 |    "metadata": {},
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "class DemoServer(flight.FlightServerBase):\n",
 25 |     "    \n",
 26 |     "    def __init__(self):\n",
 27 |     "        self._cache = {}\n",
 28 |     "    \n",
 29 |     "    def list_actions(self, context):\n",
 30 |     "        return [flight.ActionType('list-tables', 'List stored tables'),\n",
 31 |     "                flight.ActionType('drop-table', 'Drop a stored table')]\n",
 32 |     "\n",
 33 |     "    # -----------------------------------------------------------------\n",
 34 |     "    # Implement actions\n",
 35 |     "    \n",
 36 |     "    def do_action(self, context, action):\n",
 37 |     "        handlers = {\n",
 38 |     "            'list-tables': self._list_tables,\n",
 39 |     "            'drop-table': self._drop_table\n",
 40 |     "        }        \n",
 41 |     "        handler = handlers.get(action.type)\n",
 42 |     "        if not handler:\n",
 43 |     "            raise NotImplementedError   \n",
 44 |     "        return handlers[action.type](action)\n",
 45 |     "        \n",
 46 |     "    def _drop_table(self, action):\n",
 47 |     "        del self._cache[action.body]\n",
 48 |     "        \n",
 49 |     "    def _list_tables(self, action):\n",
 50 |     "        return iter([flight.Result(cache_key) \n",
 51 |     "                     for cache_key in sorted(self._cache.keys())])\n",
 52 |     "\n",
 53 |     "    # -----------------------------------------------------------------\n",
 54 |     "    # Implement puts\n",
 55 |     "    \n",
 56 |     "    def do_put(self, context, descriptor, reader, writer):\n",
 57 |     "        self._cache[descriptor.command] = reader.read_all()\n",
 58 |     "        \n",
 59 |     "    # -----------------------------------------------------------------\n",
 60 |     "    # Implement gets\n",
 61 |     "\n",
 62 |     "    def do_get(self, context, ticket):\n",
 63 |     "        table = self._cache[ticket.ticket]\n",
 64 |     "        return flight.RecordBatchStream(table)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": null,
 70 |    "metadata": {},
 71 |    "outputs": [],
 72 |    "source": [
 73 |     "import contextlib\n",
 74 |     "import socket\n",
 75 |     "def find_free_port():\n",
 76 |     "    # Find a free port\n",
 77 |     "    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n",
 78 |     "    with contextlib.closing(sock) as sock:\n",
 79 |     "        sock.bind(('', 0))\n",
 80 |     "        sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)\n",
 81 |     "        port = sock.getsockname()[1]\n",
 82 |     "    return port\n",
 83 |     "\n",
 84 |     "def wait_for_available(client):\n",
 85 |     "    deadline = time.time() + 5.0\n",
 86 |     "    while True:\n",
 87 |     "        try:\n",
 88 |     "            list(client.list_flights())\n",
 89 |     "        except Exception as e:\n",
 90 |     "            if 'Connect Failed' in str(e):\n",
 91 |     "                if time.time() < deadline:\n",
 92 |     "                    time.sleep(0.025)\n",
 93 |     "                    continue\n",
 94 |     "                else:\n",
 95 |     "                    raise\n",
 96 |     "        break"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "port = 1337\n",
106 |     "location = flight.Location.for_grpc_tcp(\"localhost\", find_free_port())\n",
107 |     "location"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "server = DemoServer()\n",
117 |     "server.init(location)\n",
118 |     "\n",
119 |     "thread = threading.Thread(target=lambda: server.run(), daemon=True)\n",
120 |     "thread.start()\n",
121 |     "\n",
122 |     "client = flight.FlightClient.connect(location)\n",
123 |     "wait_for_available(client)"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": [
132 |     "client.list_actions()"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": null,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "def list_tables(client):\n",
142 |     "    action = flight.Action('list-tables', b'')\n",
143 |     "    return [x.body.to_pybytes().decode('utf8') for x in client.do_action(action)]    \n",
144 |     "\n",
145 |     "# def drop_table(client):\n",
146 |     "\n",
147 |     "list_tables(client)"
148 |    ]
149 |   },
150 |   {
151 |    "cell_type": "code",
152 |    "execution_count": null,
153 |    "metadata": {},
154 |    "outputs": [],
155 |    "source": [
156 |     "def cache_table_in_server(name, table):\n",
157 |     "    desc = flight.FlightDescriptor.for_command(name.encode('utf8'))\n",
158 |     "    put_writer, put_meta_reader = client.do_put(desc, table.schema)\n",
159 |     "    put_writer.write(table)\n",
160 |     "    put_writer.close()\n",
161 |     "    \n",
162 |     "    \n",
163 |     "def get_table(name):\n",
164 |     "    reader = client.do_get(flight.Ticket(name.encode('utf8')))\n",
165 |     "    return reader.read_all()"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "table = pa.table([pa.array([1,2,3,4,5])], names=['f0'])\n",
175 |     "cache_table_in_server('table1', table)"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": null,
181 |    "metadata": {},
182 |    "outputs": [],
183 |    "source": [
184 |     "list_tables(client)"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "code",
189 |    "execution_count": null,
190 |    "metadata": {},
191 |    "outputs": [],
192 |    "source": [
193 |     "cache_table_in_server('table2', table)\n",
194 |     "cache_table_in_server('table3', table)\n",
195 |     "cache_table_in_server('table4', table)"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": null,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "list_tables(client)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {},
211 |    "outputs": [],
212 |    "source": [
213 |     "get_table('table1')"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": null,
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": [
222 |     "import pandas as pd\n",
223 |     "fec = pd.read_csv('/home/wesm/code/pydata-book/datasets/fec/P00000001-ALL.csv')\n",
224 |     "fec.head()\n",
225 |     "def coerce_int(x):\n",
226 |     "    try:\n",
227 |     "        return int(x)\n",
228 |     "    except:\n",
229 |     "        return -1\n",
230 |     "\n",
231 |     "fec['contbr_zip'] = fec['contbr_zip'].map(coerce_int).astype(np.int64)"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "code",
236 |    "execution_count": 17,
237 |    "metadata": {},
238 |    "outputs": [],
239 |    "source": [
240 |     "fec_table = pa.table(fec)"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 18,
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "fec_table = pa.concat_tables([fec_table] * 10)"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": 19,
255 |    "metadata": {},
256 |    "outputs": [
257 |     {
258 |      "name": "stdout",
259 |      "output_type": "stream",
260 |      "text": [
261 |       "CPU times: user 425 ms, sys: 1.13 s, total: 1.56 s\n",
262 |       "Wall time: 1.16 s\n"
263 |      ]
264 |     }
265 |    ],
266 |    "source": [
267 |     "%%time\n",
268 |     "cache_table_in_server('fec_table', fec_table)"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": 20,
274 |    "metadata": {},
275 |    "outputs": [
276 |     {
277 |      "data": {
278 |       "text/plain": [
279 |        "['fec_table', 'table1', 'table2', 'table3', 'table4']"
280 |       ]
281 |      },
282 |      "execution_count": 20,
283 |      "metadata": {},
284 |      "output_type": "execute_result"
285 |     }
286 |    ],
287 |    "source": [
288 |     "list_tables(client)"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": 21,
294 |    "metadata": {},
295 |    "outputs": [
296 |     {
297 |      "name": "stdout",
298 |      "output_type": "stream",
299 |      "text": [
300 |       "CPU times: user 404 ms, sys: 995 ms, total: 1.4 s\n",
301 |       "Wall time: 1.1 s\n"
302 |      ]
303 |     }
304 |    ],
305 |    "source": [
306 |     "%%time \n",
307 |     "\n",
308 |     "fec_table_received = get_table('fec_table')"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": null,
314 |    "metadata": {},
315 |    "outputs": [],
316 |    "source": []
317 |   }
318 |  ],
319 |  "metadata": {
320 |   "kernelspec": {
321 |    "display_name": "Python 3",
322 |    "language": "python",
323 |    "name": "python3"
324 |   },
325 |   "language_info": {
326 |    "codemirror_mode": {
327 |     "name": "ipython",
328 |     "version": 3
329 |    },
330 |    "file_extension": ".py",
331 |    "mimetype": "text/x-python",
332 |    "name": "python",
333 |    "nbconvert_exporter": "python",
334 |    "pygments_lexer": "ipython3",
335 |    "version": "3.7.3"
336 |   }
337 |  },
338 |  "nbformat": 4,
339 |  "nbformat_minor": 2
340 | }
341 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/FeatherCompression.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 4,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "using 8 cpu cores\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "# flake8: noqa\n",
 18 |     "\n",
 19 |     "import pyarrow.feather as feather\n",
 20 |     "import pandas as pd\n",
 21 |     "import json\n",
 22 |     "import numpy as np\n",
 23 |     "import pyarrow as pa\n",
 24 |     "import pyarrow.parquet as pq\n",
 25 |     "from pandas.util.testing import rands\n",
 26 |     "import gc\n",
 27 |     "import os\n",
 28 |     "import time\n",
 29 |     "\n",
 30 |     "pa.set_cpu_count(8)\n",
 31 |     "\n",
 32 |     "print(f\"using {pa.cpu_count()} cpu cores\")\n",
 33 |     "    \n",
 34 |     "\n",
 35 |     "def get_timing(f, niter=1):\n",
 36 |     "    start = time.clock_gettime(time.CLOCK_REALTIME)\n",
 37 |     "    for i in range(niter):\n",
 38 |     "        f()\n",
 39 |     "    result = (time.clock_gettime(time.CLOCK_REALTIME) - start) / niter\n",
 40 |     "    return result\n",
 41 |     "\n",
 42 |     "\n",
 43 |     "files = {\n",
 44 |     "    'fanniemae': {\n",
 45 |     "        'base': '2016Q4',\n",
 46 |     "        'source': {\n",
 47 |     "            'path': '2016Q4.txt',\n",
 48 |     "            'sep': '|',\n",
 49 |     "            'header': None\n",
 50 |     "        }\n",
 51 |     "    },\n",
 52 |     "    'nyctaxi': {\n",
 53 |     "        'base': 'yellow_tripdata_2010-01',\n",
 54 |     "        'source': {\n",
 55 |     "            'path': 'yellow_tripdata_2010-01.csv',\n",
 56 |     "            'sep': ',',\n",
 57 |     "            'header': 0\n",
 58 |     "        }\n",
 59 |     "    }\n",
 60 |     "}\n",
 61 |     "\n",
 62 |     "\n",
 63 |     "compression_cases = [\n",
 64 |     "    (None, None),   # uncompressed\n",
 65 |     "    ('zstd', 1),    # minimal compression\n",
 66 |     "    ('zstd', 10),   # moderate\n",
 67 |     "    ('lz4', None)   # LZ4 doesn't support compression level\n",
 68 |     "]\n",
 69 |     "\n",
 70 |     "\n",
 71 |     "def write_files(files, chunksize=1<<16):\n",
 72 |     "    statistics = []\n",
 73 |     "    for name, info in files.items():\n",
 74 |     "        source = info['source']\n",
 75 |     "        print(\"reading {}\".format(source['path']))\n",
 76 |     "        df = pd.read_csv(source['path'], sep=source['sep'], \n",
 77 |     "                         header=source['header'], \n",
 78 |     "                         low_memory=False)\n",
 79 |     "        if source['header'] is None:\n",
 80 |     "            df.columns = ['f{}'.format(i) for i in range(len(df.columns))]\n",
 81 |     "\n",
 82 |     "        t = (pa.Table.from_pandas(df, preserve_index=False)\n",
 83 |     "             .replace_schema_metadata(None))\n",
 84 |     "        for compression, compression_level in compression_cases:\n",
 85 |     "            path = '{}_{}_{}.feather'.format(info['base'], \n",
 86 |     "                                             compression or 'uncompressed',\n",
 87 |     "                                             compression_level)\n",
 88 |     "            print((name, compression, compression_level))\n",
 89 |     "            tm = get_timing(lambda: \n",
 90 |     "                            feather.write_feather(df, path, compression=compression,\n",
 91 |     "                                                  compression_level=compression_level,\n",
 92 |     "                                                  chunksize=chunksize))\n",
 93 |     "            file_size = os.stat(path).st_size\n",
 94 |     "            result = name, compression, compression_level, file_size, tm\n",
 95 |     "            print(result)\n",
 96 |     "            statistics.append(result)\n",
 97 |     "    return statistics\n",
 98 |     "\n",
 99 |     "def get_read_results():\n",
100 |     "    all_results = []\n",
101 |     "    for name, info in files.items():\n",
102 |     "        for compression, compression_level in compression_cases:\n",
103 |     "            path = '{}_{}_{}.feather'.format(info['base'], \n",
104 |     "                                             compression or 'uncompressed',\n",
105 |     "                                             compression_level)\n",
106 |     "            read_time = get_timing(lambda: feather.read_table(path, memory_map=False),\n",
107 |     "                                   niter=5)\n",
108 |     "            result = name, compression, compression_level, read_time\n",
109 |     "            print(result)\n",
110 |     "            all_results.append(result) \n",
111 |     "    return all_results"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": []
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 5,
124 |    "metadata": {},
125 |    "outputs": [
126 |     {
127 |      "name": "stdout",
128 |      "output_type": "stream",
129 |      "text": [
130 |       "1024\n",
131 |       "reading 2016Q4.txt\n",
132 |       "('fanniemae', None, None)\n",
133 |       "('fanniemae', None, None, 5084410194, 11.884642839431763)\n",
134 |       "('fanniemae', 'zstd', 1)\n",
135 |       "('fanniemae', 'zstd', 1, 501955562, 11.54361605644226)\n",
136 |       "('fanniemae', 'zstd', 10)\n",
137 |       "('fanniemae', 'zstd', 10, 439460538, 40.15117073059082)\n",
138 |       "('fanniemae', 'lz4', None)\n",
139 |       "('fanniemae', 'lz4', None, 765604482, 12.363005876541138)\n",
140 |       "reading yellow_tripdata_2010-01.csv\n",
141 |       "('nyctaxi', None, None)\n",
142 |       "('nyctaxi', None, None, 2522035242, 6.970196723937988)\n",
143 |       "('nyctaxi', 'zstd', 1)\n",
144 |       "('nyctaxi', 'zstd', 1, 878914098, 7.667033433914185)\n",
145 |       "('nyctaxi', 'zstd', 10)\n",
146 |       "('nyctaxi', 'zstd', 10, 828266042, 32.220927715301514)\n",
147 |       "('nyctaxi', 'lz4', None)\n",
148 |       "('nyctaxi', 'lz4', None, 1262344938, 7.114352226257324)\n",
149 |       "('fanniemae', None, None, 2.2620407581329345)\n",
150 |       "('fanniemae', 'zstd', 1, 3.4737910270690917)\n",
151 |       "('fanniemae', 'zstd', 10, 3.4430580615997313)\n",
152 |       "('fanniemae', 'lz4', None, 3.521429014205933)\n",
153 |       "('nyctaxi', None, None, 1.0237845420837401)\n",
154 |       "('nyctaxi', 'zstd', 1, 1.8016125202178954)\n",
155 |       "('nyctaxi', 'zstd', 10, 1.7049409389495849)\n",
156 |       "('nyctaxi', 'lz4', None, 1.3043041229248047)\n",
157 |       "2048\n",
158 |       "reading 2016Q4.txt\n",
159 |       "('fanniemae', None, None)\n",
160 |       "('fanniemae', None, None, 5063114554, 11.932640790939331)\n",
161 |       "('fanniemae', 'zstd', 1)\n",
162 |       "('fanniemae', 'zstd', 1, 468753626, 9.218210458755493)\n",
163 |       "('fanniemae', 'zstd', 10)\n",
164 |       "('fanniemae', 'zstd', 10, 401064538, 40.00880241394043)\n",
165 |       "('fanniemae', 'lz4', None)\n",
166 |       "('fanniemae', 'lz4', None, 701361578, 9.259565353393555)\n",
167 |       "reading yellow_tripdata_2010-01.csv\n",
168 |       "('nyctaxi', None, None)\n",
169 |       "('nyctaxi', None, None, 2513790386, 5.759558200836182)\n",
170 |       "('nyctaxi', 'zstd', 1)\n",
171 |       "('nyctaxi', 'zstd', 1, 851430546, 6.4997947216033936)\n",
172 |       "('nyctaxi', 'zstd', 10)\n",
173 |       "('nyctaxi', 'zstd', 10, 790773018, 33.690829277038574)\n",
174 |       "('nyctaxi', 'lz4', None)\n",
175 |       "('nyctaxi', 'lz4', None, 1223064234, 5.975880861282349)\n",
176 |       "('fanniemae', None, None, 1.610342788696289)\n",
177 |       "('fanniemae', 'zstd', 1, 1.983039951324463)\n",
178 |       "('fanniemae', 'zstd', 10, 1.9032105445861816)\n",
179 |       "('fanniemae', 'lz4', None, 1.7990120887756347)\n",
180 |       "('nyctaxi', None, None, 0.8817797660827636)\n",
181 |       "('nyctaxi', 'zstd', 1, 1.2915375709533692)\n",
182 |       "('nyctaxi', 'zstd', 10, 1.13835711479187)\n",
183 |       "('nyctaxi', 'lz4', None, 0.8505313873291016)\n",
184 |       "4096\n",
185 |       "reading 2016Q4.txt\n",
186 |       "('fanniemae', None, None)\n",
187 |       "('fanniemae', None, None, 5052804778, 10.22159743309021)\n",
188 |       "('fanniemae', 'zstd', 1)\n",
189 |       "('fanniemae', 'zstd', 1, 473501522, 8.019737958908081)\n",
190 |       "('fanniemae', 'zstd', 10)\n",
191 |       "('fanniemae', 'zstd', 10, 384761498, 13.248246908187866)\n",
192 |       "('fanniemae', 'lz4', None)\n",
193 |       "('fanniemae', 'lz4', None, 666704194, 7.61299991607666)\n",
194 |       "reading yellow_tripdata_2010-01.csv\n",
195 |       "('nyctaxi', None, None)\n",
196 |       "('nyctaxi', None, None, 2509671706, 6.375310659408569)\n",
197 |       "('nyctaxi', 'zstd', 1)\n",
198 |       "('nyctaxi', 'zstd', 1, 841720058, 5.634358882904053)\n",
199 |       "('nyctaxi', 'zstd', 10)\n",
200 |       "('nyctaxi', 'zstd', 10, 765991802, 23.161847591400146)\n",
201 |       "('nyctaxi', 'lz4', None)\n",
202 |       "('nyctaxi', 'lz4', None, 1165201354, 6.004603624343872)\n",
203 |       "('fanniemae', None, None, 1.341871976852417)\n",
204 |       "('fanniemae', 'zstd', 1, 1.2426270961761474)\n",
205 |       "('fanniemae', 'zstd', 10, 1.113413667678833)\n",
206 |       "('fanniemae', 'lz4', None, 1.0141475200653076)\n",
207 |       "('nyctaxi', None, None, 0.7986891269683838)\n",
208 |       "('nyctaxi', 'zstd', 1, 0.974519681930542)\n",
209 |       "('nyctaxi', 'zstd', 10, 0.8223378658294678)\n",
210 |       "('nyctaxi', 'lz4', None, 0.5664512634277343)\n",
211 |       "8192\n",
212 |       "reading 2016Q4.txt\n",
213 |       "('fanniemae', None, None)\n",
214 |       "('fanniemae', None, None, 5048174170, 10.155773162841797)\n",
215 |       "('fanniemae', 'zstd', 1)\n",
216 |       "('fanniemae', 'zstd', 1, 476147690, 7.544380187988281)\n",
217 |       "('fanniemae', 'zstd', 10)\n",
218 |       "('fanniemae', 'zstd', 10, 380904258, 13.942293882369995)\n",
219 |       "('fanniemae', 'lz4', None)\n",
220 |       "('fanniemae', 'lz4', None, 648217594, 7.258745193481445)\n",
221 |       "reading yellow_tripdata_2010-01.csv\n",
222 |       "('nyctaxi', None, None)\n",
223 |       "('nyctaxi', None, None, 2507611258, 6.470987319946289)\n",
224 |       "('nyctaxi', 'zstd', 1)\n",
225 |       "('nyctaxi', 'zstd', 1, 837304882, 5.931333303451538)\n",
226 |       "('nyctaxi', 'zstd', 10)\n",
227 |       "('nyctaxi', 'zstd', 10, 739310474, 29.601118326187134)\n",
228 |       "('nyctaxi', 'lz4', None)\n",
229 |       "('nyctaxi', 'lz4', None, 1144720050, 4.887480974197388)\n",
230 |       "('fanniemae', None, None, 1.3168220043182373)\n",
231 |       "('fanniemae', 'zstd', 1, 0.8572097301483155)\n",
232 |       "('fanniemae', 'zstd', 10, 0.7228690624237061)\n",
233 |       "('fanniemae', 'lz4', None, 0.6564846992492676)\n",
234 |       "('nyctaxi', None, None, 0.7386976718902588)\n",
235 |       "('nyctaxi', 'zstd', 1, 0.9264132499694824)\n",
236 |       "('nyctaxi', 'zstd', 10, 0.7089903354644775)\n",
237 |       "('nyctaxi', 'lz4', None, 0.46931772232055663)\n",
238 |       "16384\n",
239 |       "reading 2016Q4.txt\n",
240 |       "('fanniemae', None, None)\n",
241 |       "('fanniemae', None, None, 5046354402, 10.359640121459961)\n",
242 |       "('fanniemae', 'zstd', 1)\n",
243 |       "('fanniemae', 'zstd', 1, 488072882, 6.634678363800049)\n",
244 |       "('fanniemae', 'zstd', 10)\n",
245 |       "('fanniemae', 'zstd', 10, 386850010, 14.295108318328857)\n",
246 |       "('fanniemae', 'lz4', None)\n",
247 |       "('fanniemae', 'lz4', None, 644333354, 6.482739210128784)\n",
248 |       "reading yellow_tripdata_2010-01.csv\n",
249 |       "('nyctaxi', None, None)\n",
250 |       "('nyctaxi', None, None, 2506582282, 5.567317008972168)\n",
251 |       "('nyctaxi', 'zstd', 1)\n",
252 |       "('nyctaxi', 'zstd', 1, 833835922, 4.956018924713135)\n",
253 |       "('nyctaxi', 'zstd', 10)\n",
254 |       "('nyctaxi', 'zstd', 10, 709229218, 17.30007767677307)\n",
255 |       "('nyctaxi', 'lz4', None)\n",
256 |       "('nyctaxi', 'lz4', None, 1179681450, 5.5779945850372314)\n",
257 |       "('fanniemae', None, None, 1.3266838550567628)\n",
258 |       "('fanniemae', 'zstd', 1, 0.7207117557525635)\n",
259 |       "('fanniemae', 'zstd', 10, 0.5619686603546142)\n",
260 |       "('fanniemae', 'lz4', None, 0.5085867404937744)\n",
261 |       "('nyctaxi', None, None, 0.7293866634368896)\n",
262 |       "('nyctaxi', 'zstd', 1, 0.780490779876709)\n",
263 |       "('nyctaxi', 'zstd', 10, 0.6338376045227051)\n",
264 |       "('nyctaxi', 'lz4', None, 0.42446208000183105)\n",
265 |       "32768\n",
266 |       "reading 2016Q4.txt\n",
267 |       "('fanniemae', None, None)\n",
268 |       "('fanniemae', None, None, 5045772882, 11.194675922393799)\n",
269 |       "('fanniemae', 'zstd', 1)\n",
270 |       "('fanniemae', 'zstd', 1, 494361698, 6.307297229766846)\n",
271 |       "('fanniemae', 'zstd', 10)\n",
272 |       "('fanniemae', 'zstd', 10, 394216642, 16.57004427909851)\n",
273 |       "('fanniemae', 'lz4', None)\n",
274 |       "('fanniemae', 'lz4', None, 640424914, 6.438863277435303)\n",
275 |       "reading yellow_tripdata_2010-01.csv\n",
276 |       "('nyctaxi', None, None)\n",
277 |       "('nyctaxi', None, None, 2506066506, 5.98804497718811)\n",
278 |       "('nyctaxi', 'zstd', 1)\n",
279 |       "('nyctaxi', 'zstd', 1, 817758394, 4.760921478271484)\n",
280 |       "('nyctaxi', 'zstd', 10)\n",
281 |       "('nyctaxi', 'zstd', 10, 675626410, 19.773839712142944)\n",
282 |       "('nyctaxi', 'lz4', None)\n",
283 |       "('nyctaxi', 'lz4', None, 1176543226, 5.565099239349365)\n",
284 |       "('fanniemae', None, None, 1.207357358932495)\n",
285 |       "('fanniemae', 'zstd', 1, 0.6379957675933838)\n",
286 |       "('fanniemae', 'zstd', 10, 0.5131874561309815)\n",
287 |       "('fanniemae', 'lz4', None, 0.45996761322021484)\n",
288 |       "('nyctaxi', None, None, 0.6317520141601562)\n",
289 |       "('nyctaxi', 'zstd', 1, 0.7357310771942138)\n",
290 |       "('nyctaxi', 'zstd', 10, 0.5581299781799316)\n",
291 |       "('nyctaxi', 'lz4', None, 0.37372236251831054)\n",
292 |       "65536\n",
293 |       "reading 2016Q4.txt\n",
294 |       "('fanniemae', None, None)\n",
295 |       "('fanniemae', None, None, 5045771154, 11.179830074310303)\n",
296 |       "('fanniemae', 'zstd', 1)\n",
297 |       "('fanniemae', 'zstd', 1, 524046410, 6.3280885219573975)\n",
298 |       "('fanniemae', 'zstd', 10)\n",
299 |       "('fanniemae', 'zstd', 10, 395368482, 14.682528018951416)\n",
300 |       "('fanniemae', 'lz4', None)\n",
301 |       "('fanniemae', 'lz4', None, 638440418, 5.975476264953613)\n",
302 |       "reading yellow_tripdata_2010-01.csv\n",
303 |       "('nyctaxi', None, None)\n",
304 |       "('nyctaxi', None, None, 2505808570, 5.9450695514678955)\n",
305 |       "('nyctaxi', 'zstd', 1)\n",
306 |       "('nyctaxi', 'zstd', 1, 821964938, 5.244204044342041)\n",
307 |       "('nyctaxi', 'zstd', 10)\n",
308 |       "('nyctaxi', 'zstd', 10, 651798442, 19.96653389930725)\n",
309 |       "('nyctaxi', 'lz4', None)\n",
310 |       "('nyctaxi', 'lz4', None, 1174964650, 5.419882297515869)\n",
311 |       "('fanniemae', None, None, 1.0205121994018556)\n",
312 |       "('fanniemae', 'zstd', 1, 0.5739494800567627)\n",
313 |       "('fanniemae', 'zstd', 10, 0.4582984924316406)\n",
314 |       "('fanniemae', 'lz4', None, 0.41712336540222167)\n",
315 |       "('nyctaxi', None, None, 0.5486010074615478)\n",
316 |       "('nyctaxi', 'zstd', 1, 0.6663787841796875)\n",
317 |       "('nyctaxi', 'zstd', 10, 0.5117742538452148)\n",
318 |       "('nyctaxi', 'lz4', None, 0.34208340644836427)\n"
319 |      ]
320 |     }
321 |    ],
322 |    "source": [
323 |     "chunksizes = [1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15,\n",
324 |     "              1 << 16]\n",
325 |     "\n",
326 |     "results_by_chunksize = {}\n",
327 |     "for chunksize in chunksizes:\n",
328 |     "    print(chunksize)\n",
329 |     "    write_results = write_files(files, chunksize=chunksize)\n",
330 |     "    read_results = get_read_results()    \n",
331 |     "    results_by_chunksize[chunksize] = write_results, read_results"
332 |    ]
333 |   },
334 |   {
335 |    "cell_type": "code",
336 |    "execution_count": null,
337 |    "metadata": {},
338 |    "outputs": [],
339 |    "source": []
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": 6,
344 |    "metadata": {},
345 |    "outputs": [],
346 |    "source": [
347 |     "reads = []\n",
348 |     "writes = []\n",
349 |     "\n",
350 |     "for chunksize, (write_results, read_results) in results_by_chunksize.items():\n",
351 |     "    write_results = pd.DataFrame.from_records(\n",
352 |     "        write_results, columns=['dataset', 'codec', 'codec_level', \n",
353 |     "                                'file_size', 'write_time'])\n",
354 |     "    read_results = pd.DataFrame.from_records(\n",
355 |     "        read_results, columns=['dataset', 'codec', 'codec_level', \n",
356 |     "                               'read_time'])\n",
357 |     "    write_results['chunksize'] = chunksize\n",
358 |     "    read_results['chunksize'] = chunksize\n",
359 |     "    \n",
360 |     "    reads.append(read_results)\n",
361 |     "    writes.append(write_results)\n",
362 |     "    \n",
363 |     "reads = pd.concat(reads, ignore_index=True)\n",
364 |     "writes = pd.concat(writes, ignore_index=True)\n",
365 |     "\n",
366 |     "def munge_codecs(codec_s, codec_level_s):\n",
367 |     "    results = []\n",
368 |     "    codec_s = codec_s.fillna('uncompressed')\n",
369 |     "    for codec, codec_level in zip(codec_s, codec_level_s):\n",
370 |     "        if pd.isnull(codec_level):\n",
371 |     "            results.append(codec)\n",
372 |     "        else:\n",
373 |     "            results.append(codec + '-' + str(int(codec_level)))\n",
374 |     "    return results\n",
375 |     "\n",
376 |     "reads['codec'] = munge_codecs(reads['codec'], reads.pop('codec_level'))\n",
377 |     "writes['codec'] = munge_codecs(writes['codec'], writes.pop('codec_level'))"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": 13,
383 |    "metadata": {},
384 |    "outputs": [],
385 |    "source": [
386 |     "%matplotlib notebook"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": 7,
392 |    "metadata": {},
393 |    "outputs": [],
394 |    "source": [
395 |     "reads.to_csv('ipc_read_parallel.csv')\n",
396 |     "writes.to_csv('ipc_write_parallel.csv')"
397 |    ]
398 |   },
399 |   {
400 |    "cell_type": "code",
401 |    "execution_count": 8,
402 |    "metadata": {},
403 |    "outputs": [
404 |     {
405 |      "data": {
406 |       "text/html": [
407 |        "<div>\n",
408 |        "<style scoped>\n",
409 |        "    .dataframe tbody tr th:only-of-type {\n",
410 |        "        vertical-align: middle;\n",
411 |        "    }\n",
412 |        "\n",
413 |        "    .dataframe tbody tr th {\n",
414 |        "        vertical-align: top;\n",
415 |        "    }\n",
416 |        "\n",
417 |        "    .dataframe thead th {\n",
418 |        "        text-align: right;\n",
419 |        "    }\n",
420 |        "</style>\n",
421 |        "<table border=\"1\" class=\"dataframe\">\n",
422 |        "  <thead>\n",
423 |        "    <tr style=\"text-align: right;\">\n",
424 |        "      <th></th>\n",
425 |        "      <th>dataset</th>\n",
426 |        "      <th>codec</th>\n",
427 |        "      <th>read_time</th>\n",
428 |        "      <th>chunksize</th>\n",
429 |        "    </tr>\n",
430 |        "  </thead>\n",
431 |        "  <tbody>\n",
432 |        "    <tr>\n",
433 |        "      <th>0</th>\n",
434 |        "      <td>fanniemae</td>\n",
435 |        "      <td>uncompressed</td>\n",
436 |        "      <td>1.285094</td>\n",
437 |        "      <td>1024</td>\n",
438 |        "    </tr>\n",
439 |        "    <tr>\n",
440 |        "      <th>1</th>\n",
441 |        "      <td>fanniemae</td>\n",
442 |        "      <td>zstd-1</td>\n",
443 |        "      <td>3.586269</td>\n",
444 |        "      <td>1024</td>\n",
445 |        "    </tr>\n",
446 |        "    <tr>\n",
447 |        "      <th>2</th>\n",
448 |        "      <td>fanniemae</td>\n",
449 |        "      <td>zstd-10</td>\n",
450 |        "      <td>3.704591</td>\n",
451 |        "      <td>1024</td>\n",
452 |        "    </tr>\n",
453 |        "    <tr>\n",
454 |        "      <th>3</th>\n",
455 |        "      <td>fanniemae</td>\n",
456 |        "      <td>lz4</td>\n",
457 |        "      <td>3.590986</td>\n",
458 |        "      <td>1024</td>\n",
459 |        "    </tr>\n",
460 |        "    <tr>\n",
461 |        "      <th>4</th>\n",
462 |        "      <td>nyctaxi</td>\n",
463 |        "      <td>uncompressed</td>\n",
464 |        "      <td>0.608589</td>\n",
465 |        "      <td>1024</td>\n",
466 |        "    </tr>\n",
467 |        "    <tr>\n",
468 |        "      <th>5</th>\n",
469 |        "      <td>nyctaxi</td>\n",
470 |        "      <td>zstd-1</td>\n",
471 |        "      <td>1.891127</td>\n",
472 |        "      <td>1024</td>\n",
473 |        "    </tr>\n",
474 |        "    <tr>\n",
475 |        "      <th>6</th>\n",
476 |        "      <td>nyctaxi</td>\n",
477 |        "      <td>zstd-10</td>\n",
478 |        "      <td>1.665766</td>\n",
479 |        "      <td>1024</td>\n",
480 |        "    </tr>\n",
481 |        "    <tr>\n",
482 |        "      <th>7</th>\n",
483 |        "      <td>nyctaxi</td>\n",
484 |        "      <td>lz4</td>\n",
485 |        "      <td>1.227717</td>\n",
486 |        "      <td>1024</td>\n",
487 |        "    </tr>\n",
488 |        "    <tr>\n",
489 |        "      <th>8</th>\n",
490 |        "      <td>fanniemae</td>\n",
491 |        "      <td>uncompressed</td>\n",
492 |        "      <td>0.472191</td>\n",
493 |        "      <td>2048</td>\n",
494 |        "    </tr>\n",
495 |        "    <tr>\n",
496 |        "      <th>9</th>\n",
497 |        "      <td>fanniemae</td>\n",
498 |        "      <td>zstd-1</td>\n",
499 |        "      <td>2.057420</td>\n",
500 |        "      <td>2048</td>\n",
501 |        "    </tr>\n",
502 |        "    <tr>\n",
503 |        "      <th>10</th>\n",
504 |        "      <td>fanniemae</td>\n",
505 |        "      <td>zstd-10</td>\n",
506 |        "      <td>1.798960</td>\n",
507 |        "      <td>2048</td>\n",
508 |        "    </tr>\n",
509 |        "    <tr>\n",
510 |        "      <th>11</th>\n",
511 |        "      <td>fanniemae</td>\n",
512 |        "      <td>lz4</td>\n",
513 |        "      <td>1.634048</td>\n",
514 |        "      <td>2048</td>\n",
515 |        "    </tr>\n",
516 |        "    <tr>\n",
517 |        "      <th>12</th>\n",
518 |        "      <td>nyctaxi</td>\n",
519 |        "      <td>uncompressed</td>\n",
520 |        "      <td>0.171463</td>\n",
521 |        "      <td>2048</td>\n",
522 |        "    </tr>\n",
523 |        "    <tr>\n",
524 |        "      <th>13</th>\n",
525 |        "      <td>nyctaxi</td>\n",
526 |        "      <td>zstd-1</td>\n",
527 |        "      <td>1.287250</td>\n",
528 |        "      <td>2048</td>\n",
529 |        "    </tr>\n",
530 |        "    <tr>\n",
531 |        "      <th>14</th>\n",
532 |        "      <td>nyctaxi</td>\n",
533 |        "      <td>zstd-10</td>\n",
534 |        "      <td>1.045773</td>\n",
535 |        "      <td>2048</td>\n",
536 |        "    </tr>\n",
537 |        "    <tr>\n",
538 |        "      <th>15</th>\n",
539 |        "      <td>nyctaxi</td>\n",
540 |        "      <td>lz4</td>\n",
541 |        "      <td>0.730872</td>\n",
542 |        "      <td>2048</td>\n",
543 |        "    </tr>\n",
544 |        "    <tr>\n",
545 |        "      <th>16</th>\n",
546 |        "      <td>fanniemae</td>\n",
547 |        "      <td>uncompressed</td>\n",
548 |        "      <td>0.193799</td>\n",
549 |        "      <td>4096</td>\n",
550 |        "    </tr>\n",
551 |        "    <tr>\n",
552 |        "      <th>17</th>\n",
553 |        "      <td>fanniemae</td>\n",
554 |        "      <td>zstd-1</td>\n",
555 |        "      <td>1.314903</td>\n",
556 |        "      <td>4096</td>\n",
557 |        "    </tr>\n",
558 |        "    <tr>\n",
559 |        "      <th>18</th>\n",
560 |        "      <td>fanniemae</td>\n",
561 |        "      <td>zstd-10</td>\n",
562 |        "      <td>1.107955</td>\n",
563 |        "      <td>4096</td>\n",
564 |        "    </tr>\n",
565 |        "    <tr>\n",
566 |        "      <th>19</th>\n",
567 |        "      <td>fanniemae</td>\n",
568 |        "      <td>lz4</td>\n",
569 |        "      <td>0.983337</td>\n",
570 |        "      <td>4096</td>\n",
571 |        "    </tr>\n",
572 |        "    <tr>\n",
573 |        "      <th>20</th>\n",
574 |        "      <td>nyctaxi</td>\n",
575 |        "      <td>uncompressed</td>\n",
576 |        "      <td>0.089300</td>\n",
577 |        "      <td>4096</td>\n",
578 |        "    </tr>\n",
579 |        "    <tr>\n",
580 |        "      <th>21</th>\n",
581 |        "      <td>nyctaxi</td>\n",
582 |        "      <td>zstd-1</td>\n",
583 |        "      <td>1.009519</td>\n",
584 |        "      <td>4096</td>\n",
585 |        "    </tr>\n",
586 |        "    <tr>\n",
587 |        "      <th>22</th>\n",
588 |        "      <td>nyctaxi</td>\n",
589 |        "      <td>zstd-10</td>\n",
590 |        "      <td>0.806882</td>\n",
591 |        "      <td>4096</td>\n",
592 |        "    </tr>\n",
593 |        "    <tr>\n",
594 |        "      <th>23</th>\n",
595 |        "      <td>nyctaxi</td>\n",
596 |        "      <td>lz4</td>\n",
597 |        "      <td>0.495844</td>\n",
598 |        "      <td>4096</td>\n",
599 |        "    </tr>\n",
600 |        "    <tr>\n",
601 |        "      <th>24</th>\n",
602 |        "      <td>fanniemae</td>\n",
603 |        "      <td>uncompressed</td>\n",
604 |        "      <td>0.101595</td>\n",
605 |        "      <td>8192</td>\n",
606 |        "    </tr>\n",
607 |        "    <tr>\n",
608 |        "      <th>25</th>\n",
609 |        "      <td>fanniemae</td>\n",
610 |        "      <td>zstd-1</td>\n",
611 |        "      <td>0.897038</td>\n",
612 |        "      <td>8192</td>\n",
613 |        "    </tr>\n",
614 |        "    <tr>\n",
615 |        "      <th>26</th>\n",
616 |        "      <td>fanniemae</td>\n",
617 |        "      <td>zstd-10</td>\n",
618 |        "      <td>0.713981</td>\n",
619 |        "      <td>8192</td>\n",
620 |        "    </tr>\n",
621 |        "    <tr>\n",
622 |        "      <th>27</th>\n",
623 |        "      <td>fanniemae</td>\n",
624 |        "      <td>lz4</td>\n",
625 |        "      <td>0.635939</td>\n",
626 |        "      <td>8192</td>\n",
627 |        "    </tr>\n",
628 |        "    <tr>\n",
629 |        "      <th>28</th>\n",
630 |        "      <td>nyctaxi</td>\n",
631 |        "      <td>uncompressed</td>\n",
632 |        "      <td>0.037341</td>\n",
633 |        "      <td>8192</td>\n",
634 |        "    </tr>\n",
635 |        "    <tr>\n",
636 |        "      <th>29</th>\n",
637 |        "      <td>nyctaxi</td>\n",
638 |        "      <td>zstd-1</td>\n",
639 |        "      <td>0.734890</td>\n",
640 |        "      <td>8192</td>\n",
641 |        "    </tr>\n",
642 |        "    <tr>\n",
643 |        "      <th>30</th>\n",
644 |        "      <td>nyctaxi</td>\n",
645 |        "      <td>zstd-10</td>\n",
646 |        "      <td>0.606730</td>\n",
647 |        "      <td>8192</td>\n",
648 |        "    </tr>\n",
649 |        "    <tr>\n",
650 |        "      <th>31</th>\n",
651 |        "      <td>nyctaxi</td>\n",
652 |        "      <td>lz4</td>\n",
653 |        "      <td>0.350030</td>\n",
654 |        "      <td>8192</td>\n",
655 |        "    </tr>\n",
656 |        "    <tr>\n",
657 |        "      <th>32</th>\n",
658 |        "      <td>fanniemae</td>\n",
659 |        "      <td>uncompressed</td>\n",
660 |        "      <td>0.048579</td>\n",
661 |        "      <td>16384</td>\n",
662 |        "    </tr>\n",
663 |        "    <tr>\n",
664 |        "      <th>33</th>\n",
665 |        "      <td>fanniemae</td>\n",
666 |        "      <td>zstd-1</td>\n",
667 |        "      <td>0.760098</td>\n",
668 |        "      <td>16384</td>\n",
669 |        "    </tr>\n",
670 |        "    <tr>\n",
671 |        "      <th>34</th>\n",
672 |        "      <td>fanniemae</td>\n",
673 |        "      <td>zstd-10</td>\n",
674 |        "      <td>0.559905</td>\n",
675 |        "      <td>16384</td>\n",
676 |        "    </tr>\n",
677 |        "    <tr>\n",
678 |        "      <th>35</th>\n",
679 |        "      <td>fanniemae</td>\n",
680 |        "      <td>lz4</td>\n",
681 |        "      <td>0.481683</td>\n",
682 |        "      <td>16384</td>\n",
683 |        "    </tr>\n",
684 |        "    <tr>\n",
685 |        "      <th>36</th>\n",
686 |        "      <td>nyctaxi</td>\n",
687 |        "      <td>uncompressed</td>\n",
688 |        "      <td>0.016135</td>\n",
689 |        "      <td>16384</td>\n",
690 |        "    </tr>\n",
691 |        "    <tr>\n",
692 |        "      <th>37</th>\n",
693 |        "      <td>nyctaxi</td>\n",
694 |        "      <td>zstd-1</td>\n",
695 |        "      <td>0.697351</td>\n",
696 |        "      <td>16384</td>\n",
697 |        "    </tr>\n",
698 |        "    <tr>\n",
699 |        "      <th>38</th>\n",
700 |        "      <td>nyctaxi</td>\n",
701 |        "      <td>zstd-10</td>\n",
702 |        "      <td>0.556723</td>\n",
703 |        "      <td>16384</td>\n",
704 |        "    </tr>\n",
705 |        "    <tr>\n",
706 |        "      <th>39</th>\n",
707 |        "      <td>nyctaxi</td>\n",
708 |        "      <td>lz4</td>\n",
709 |        "      <td>0.297602</td>\n",
710 |        "      <td>16384</td>\n",
711 |        "    </tr>\n",
712 |        "    <tr>\n",
713 |        "      <th>40</th>\n",
714 |        "      <td>fanniemae</td>\n",
715 |        "      <td>uncompressed</td>\n",
716 |        "      <td>0.021817</td>\n",
717 |        "      <td>32768</td>\n",
718 |        "    </tr>\n",
719 |        "    <tr>\n",
720 |        "      <th>41</th>\n",
721 |        "      <td>fanniemae</td>\n",
722 |        "      <td>zstd-1</td>\n",
723 |        "      <td>0.638263</td>\n",
724 |        "      <td>32768</td>\n",
725 |        "    </tr>\n",
726 |        "    <tr>\n",
727 |        "      <th>42</th>\n",
728 |        "      <td>fanniemae</td>\n",
729 |        "      <td>zstd-10</td>\n",
730 |        "      <td>0.476892</td>\n",
731 |        "      <td>32768</td>\n",
732 |        "    </tr>\n",
733 |        "    <tr>\n",
734 |        "      <th>43</th>\n",
735 |        "      <td>fanniemae</td>\n",
736 |        "      <td>lz4</td>\n",
737 |        "      <td>0.418044</td>\n",
738 |        "      <td>32768</td>\n",
739 |        "    </tr>\n",
740 |        "    <tr>\n",
741 |        "      <th>44</th>\n",
742 |        "      <td>nyctaxi</td>\n",
743 |        "      <td>uncompressed</td>\n",
744 |        "      <td>0.008242</td>\n",
745 |        "      <td>32768</td>\n",
746 |        "    </tr>\n",
747 |        "    <tr>\n",
748 |        "      <th>45</th>\n",
749 |        "      <td>nyctaxi</td>\n",
750 |        "      <td>zstd-1</td>\n",
751 |        "      <td>0.712379</td>\n",
752 |        "      <td>32768</td>\n",
753 |        "    </tr>\n",
754 |        "    <tr>\n",
755 |        "      <th>46</th>\n",
756 |        "      <td>nyctaxi</td>\n",
757 |        "      <td>zstd-10</td>\n",
758 |        "      <td>0.530141</td>\n",
759 |        "      <td>32768</td>\n",
760 |        "    </tr>\n",
761 |        "    <tr>\n",
762 |        "      <th>47</th>\n",
763 |        "      <td>nyctaxi</td>\n",
764 |        "      <td>lz4</td>\n",
765 |        "      <td>0.288290</td>\n",
766 |        "      <td>32768</td>\n",
767 |        "    </tr>\n",
768 |        "    <tr>\n",
769 |        "      <th>48</th>\n",
770 |        "      <td>fanniemae</td>\n",
771 |        "      <td>uncompressed</td>\n",
772 |        "      <td>0.010546</td>\n",
773 |        "      <td>65536</td>\n",
774 |        "    </tr>\n",
775 |        "    <tr>\n",
776 |        "      <th>49</th>\n",
777 |        "      <td>fanniemae</td>\n",
778 |        "      <td>zstd-1</td>\n",
779 |        "      <td>0.595585</td>\n",
780 |        "      <td>65536</td>\n",
781 |        "    </tr>\n",
782 |        "    <tr>\n",
783 |        "      <th>50</th>\n",
784 |        "      <td>fanniemae</td>\n",
785 |        "      <td>zstd-10</td>\n",
786 |        "      <td>0.440694</td>\n",
787 |        "      <td>65536</td>\n",
788 |        "    </tr>\n",
789 |        "    <tr>\n",
790 |        "      <th>51</th>\n",
791 |        "      <td>fanniemae</td>\n",
792 |        "      <td>lz4</td>\n",
793 |        "      <td>0.395174</td>\n",
794 |        "      <td>65536</td>\n",
795 |        "    </tr>\n",
796 |        "    <tr>\n",
797 |        "      <th>52</th>\n",
798 |        "      <td>nyctaxi</td>\n",
799 |        "      <td>uncompressed</td>\n",
800 |        "      <td>0.006750</td>\n",
801 |        "      <td>65536</td>\n",
802 |        "    </tr>\n",
803 |        "    <tr>\n",
804 |        "      <th>53</th>\n",
805 |        "      <td>nyctaxi</td>\n",
806 |        "      <td>zstd-1</td>\n",
807 |        "      <td>0.593057</td>\n",
808 |        "      <td>65536</td>\n",
809 |        "    </tr>\n",
810 |        "    <tr>\n",
811 |        "      <th>54</th>\n",
812 |        "      <td>nyctaxi</td>\n",
813 |        "      <td>zstd-10</td>\n",
814 |        "      <td>0.467952</td>\n",
815 |        "      <td>65536</td>\n",
816 |        "    </tr>\n",
817 |        "    <tr>\n",
818 |        "      <th>55</th>\n",
819 |        "      <td>nyctaxi</td>\n",
820 |        "      <td>lz4</td>\n",
821 |        "      <td>0.277783</td>\n",
822 |        "      <td>65536</td>\n",
823 |        "    </tr>\n",
824 |        "  </tbody>\n",
825 |        "</table>\n",
826 |        "</div>"
827 |       ],
828 |       "text/plain": [
829 |        "      dataset         codec  read_time  chunksize\n",
830 |        "0   fanniemae  uncompressed   1.285094       1024\n",
831 |        "1   fanniemae        zstd-1   3.586269       1024\n",
832 |        "2   fanniemae       zstd-10   3.704591       1024\n",
833 |        "3   fanniemae           lz4   3.590986       1024\n",
834 |        "4     nyctaxi  uncompressed   0.608589       1024\n",
835 |        "5     nyctaxi        zstd-1   1.891127       1024\n",
836 |        "6     nyctaxi       zstd-10   1.665766       1024\n",
837 |        "7     nyctaxi           lz4   1.227717       1024\n",
838 |        "8   fanniemae  uncompressed   0.472191       2048\n",
839 |        "9   fanniemae        zstd-1   2.057420       2048\n",
840 |        "10  fanniemae       zstd-10   1.798960       2048\n",
841 |        "11  fanniemae           lz4   1.634048       2048\n",
842 |        "12    nyctaxi  uncompressed   0.171463       2048\n",
843 |        "13    nyctaxi        zstd-1   1.287250       2048\n",
844 |        "14    nyctaxi       zstd-10   1.045773       2048\n",
845 |        "15    nyctaxi           lz4   0.730872       2048\n",
846 |        "16  fanniemae  uncompressed   0.193799       4096\n",
847 |        "17  fanniemae        zstd-1   1.314903       4096\n",
848 |        "18  fanniemae       zstd-10   1.107955       4096\n",
849 |        "19  fanniemae           lz4   0.983337       4096\n",
850 |        "20    nyctaxi  uncompressed   0.089300       4096\n",
851 |        "21    nyctaxi        zstd-1   1.009519       4096\n",
852 |        "22    nyctaxi       zstd-10   0.806882       4096\n",
853 |        "23    nyctaxi           lz4   0.495844       4096\n",
854 |        "24  fanniemae  uncompressed   0.101595       8192\n",
855 |        "25  fanniemae        zstd-1   0.897038       8192\n",
856 |        "26  fanniemae       zstd-10   0.713981       8192\n",
857 |        "27  fanniemae           lz4   0.635939       8192\n",
858 |        "28    nyctaxi  uncompressed   0.037341       8192\n",
859 |        "29    nyctaxi        zstd-1   0.734890       8192\n",
860 |        "30    nyctaxi       zstd-10   0.606730       8192\n",
861 |        "31    nyctaxi           lz4   0.350030       8192\n",
862 |        "32  fanniemae  uncompressed   0.048579      16384\n",
863 |        "33  fanniemae        zstd-1   0.760098      16384\n",
864 |        "34  fanniemae       zstd-10   0.559905      16384\n",
865 |        "35  fanniemae           lz4   0.481683      16384\n",
866 |        "36    nyctaxi  uncompressed   0.016135      16384\n",
867 |        "37    nyctaxi        zstd-1   0.697351      16384\n",
868 |        "38    nyctaxi       zstd-10   0.556723      16384\n",
869 |        "39    nyctaxi           lz4   0.297602      16384\n",
870 |        "40  fanniemae  uncompressed   0.021817      32768\n",
871 |        "41  fanniemae        zstd-1   0.638263      32768\n",
872 |        "42  fanniemae       zstd-10   0.476892      32768\n",
873 |        "43  fanniemae           lz4   0.418044      32768\n",
874 |        "44    nyctaxi  uncompressed   0.008242      32768\n",
875 |        "45    nyctaxi        zstd-1   0.712379      32768\n",
876 |        "46    nyctaxi       zstd-10   0.530141      32768\n",
877 |        "47    nyctaxi           lz4   0.288290      32768\n",
878 |        "48  fanniemae  uncompressed   0.010546      65536\n",
879 |        "49  fanniemae        zstd-1   0.595585      65536\n",
880 |        "50  fanniemae       zstd-10   0.440694      65536\n",
881 |        "51  fanniemae           lz4   0.395174      65536\n",
882 |        "52    nyctaxi  uncompressed   0.006750      65536\n",
883 |        "53    nyctaxi        zstd-1   0.593057      65536\n",
884 |        "54    nyctaxi       zstd-10   0.467952      65536\n",
885 |        "55    nyctaxi           lz4   0.277783      65536"
886 |       ]
887 |      },
888 |      "execution_count": 8,
889 |      "metadata": {},
890 |      "output_type": "execute_result"
891 |     }
892 |    ],
893 |    "source": [
894 |     "reads"
895 |    ]
896 |   },
897 |   {
898 |    "cell_type": "code",
899 |    "execution_count": null,
900 |    "metadata": {},
901 |    "outputs": [],
902 |    "source": []
903 |   }
904 |  ],
905 |  "metadata": {
906 |   "kernelspec": {
907 |    "display_name": "Python 3",
908 |    "language": "python",
909 |    "name": "python3"
910 |   },
911 |   "language_info": {
912 |    "codemirror_mode": {
913 |     "name": "ipython",
914 |     "version": 3
915 |    },
916 |    "file_extension": ".py",
917 |    "mimetype": "text/x-python",
918 |    "name": "python",
919 |    "nbconvert_exporter": "python",
920 |    "pygments_lexer": "ipython3",
921 |    "version": "3.7.6"
922 |   }
923 |  },
924 |  "nbformat": 4,
925 |  "nbformat_minor": 4
926 | }
927 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/all_read_results.csv:
--------------------------------------------------------------------------------
 1 | expr,time,dataset,output_type,nthreads,language
 2 | csv_fread,17.535751806,fanniemae,R data.frame,1,R
 3 | fst (UNC),5.83356695,fanniemae,R data.frame,1,R
 4 | fst (c=50),5.875382178,fanniemae,R data.frame,1,R
 5 | feather V1,10.078519502799999,fanniemae,R data.frame,1,R
 6 | feather V2 (UNC),4.7198155451999995,fanniemae,R data.frame,1,R
 7 | feather V2 (LZ4),5.852145495199999,fanniemae,R data.frame,1,R
 8 | feather V2 (ZSTD),7.77908361,fanniemae,R data.frame,1,R
 9 | parquet (UNC),9.4933916048,fanniemae,R data.frame,1,R
10 | parquet (SNAPPY),9.911315661200002,fanniemae,R data.frame,1,R
11 | RDS (UNC),30.2670197082,fanniemae,R data.frame,1,R
12 | RDS (C),41.482849064199996,fanniemae,R data.frame,1,R
13 | csv_fread,23.370041255,nyctaxi,R data.frame,1,R
14 | fst (UNC),13.017416436,nyctaxi,R data.frame,1,R
15 | fst (c=50),12.6347099714,nyctaxi,R data.frame,1,R
16 | feather V1,13.443664009399999,nyctaxi,R data.frame,1,R
17 | feather V2 (UNC),11.3714301042,nyctaxi,R data.frame,1,R
18 | feather V2 (LZ4),13.29604463,nyctaxi,R data.frame,1,R
19 | feather V2 (ZSTD),14.5943007722,nyctaxi,R data.frame,1,R
20 | parquet (UNC),13.1586667582,nyctaxi,R data.frame,1,R
21 | parquet (SNAPPY),13.958228992,nyctaxi,R data.frame,1,R
22 | RDS (UNC),22.211784820200002,nyctaxi,R data.frame,1,R
23 | RDS (C),30.765105346200002,nyctaxi,R data.frame,1,R
24 | parquet (UNC),6.126083183288574,fanniemae,arrow Table,1,Python
25 | parquet (UNC),9.3643874168396,fanniemae,pandas,1,Python
26 | parquet (SNAPPY),6.056532478332518,fanniemae,arrow Table,1,Python
27 | parquet (SNAPPY),9.177780771255494,fanniemae,pandas,1,Python
28 | feather V2 (UNC),4.354116058349609,fanniemae,pandas,1,Python
29 | feather V2 (LZ4),4.396533584594726,fanniemae,pandas,1,Python
30 | feather V2 (ZSTD),5.775776481628418,fanniemae,pandas,1,Python
31 | feather V2 (UNC),1.0860649585723876,fanniemae,arrow Table,1,Python
32 | feather V2 (LZ4),1.0962132453918456,fanniemae,arrow Table,1,Python
33 | feather V2 (ZSTD),2.531323909759521,fanniemae,arrow Table,1,Python
34 | parquet (UNC),2.2780594348907472,nyctaxi,arrow Table,1,Python
35 | parquet (UNC),9.222453880310061,nyctaxi,pandas,1,Python
36 | parquet (SNAPPY),2.8247000694274904,nyctaxi,arrow Table,1,Python
37 | parquet (SNAPPY),9.735122680664062,nyctaxi,pandas,1,Python
38 | feather V2 (UNC),7.608278465270996,nyctaxi,pandas,1,Python
39 | feather V2 (LZ4),7.784061861038206,nyctaxi,pandas,1,Python
40 | feather V2 (ZSTD),9.633673095703122,nyctaxi,pandas,1,Python
41 | feather V2 (UNC),0.5403317451477051,nyctaxi,arrow Table,1,Python
42 | feather V2 (LZ4),0.9643253803253172,nyctaxi,arrow Table,1,Python
43 | feather V2 (ZSTD),2.7800182342529296,nyctaxi,arrow Table,1,Python
44 | csv_fread,8.036938666600001,fanniemae,R data.frame,4,R
45 | fst (UNC),6.3416014972,fanniemae,R data.frame,4,R
46 | fst (c=50),5.0547549678,fanniemae,R data.frame,4,R
47 | feather V1,9.799018014,fanniemae,R data.frame,4,R
48 | feather V2 (UNC),5.0542017474,fanniemae,R data.frame,4,R
49 | feather V2 (LZ4),4.928118181,fanniemae,R data.frame,4,R
50 | feather V2 (ZSTD),5.5355538286,fanniemae,R data.frame,4,R
51 | parquet (UNC),6.281569166600001,fanniemae,R data.frame,4,R
52 | parquet (SNAPPY),6.3922376926,fanniemae,R data.frame,4,R
53 | RDS (UNC),29.8928874914,fanniemae,R data.frame,4,R
54 | RDS (C),41.273872293800004,fanniemae,R data.frame,4,R
55 | csv_fread,18.312046954,nyctaxi,R data.frame,4,R
56 | fst (UNC),11.9693504656,nyctaxi,R data.frame,4,R
57 | fst (c=50),13.4391470686,nyctaxi,R data.frame,4,R
58 | feather V1,12.034649945,nyctaxi,R data.frame,4,R
59 | feather V2 (UNC),11.0239614322,nyctaxi,R data.frame,4,R
60 | feather V2 (LZ4),11.592801001,nyctaxi,R data.frame,4,R
61 | feather V2 (ZSTD),12.704684877,nyctaxi,R data.frame,4,R
62 | parquet (UNC),12.225668849,nyctaxi,R data.frame,4,R
63 | parquet (SNAPPY),12.0044663816,nyctaxi,R data.frame,4,R
64 | RDS (UNC),21.847153904,nyctaxi,R data.frame,4,R
65 | RDS (C),30.735937022799998,nyctaxi,R data.frame,4,R
66 | parquet (UNC),1.841284704208374,fanniemae,arrow Table,4,Python
67 | parquet (UNC),4.0880148887634284,fanniemae,pandas,4,Python
68 | parquet (SNAPPY),1.8786502361297608,fanniemae,arrow Table,4,Python
69 | parquet (SNAPPY),4.165652704238892,fanniemae,pandas,4,Python
70 | feather V2 (UNC),3.5610058307647705,fanniemae,pandas,4,Python
71 | feather V2 (LZ4),2.778682994842529,fanniemae,pandas,4,Python
72 | feather V2 (ZSTD),3.0616337299346923,fanniemae,pandas,4,Python
73 | feather V2 (UNC),1.1269856452941895,fanniemae,arrow Table,4,Python
74 | feather V2 (LZ4),0.4898182392120362,fanniemae,arrow Table,4,Python
75 | feather V2 (ZSTD),0.8093690395355224,fanniemae,arrow Table,4,Python
76 | parquet (UNC),0.6995339870452881,nyctaxi,arrow Table,4,Python
77 | parquet (UNC),7.4361457347869875,nyctaxi,pandas,4,Python
78 | parquet (SNAPPY),0.78084397315979,nyctaxi,arrow Table,4,Python
79 | parquet (SNAPPY),7.540273284912108,nyctaxi,pandas,4,Python
80 | feather V2 (UNC),7.369460582733153,nyctaxi,pandas,4,Python
81 | feather V2 (LZ4),7.119231033325195,nyctaxi,pandas,4,Python
82 | feather V2 (ZSTD),7.537483549118043,nyctaxi,pandas,4,Python
83 | feather V2 (UNC),0.6116453170776367,nyctaxi,arrow Table,4,Python
84 | feather V2 (LZ4),0.4065845012664795,nyctaxi,arrow Table,4,Python
85 | feather V2 (ZSTD),0.8925417900085449,nyctaxi,arrow Table,4,Python
86 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/all_results.csv:
--------------------------------------------------------------------------------
 1 | expr,time,dataset,output_type,nthreads,language
 2 | R csv_fread,17.8678609836,fanniemae,R data.frame,1,R
 3 | R fst,5.7596893994,fanniemae,R data.frame,1,R
 4 | feather (UNC),4.4072281468000005,fanniemae,R data.frame,1,R
 5 | feather (LZ4),6.0330426373999995,fanniemae,R data.frame,1,R
 6 | feather (ZSTD),7.526674342,fanniemae,R data.frame,1,R
 7 | parquet (SNAPPY),10.0367648462,fanniemae,R data.frame,1,R
 8 | R csv_fread,24.648797387400002,nyctaxi,R data.frame,1,R
 9 | R fst,13.142260905799999,nyctaxi,R data.frame,1,R
10 | feather (UNC),10.96529547,nyctaxi,R data.frame,1,R
11 | feather (LZ4),11.801702598,nyctaxi,R data.frame,1,R
12 | feather (ZSTD),14.2444990752,nyctaxi,R data.frame,1,R
13 | parquet (SNAPPY),13.157420057,nyctaxi,R data.frame,1,R
14 | parquet (SNAPPY),5.7931968688964846,fanniemae,arrow Table,1,Python
15 | parquet (SNAPPY),9.107409811019897,fanniemae,pandas,1,Python
16 | feather (UNC),4.035408067703247,fanniemae,pandas,1,Python
17 | feather (LZ4),4.295090818405152,fanniemae,pandas,1,Python
18 | feather (ZSTD),5.678592157363893,fanniemae,pandas,1,Python
19 | feather (UNC),1.2464978694915771,fanniemae,arrow Table,1,Python
20 | feather (LZ4),1.062558937072754,fanniemae,arrow Table,1,Python
21 | feather (ZSTD),2.471682643890381,fanniemae,arrow Table,1,Python
22 | parquet (SNAPPY),2.7657272338867194,nyctaxi,arrow Table,1,Python
23 | parquet (SNAPPY),9.840531587600706,nyctaxi,pandas,1,Python
24 | feather (UNC),7.5906150341033936,nyctaxi,pandas,1,Python
25 | feather (LZ4),7.9236814975738525,nyctaxi,pandas,1,Python
26 | feather (ZSTD),9.791791486740113,nyctaxi,pandas,1,Python
27 | feather (UNC),0.6637681007385254,nyctaxi,arrow Table,1,Python
28 | feather (LZ4),1.0227035522460937,nyctaxi,arrow Table,1,Python
29 | feather (ZSTD),2.77500696182251,nyctaxi,arrow Table,1,Python
30 | R csv_fread,8.381513095,fanniemae,R data.frame,4,R
31 | R fst,4.8154870964,fanniemae,R data.frame,4,R
32 | feather (UNC),4.8105564258,fanniemae,R data.frame,4,R
33 | feather (LZ4),5.4882766928,fanniemae,R data.frame,4,R
34 | feather (ZSTD),5.986291964,fanniemae,R data.frame,4,R
35 | parquet (SNAPPY),6.7089619354,fanniemae,R data.frame,4,R
36 | R csv_fread,19.3027468002,nyctaxi,R data.frame,4,R
37 | R fst,13.0800444294,nyctaxi,R data.frame,4,R
38 | feather (UNC),11.8721187678,nyctaxi,R data.frame,4,R
39 | feather (LZ4),12.5549529788,nyctaxi,R data.frame,4,R
40 | feather (ZSTD),12.829650966600001,nyctaxi,R data.frame,4,R
41 | parquet (SNAPPY),12.7536964852,nyctaxi,R data.frame,4,R
42 | parquet (SNAPPY),1.8717081069946289,fanniemae,arrow Table,4,Python
43 | parquet (SNAPPY),4.098778772354127,fanniemae,pandas,4,Python
44 | feather (UNC),3.539084482192993,fanniemae,pandas,4,Python
45 | feather (LZ4),2.8530110359191894,fanniemae,pandas,4,Python
46 | feather (ZSTD),3.06166353225708,fanniemae,pandas,4,Python
47 | feather (UNC),1.3176395416259763,fanniemae,arrow Table,4,Python
48 | feather (LZ4),0.4744390964508057,fanniemae,arrow Table,4,Python
49 | feather (ZSTD),0.7838622570037842,fanniemae,arrow Table,4,Python
50 | parquet (SNAPPY),0.8635732173919678,nyctaxi,arrow Table,4,Python
51 | parquet (SNAPPY),7.623702335357666,nyctaxi,pandas,4,Python
52 | feather (UNC),7.328182792663574,nyctaxi,pandas,4,Python
53 | feather (LZ4),7.2832419872283936,nyctaxi,pandas,4,Python
54 | feather (ZSTD),8.017264556884765,nyctaxi,pandas,4,Python
55 | feather (UNC),0.6738637924194336,nyctaxi,arrow Table,4,Python
56 | feather (LZ4),0.4330804347991944,nyctaxi,arrow Table,4,Python
57 | feather (ZSTD),0.9005756855010987,nyctaxi,arrow Table,4,Python
58 | R csv_fread,8.235247531,fanniemae,R data.frame,8,R
59 | R fst,4.5943393692,fanniemae,R data.frame,8,R
60 | feather (UNC),4.7164801714,fanniemae,R data.frame,8,R
61 | feather (LZ4),4.6001075036,fanniemae,R data.frame,8,R
62 | feather (ZSTD),5.166106334399999,fanniemae,R data.frame,8,R
63 | parquet (SNAPPY),5.9058646954,fanniemae,R data.frame,8,R
64 | R csv_fread,17.998316013,nyctaxi,R data.frame,8,R
65 | R fst,13.064559282,nyctaxi,R data.frame,8,R
66 | feather (UNC),11.93319899,nyctaxi,R data.frame,8,R
67 | feather (LZ4),12.5654696644,nyctaxi,R data.frame,8,R
68 | feather (ZSTD),12.1251017998,nyctaxi,R data.frame,8,R
69 | parquet (SNAPPY),11.4879469076,nyctaxi,R data.frame,8,R
70 | parquet (SNAPPY),1.3059203624725342,fanniemae,arrow Table,8,Python
71 | parquet (SNAPPY),3.710281848907471,fanniemae,pandas,8,Python
72 | feather (UNC),3.67109489440918,fanniemae,pandas,8,Python
73 | feather (LZ4),2.8483234405517583,fanniemae,pandas,8,Python
74 | feather (ZSTD),2.943112850189209,fanniemae,pandas,8,Python
75 | feather (UNC),1.3228723049163815,fanniemae,arrow Table,8,Python
76 | feather (LZ4),0.4322311401367188,fanniemae,arrow Table,8,Python
77 | feather (ZSTD),0.5514030933380127,fanniemae,arrow Table,8,Python
78 | parquet (SNAPPY),0.7145666599273681,nyctaxi,arrow Table,8,Python
79 | parquet (SNAPPY),7.5506598472595226,nyctaxi,pandas,8,Python
80 | feather (UNC),7.442094039916992,nyctaxi,pandas,8,Python
81 | feather (LZ4),7.163635158538819,nyctaxi,pandas,8,Python
82 | feather (ZSTD),7.376304483413696,nyctaxi,pandas,8,Python
83 | feather (UNC),0.638268232345581,nyctaxi,arrow Table,8,Python
84 | feather (LZ4),0.3298566818237305,nyctaxi,arrow Table,8,Python
85 | feather (ZSTD),0.576887559890747,nyctaxi,arrow Table,8,Python
86 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/all_write_results.csv:
--------------------------------------------------------------------------------
 1 | expr,time,dataset,output_type,nthreads,language
 2 | fst (UNC),7.00702392,fanniemae,R data.frame,1,R
 3 | fst (c=50),4.385196419,fanniemae,R data.frame,1,R
 4 | feather V1,8.656647228,fanniemae,R data.frame,1,R
 5 | feather V2 (UNC),10.040626659,fanniemae,R data.frame,1,R
 6 | feather V2 (LZ4),10.818098194,fanniemae,R data.frame,1,R
 7 | feather V2 (ZSTD),11.438481575,fanniemae,R data.frame,1,R
 8 | parquet (UNC),10.434816898,fanniemae,R data.frame,1,R
 9 | parquet (SNAPPY),10.800951873,fanniemae,R data.frame,1,R
10 | RDS (C),76.929230341,fanniemae,R data.frame,1,R
11 | RDS (UNC),24.216423401,fanniemae,R data.frame,1,R
12 | fst (UNC),4.08787925,nyctaxi,R data.frame,1,R
13 | fst (c=50),3.950344461,nyctaxi,R data.frame,1,R
14 | feather V1,5.97229482,nyctaxi,R data.frame,1,R
15 | feather V2 (UNC),5.888590985,nyctaxi,R data.frame,1,R
16 | feather V2 (LZ4),8.325439328,nyctaxi,R data.frame,1,R
17 | feather V2 (ZSTD),10.223231254,nyctaxi,R data.frame,1,R
18 | parquet (UNC),7.71564074,nyctaxi,R data.frame,1,R
19 | parquet (SNAPPY),8.585539352,nyctaxi,R data.frame,1,R
20 | RDS (C),104.898052261,nyctaxi,R data.frame,1,R
21 | RDS (UNC),10.739751088,nyctaxi,R data.frame,1,R
22 | parquet (UNC),6.220219850540161,fanniemae,arrow Table,1,Python
23 | parquet (UNC),12.395264983177185,fanniemae,pandas,1,Python
24 | parquet (SNAPPY),6.694774866104126,fanniemae,arrow Table,1,Python
25 | parquet (SNAPPY),13.161320447921753,fanniemae,pandas,1,Python
26 | feather V2 (UNC),12.677234172821045,fanniemae,pandas,1,Python
27 | feather V2 (UNC),6.397535443305969,fanniemae,arrow Table,1,Python
28 | feather V2 (LZ4),8.32238781452179,fanniemae,pandas,1,Python
29 | feather V2 (LZ4),2.2326916456222534,fanniemae,arrow Table,1,Python
30 | feather V2 (ZSTD),10.61594545841217,fanniemae,pandas,1,Python
31 | feather V2 (ZSTD),4.308579444885254,fanniemae,arrow Table,1,Python
32 | parquet (UNC),4.5986950397491455,nyctaxi,arrow Table,1,Python
33 | parquet (UNC),9.009780049324037,nyctaxi,pandas,1,Python
34 | parquet (SNAPPY),5.70121443271637,nyctaxi,arrow Table,1,Python
35 | parquet (SNAPPY),10.175373315811155,nyctaxi,pandas,1,Python
36 | feather V2 (UNC),7.1334041357040405,nyctaxi,pandas,1,Python
37 | feather V2 (UNC),3.112175464630127,nyctaxi,arrow Table,1,Python
38 | feather V2 (LZ4),7.4143136739730835,nyctaxi,pandas,1,Python
39 | feather V2 (LZ4),3.567118763923645,nyctaxi,arrow Table,1,Python
40 | feather V2 (ZSTD),11.283223748207092,nyctaxi,pandas,1,Python
41 | feather V2 (ZSTD),6.928452372550964,nyctaxi,arrow Table,1,Python
42 | fst (UNC),7.758567831,fanniemae,R data.frame,4,R
43 | fst (c=50),3.700873556,fanniemae,R data.frame,4,R
44 | feather V1,7.08059183,fanniemae,R data.frame,4,R
45 | feather V2 (UNC),10.413025112,fanniemae,R data.frame,4,R
46 | feather V2 (LZ4),10.818213516,fanniemae,R data.frame,4,R
47 | feather V2 (ZSTD),11.563816777,fanniemae,R data.frame,4,R
48 | parquet (UNC),10.814584911,fanniemae,R data.frame,4,R
49 | parquet (SNAPPY),11.152511189,fanniemae,R data.frame,4,R
50 | RDS (C),78.42714811,fanniemae,R data.frame,4,R
51 | RDS (UNC),24.919762665,fanniemae,R data.frame,4,R
52 | fst (UNC),4.399914353,nyctaxi,R data.frame,4,R
53 | fst (c=50),3.305661431,nyctaxi,R data.frame,4,R
54 | feather V1,5.47744372,nyctaxi,R data.frame,4,R
55 | feather V2 (UNC),5.864371601,nyctaxi,R data.frame,4,R
56 | feather V2 (LZ4),8.494803995,nyctaxi,R data.frame,4,R
57 | feather V2 (ZSTD),10.073068744,nyctaxi,R data.frame,4,R
58 | parquet (UNC),7.675560036,nyctaxi,R data.frame,4,R
59 | parquet (SNAPPY),8.428579617,nyctaxi,R data.frame,4,R
60 | RDS (C),108.234060692,nyctaxi,R data.frame,4,R
61 | RDS (UNC),10.717121094,nyctaxi,R data.frame,4,R
62 | parquet (UNC),6.162686586380005,fanniemae,arrow Table,4,Python
63 | parquet (UNC),11.565850496292114,fanniemae,pandas,4,Python
64 | parquet (SNAPPY),6.410535216331482,fanniemae,arrow Table,4,Python
65 | parquet (SNAPPY),11.6298109292984,fanniemae,pandas,4,Python
66 | feather V2 (UNC),11.104193806648254,fanniemae,pandas,4,Python
67 | feather V2 (UNC),5.889622092247009,fanniemae,arrow Table,4,Python
68 | feather V2 (LZ4),6.612253308296204,fanniemae,pandas,4,Python
69 | feather V2 (LZ4),1.306950330734253,fanniemae,arrow Table,4,Python
70 | feather V2 (ZSTD),7.202290296554565,fanniemae,pandas,4,Python
71 | feather V2 (ZSTD),1.8320761919021609,fanniemae,arrow Table,4,Python
72 | parquet (UNC),4.338123440742494,nyctaxi,arrow Table,4,Python
73 | parquet (UNC),8.028993129730225,nyctaxi,pandas,4,Python
74 | parquet (SNAPPY),5.622675895690918,nyctaxi,arrow Table,4,Python
75 | parquet (SNAPPY),9.33586835861206,nyctaxi,pandas,4,Python
76 | feather V2 (UNC),6.233096599578857,nyctaxi,pandas,4,Python
77 | feather V2 (UNC),2.994387269020081,nyctaxi,arrow Table,4,Python
78 | feather V2 (LZ4),5.67785370349884,nyctaxi,pandas,4,Python
79 | feather V2 (LZ4),2.289505124092102,nyctaxi,arrow Table,4,Python
80 | feather V2 (ZSTD),6.161942005157472,nyctaxi,pandas,4,Python
81 | feather V2 (ZSTD),2.7954366207122803,nyctaxi,arrow Table,4,Python
82 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/benchmark.R:
--------------------------------------------------------------------------------
  1 | library(fst)
  2 | library(microbenchmark)
  3 | library(data.table)
  4 | library(arrow)
  5 | library(feather)
  6 | library(stringr)
  7 | library(dplyr)
  8 | 
  9 | files <- c("2016Q4", "yellow_tripdata_2010-01")
 10 | names <- c("fanniemae", "nyctaxi")
 11 | seps <- c("|", ",")
 12 | 
 13 | create_files <- function(base) {
 14 |   df <- arrow::read_parquet(str_c(base, "_snappy.parquet"))
 15 |   feather::write_feather(df, str_c(base, "_v1.feather"))
 16 |   fst::write_fst(df, str_c(base, "_0.fst"), compress=0)
 17 |   fst::write_fst(df, str_c(base, "_50.fst"), compress=50)
 18 |   saveRDS(df, str_c(base, "_compressed.rds"), compress=TRUE)
 19 |   saveRDS(df, str_c(base, "_uncompressed.rds"), compress=FALSE)
 20 | }
 21 | 
 22 | do_benchmark <- function(index) {
 23 |   base <- files[index]
 24 |   sep <- seps[index]
 25 | 
 26 |   csv_path <- str_c("data/", base, ".csv")
 27 |   feather_v1_path <- str_c(base, "_v1.feather")
 28 |   feather_unc_path <- str_c(base, "_uncompressed.feather")
 29 |   feather_lz4_path <- str_c(base, "_lz4.feather")
 30 |   feather_zstd_path <- str_c(base, "_zstd.feather")
 31 |   fst_0_path <- str_c(base, "_0.fst")
 32 |   fst_50_path <- str_c(base, "_50.fst")
 33 |   parquet_unc_path <- str_c(base, "_uncompressed.parquet")
 34 |   parquet_snappy_path <- str_c(base, "_snappy.parquet")
 35 |   rds_unc_path <- str_c(base, "_uncompressed.rds")
 36 |   rds_compressed_path <- str_c(base, "_compressed.rds")
 37 | 
 38 |   mbm <- microbenchmark(
 39 |      csv_fread=data.table::fread(csv_path, sep=sep, header=FALSE),
 40 |      fst_unc=fst::read_fst(fst_0_path),
 41 |      fst_50=fst::read_fst(fst_50_path),
 42 |      feather_v1=feather::read_feather(feather_v1_path),
 43 |      feather_unc=arrow::read_feather(feather_unc_path),
 44 |      feather_lz4=arrow::read_feather(feather_lz4_path),
 45 |      feather_zstd=arrow::read_feather(feather_zstd_path),
 46 |      parquet_unc=arrow::read_parquet(parquet_unc_path),
 47 |      parquet_snappy=arrow::read_parquet(parquet_snappy_path),
 48 |      rds_unc=readRDS(rds_unc_path),
 49 |      rds_compressed=readRDS(rds_compressed_path),
 50 |      times=5
 51 |   )
 52 |   mbm <- data.frame(mbm) %>% dplyr::group_by(expr) %>% dplyr::summarize(time=mean(time))
 53 |   mbm$dataset <- names[index]
 54 |   mbm
 55 | }
 56 | 
 57 | do_write_benchmark <- function(index) {
 58 |   base <- files[index]
 59 |   sep <- seps[index]
 60 | 
 61 |   df <- arrow::read_parquet(str_c(base, "_snappy.parquet"))
 62 | 
 63 |   mbm <- microbenchmark(
 64 |      fst_unc=fst::write_fst(df, str_c(base, "_0.fst"), compress=0),
 65 |      fst_50=fst::write_fst(df, str_c(base, "_50.fst"), compress=50),
 66 |      feather_v1=feather::write_feather(df, str_c(base, "_v1.feather")),
 67 |      feather_unc=arrow::write_feather(df, str_c(base, "_unc_r.feather"),
 68 |           compression="uncompressed"),
 69 |      feather_lz4=arrow::write_parquet(df, str_c(base, "_lz4_r.feather"),
 70 |           compression="lz4"),
 71 |      feather_zstd=arrow::write_parquet(df, str_c(base, "_zstd_r.feather"),
 72 |           compression="zstd"),
 73 |      parquet_unc=arrow::write_parquet(df, str_c(base, "_unc_r.parquet"),
 74 |           compression="uncompressed"),
 75 |      parquet_snappy=arrow::write_parquet(df, str_c(base, "_snappy_r.parquet"),
 76 |           compression="snappy"),
 77 |      rds_compressed=saveRDS(df, str_c(base, "_compressed.rds"), compress=TRUE),
 78 |      rds_unc=saveRDS(df, str_c(base, "_uncompressed.rds"), compress=FALSE),
 79 |      times=1
 80 |   )
 81 |   mbm <- data.frame(mbm) %>% dplyr::group_by(expr) %>% dplyr::summarize(time=mean(time))
 82 |   mbm$dataset <- names[index]
 83 |   mbm
 84 | }
 85 | 
 86 | generate_files <- function() {
 87 |   for (base in files) {
 88 |     create_files(base)
 89 |   }
 90 | }
 91 | 
 92 | # generate_files()
 93 | 
 94 | print(str_c("Using ", arrow::cpu_count(), " threads"))
 95 | 
 96 | results <- dplyr::bind_rows(do_benchmark(1), do_benchmark(2))
 97 | print(results)
 98 | write.csv(results, str_c("r_read_results_", arrow::cpu_count(), ".csv"))
 99 | 
100 | write_results <- dplyr::bind_rows(do_write_benchmark(1), do_write_benchmark(2))
101 | print(write_results)
102 | write.csv(write_results, str_c("r_write_results_", arrow::cpu_count(), ".csv"))
103 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/benchmark.py:
--------------------------------------------------------------------------------
  1 | # flake8: noqa
  2 | 
  3 | import pyarrow.feather as feather
  4 | import pandas as pd
  5 | import json
  6 | import numpy as np
  7 | import pyarrow as pa
  8 | import pyarrow.parquet as pq
  9 | from pandas.util.testing import rands
 10 | import gc
 11 | import time
 12 | 
 13 | 
 14 | def get_timing(f, niter):
 15 |     start = time.clock_gettime(time.CLOCK_REALTIME)
 16 |     for i in range(niter):
 17 |         f()
 18 |     result = (time.clock_gettime(time.CLOCK_REALTIME) - start) / niter
 19 |     return result
 20 | 
 21 | 
 22 | class Benchmarker:
 23 | 
 24 |     def __init__(self, file_info):
 25 |         self.base = file_info['base']
 26 |         (self.csv_path,
 27 |          self.sep,
 28 |          self.header) = unpack(file_info['source'], 'path', 'sep', 'header')
 29 | 
 30 |         self.parquet_unc_path = '{}_uncompressed.parquet'.format(self.base)
 31 |         self.parquet_snappy_path = '{}_snappy.parquet'.format(self.base)
 32 |         self.feather_unc_path = '{}_uncompressed.feather'.format(self.base)
 33 |         self.feather_lz4_path = '{}_lz4.feather'.format(self.base)
 34 |         self.feather_zstd_path = '{}_zstd.feather'.format(self.base)
 35 | 
 36 |     def bench_read(self, niter=5):
 37 |         cases = [
 38 |             ('parquet (UNC)', 'arrow Table',
 39 |              lambda: pq.read_table(self.parquet_unc_path, memory_map=False)),
 40 |             ('parquet (UNC)', 'pandas',
 41 |              lambda: (pq.read_table(self.parquet_unc_path, memory_map=False)
 42 |                       .to_pandas())),
 43 |             ('parquet (SNAPPY)', 'arrow Table',
 44 |              lambda: pq.read_table(self.parquet_snappy_path,
 45 |                                    memory_map=False)),
 46 |             ('parquet (SNAPPY)', 'pandas',
 47 |              lambda: (pq.read_table(self.parquet_snappy_path, memory_map=False)
 48 |                       .to_pandas())),
 49 |             ('feather V2 (UNC)', 'pandas',
 50 |              lambda: feather.read_feather(self.feather_unc_path,
 51 |                                           memory_map=False)),
 52 |             ('feather V2 (LZ4)', 'pandas',
 53 |              lambda: feather.read_feather(self.feather_lz4_path,
 54 |                                           memory_map=False)),
 55 |             ('feather V2 (ZSTD)', 'pandas',
 56 |              lambda: feather.read_feather(self.feather_zstd_path,
 57 |                                           memory_map=False)),
 58 |             ('feather V2 (UNC)', 'arrow Table',
 59 |              lambda: feather.read_table(self.feather_unc_path,
 60 |                                         memory_map=False)),
 61 |             ('feather V2 (LZ4)', 'arrow Table',
 62 |              lambda: feather.read_table(self.feather_lz4_path,
 63 |                                         memory_map=False)),
 64 |             ('feather V2 (ZSTD)', 'arrow Table',
 65 |              lambda: feather.read_table(self.feather_zstd_path,
 66 |                                         memory_map=False)),
 67 |         ]
 68 | 
 69 |         return self._bench_cases(cases, niter)
 70 | 
 71 |     def bench_write(self, niter=2):
 72 |         print("Reading text file: {}".format(self.csv_path))
 73 |         df = pd.read_csv(self.csv_path, sep=self.sep, header=self.header,
 74 |                          low_memory=False)
 75 |         if self.header is None:
 76 |             df.columns = ['f{}'.format(i) for i in range(len(df.columns))]
 77 | 
 78 |         def _get_table(df):
 79 |             return (pa.Table.from_pandas(df, preserve_index=False)
 80 |                     .replace_schema_metadata(None))
 81 | 
 82 |         t = _get_table(df)
 83 | 
 84 |         cases = [
 85 |             ('parquet (UNC)', 'arrow Table',
 86 |              lambda: pq.write_table(t, self.parquet_unc_path,
 87 |                                     compression='NONE')),
 88 |             ('parquet (UNC)', 'pandas',
 89 |              lambda: pq.write_table(_get_table(df), self.parquet_unc_path,
 90 |                                     compression='NONE')),
 91 |             ('parquet (SNAPPY)', 'arrow Table',
 92 |              lambda: pq.write_table(t, self.parquet_snappy_path)),
 93 |             ('parquet (SNAPPY)', 'pandas',
 94 |              lambda: pq.write_table(_get_table(df), self.parquet_snappy_path)),
 95 |             ('feather V2 (UNC)', 'pandas',
 96 |              lambda: feather.write_feather(df, self.feather_unc_path,
 97 |                                            compression='uncompressed')),
 98 |             ('feather V2 (UNC)', 'arrow Table',
 99 |              lambda: feather.write_feather(t, self.feather_unc_path,
100 |                                            compression='uncompressed')),
101 |             ('feather V2 (LZ4)', 'pandas',
102 |              lambda: feather.write_feather(df, self.feather_lz4_path,
103 |                                            compression='lz4')),
104 |             ('feather V2 (LZ4)', 'arrow Table',
105 |              lambda: feather.write_feather(t, self.feather_lz4_path,
106 |                                            compression='lz4')),
107 |             ('feather V2 (ZSTD)', 'pandas',
108 |              lambda: feather.write_feather(df, self.feather_zstd_path,
109 |                                            compression='zstd')),
110 |             ('feather V2 (ZSTD)', 'arrow Table',
111 |              lambda: feather.write_feather(t, self.feather_zstd_path,
112 |                                            compression='zstd'))
113 |         ]
114 | 
115 |         return self._bench_cases(cases, niter)
116 | 
117 |     def _bench_cases(self, cases, niter):
118 |         results = []
119 |         for name, output_type, f in cases:
120 |             print(name)
121 |             result = (name, output_type, get_timing(f, niter))
122 |             print(result)
123 |             results.append(result)
124 |         return pd.DataFrame.from_records(results,
125 |                                          columns=['expr', 'output_type',
126 |                                                   'mean'])
127 | 
128 | 
129 | def unpack(d, *fields):
130 |     return (d[f] for f in fields)
131 | 
132 | 
133 | 
134 | files = {
135 |     'fanniemae': {
136 |         'base': '2016Q4',
137 |         'source': {
138 |             'path': 'data/2016Q4.csv',
139 |             'sep': '|',
140 |             'header': None
141 |         }
142 |     },
143 |     'nyctaxi': {
144 |         'base': 'yellow_tripdata_2010-01',
145 |         'source': {
146 |             'path': 'data/yellow_tripdata_2010-01.csv',
147 |             'sep': ',',
148 |             'header': 0
149 |         }
150 |     }
151 | }
152 | 
153 | 
154 | def run_benchmarks(num_threads, what='read'):
155 |     pa.set_cpu_count(num_threads)
156 | 
157 |     all_results = []
158 |     for name, info in files.items():
159 |         benchmarker = Benchmarker(info)
160 |         if what == 'read':
161 |             print("Benchmarking reads")
162 |             file_results = benchmarker.bench_read()
163 |         elif what == 'write':
164 |             print("Benchmarking writes")
165 |             file_results = benchmarker.bench_write()
166 |         else:
167 |             raise ValueError(what)
168 |         file_results['dataset'] = name
169 |         all_results.append(file_results)
170 | 
171 |     print(all_results)
172 |     return pd.concat(all_results, ignore_index=True)
173 | 
174 | 
175 | 
176 | # for i in range(5):
177 | #     pq.read_table('yellow_tripdata_2010-01.parquet').to_pandas()
178 | 
179 | # write_files(files)
180 | 
181 | num_threads_cases = [1, 4]
182 | 
183 | for nthreads in num_threads_cases:
184 |     write_results = run_benchmarks(nthreads, what='write')
185 |     write_results.to_csv('py_write_results_{}.csv'.format(nthreads))
186 | 
187 |     read_results = run_benchmarks(nthreads, what='read')
188 |     read_results.to_csv('py_read_results_{}.csv'.format(nthreads))
189 | 
190 | # for nthreads in num_threads_cases:
191 | #     run_benchmarks(nthreads)
192 | 
193 | # ('pyarrow.parquet', 1.5470361709594727)
194 | # ('pyarrow.parquet-pandas', 2.925654172897339)
195 | # ('pyarrow.feather', 1.6384665012359618)
196 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/file_sizes.csv:
--------------------------------------------------------------------------------
 1 | dataset,file_type,size
 2 | fanniemae,feather V1,4812.5457763671875
 3 | fanniemae,feather V2 (UNC),4812.015695571899
 4 | fanniemae,feather V2 (LZ4),608.8575687408447
 5 | fanniemae,feather V2 (ZSTD),499.76035499572754
 6 | fanniemae,parquet (UNC),372.0420684814453
 7 | fanniemae,parquet (SNAPPY),136.54194450378418
 8 | fanniemae,fst (UNC),5033.68958568573
 9 | fanniemae,fst (C=50),766.0145416259766
10 | fanniemae,RDS (C),114.32603359222412
11 | fanniemae,RDS (UNC),5682.447074890137
12 | nyctaxi,feather V1,2389.4743881225586
13 | nyctaxi,feather V2 (UNC),2389.72052192688
14 | nyctaxi,feather V2 (LZ4),1120.5288562774658
15 | nyctaxi,feather V2 (ZSTD),783.8803653717041
16 | nyctaxi,parquet (UNC),1188.3576135635376
17 | nyctaxi,parquet (SNAPPY),719.5741958618164
18 | nyctaxi,fst (UNC),2412.6597032546997
19 | nyctaxi,fst (C=50),1199.6639070510864
20 | nyctaxi,RDS (C),541.6701745986938
21 | nyctaxi,RDS (UNC),2671.2057056427
22 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/generate_results.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Make sure we're using performance CPU governor
 4 | sudo cpufreq-set -g performance
 5 | 
 6 | python benchmark.py
 7 | 
 8 | OMP_NUM_THREADS=1 Rscript benchmark.R
 9 | OMP_NUM_THREADS=4 Rscript benchmark.R
10 | 
11 | python glue_results.py
12 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/glue_results.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import pandas as pd
 3 | 
 4 | 
 5 | def munge_results(kind='read'):
 6 |     pieces = []
 7 |     for num_threads in (1, 4):
 8 |         expr_rename = {
 9 |             'parquet_unc': 'parquet (UNC)',
10 |             'parquet_snappy': 'parquet (SNAPPY)',
11 |             'feather_v1': 'feather V1',
12 |             'feather_unc': 'feather V2 (UNC)',
13 |             'feather_lz4': 'feather V2 (LZ4)',
14 |             'feather_zstd': 'feather V2 (ZSTD)',
15 |             'fst_unc': 'fst (UNC)',
16 |             'fst_50': 'fst (c=50)',
17 |             'rds_unc': 'RDS (UNC)',
18 |             'rds_compressed': 'RDS (C)',
19 |             'pyarrow.parquet': 'parquet (SNAPPY)',
20 |             'pyarrow.feather (UNC)': 'feather V2 (UNC)',
21 |             'pyarrow.feather (LZ4)': 'feather V2 (LZ4)',
22 |             'pyarrow.feather (ZSTD)': 'feather V2 (ZSTD)',
23 |         }
24 | 
25 |         r_results = pd.read_csv('r_{}_results_{}.csv'.format(kind,
26 |                                                              num_threads))
27 |         r_results = r_results[['expr', 'time', 'dataset']]
28 |         r_results['output_type'] = "R data.frame"
29 |         r_results['expr'] = r_results['expr']
30 |         r_results['time'] /= 1e9
31 |         r_results['nthreads'] = num_threads
32 |         r_results['language'] = 'R'
33 | 
34 |         r_results.expr = r_results.expr.map(lambda x: expr_rename.get(x, x))
35 | 
36 |         py_results = pd.read_csv('py_{}_results_{}.csv'.format(kind,
37 |                                                                num_threads))
38 |         py_results = py_results[['expr', 'output_type', 'mean', 'dataset']]
39 |         py_results['time'] = py_results.pop('mean')
40 |         py_results['nthreads'] = num_threads
41 |         py_results['language'] = 'Python'
42 | 
43 |         py_results.expr = py_results.expr.map(lambda x: expr_rename.get(x, x))
44 | 
45 |         renamings = {
46 |             'pyarrow.Table': 'arrow Table',
47 |         }
48 | 
49 |         py_results.output_type = py_results.output_type.map(
50 |             lambda x: renamings.get(x, x))
51 | 
52 |         pieces.extend([r_results, py_results])
53 |     return pd.concat(pieces, ignore_index=True, sort=False)
54 | 
55 | 
56 | read_results = munge_results('read')
57 | read_results.to_csv('all_read_results.csv', index=False)
58 | 
59 | write_results = munge_results('write')
60 | write_results.to_csv('all_write_results.csv', index=False)
61 | 
62 | 
63 | files = [('fanniemae', '2016Q4'),
64 |          ('nyctaxi', 'yellow_tripdata_2010-01')]
65 | 
66 | cases = [
67 |     ('feather V1', '_v1.feather'),
68 |     ('feather V2 (UNC)', '_uncompressed.feather'),
69 |     ('feather V2 (LZ4)', '_lz4.feather'),
70 |     ('feather V2 (ZSTD)', '_zstd.feather'),
71 |     ('parquet (UNC)', '_uncompressed.parquet'),
72 |     ('parquet (SNAPPY)', '_snappy.parquet'),
73 |     ('fst (UNC)', '_0.fst'),
74 |     ('fst (C=50)', '_50.fst'),
75 |     ('RDS (C)', '_compressed.rds'),
76 |     ('RDS (UNC)', '_uncompressed.rds')
77 | ]
78 | 
79 | file_sizes = []
80 | 
81 | 
82 | for logical_name, file_base in files:
83 |     for storage, ending in cases:
84 |         full_path = f'{file_base}{ending}'
85 |         size = os.stat(full_path).st_size
86 |         result = (logical_name, storage, size / (1 << 20))
87 |         print(result)
88 |         file_sizes.append(result)
89 | 
90 | file_sizes = pd.DataFrame.from_records(
91 |     file_sizes, columns=['dataset', 'file_type', 'size'])
92 | 
93 | file_sizes.to_csv('file_sizes.csv', index=False)
94 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/i9-9880H-1/all_results.csv:
--------------------------------------------------------------------------------
 1 | expr,time,dataset,output_type
 2 | R rds,24.258071124,fanniemae,R data.frame
 3 | R csv_fread,11.890215172200001,fanniemae,R data.frame
 4 | R feather_old,5.2771154078,fanniemae,R data.frame
 5 | R fst,4.2202414134,fanniemae,R data.frame
 6 | R feather_arrow,2.9834087618000003,fanniemae,R data.frame
 7 | R parquet,7.969573458,fanniemae,R data.frame
 8 | R rds,21.7943077156,nyctaxi,R data.frame
 9 | R csv_fread,21.743098532599998,nyctaxi,R data.frame
10 | R feather_old,13.1421169332,nyctaxi,R data.frame
11 | R fst,13.226063631799999,nyctaxi,R data.frame
12 | R feather_arrow,11.358103880200002,nyctaxi,R data.frame
13 | R parquet,13.9190224234,nyctaxi,R data.frame
14 | pyarrow.parquet,5.198975515365602,fanniemae,arrow Table
15 | pyarrow.parquet-pandas,7.051469707489014,fanniemae,pandas
16 | pyarrow.feather,1.979597759246826,fanniemae,pandas
17 | pyarrow.parquet,2.888606691360473,nyctaxi,arrow Table
18 | pyarrow.parquet-pandas,9.884848737716675,nyctaxi,pandas
19 | pyarrow.feather,6.670159721374513,nyctaxi,pandas
20 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/i9-9880H-1/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ursa-labs/notebooks/fef623e363546b76333a58d31dd58805248a596d/20190919file_benchmarks/i9-9880H-1/plot.png


--------------------------------------------------------------------------------
/20190919file_benchmarks/i9-9880H-1/py_results.csv:
--------------------------------------------------------------------------------
1 | ,expr,output_type,mean,dataset
2 | 0,pyarrow.parquet,arrow Table,5.198975515365601,fanniemae
3 | 1,pyarrow.parquet-pandas,pandas,7.051469707489014,fanniemae
4 | 2,pyarrow.feather,pandas,1.9795977592468261,fanniemae
5 | 3,pyarrow.parquet,arrow Table,2.8886066913604735,nyctaxi
6 | 4,pyarrow.parquet-pandas,pandas,9.884848737716675,nyctaxi
7 | 5,pyarrow.feather,pandas,6.670159721374512,nyctaxi
8 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/i9-9880H-1/r_results.csv:
--------------------------------------------------------------------------------
 1 | "","expr","time","dataset"
 2 | "1","rds",24258071124,"fanniemae"
 3 | "2","csv_fread",11890215172.2,"fanniemae"
 4 | "3","feather_old",5277115407.8,"fanniemae"
 5 | "4","fst",4220241413.4,"fanniemae"
 6 | "5","feather_arrow",2983408761.8,"fanniemae"
 7 | "6","parquet",7969573458,"fanniemae"
 8 | "7","rds",21794307715.6,"nyctaxi"
 9 | "8","csv_fread",21743098532.6,"nyctaxi"
10 | "9","feather_old",13142116933.2,"nyctaxi"
11 | "10","fst",13226063631.8,"nyctaxi"
12 | "11","feather_arrow",11358103880.2,"nyctaxi"
13 | "12","parquet",13919022423.4,"nyctaxi"
14 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/i9-9880H-4/all_results.csv:
--------------------------------------------------------------------------------
 1 | expr,time,dataset,output_type
 2 | R rds,24.7440853608,fanniemae,R data.frame
 3 | R csv_fread,5.4296275238,fanniemae,R data.frame
 4 | R feather_old,5.2222283298,fanniemae,R data.frame
 5 | R fst,3.3813570264,fanniemae,R data.frame
 6 | R feather_arrow,2.9662292186,fanniemae,R data.frame
 7 | R parquet,4.6544630666,fanniemae,R data.frame
 8 | R rds,22.135398477200003,nyctaxi,R data.frame
 9 | R csv_fread,17.687647606,nyctaxi,R data.frame
10 | R feather_old,11.989569364200001,nyctaxi,R data.frame
11 | R fst,12.0112101424,nyctaxi,R data.frame
12 | R feather_arrow,11.617949409200001,nyctaxi,R data.frame
13 | R parquet,13.0886089094,nyctaxi,R data.frame
14 | pyarrow.parquet,2.1267578125,fanniemae,arrow Table
15 | pyarrow.parquet-pandas,3.518295383453369,fanniemae,pandas
16 | pyarrow.feather,1.6831360816955567,fanniemae,pandas
17 | pyarrow.parquet,1.1050359725952148,nyctaxi,arrow Table
18 | pyarrow.parquet-pandas,7.481458854675293,nyctaxi,pandas
19 | pyarrow.feather,6.5046766757965075,nyctaxi,pandas
20 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/i9-9880H-4/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ursa-labs/notebooks/fef623e363546b76333a58d31dd58805248a596d/20190919file_benchmarks/i9-9880H-4/plot.png


--------------------------------------------------------------------------------
/20190919file_benchmarks/i9-9880H-4/py_results.csv:
--------------------------------------------------------------------------------
1 | ,expr,output_type,mean,dataset
2 | 0,pyarrow.parquet,arrow Table,2.1267578125,fanniemae
3 | 1,pyarrow.parquet-pandas,pandas,3.5182953834533692,fanniemae
4 | 2,pyarrow.feather,pandas,1.6831360816955567,fanniemae
5 | 3,pyarrow.parquet,arrow Table,1.1050359725952148,nyctaxi
6 | 4,pyarrow.parquet-pandas,pandas,7.481458854675293,nyctaxi
7 | 5,pyarrow.feather,pandas,6.504676675796508,nyctaxi
8 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/i9-9880H-4/r_results.csv:
--------------------------------------------------------------------------------
 1 | "","expr","time","dataset"
 2 | "1","rds",24744085360.8,"fanniemae"
 3 | "2","csv_fread",5429627523.8,"fanniemae"
 4 | "3","feather_old",5222228329.8,"fanniemae"
 5 | "4","fst",3381357026.4,"fanniemae"
 6 | "5","feather_arrow",2966229218.6,"fanniemae"
 7 | "6","parquet",4654463066.6,"fanniemae"
 8 | "7","rds",22135398477.2,"nyctaxi"
 9 | "8","csv_fread",17687647606,"nyctaxi"
10 | "9","feather_old",11989569364.2,"nyctaxi"
11 | "10","fst",12011210142.4,"nyctaxi"
12 | "11","feather_arrow",11617949409.2,"nyctaxi"
13 | "12","parquet",13088608909.4,"nyctaxi"
14 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/i9-9880H-8/all_results.csv:
--------------------------------------------------------------------------------
 1 | expr,time,dataset,output_type
 2 | R rds,25.963613643200002,fanniemae,R data.frame
 3 | R csv_fread,5.125688092600001,fanniemae,R data.frame
 4 | R feather_old,5.8978863578,fanniemae,R data.frame
 5 | R fst,3.6207146728,fanniemae,R data.frame
 6 | R feather_arrow,3.285127359,fanniemae,R data.frame
 7 | R parquet,4.608878230399999,fanniemae,R data.frame
 8 | R rds,22.5701864218,nyctaxi,R data.frame
 9 | R csv_fread,17.681116847200002,nyctaxi,R data.frame
10 | R feather_old,13.7390440426,nyctaxi,R data.frame
11 | R fst,13.188127108200002,nyctaxi,R data.frame
12 | R feather_arrow,12.2201220736,nyctaxi,R data.frame
13 | R parquet,12.6165632024,nyctaxi,R data.frame
14 | pyarrow.parquet,1.6406285285949709,fanniemae,arrow Table
15 | pyarrow.parquet-pandas,3.035256814956665,fanniemae,pandas
16 | pyarrow.feather,1.6025235176086423,fanniemae,pandas
17 | pyarrow.parquet,0.9039567470550536,nyctaxi,arrow Table
18 | pyarrow.parquet-pandas,6.945300436019897,nyctaxi,pandas
19 | pyarrow.feather,6.311788606643678,nyctaxi,pandas
20 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/i9-9880H-8/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ursa-labs/notebooks/fef623e363546b76333a58d31dd58805248a596d/20190919file_benchmarks/i9-9880H-8/plot.png


--------------------------------------------------------------------------------
/20190919file_benchmarks/i9-9880H-8/py_results.csv:
--------------------------------------------------------------------------------
1 | ,expr,output_type,mean,dataset
2 | 0,pyarrow.parquet,arrow Table,1.6406285285949707,fanniemae
3 | 1,pyarrow.parquet-pandas,pandas,3.035256814956665,fanniemae
4 | 2,pyarrow.feather,pandas,1.6025235176086425,fanniemae
5 | 3,pyarrow.parquet,arrow Table,0.9039567470550537,nyctaxi
6 | 4,pyarrow.parquet-pandas,pandas,6.945300436019897,nyctaxi
7 | 5,pyarrow.feather,pandas,6.311788606643677,nyctaxi
8 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/i9-9880H-8/r_results.csv:
--------------------------------------------------------------------------------
 1 | "","expr","time","dataset"
 2 | "1","rds",25963613643.2,"fanniemae"
 3 | "2","csv_fread",5125688092.6,"fanniemae"
 4 | "3","feather_old",5897886357.8,"fanniemae"
 5 | "4","fst",3620714672.8,"fanniemae"
 6 | "5","feather_arrow",3285127359,"fanniemae"
 7 | "6","parquet",4608878230.4,"fanniemae"
 8 | "7","rds",22570186421.8,"nyctaxi"
 9 | "8","csv_fread",17681116847.2,"nyctaxi"
10 | "9","feather_old",13739044042.6,"nyctaxi"
11 | "10","fst",13188127108.2,"nyctaxi"
12 | "11","feather_arrow",12220122073.6,"nyctaxi"
13 | "12","parquet",12616563202.4,"nyctaxi"
14 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/make_feather_plots.R:
--------------------------------------------------------------------------------
 1 | library(ggplot2)
 2 | 
 3 | # install.packages("stringi")
 4 | 
 5 | setwd("~/code/notebooks/20190919file_benchmarks/")
 6 | 
 7 | reads <- read.csv("ipc_read_parallel.csv")
 8 | writes <- read.csv("ipc_write_parallel.csv")
 9 | 
10 | writes
11 | 
12 | # file size
13 | ggplot(writes, aes(fill=factor(chunksize), y=file_size, x=codec)) +
14 |   facet_grid(rows=vars(dataset)) +
15 |   geom_bar(position="dodge", stat="identity") +
16 |   coord_flip()
17 | 
18 | ggsave("ipc_file_size.png", width=10, height=4)
19 | 
20 | # write time
21 | ggplot(writes, aes(fill=factor(chunksize), y=write_time, x=codec)) +
22 |   facet_grid(rows=vars(dataset)) +
23 |   geom_bar(position="dodge", stat="identity") +
24 |   coord_flip()
25 | 
26 | ggsave("ipc_write_time.png", width=10, height=4)
27 | 
28 | # read time
29 | ggplot(reads, aes(fill=factor(chunksize), y=read_time, x=codec)) +
30 |   facet_grid(rows=vars(dataset)) +
31 |   geom_bar(position="dodge", stat="identity") +
32 |   coord_flip()
33 | 
34 | ggsave("ipc_read_time.png", width=10, height=4)


--------------------------------------------------------------------------------
/20190919file_benchmarks/make_plots.R:
--------------------------------------------------------------------------------
  1 | library(ggplot2)
  2 | library(dplyr)
  3 | 
  4 | 
  5 | read_results <- read.csv("all_read_results.csv")
  6 | write_results <- read.csv("all_write_results.csv")
  7 | # Add a row for the Fannie Mae CSV file size, as reported in the previous post
  8 | file_sizes <- rbind(
  9 |   read.csv("file_sizes.csv", stringsAsFactors = FALSE),
 10 |   data.frame(
 11 |     dataset = "fanniemae",
 12 |     file_type = "CSV",
 13 |     size = 1.52*1024,
 14 |     stringsAsFactors = FALSE
 15 |   )
 16 | )
 17 | 
 18 | # Color mapping
 19 | cols <- c(
 20 |   "feather V1" = "steelblue",
 21 |   "feather V2 (UNC)" = "steelblue",
 22 |   "feather V2 (LZ4)" = "steelblue",
 23 |   "feather V2 (ZSTD)" = "steelblue",
 24 |   "parquet (SNAPPY)" = "steelblue1",
 25 |   "parquet (UNC)" = "steelblue1",
 26 |   "fst (C=50)" = "wheat4",
 27 |   "fst (UNC)" = "wheat4",
 28 |   "RDS (C)" = "gray",
 29 |   "RDS (UNC)" = "gray",
 30 |   "csv_fread" = "wheat3",
 31 |   "CSV" = "wheat3"
 32 | )
 33 | 
 34 | # This is ugly but it makes the graph labels prettier
 35 | munge_labels <- function (x) {
 36 |   sub("csv_fread", "CSV (data.table::fread)",
 37 |     sub("UNC", "Uncompressed",
 38 |       sub("feather", "Feather",
 39 |         sub("parquet", "Parquet",
 40 |           sub("[Cc]=", "ZSTD, ",
 41 |             sub("ZSTD", "ZSTD, 1",
 42 |               sub("\\(C\\)", "(GZIP)",
 43 |                 sub("V1", "V1 (Uncompressed)",
 44 |                   x))))))))
 45 | }
 46 | names(cols) <- munge_labels(names(cols))
 47 | 
 48 | fix_formats <- function(x) {
 49 |   # This applies the pretty names and reorders the factor levels so that
 50 |   # they print in the order we want
 51 |   levels(x) <- munge_labels(levels(x))
 52 |   factor(x, levels = rev(c(
 53 |       "Feather V1 (Uncompressed)",
 54 |       "Feather V2 (Uncompressed)",
 55 |       "Feather V2 (LZ4)",
 56 |       "Feather V2 (ZSTD, 1)",
 57 |       "Parquet (Uncompressed)",
 58 |       "Parquet (SNAPPY)",
 59 |       "RDS (Uncompressed)",
 60 |       "RDS (GZIP)",
 61 |       "CSV",
 62 |       "CSV (data.table::fread)",
 63 |       "fst (Uncompressed)",
 64 |       "fst (ZSTD, 50)"
 65 |     ))
 66 |   )
 67 | }
 68 | 
 69 | benchmark_plot <- function(data) {
 70 |   # Since we do the same thing for most of the graphs, collect plotting logic here
 71 |   ggplot(data, aes(y=time, fill=expr, x=expr)) +
 72 |     facet_wrap(vars(output_type), ncol=1) +
 73 |     geom_col(position="dodge") +
 74 |     theme_minimal() +
 75 |     scale_fill_manual(values = cols) +
 76 |     coord_flip() +
 77 |     theme(
 78 |       legend.position = "none",
 79 |       panel.grid.major.y = element_blank()
 80 |     )
 81 | }
 82 | 
 83 | 
 84 | ### Reading
 85 | read_results$expr <- fix_formats(read_results$expr)
 86 | read_results$Threads <- factor(read_results$nthreads)
 87 | # All
 88 | ggplot(read_results, aes(fill=Threads, y=time, x=expr)) +
 89 |   facet_grid(rows=vars(output_type), col=vars(dataset)) +
 90 |   geom_bar(position="dodge", stat="identity") +
 91 |   coord_flip() +
 92 |   theme_minimal() +
 93 |   theme(legend.position = "right") +
 94 |   labs(x = "Format", y = "Time to read (s)", title = "")
 95 | ggsave("20200414_read_full.png", width=10, height=6)
 96 | 
 97 | # Python and Arrow only
 98 | read_results %>%
 99 |   filter(nthreads == 4 & dataset == "fanniemae" & language == "Python") %>%
100 |   benchmark_plot() +
101 |   scale_y_continuous(breaks = seq(0, 10, 2), limits = c(0, 10)) +
102 |   labs(x = "", y = "Time to read (s)", title = "")
103 | ggsave("20200414_read_py.png", width=10, height=3)
104 | 
105 | # R (and drop RDS because it is out of range)
106 | read_results %>%
107 |   filter(nthreads == 4 & dataset == "fanniemae" & language == "R" & !grepl("^RDS", expr)) %>%
108 |   benchmark_plot() +
109 |   scale_y_continuous(breaks = seq(0, 10, 2), limits = c(0, 10)) +
110 |   labs(x = "", y = "Time to read (s)", title = "")
111 | ggsave("20200414_read_r.png", width=10, height=3)
112 | 
113 | ### Writing
114 | write_results$expr <- fix_formats(write_results$expr)
115 | write_results$Threads <- factor(write_results$nthreads)
116 | # All
117 | ggplot(write_results, aes(fill=Threads, y=time, x=expr)) +
118 |   facet_grid(rows=vars(output_type), col=vars(dataset)) +
119 |   geom_bar(position="dodge", stat="identity") +
120 |   coord_flip() +
121 |   labs(x = "Format", y = "Time (s)", title = "Write speeds")
122 | ggsave("20200414_write_full.png", width=10, height=6)
123 | 
124 | # Python and Arrow only
125 | write_results %>%
126 |   filter(nthreads == 4 & dataset == "fanniemae" & language == "Python") %>%
127 |   benchmark_plot() +
128 |   scale_y_continuous(breaks = seq(0, 12, 2), limits = c(0, 12)) +
129 |   labs(x = "", y = "Time to write (s)", title = "")
130 | ggsave("20200414_write_py.png", width=10, height=3)
131 | 
132 | # R
133 | write_results %>%
134 |   filter(nthreads == 4 & dataset == "fanniemae" & language == "R" & !grepl("^RDS", expr)) %>%
135 |   benchmark_plot() +
136 |   scale_y_continuous(breaks = seq(0, 12, 2), limits = c(0, 12)) +
137 |   labs(x = "", y = "Time to write (s)", title = "")
138 | ggsave("20200414_write_r.png", width=10, height=3)
139 | 
140 | ### File sizes
141 | file_sizes$file_type <- fix_formats(as.factor(file_sizes$file_type))
142 | ggplot(file_sizes[file_sizes$dataset == "fanniemae",], aes(y=size/1024, file_type, fill = file_type)) +
143 |   geom_col(position="dodge") +
144 |   theme_minimal() +
145 |   scale_fill_manual(values = cols) +
146 |   coord_flip() +
147 |   theme(
148 |     legend.position = "none",
149 |     panel.grid.major.y = element_blank()
150 |   ) +
151 |   labs(y = "File size (GB)", x = "", title = "")
152 | ggsave("20200414_file_sizes.png", width=10, height=3)
153 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/py_read_results_1.csv:
--------------------------------------------------------------------------------
 1 | ,expr,output_type,mean,dataset
 2 | 0,parquet (UNC),arrow Table,6.126083183288574,fanniemae
 3 | 1,parquet (UNC),pandas,9.3643874168396,fanniemae
 4 | 2,parquet (SNAPPY),arrow Table,6.056532478332519,fanniemae
 5 | 3,parquet (SNAPPY),pandas,9.177780771255494,fanniemae
 6 | 4,feather V2 (UNC),pandas,4.354116058349609,fanniemae
 7 | 5,feather V2 (LZ4),pandas,4.396533584594726,fanniemae
 8 | 6,feather V2 (ZSTD),pandas,5.775776481628418,fanniemae
 9 | 7,feather V2 (UNC),arrow Table,1.0860649585723876,fanniemae
10 | 8,feather V2 (LZ4),arrow Table,1.0962132453918456,fanniemae
11 | 9,feather V2 (ZSTD),arrow Table,2.5313239097595215,fanniemae
12 | 10,parquet (UNC),arrow Table,2.2780594348907472,nyctaxi
13 | 11,parquet (UNC),pandas,9.22245388031006,nyctaxi
14 | 12,parquet (SNAPPY),arrow Table,2.8247000694274904,nyctaxi
15 | 13,parquet (SNAPPY),pandas,9.735122680664062,nyctaxi
16 | 14,feather V2 (UNC),pandas,7.608278465270996,nyctaxi
17 | 15,feather V2 (LZ4),pandas,7.784061861038208,nyctaxi
18 | 16,feather V2 (ZSTD),pandas,9.633673095703125,nyctaxi
19 | 17,feather V2 (UNC),arrow Table,0.5403317451477051,nyctaxi
20 | 18,feather V2 (LZ4),arrow Table,0.9643253803253173,nyctaxi
21 | 19,feather V2 (ZSTD),arrow Table,2.7800182342529296,nyctaxi
22 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/py_read_results_4.csv:
--------------------------------------------------------------------------------
 1 | ,expr,output_type,mean,dataset
 2 | 0,parquet (UNC),arrow Table,1.841284704208374,fanniemae
 3 | 1,parquet (UNC),pandas,4.088014888763428,fanniemae
 4 | 2,parquet (SNAPPY),arrow Table,1.8786502361297608,fanniemae
 5 | 3,parquet (SNAPPY),pandas,4.165652704238892,fanniemae
 6 | 4,feather V2 (UNC),pandas,3.5610058307647705,fanniemae
 7 | 5,feather V2 (LZ4),pandas,2.778682994842529,fanniemae
 8 | 6,feather V2 (ZSTD),pandas,3.0616337299346923,fanniemae
 9 | 7,feather V2 (UNC),arrow Table,1.1269856452941895,fanniemae
10 | 8,feather V2 (LZ4),arrow Table,0.48981823921203616,fanniemae
11 | 9,feather V2 (ZSTD),arrow Table,0.8093690395355224,fanniemae
12 | 10,parquet (UNC),arrow Table,0.6995339870452881,nyctaxi
13 | 11,parquet (UNC),pandas,7.4361457347869875,nyctaxi
14 | 12,parquet (SNAPPY),arrow Table,0.78084397315979,nyctaxi
15 | 13,parquet (SNAPPY),pandas,7.540273284912109,nyctaxi
16 | 14,feather V2 (UNC),pandas,7.369460582733154,nyctaxi
17 | 15,feather V2 (LZ4),pandas,7.119231033325195,nyctaxi
18 | 16,feather V2 (ZSTD),pandas,7.537483549118042,nyctaxi
19 | 17,feather V2 (UNC),arrow Table,0.6116453170776367,nyctaxi
20 | 18,feather V2 (LZ4),arrow Table,0.4065845012664795,nyctaxi
21 | 19,feather V2 (ZSTD),arrow Table,0.8925417900085449,nyctaxi
22 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/py_write_results_1.csv:
--------------------------------------------------------------------------------
 1 | ,expr,output_type,mean,dataset
 2 | 0,parquet (UNC),arrow Table,6.220219850540161,fanniemae
 3 | 1,parquet (UNC),pandas,12.395264983177185,fanniemae
 4 | 2,parquet (SNAPPY),arrow Table,6.694774866104126,fanniemae
 5 | 3,parquet (SNAPPY),pandas,13.161320447921753,fanniemae
 6 | 4,feather V2 (UNC),pandas,12.677234172821045,fanniemae
 7 | 5,feather V2 (UNC),arrow Table,6.397535443305969,fanniemae
 8 | 6,feather V2 (LZ4),pandas,8.32238781452179,fanniemae
 9 | 7,feather V2 (LZ4),arrow Table,2.2326916456222534,fanniemae
10 | 8,feather V2 (ZSTD),pandas,10.61594545841217,fanniemae
11 | 9,feather V2 (ZSTD),arrow Table,4.308579444885254,fanniemae
12 | 10,parquet (UNC),arrow Table,4.5986950397491455,nyctaxi
13 | 11,parquet (UNC),pandas,9.009780049324036,nyctaxi
14 | 12,parquet (SNAPPY),arrow Table,5.70121443271637,nyctaxi
15 | 13,parquet (SNAPPY),pandas,10.175373315811157,nyctaxi
16 | 14,feather V2 (UNC),pandas,7.1334041357040405,nyctaxi
17 | 15,feather V2 (UNC),arrow Table,3.112175464630127,nyctaxi
18 | 16,feather V2 (LZ4),pandas,7.4143136739730835,nyctaxi
19 | 17,feather V2 (LZ4),arrow Table,3.567118763923645,nyctaxi
20 | 18,feather V2 (ZSTD),pandas,11.283223748207092,nyctaxi
21 | 19,feather V2 (ZSTD),arrow Table,6.928452372550964,nyctaxi
22 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/py_write_results_4.csv:
--------------------------------------------------------------------------------
 1 | ,expr,output_type,mean,dataset
 2 | 0,parquet (UNC),arrow Table,6.162686586380005,fanniemae
 3 | 1,parquet (UNC),pandas,11.565850496292114,fanniemae
 4 | 2,parquet (SNAPPY),arrow Table,6.410535216331482,fanniemae
 5 | 3,parquet (SNAPPY),pandas,11.6298109292984,fanniemae
 6 | 4,feather V2 (UNC),pandas,11.104193806648254,fanniemae
 7 | 5,feather V2 (UNC),arrow Table,5.889622092247009,fanniemae
 8 | 6,feather V2 (LZ4),pandas,6.612253308296204,fanniemae
 9 | 7,feather V2 (LZ4),arrow Table,1.306950330734253,fanniemae
10 | 8,feather V2 (ZSTD),pandas,7.202290296554565,fanniemae
11 | 9,feather V2 (ZSTD),arrow Table,1.8320761919021606,fanniemae
12 | 10,parquet (UNC),arrow Table,4.338123440742493,nyctaxi
13 | 11,parquet (UNC),pandas,8.028993129730225,nyctaxi
14 | 12,parquet (SNAPPY),arrow Table,5.622675895690918,nyctaxi
15 | 13,parquet (SNAPPY),pandas,9.33586835861206,nyctaxi
16 | 14,feather V2 (UNC),pandas,6.233096599578857,nyctaxi
17 | 15,feather V2 (UNC),arrow Table,2.9943872690200806,nyctaxi
18 | 16,feather V2 (LZ4),pandas,5.67785370349884,nyctaxi
19 | 17,feather V2 (LZ4),arrow Table,2.289505124092102,nyctaxi
20 | 18,feather V2 (ZSTD),pandas,6.161942005157471,nyctaxi
21 | 19,feather V2 (ZSTD),arrow Table,2.7954366207122803,nyctaxi
22 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/r_read_results_1.csv:
--------------------------------------------------------------------------------
 1 | "","expr","time","dataset"
 2 | "1","csv_fread",17535751806,"fanniemae"
 3 | "2","fst_unc",5833566950,"fanniemae"
 4 | "3","fst_50",5875382178,"fanniemae"
 5 | "4","feather_v1",10078519502.8,"fanniemae"
 6 | "5","feather_unc",4719815545.2,"fanniemae"
 7 | "6","feather_lz4",5852145495.2,"fanniemae"
 8 | "7","feather_zstd",7779083610,"fanniemae"
 9 | "8","parquet_unc",9493391604.8,"fanniemae"
10 | "9","parquet_snappy",9911315661.2,"fanniemae"
11 | "10","rds_unc",30267019708.2,"fanniemae"
12 | "11","rds_compressed",41482849064.2,"fanniemae"
13 | "12","csv_fread",23370041255,"nyctaxi"
14 | "13","fst_unc",13017416436,"nyctaxi"
15 | "14","fst_50",12634709971.4,"nyctaxi"
16 | "15","feather_v1",13443664009.4,"nyctaxi"
17 | "16","feather_unc",11371430104.2,"nyctaxi"
18 | "17","feather_lz4",13296044630,"nyctaxi"
19 | "18","feather_zstd",14594300772.2,"nyctaxi"
20 | "19","parquet_unc",13158666758.2,"nyctaxi"
21 | "20","parquet_snappy",13958228992,"nyctaxi"
22 | "21","rds_unc",22211784820.2,"nyctaxi"
23 | "22","rds_compressed",30765105346.2,"nyctaxi"
24 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/r_read_results_4.csv:
--------------------------------------------------------------------------------
 1 | "","expr","time","dataset"
 2 | "1","csv_fread",8036938666.6,"fanniemae"
 3 | "2","fst_unc",6341601497.2,"fanniemae"
 4 | "3","fst_50",5054754967.8,"fanniemae"
 5 | "4","feather_v1",9799018014,"fanniemae"
 6 | "5","feather_unc",5054201747.4,"fanniemae"
 7 | "6","feather_lz4",4928118181,"fanniemae"
 8 | "7","feather_zstd",5535553828.6,"fanniemae"
 9 | "8","parquet_unc",6281569166.6,"fanniemae"
10 | "9","parquet_snappy",6392237692.6,"fanniemae"
11 | "10","rds_unc",29892887491.4,"fanniemae"
12 | "11","rds_compressed",41273872293.8,"fanniemae"
13 | "12","csv_fread",18312046954,"nyctaxi"
14 | "13","fst_unc",11969350465.6,"nyctaxi"
15 | "14","fst_50",13439147068.6,"nyctaxi"
16 | "15","feather_v1",12034649945,"nyctaxi"
17 | "16","feather_unc",11023961432.2,"nyctaxi"
18 | "17","feather_lz4",11592801001,"nyctaxi"
19 | "18","feather_zstd",12704684877,"nyctaxi"
20 | "19","parquet_unc",12225668849,"nyctaxi"
21 | "20","parquet_snappy",12004466381.6,"nyctaxi"
22 | "21","rds_unc",21847153904,"nyctaxi"
23 | "22","rds_compressed",30735937022.8,"nyctaxi"
24 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/r_write_results_1.csv:
--------------------------------------------------------------------------------
 1 | "","expr","time","dataset"
 2 | "1","fst_unc",7007023920,"fanniemae"
 3 | "2","fst_50",4385196419,"fanniemae"
 4 | "3","feather_v1",8656647228,"fanniemae"
 5 | "4","feather_unc",10040626659,"fanniemae"
 6 | "5","feather_lz4",10818098194,"fanniemae"
 7 | "6","feather_zstd",11438481575,"fanniemae"
 8 | "7","parquet_unc",10434816898,"fanniemae"
 9 | "8","parquet_snappy",10800951873,"fanniemae"
10 | "9","rds_compressed",76929230341,"fanniemae"
11 | "10","rds_unc",24216423401,"fanniemae"
12 | "11","fst_unc",4087879250,"nyctaxi"
13 | "12","fst_50",3950344461,"nyctaxi"
14 | "13","feather_v1",5972294820,"nyctaxi"
15 | "14","feather_unc",5888590985,"nyctaxi"
16 | "15","feather_lz4",8325439328,"nyctaxi"
17 | "16","feather_zstd",10223231254,"nyctaxi"
18 | "17","parquet_unc",7715640740,"nyctaxi"
19 | "18","parquet_snappy",8585539352,"nyctaxi"
20 | "19","rds_compressed",104898052261,"nyctaxi"
21 | "20","rds_unc",10739751088,"nyctaxi"
22 | 


--------------------------------------------------------------------------------
/20190919file_benchmarks/r_write_results_4.csv:
--------------------------------------------------------------------------------
 1 | "","expr","time","dataset"
 2 | "1","fst_unc",7758567831,"fanniemae"
 3 | "2","fst_50",3700873556,"fanniemae"
 4 | "3","feather_v1",7080591830,"fanniemae"
 5 | "4","feather_unc",10413025112,"fanniemae"
 6 | "5","feather_lz4",10818213516,"fanniemae"
 7 | "6","feather_zstd",11563816777,"fanniemae"
 8 | "7","parquet_unc",10814584911,"fanniemae"
 9 | "8","parquet_snappy",11152511189,"fanniemae"
10 | "9","rds_compressed",78427148110,"fanniemae"
11 | "10","rds_unc",24919762665,"fanniemae"
12 | "11","fst_unc",4399914353,"nyctaxi"
13 | "12","fst_50",3305661431,"nyctaxi"
14 | "13","feather_v1",5477443720,"nyctaxi"
15 | "14","feather_unc",5864371601,"nyctaxi"
16 | "15","feather_lz4",8494803995,"nyctaxi"
17 | "16","feather_zstd",10073068744,"nyctaxi"
18 | "17","parquet_unc",7675560036,"nyctaxi"
19 | "18","parquet_snappy",8428579617,"nyctaxi"
20 | "19","rds_compressed",108234060692,"nyctaxi"
21 | "20","rds_unc",10717121094,"nyctaxi"
22 | 


--------------------------------------------------------------------------------
/20200402pandas_load/.ipynb_checkpoints/Untitled-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import pyarrow as pa\n",
 11 |     "import numpy as np\n",
 12 |     "\n",
 13 |     "num_rows = 1_000_000\n",
 14 |     "num_columns = 100\n",
 15 |     "arr = np.random.randn(num_rows)\n",
 16 |     "dict_of_numpy_arrays = {\n",
 17 |     "    'f{}'.format(i): arr\n",
 18 |     "    for i in range(num_columns)\n",
 19 |     "}"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 3,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": []
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 4,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": []
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 8,
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "name": "stdout",
 43 |      "output_type": "stream",
 44 |      "text": [
 45 |       "85.7 ms ± 4.55 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
 46 |      ]
 47 |     }
 48 |    ],
 49 |    "source": [
 50 |     "%timeit df = pd.DataFrame(dict_of_numpy_arrays)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 11,
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "49.7 ms ± 4.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
 63 |      ]
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "timeit df = pa.table(dict_of_numpy_arrays).to_pandas()"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 12,
 73 |    "metadata": {},
 74 |    "outputs": [
 75 |     {
 76 |      "name": "stdout",
 77 |      "output_type": "stream",
 78 |      "text": [
 79 |       "66.9 ms ± 4.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
 80 |      ]
 81 |     }
 82 |    ],
 83 |    "source": [
 84 |     "timeit df = pa.table(dict_of_numpy_arrays).to_pandas(use_threads=False)"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": []
 93 |   }
 94 |  ],
 95 |  "metadata": {
 96 |   "kernelspec": {
 97 |    "display_name": "Python 3",
 98 |    "language": "python",
 99 |    "name": "python3"
100 |   },
101 |   "language_info": {
102 |    "codemirror_mode": {
103 |     "name": "ipython",
104 |     "version": 3
105 |    },
106 |    "file_extension": ".py",
107 |    "mimetype": "text/x-python",
108 |    "name": "python",
109 |    "nbconvert_exporter": "python",
110 |    "pygments_lexer": "ipython3",
111 |    "version": "3.7.6"
112 |   }
113 |  },
114 |  "nbformat": 4,
115 |  "nbformat_minor": 4
116 | }
117 | 


--------------------------------------------------------------------------------
/20200402pandas_load/Untitled.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import pyarrow as pa\n",
 11 |     "import numpy as np\n",
 12 |     "\n",
 13 |     "num_rows = 1_000_000\n",
 14 |     "num_columns = 100\n",
 15 |     "arr = np.random.randn(num_rows)\n",
 16 |     "dict_of_numpy_arrays = {\n",
 17 |     "    'f{}'.format(i): arr\n",
 18 |     "    for i in range(num_columns)\n",
 19 |     "}"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": 3,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": []
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": 4,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": []
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 8,
 39 |    "metadata": {},
 40 |    "outputs": [
 41 |     {
 42 |      "name": "stdout",
 43 |      "output_type": "stream",
 44 |      "text": [
 45 |       "85.7 ms ± 4.55 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
 46 |      ]
 47 |     }
 48 |    ],
 49 |    "source": [
 50 |     "%timeit df = pd.DataFrame(dict_of_numpy_arrays)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 11,
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "49.7 ms ± 4.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
 63 |      ]
 64 |     }
 65 |    ],
 66 |    "source": [
 67 |     "timeit df = pa.table(dict_of_numpy_arrays).to_pandas()"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 12,
 73 |    "metadata": {},
 74 |    "outputs": [
 75 |     {
 76 |      "name": "stdout",
 77 |      "output_type": "stream",
 78 |      "text": [
 79 |       "66.9 ms ± 4.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
 80 |      ]
 81 |     }
 82 |    ],
 83 |    "source": [
 84 |     "timeit df = pa.table(dict_of_numpy_arrays).to_pandas(use_threads=False)"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": null,
 90 |    "metadata": {},
 91 |    "outputs": [],
 92 |    "source": []
 93 |   }
 94 |  ],
 95 |  "metadata": {
 96 |   "kernelspec": {
 97 |    "display_name": "Python 3",
 98 |    "language": "python",
 99 |    "name": "python3"
100 |   },
101 |   "language_info": {
102 |    "codemirror_mode": {
103 |     "name": "ipython",
104 |     "version": 3
105 |    },
106 |    "file_extension": ".py",
107 |    "mimetype": "text/x-python",
108 |    "name": "python",
109 |    "nbconvert_exporter": "python",
110 |    "pygments_lexer": "ipython3",
111 |    "version": "3.7.6"
112 |   }
113 |  },
114 |  "nbformat": 4,
115 |  "nbformat_minor": 4
116 | }
117 | 


--------------------------------------------------------------------------------
/20200509wideparquet/WideParquet.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pyarrow as pa\n",
 10 |     "import pyarrow.parquet as pq\n",
 11 |     "import numpy as np\n",
 12 |     "import os\n",
 13 |     "import pandas as pd\n",
 14 |     "import time\n",
 15 |     "\n",
 16 |     "pa.set_cpu_count(8)\n",
 17 |     "\n",
 18 |     "def get_timing(f, niter):\n",
 19 |     "    start = time.clock_gettime(time.CLOCK_REALTIME)\n",
 20 |     "    for i in range(niter):\n",
 21 |     "        f()\n",
 22 |     "    result = (time.clock_gettime(time.CLOCK_REALTIME) - start) / niter\n",
 23 |     "    return result"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 2,
 29 |    "metadata": {},
 30 |    "outputs": [],
 31 |    "source": [
 32 |     "total_num_values = 100_000_000"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 4,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "def make_example_table(num_cols):\n",
 42 |     "    num_rows = total_num_values // num_cols\n",
 43 |     "    \n",
 44 |     "    values = np.arange(num_rows)\n",
 45 |     "    \n",
 46 |     "    return pa.table([values] * num_cols, \n",
 47 |     "                    names=['f{}'.format(i) for i in range(num_cols)])\n",
 48 |     "    "
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 5,
 54 |    "metadata": {},
 55 |    "outputs": [],
 56 |    "source": [
 57 |     "t1 = make_example_table(100)\n",
 58 |     "t2 = make_example_table(100000)"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 6,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "import pyarrow.feather as fth"
 68 |    ]
 69 |   },
 70 |   {
 71 |    "cell_type": "code",
 72 |    "execution_count": 17,
 73 |    "metadata": {},
 74 |    "outputs": [],
 75 |    "source": [
 76 |     "fth.write_feather(t1, 't1.arrow', compression=None)\n",
 77 |     "fth.write_feather(t2, 't2.arrow', compression=None)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 18,
 83 |    "metadata": {},
 84 |    "outputs": [
 85 |     {
 86 |      "name": "stdout",
 87 |      "output_type": "stream",
 88 |      "text": [
 89 |       "total 2519376\r\n",
 90 |       "-rw------- 1 wesm wesm 800088522 May  9 16:44 t1.arrow\r\n",
 91 |       "-rw------- 1 wesm wesm 815199522 May  9 16:44 t2.arrow\r\n",
 92 |       "-rw------- 1 wesm wesm 964513790 May  9 16:23 test.parquet\r\n",
 93 |       "-rw------- 1 wesm wesm     17815 May  9 16:42 WideParquet.ipynb\r\n"
 94 |      ]
 95 |     }
 96 |    ],
 97 |    "source": [
 98 |     "!ls -l"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": 15,
104 |    "metadata": {},
105 |    "outputs": [
106 |     {
107 |      "name": "stdout",
108 |      "output_type": "stream",
109 |      "text": [
110 |       "118 ms ± 1.28 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
111 |      ]
112 |     }
113 |    ],
114 |    "source": [
115 |     "%timeit fth.read_table('t1.arrow')"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 16,
121 |    "metadata": {},
122 |    "outputs": [
123 |     {
124 |      "name": "stdout",
125 |      "output_type": "stream",
126 |      "text": [
127 |       "398 ms ± 10.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
128 |      ]
129 |     }
130 |    ],
131 |    "source": [
132 |     "%timeit fth.read_table('t2.arrow')"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 13,
138 |    "metadata": {},
139 |    "outputs": [],
140 |    "source": [
141 |     "fth.write_feather?"
142 |    ]
143 |   },
144 |   {
145 |    "cell_type": "code",
146 |    "execution_count": 27,
147 |    "metadata": {},
148 |    "outputs": [
149 |     {
150 |      "name": "stdout",
151 |      "output_type": "stream",
152 |      "text": [
153 |       "10\n",
154 |       "100\n",
155 |       "1000\n",
156 |       "10000\n",
157 |       "100000\n"
158 |      ]
159 |     }
160 |    ],
161 |    "source": [
162 |     "num_cols_cases = [10, 100, 1000, 10000, 100000]\n",
163 |     "\n",
164 |     "file_path = 'test.parquet'\n",
165 |     "\n",
166 |     "file_sizes = {}\n",
167 |     "read_times = {}\n",
168 |     "\n",
169 |     "for num_cols in num_cols_cases:\n",
170 |     "    print(num_cols)\n",
171 |     "\n",
172 |     "    table = make_example_table(num_cols)\n",
173 |     "    \n",
174 |     "    pq.write_table(table, file_path, compression='NONE')\n",
175 |     "    \n",
176 |     "    file_sizes[num_cols] = os.stat(file_path).st_size\n",
177 |     "    read_times[num_cols] = get_timing(lambda: pq.read_table(file_path), 10)"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 28,
183 |    "metadata": {},
184 |    "outputs": [
185 |     {
186 |      "data": {
187 |       "text/plain": [
188 |        "{10: 802850724,\n",
189 |        " 100: 827973796,\n",
190 |        " 1000: 1013094899,\n",
191 |        " 10000: 979191400,\n",
192 |        " 100000: 964513790}"
193 |       ]
194 |      },
195 |      "execution_count": 28,
196 |      "metadata": {},
197 |      "output_type": "execute_result"
198 |     }
199 |    ],
200 |    "source": [
201 |     "file_sizes"
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "code",
206 |    "execution_count": 29,
207 |    "metadata": {},
208 |    "outputs": [
209 |     {
210 |      "data": {
211 |       "text/plain": [
212 |        "{10: 0.10510084629058838,\n",
213 |        " 100: 0.07333686351776122,\n",
214 |        " 1000: 0.12494065761566162,\n",
215 |        " 10000: 0.5032524585723877,\n",
216 |        " 100000: 3.8229554891586304}"
217 |       ]
218 |      },
219 |      "execution_count": 29,
220 |      "metadata": {},
221 |      "output_type": "execute_result"
222 |     }
223 |    ],
224 |    "source": [
225 |     "read_times"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 30,
231 |    "metadata": {},
232 |    "outputs": [
233 |     {
234 |      "data": {
235 |       "text/plain": [
236 |        "<matplotlib.axes._subplots.AxesSubplot at 0x7f738ff84358>"
237 |       ]
238 |      },
239 |      "execution_count": 30,
240 |      "metadata": {},
241 |      "output_type": "execute_result"
242 |     },
243 |     {
244 |      "data": {
245 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAEgCAYAAACkfIiyAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAQ40lEQVR4nO3dbYxcZ3nG8f9Vm6BSAqGxScEvOCoO4A8EgUlQ1aqhCHBAqkspJQE1TRRkRRBKPyAwX9pShAillSjKi2WBCfnQutCi4oIbI9EGWkEgNoUUJ3VihbxsTInDS1CLaORw98NMMuNhvDs24z27z/x/0spznvNk9s7R6tpnn7nnTKoKSdLy9wtdFyBJmg4DXZIaYaBLUiMMdElqhIEuSY0w0CWpEZ0GepJdSR5K8q0J5j4nyReS3J7kliRrF6NGSVouul6h3whsmXDuXwI3VdULgT8HPnC6ipKk5ajTQK+qLwHfHx5L8qtJbk5yIMm/JXl+/9Qm4Av9x/8KbF3EUiVpyet6hT7OTuDtVfUS4J3A9f3xbwKv7z9+HXBmkrM7qE+SlqSVXRcwLMlTgV8DPpXk8eEn9/99J3BtksuBLwEPAscWu0ZJWqqWVKDT+4vhh1X1otETVXUE+F14IvhfX1WPLHJ9krRkLaktl6r6EfDtJG8ASM/5/cerkjxe73uAXR2VKUlLUtdti38LfAV4XpK5JFcCbwauTPJN4CCDFz8vAg4luQs4B3h/ByVL0pIVb58rSW1YUlsukqRTZ6BLUiM663JZtWpVbdiwoatvL0nL0oEDBx6uqtXjznUW6Bs2bGD//v1dfXtJWpaS3Heic265SFIjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUiAUDfaHP/ezfEfEjSQ73P+/zxdMvU5K0kEneWHQjcC1w0wnOXwxs7H9dCNzQ/1fqxIbtn+u6BO695rVdl6AZtOAKfdznfo7YSu/Dm6uqbgXOSvKsaRUoSZrMNPbQ1wAPDB3P9cckSYtoGoGeMWNjb7KeZFuS/Un2Hz16dArfWpL0uGkE+hywbuh4LXBk3MSq2llVm6tq8+rVY28WJkk6RdMI9D3AZf1ul5cBj1TVd6bwvJKkk7Bgl0v/cz8vAlYlmQP+FHgSQFXtAPYCrwEOAz8GrjhdxUqSTmzBQK+qSxc4X8DbplaRpKmxhXO2+E5RSWqEgS5JjejsI+gkaTHNwvaTK3RJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREruy5A07Fh++e6LoF7r3lt1yVIM80VuiQ1YqJAT7IlyaEkh5NsH3P+6Un+Kck3kxxMcsX0S5UkzWfBQE+yArgOuBjYBFyaZNPItLcBd1TV+cBFwF8lOWPKtUqS5jHJCv0C4HBV3VNVjwK7ga0jcwo4M0mApwLfB45NtVJJ0rwmeVF0DfDA0PEccOHInGuBPcAR4EzgjVX106lUOA9fCJSkgUlW6BkzViPHrwa+ATwbeBFwbZKn/cwTJduS7E+y/+jRoyddrCTpxCYJ9Dlg3dDxWnor8WFXAJ+unsPAt4Hnjz5RVe2sqs1VtXn16tWnWrMkaYxJAv02YGOSc/svdF5Cb3tl2P3AKwCSnAM8D7hnmoVKkua34B56VR1LcjWwD1gB7Kqqg0mu6p/fAbwPuDHJf9Lbonl3VT18GuuWJI2Y6J2iVbUX2DsytmPo8RHgVdMtTZJ0MnynqCQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakREwV6ki1JDiU5nGT7CeZclOQbSQ4m+eJ0y5QkLWTlQhOSrACuA14JzAG3JdlTVXcMzTkLuB7YUlX3J3nm6SpYkjTeJCv0C4DDVXVPVT0K7Aa2jsx5E/DpqrofoKoemm6ZkqSFTBLoa4AHho7n+mPDzgOekeSWJAeSXDatAiVJk1lwywXImLEa8zwvAV4B/CLwlSS3VtVdxz1Rsg3YBrB+/fqTr1aSdEKTrNDngHVDx2uBI2Pm3FxV/1tVDwNfAs4ffaKq2llVm6tq8+rVq0+1ZknSGJME+m3AxiTnJjkDuATYMzLnM8BvJFmZ5CnAhcCd0y1VkjSfBbdcqupYkquBfcAKYFdVHUxyVf/8jqq6M8nNwO3AT4GPVtW3TmfhkqTjTbKHTlXtBfaOjO0YOf4Q8KHplSZJOhm+U1SSGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUiIkCPcmWJIeSHE6yfZ55L03yWJLfm16JkqRJLBjoSVYA1wEXA5uAS5NsOsG8DwL7pl2kJGlhk6zQLwAOV9U9VfUosBvYOmbe24F/AB6aYn2SpAlNEuhrgAeGjuf6Y09IsgZ4HbBjeqVJkk7GJIGeMWM1cvxh4N1V9di8T5RsS7I/yf6jR49OWqMkaQIrJ5gzB6wbOl4LHBmZsxnYnQRgFfCaJMeq6h+HJ1XVTmAnwObNm0d/KUiSfg6TBPptwMYk5wIPApcAbxqeUFXnPv44yY3AZ0fDXJJ0ei0Y6FV1LMnV9LpXVgC7qupgkqv65903l6QlYJIVOlW1F9g7MjY2yKvq8p+/LEnSyfKdopLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGTBToSbYkOZTkcJLtY86/Ocnt/a8vJzl/+qVKkuazYKAnWQFcB1wMbAIuTbJpZNq3gd+sqhcC7wN2TrtQSdL8JlmhXwAcrqp7qupRYDewdXhCVX25qn7QP7wVWDvdMiVJC5kk0NcADwwdz/XHTuRK4J9/nqIkSSdv5QRzMmasxk5MXk4v0H/9BOe3AdsA1q9fP2GJkqRJTLJCnwPWDR2vBY6MTkryQuCjwNaq+t64J6qqnVW1uao2r169+lTqlSSdwCSBfhuwMcm5Sc4ALgH2DE9Ish74NPAHVXXX9MuUJC1kwS2XqjqW5GpgH7AC2FVVB5Nc1T+/A/gT4Gzg+iQAx6pq8+krW5I0apI9dKpqL7B3ZGzH0OO3AG+ZbmmSpJPhO0UlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjZgo0JNsSXIoyeEk28ecT5KP9M/fnuTF0y9VkjSfBQM9yQrgOuBiYBNwaZJNI9MuBjb2v7YBN0y5TknSAiZZoV8AHK6qe6rqUWA3sHVkzlbgpuq5FTgrybOmXKskaR6TBPoa4IGh47n+2MnOkSSdRisnmJMxY3UKc0iyjd6WDMD/JDk0wfc/3VYBD5/qf5wPTrGS7nktBrwWA16LgaVwLZ5zohOTBPocsG7oeC1w5BTmUFU7gZ0TfM9Fk2R/VW3uuo6lwGsx4LUY8FoMLPVrMcmWy23AxiTnJjkDuATYMzJnD3BZv9vlZcAjVfWdKdcqSZrHgiv0qjqW5GpgH7AC2FVVB5Nc1T+/A9gLvAY4DPwYuOL0lSxJGmeSLReqai+90B4e2zH0uIC3Tbe0RbOktoA65rUY8FoMeC0GlvS1SC+LJUnLnW/9l6RGGOiS1AgDXZIaYaBLUiMm6nJpRZKnA+8BfgdY3R9+CPgMcE1V/bCr2rqQJPTu1bOG3jt7jwBfqxl8pTzJq+n9XAxfi89U1c2dFtahJL9Mr4ntB13X0pV+Zmzh+J+LfUs1K2Zthf5J4AfARVV1dlWdDby8P/apTitbZEleBdwN/Bm99xC8FngvcHf/3MxI8mHgHcAXgb8APtR//EdJ/rrL2hZbkvVJdic5CnwVuC3JQ/2xDd1Wt7iSXAZ8HbgIeArwS/Ty4kD/3JIzU22LSQ5V1fNO9lyLktwJXFxV946MnwvsraoXdFJYB5LcVVXnjRkPcFdVbeygrE4k+QrwYeDvq+qx/tgK4A3AH1fVy7qsbzH17zV14ehqPMkzgK+O+5np2qyt0O9L8q4k5zw+kOScJO/m+LtFzoKV9O7BM+pB4EmLXEvXfpLkgjHjLwV+stjFdGxVVf3d42EOUFWPVdVu4OwO6+pCGHOTQeCnjL8hYedmag8deCOwHfhikmf2x75L7140b+isqm7sovfn9G4Gv8zW0btXz8c6q6oblwM3JDmTwS+5dcCP+udmyYEk1wOf4Pifiz8E/qOzqrrxfuDrST7P4FqsB14JvK+zquYxU1su80lyRVV9vOs6FlP/k6d+m94LPqEXZnuq6o5OC+tIkl9h6FpU1X93XNKi69+A70p6H1pz3M8F8LGq+r8Oy1t0/e2VV3P8tdi3VF8oNtD7ktxfVeu7rkPdsONH81kuHT8zteWS5PYTnQLOOcG5JtnCOdDv6rmeXtfPg/3htcBzk7y1qj7fWXEdsIWzJ8l6el1PvwU80hvK04B/AbaPNhQsBTO1Qk/yXXp/Po3+lg3w5ap69uJX1Y0k++j9YH7i8a2F/pbD5cArquqVHZa3qOz4Gei3cJ4H3MTg9YS1wGXA3VX1jq5qW2zLseNn1gL9Y8DHq+rfx5z7m6p6UwdldcIWzoEkdwMvqKpjI+NnAHdU1XO7qWzx2cI5kOTuE/3/zneuSzO15VJVV85zbmbCvO++JO+it0L/LvRaOOmt0GethdOOn4GfJLmgqr42Mj6LLZzLruNnplboGui/er+dXjfDaAvnNUv9xZ9ps+OnJ8mLgRuAcS2cb62qA13VttiWY8ePga6fMYstnDqeLZzLk4GunzFrLZx2/BzPFs6B5dbxY6DPqAVaOM+rqicvZj1dsuNnYL4WTnpbLjPTwrkcO34M9BllC+eAHT8DtnAOLMeOn1m7OZcGPgs8taruG/m6F7il29IWnTdtG/CmbQPL7qZtrtA18+z4GUjyHuD3gXEtnJ+sqg90VdtiW44dPwa6NI9Z7PixhfN4y6njx0CX5jFrHT863nLr+Jmpd4pK43jTtgFbOAeW403bXKFr5tnxM2AL58By7PhxhS4NOn6+MXoiyS2LX06nNlTVB4cH+sF+TZIrOqqpK8uu48dA18zzpm3H8aZtA8vupm1uuUh6gi2cx1tuHT8GuqSJzGIL53LjO0UlTeq9XRewmJI8Pck1Sf4ryff6X3f2x87qur5x3EOX9ARbOI/zSXodPxeN6fj5FLDkOn7ccpH0BFs4B5bjTdtcoUsaZgvnwLLr+HGFLkljLMeOHwNdkk7SUu34MdAl6SQt1Zu2uYcuSWMsx44fA12SxjuHeTp+Fr+chRnokjTesuv4cQ9dkhrhW/8lqREGuiQ1wkCXpEYY6JLUCANdkhrx/4TufEphIn9cAAAAAElFTkSuQmCC\n",
246 |       "text/plain": [
247 |        "<Figure size 432x288 with 1 Axes>"
248 |       ]
249 |      },
250 |      "metadata": {
251 |       "needs_background": "light"
252 |      },
253 |      "output_type": "display_data"
254 |     }
255 |    ],
256 |    "source": [
257 |     "pd.Series(file_sizes).plot.bar()"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": 21,
263 |    "metadata": {},
264 |    "outputs": [
265 |     {
266 |      "data": {
267 |       "text/plain": [
268 |        "<matplotlib.axes._subplots.AxesSubplot at 0x7f738f05aac8>"
269 |       ]
270 |      },
271 |      "execution_count": 21,
272 |      "metadata": {},
273 |      "output_type": "execute_result"
274 |     },
275 |     {
276 |      "data": {
277 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAEVCAYAAADwyx6sAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAARcUlEQVR4nO3dbYxcZ3nG8f9VYwotlLR4S9LYjpEwFS9SIXJNEF9cCiUJUdMP0AYkUiIkizcVJCQa+ABFVdXQSojSQCxLCRCJkoaCQgROA1J5i9oEHNcJJAZiIWhMUmICOFi8yfTuhzlh1pPZndl4do/3mf9PGuXMc56duXO0uvb4mfucSVUhSVr/fq3vAiRJs2GgS1IjDHRJaoSBLkmNMNAlqREGuiQ14jF9vfGmTZtq27Ztfb29JK1Lt99++/eramHcvt4Cfdu2bezfv7+vt5ekdSnJd5ba55KLJDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqRG9XVgkSWtp2+Wf7rsEvn3FS1f19T1Dl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRkwM9CSPS/LlJHckuSvJu8bM2ZXkWJKD3eMdq1OuJGkp01z6/3PghVV1PMlG4JYkN1XVrSPzvlRVF82+REnSNCYGelUVcLx7urF71GoWJUlauanW0JNsSHIQeAD4bFXdNmba87tlmZuSPGumVUqSJpoq0Kvql1X1HGAzsDPJs0emHADOqao/AP4ZuGHc6yTZnWR/kv1Hjx49lbolSSNW1OVSVT8CPg+cPzL+UFUd77b3ARuTbBrz83urakdV7VhYWHj0VUuSHmGaLpeFJGd0248HXgR8fWTOmUnSbe/sXvfB2ZcrSVrKNF0uZwEfTrKBQVBfX1WfSvJagKraA7wMeF2SE8BPgUu6D1MlSWtkmi6XO4Hnjhnfs2j7SuDK2ZYmSVoJrxSVpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGjEx0JM8LsmXk9yR5K4k7xozJ0nel+RwkjuTnLs65UqSljLxS6KBnwMvrKrjSTYCtyS5qapuXTTnAmB793gecFX3X0nSGpl4hl4Dx7unG7tHjUy7GLi2m3srcEaSs2ZbqiRpOVOtoSfZkOQg8ADw2aq6bWTK2cC9i54f6cYkSWtkqkCvql9W1XOAzcDOJM8emZJxPzY6kGR3kv1J9h89enTl1UqSlrSiLpeq+hHweeD8kV1HgC2Lnm8G7hvz83urakdV7VhYWFhhqZKk5UzT5bKQ5Ixu+/HAi4Cvj0y7Ebi063Y5DzhWVffPvFpJ0pKm6XI5C/hwkg0M/gBcX1WfSvJagKraA+wDLgQOAz8BLluleiVJS5gY6FV1J/DcMeN7Fm0X8IbZliZJWgmvFJWkRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1IhpviR6S5LPJTmU5K4kbxozZ1eSY0kOdo93rE65kqSlTPMl0SeAt1TVgSRPBG5P8tmquntk3peq6qLZlyhJmsbEM/Squr+qDnTbPwYOAWevdmGSpJVZ0Rp6km3Ac4Hbxux+fpI7ktyU5FkzqE2StALTLLkAkOQJwMeBN1fVQyO7DwDnVNXxJBcCNwDbx7zGbmA3wNatWx910ZKkR5rqDD3JRgZh/pGq+sTo/qp6qKqOd9v7gI1JNo2Zt7eqdlTVjoWFhVMsXZK02DRdLgGuBg5V1XuWmHNmN48kO7vXfXCWhUqSljfNkssLgFcBX01ysBt7O7AVoKr2AC8DXpfkBPBT4JKqqlWoV5K0hImBXlW3AJkw50rgylkVJUlaOa8UlaRGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhoxMdCTbEnyuSSHktyV5E1j5iTJ+5IcTnJnknNXp1xJ0lImfkk0cAJ4S1UdSPJE4PYkn62quxfNuQDY3j2eB1zV/VeStEYmnqFX1f1VdaDb/jFwCDh7ZNrFwLU1cCtwRpKzZl6tJGlJK1pDT7INeC5w28ius4F7Fz0/wiNDX5K0iqYO9CRPAD4OvLmqHhrdPeZHasxr7E6yP8n+o0ePrqxSSdKypgr0JBsZhPlHquoTY6YcAbYser4ZuG90UlXtraodVbVjYWHh0dQrSVrCNF0uAa4GDlXVe5aYdiNwadftch5wrKrun2GdkqQJpulyeQHwKuCrSQ52Y28HtgJU1R5gH3AhcBj4CXDZ7EuVJC1nYqBX1S2MXyNfPKeAN8yqKEnSynmlqCQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktSIiYGe5JokDyT52hL7dyU5luRg93jH7MuUJE0y8UuigQ8BVwLXLjPnS1V10UwqkiQ9KhPP0Kvqi8AP1qAWSdIpmNUa+vOT3JHkpiTPmtFrSpJWYJoll0kOAOdU1fEkFwI3ANvHTUyyG9gNsHXr1hm8tSTpYad8hl5VD1XV8W57H7AxyaYl5u6tqh1VtWNhYeFU31qStMgpB3qSM5Ok297ZveaDp/q6kqSVmbjkkuSjwC5gU5IjwDuBjQBVtQd4GfC6JCeAnwKXVFWtWsWSpLEmBnpVvWLC/isZtDVKknrklaKS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhoxMdCTXJPkgSRfW2J/krwvyeEkdyY5d/ZlSpImmeYM/UPA+cvsvwDY3j12A1edelmSpJWaGOhV9UXgB8tMuRi4tgZuBc5IctasCpQkTWcWa+hnA/cuen6kG5MkraFZBHrGjNXYicnuJPuT7D969OgM3lqS9LBZBPoRYMui55uB+8ZNrKq9VbWjqnYsLCzM4K0lSQ+bRaDfCFzadbucBxyrqvtn8LqSpBV4zKQJST4K7AI2JTkCvBPYCFBVe4B9wIXAYeAnwGWrVawkaWkTA72qXjFhfwFvmFlFkqRHxStFJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEZMFehJzk/yjSSHk1w+Zv+uJMeSHOwe75h9qZKk5Uz8kugkG4D3Ay8GjgBfSXJjVd09MvVLVXXRKtQoSZrCNGfoO4HDVfWtqvoFcB1w8eqWJUlaqWkC/Wzg3kXPj3Rjo56f5I4kNyV51kyqkyRNbeKSC5AxYzXy/ABwTlUdT3IhcAOw/REvlOwGdgNs3bp1haVKkpYzzRn6EWDLouebgfsWT6iqh6rqeLe9D9iYZNPoC1XV3qraUVU7FhYWTqFsSdKoac7QvwJsT/JU4LvAJcArF09IcibwvaqqJDsZ/KF4cNbFSlqZbZd/uu8S+PYVL+27hLkxMdCr6kSSNwI3AxuAa6rqriSv7fbvAV4GvC7JCeCnwCVVNbosI0laRdOcoT+8jLJvZGzPou0rgStnW5okaSW8UlSSGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGTHU/dGk98Vt6NK88Q5ekRqzrM3TPxCRpaF0Huob84yZpqiWXJOcn+UaSw0kuH7M/Sd7X7b8zybmzL1WStJyJgZ5kA/B+4ALgmcArkjxzZNoFwPbusRu4asZ1SpImmOYMfSdwuKq+VVW/AK4DLh6ZczFwbQ3cCpyR5KwZ1ypJWsY0gX42cO+i50e6sZXOkSStomk+FM2YsXoUc0iym8GSDMDxJN+Y4v1X2ybg+4/2h/PuGVbSP4/FkMdiyGMxdDoci3OW2jFNoB8Btix6vhm471HMoar2AnuneM81k2R/Ve3ou47TgcdiyGMx5LEYOt2PxTRLLl8Btid5apLHApcAN47MuRG4tOt2OQ84VlX3z7hWSdIyJp6hV9WJJG8EbgY2ANdU1V1JXtvt3wPsAy4EDgM/AS5bvZIlSeNMdWFRVe1jENqLx/Ys2i7gDbMtbc2cVktAPfNYDHkshjwWQ6f1scggiyVJ650355KkRhjoktQIA12SGmGgS1Ij5ur2uUmeBLwN+DNgoRt+APgkcEVV/aiv2vqQJAzu1XM2gyt77wO+XHP4SXmSlzD4vVh8LD5ZVf/ea2E9SvI7DJrYfth3LX3pMuN8Tv69uPl0zYp5O0O/HvghsKuqnlxVTwb+qBv7WK+VrbEkfwLcA/wNg2sIXgq8C7in2zc3krwXeBPwBeAfgH/stv8qyT/1WdtaS7I1yXVJjgK3AV9J8kA3tq3f6tZWkkuBA8Au4DeA32SQF7d3+047c9W2mOQbVfX7K93XoiSHgAuq6tsj408F9lXVM3oprAdJvllVTx8zHuCbVbW9h7J6keS/gPcC/1ZVv+zGNgAvB95cVef1Wd9a6u419bzRs/Ekvw3cNu53pm/zdob+nSRvTfKUhweSPCXJX3Py3SLnwWMY3INn1HeBjWtcS99+lmTnmPE/BH621sX0bFNV/evDYQ5QVb+squuAJ/dYVx/CmJsMAv/H+BsS9m6u1tCBvwAuB76Q5He7se8xuBfNy3urqh/XMPjn9HUM/5htYXCvnqt7q6ofrwauSvJEhn/ktgAPdfvmye1JPgB8mJN/L/4S+O/equrH3wEHknyG4bHYCrwY+NveqlrGXC25LCfJZVX1wb7rWEvdN0/9KYMPfMIgzG6sqrt7LawnSc5k0bGoqv/tuaQ1192A7zUMvrTmpN8L4Oqq+nmP5a25bnnlJZx8LG4+XT8oNtA7Sf6nqrb2XYf6YcePlrNeOn7masklyZ1L7QKessS+JtnCOdR19XyAQdfPd7vhzcDTkry+qj7TW3E9sIVzIMlWBl1PLwSODYbyW8B/AJePNhScDubqDD3J9xj882n0r2yA/6yq31v7qvqR5GYGv5gffnhpoVtyeDXwx1X14h7LW1N2/Ax1LZxPB65l+HnCZuBS4J6qelNfta219djxM2+BfjXwwaq6Zcy+f6mqV/ZQVi9s4RxKcg/wjKo6MTL+WODuqnpaP5WtPVs4h5Lcs9T/73L7+jRXSy5V9Zpl9s1NmHe+k+StDM7QvweDFk4GZ+jz1sJpx8/Qz5LsrKovj4zPYwvnuuv4maszdA11n95fzqCbYbSF84rT/cOfWbPjZyDJucBVwLgWztdX1e191bbW1mPHj4GuR5jHFk6dzBbO9clA1yPMWwunHT8ns4VzaL11/Bjoc2pCC+fTq+rX17KePtnxM7RcCyeDJZe5aeFcjx0/BvqcsoVzyI6fIVs4h9Zjx8+83ZxLQ58CnlBV3xl5fBv4fL+lrTlv2jbkTduG1t1N2zxD19yz42coyduAPwfGtXBeX1V/31dta209dvwY6NIy5rHjxxbOk62njh8DXVrGvHX86GTrreNnrq4Ulcbxpm1DtnAOrcebtnmGrrlnx8+QLZxD67HjxzN0adjxc3B0R5LPr305vdpWVe9ePNAF+xVJLuuppr6su44fA11zz5u2ncSbtg2tu5u2ueQi6Vds4TzZeuv4MdAlTWUeWzjXG68UlTStd/VdwFpK8qQkVyT5epIHu8ehbuyMvusbxzV0Sb9iC+dJrmfQ8bNrTMfPx4DTruPHJRdJv2IL59B6vGmbZ+iSFrOFc2jddfx4hi5JY6zHjh8DXZJW6HTt+DHQJWmFTtebtrmGLkljrMeOHwNdksZ7Cst0/Kx9OZMZ6JI03rrr+HENXZIa4aX/ktQIA12SGmGgS1IjDHRJaoSBLkmN+H+NiAucWRIu8wAAAABJRU5ErkJggg==\n",
278 |       "text/plain": [
279 |        "<Figure size 432x288 with 1 Axes>"
280 |       ]
281 |      },
282 |      "metadata": {
283 |       "needs_background": "light"
284 |      },
285 |      "output_type": "display_data"
286 |     }
287 |    ],
288 |    "source": [
289 |     "pd.Series(read_times).plot.bar()"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": null,
295 |    "metadata": {},
296 |    "outputs": [],
297 |    "source": []
298 |   }
299 |  ],
300 |  "metadata": {
301 |   "kernelspec": {
302 |    "display_name": "Python 3",
303 |    "language": "python",
304 |    "name": "python3"
305 |   },
306 |   "language_info": {
307 |    "codemirror_mode": {
308 |     "name": "ipython",
309 |     "version": 3
310 |    },
311 |    "file_extension": ".py",
312 |    "mimetype": "text/x-python",
313 |    "name": "python",
314 |    "nbconvert_exporter": "python",
315 |    "pygments_lexer": "ipython3",
316 |    "version": "3.7.3"
317 |   }
318 |  },
319 |  "nbformat": 4,
320 |  "nbformat_minor": 2
321 | }
322 | 


--------------------------------------------------------------------------------
/peak_use.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | from pandas.util.testing import rands
 3 | 
 4 | import pyarrow as pa
 5 | import pyarrow.parquet as pq
 6 | 
 7 | import gc
 8 | 
 9 | GB = 1 << 30
10 | 
11 | class memory_use:
12 | 
13 |     def __init__(self):
14 |         self.start_use = pa.total_allocated_bytes()
15 |         self.pool = pa.default_memory_pool()
16 |         self.start_peak_use = self.pool.max_memory()
17 | 
18 |     def __enter__(self):
19 |         return
20 | 
21 |     def __exit__(self, type, value, traceback):
22 |         gc.collect()
23 |         print("Change in memory use: {}"
24 |               .format((pa.total_allocated_bytes() - self.start_use) / GB))
25 |         print("Change in peak use: {}"
26 |               .format((self.pool.max_memory() - self.start_peak_use) / GB))
27 | 
28 | 
29 | with memory_use():
30 |     table = pq.read_table('/tmp/test.parquet')
31 | 


--------------------------------------------------------------------------------
/scripts/20190903_parquet_benchmark.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import numpy as np
  3 | import pyarrow as pa
  4 | import pyarrow.parquet as pq
  5 | from pandas.util.testing import rands
  6 | import gc
  7 | import time
  8 | 
  9 | 
 10 | class memory_use:
 11 | 
 12 |     def __init__(self):
 13 |         self.start_use = pa.total_allocated_bytes()
 14 |         self.pool = pa.default_memory_pool()
 15 |         self.start_peak_use = self.pool.max_memory()
 16 | 
 17 |     def __enter__(self):
 18 |         return
 19 | 
 20 |     def __exit__(self, type, value, traceback):
 21 |         gc.collect()
 22 |         print("Change in memory use: {}"
 23 |               .format(pa.total_allocated_bytes() - self.start_use))
 24 |         print("Change in peak use: {}"
 25 |               .format(self.pool.max_memory() - self.start_peak_use))
 26 | 
 27 | 
 28 | def generate_strings(string_size, nunique, length, random_order=True):
 29 |     uniques = np.array([rands(string_size) for i in range(nunique)], dtype='O')
 30 |     if random_order:
 31 |         indices = np.random.randint(0, nunique, size=length).astype('i4')
 32 |         return uniques.take(indices)
 33 |     else:
 34 |         return uniques.repeat(length // nunique)
 35 | 
 36 | 
 37 | def generate_dict_strings(string_size, nunique, length, random_order=True):
 38 |     uniques = np.array([rands(string_size) for i in range(nunique)], dtype='O')
 39 |     if random_order:
 40 |         indices = np.random.randint(0, nunique, size=length).astype('i4')
 41 |     else:
 42 |         indices = np.arange(nunique).astype('i4').repeat(length // nunique)
 43 |     return pa.DictionaryArray.from_arrays(indices, uniques)
 44 | 
 45 | 
 46 | STRING_SIZE = 32
 47 | LENGTH = 3_000_000
 48 | NITER = 5
 49 | 
 50 | 
 51 | def generate_table(nunique, num_cols=10, random_order=True):
 52 |     data = generate_strings(STRING_SIZE, nunique, LENGTH,
 53 |                             random_order=random_order)
 54 |     return pa.Table.from_arrays([
 55 |         pa.array(data) for i in range(num_cols)
 56 |     ], names=['f{}'.format(i) for i in range(num_cols)])
 57 | 
 58 | 
 59 | def generate_dict_table(nunique, num_cols=10, random_order=True):
 60 |     data = generate_dict_strings(STRING_SIZE, nunique, LENGTH,
 61 |                                  random_order=random_order)
 62 |     return pa.Table.from_arrays([
 63 |         data for i in range(num_cols)
 64 |     ], names=['f{}'.format(i) for i in range(num_cols)])
 65 | 
 66 | 
 67 | def get_timing(f, niter):
 68 |     start = time.clock_gettime(time.CLOCK_REALTIME)
 69 |     gc.disable()
 70 |     for i in range(niter):
 71 |         f()
 72 |     result = (time.clock_gettime(time.CLOCK_REALTIME) - start) / niter
 73 |     gc.enable()
 74 |     gc.collect()
 75 |     return result
 76 | 
 77 | 
 78 | def write_table(t):
 79 |     out = pa.BufferOutputStream()
 80 |     pq.write_table(t, out)
 81 |     return out.getvalue()
 82 | 
 83 | 
 84 | def read_table(source):
 85 |     return pq.read_table(source)
 86 | 
 87 | 
 88 | def get_write_read_results(table, case_name):
 89 |     buf = write_table(table)
 90 |     results = [({'case': f'write-{case_name}'},
 91 |                 get_timing(lambda: write_table(table), 1)),
 92 |                ({'case': f'read-{case_name}'},
 93 |                 get_timing(lambda: read_table(buf), NITER)),
 94 |                ({'case': f'read-{case_name}-single-thread'},
 95 |                 get_timing(lambda: pq.read_table(buf, use_threads=False),
 96 |                            NITER))]
 97 |     for item in results:
 98 |         print(item)
 99 |     return results
100 | 
101 | 
102 | def get_cases(nunique):
103 |     return {
104 |         'dense-random': generate_table(nunique),
105 |         'dense-sequential': generate_table(nunique, random_order=False),
106 |         'dict-random': generate_dict_table(nunique),
107 |         'dict-sequential': generate_dict_table(nunique, random_order=False)
108 |     }
109 | 
110 | 
111 | def run_benchmarks():
112 |     results = {}
113 | 
114 |     nuniques = [1000, 100000]
115 |     # nuniques = [100000]
116 |     for nunique in nuniques:
117 |         nunique_results = []
118 | 
119 |         cases = get_cases(nunique)
120 |         for case_name, table in cases.items():
121 |             print(case_name, nunique)
122 |             nunique_results.extend(get_write_read_results(table, case_name))
123 | 
124 |         results[nunique] = nunique_results
125 | 
126 |     return results
127 | 
128 | 
129 | # cases = get_cases(100000)
130 | 
131 | # buf = write_table(cases['dict-random'])
132 | # with memory_use():
133 | #     result = pq.read_table(buf)
134 | 
135 | 
136 | print(json.dumps(run_benchmarks()))
137 | 


--------------------------------------------------------------------------------
/scripts/arrow7305.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import pandas as pd
 3 | import pyarrow as pa
 4 | import pyarrow.parquet as pq
 5 | import time
 6 | 
 7 | import gc
 8 | import psutil
 9 | 
10 | 
11 | PROC = psutil.Process()
12 | 
13 | 
14 | def get_rss():
15 |     return PROC.memory_info().rss
16 | 
17 | 
18 | def print_rss():
19 |     print(f"RSS: {get_rss()}")
20 | 
21 | 
22 | RSS_TELEMETRY = []
23 | 
24 | 
25 | class memory_use:
26 | 
27 |     def __init__(self):
28 |         self.start_use = pa.total_allocated_bytes()
29 |         self.start_rss = get_rss()
30 |         self.pool = pa.default_memory_pool()
31 |         self.start_peak_use = self.pool.max_memory()
32 | 
33 |     def __enter__(self):
34 |         return
35 | 
36 |     def __exit__(self, type, value, traceback):
37 |         gc.collect()
38 |         rss = get_rss()
39 |         print("RSS: {}, change: {}"
40 |               .format(rss, rss - self.start_rss))
41 |         RSS_TELEMETRY.append(rss)
42 |         # print("Change in Arrow allocations: {}"
43 |         #       .format(pa.total_allocated_bytes() - self.start_use))
44 |         # print("Change in peak use: {}"
45 |         #       .format(self.pool.max_memory() - self.start_peak_use))
46 | 
47 | 
48 | def log_(msg):
49 |     print(f"{msg} RSS: {get_rss()}")
50 | 
51 | 
52 | path = '/home/wesm/Downloads/big.snappy.parquet'
53 | 
54 | CSV_PATH = '/home/wesm/Downloads/50mb.csv.gz'
55 | 
56 | pa.jemalloc_set_decay_ms(0)
57 | 
58 | log_("Starting")
59 | 
60 | for i in range(10):
61 |     df = pd.read_csv(CSV_PATH)
62 |     log_("Read CSV")
63 | 
64 |     df.to_parquet('out.parquet')
65 |     log_("Wrote Parquet")
66 | 
67 |     time.sleep(1)
68 |     log_(f"Waited 1 second")
69 | 
70 |     # for i in range(10):
71 |     #     time.sleep(0.1)
72 |     #     elapsed = "%.2f" % (0.1 * (i + 1))
73 |     #     log_(f"{elapsed} seconds elapsed")
74 | 
75 | 
76 | for i in range(10):
77 |     time.sleep(1)
78 |     log_(f"Waited 1 second")
79 | 


--------------------------------------------------------------------------------