├── 20190510-RStats-NYC.ipynb
├── 20190725-Parquet-Dict-Benchmark.ipynb
├── 20190726-Parquet-Float-Encoding.ipynb
├── 20190802-Parquet-Dict-Decoding-Benchmark.ipynb
├── 20190803-ARROW-6060.ipynb
├── 20190815-Parquet-Direct-Dictionary-Write.ipynb
├── 20190830-VLDB-FlightDemo.ipynb
├── 20190919file_benchmarks
├── FeatherCompression.ipynb
├── all_read_results.csv
├── all_results.csv
├── all_write_results.csv
├── benchmark.R
├── benchmark.py
├── file_sizes.csv
├── generate_results.sh
├── glue_results.py
├── i9-9880H-1
│ ├── all_results.csv
│ ├── plot.png
│ ├── py_results.csv
│ └── r_results.csv
├── i9-9880H-4
│ ├── all_results.csv
│ ├── plot.png
│ ├── py_results.csv
│ └── r_results.csv
├── i9-9880H-8
│ ├── all_results.csv
│ ├── plot.png
│ ├── py_results.csv
│ └── r_results.csv
├── make_feather_plots.R
├── make_plots.R
├── py_read_results_1.csv
├── py_read_results_4.csv
├── py_write_results_1.csv
├── py_write_results_4.csv
├── r_read_results_1.csv
├── r_read_results_4.csv
├── r_write_results_1.csv
└── r_write_results_4.csv
├── 20200402pandas_load
├── .ipynb_checkpoints
│ └── Untitled-checkpoint.ipynb
└── Untitled.ipynb
├── 20200509wideparquet
└── WideParquet.ipynb
├── peak_use.py
└── scripts
├── 20190903_parquet_benchmark.py
└── arrow7305.py
/20190725-Parquet-Dict-Benchmark.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": []
9 | }
10 | ],
11 | "metadata": {
12 | "kernelspec": {
13 | "display_name": "Python 3",
14 | "language": "python",
15 | "name": "python3"
16 | },
17 | "language_info": {
18 | "codemirror_mode": {
19 | "name": "ipython",
20 | "version": 3
21 | },
22 | "file_extension": ".py",
23 | "mimetype": "text/x-python",
24 | "name": "python",
25 | "nbconvert_exporter": "python",
26 | "pygments_lexer": "ipython3",
27 | "version": "3.7.3"
28 | }
29 | },
30 | "nbformat": 4,
31 | "nbformat_minor": 2
32 | }
33 |
--------------------------------------------------------------------------------
/20190726-Parquet-Float-Encoding.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": []
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import pyarrow as pa\n",
17 | "import pyarrow.parquet as pq\n",
18 | "import numpy as np\n",
19 | "import pandas as pd\n",
20 | "arr = pa.array([np.nan] * 10000000)\n",
21 | "t = pa.Table.from_arrays([arr], names=['f0'])"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 2,
27 | "metadata": {},
28 | "outputs": [
29 | {
30 | "name": "stdout",
31 | "output_type": "stream",
32 | "text": [
33 | "88.1 ms ± 1.68 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
34 | ]
35 | }
36 | ],
37 | "source": [
38 | "%timeit pq.write_table(t, '/home/wesm/tmp/nans.parquet')"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": null,
44 | "metadata": {},
45 | "outputs": [],
46 | "source": []
47 | }
48 | ],
49 | "metadata": {
50 | "kernelspec": {
51 | "display_name": "Python 3",
52 | "language": "python",
53 | "name": "python3"
54 | },
55 | "language_info": {
56 | "codemirror_mode": {
57 | "name": "ipython",
58 | "version": 3
59 | },
60 | "file_extension": ".py",
61 | "mimetype": "text/x-python",
62 | "name": "python",
63 | "nbconvert_exporter": "python",
64 | "pygments_lexer": "ipython3",
65 | "version": "3.7.3"
66 | }
67 | },
68 | "nbformat": 4,
69 | "nbformat_minor": 2
70 | }
71 |
--------------------------------------------------------------------------------
/20190802-Parquet-Dict-Decoding-Benchmark.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 5,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import numpy as np\n",
10 | "import pyarrow as pa\n",
11 | "import pyarrow.parquet as pq\n",
12 | "import pandas as pd\n",
13 | "from pandas.util.testing import rands\n",
14 | " \n",
15 | "NUNIQUE = 1000\n",
16 | "STRING_SIZE = 50\n",
17 | "LENGTH = 10_000_000\n",
18 | "REPEATS = LENGTH // NUNIQUE\n",
19 | "\n",
20 | "data = [rands(STRING_SIZE) for i in range(NUNIQUE)] * REPEATS\n",
21 | "table = pa.table([data], names=['f0'])\n",
22 | "\n",
23 | "out_stream = pa.BufferOutputStream()\n",
24 | "pq.write_table(table, out_stream)\n",
25 | "contents = out_stream.getvalue()"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 6,
31 | "metadata": {},
32 | "outputs": [
33 | {
34 | "data": {
35 | "text/plain": [
36 | "1129939"
37 | ]
38 | },
39 | "execution_count": 6,
40 | "metadata": {},
41 | "output_type": "execute_result"
42 | }
43 | ],
44 | "source": [
45 | "len(contents)"
46 | ]
47 | },
48 | {
49 | "cell_type": "code",
50 | "execution_count": 12,
51 | "metadata": {},
52 | "outputs": [
53 | {
54 | "name": "stdout",
55 | "output_type": "stream",
56 | "text": [
57 | "0\n"
58 | ]
59 | }
60 | ],
61 | "source": [
62 | "import gc\n",
63 | "class memory_use:\n",
64 | " \n",
65 | " def __init__(self):\n",
66 | " self.start_use = pa.total_allocated_bytes()\n",
67 | " \n",
68 | " def __enter__(self):\n",
69 | " return\n",
70 | " \n",
71 | " def __exit__(self, type, value, traceback):\n",
72 | " gc.collect()\n",
73 | " print(pa.total_allocated_bytes() - self.start_use)"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 13,
79 | "metadata": {},
80 | "outputs": [
81 | {
82 | "name": "stdout",
83 | "output_type": "stream",
84 | "text": [
85 | "541250112\n"
86 | ]
87 | }
88 | ],
89 | "source": [
90 | "with memory_use():\n",
91 | " memory_use_no_dict = pq.read_table(pa.BufferReader(contents))"
92 | ]
93 | },
94 | {
95 | "cell_type": "code",
96 | "execution_count": 15,
97 | "metadata": {},
98 | "outputs": [
99 | {
100 | "name": "stdout",
101 | "output_type": "stream",
102 | "text": [
103 | "41304128\n"
104 | ]
105 | }
106 | ],
107 | "source": [
108 | "with memory_use():\n",
109 | " memory_use_dict = pq.read_table(pa.BufferReader(contents), read_dictionary=['f0'])"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 16,
115 | "metadata": {},
116 | "outputs": [
117 | {
118 | "name": "stdout",
119 | "output_type": "stream",
120 | "text": [
121 | "1.79 s ± 7.86 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
122 | ]
123 | }
124 | ],
125 | "source": [
126 | "%timeit memory_use_no_dict = pq.read_table(pa.BufferReader(contents))"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 17,
132 | "metadata": {},
133 | "outputs": [
134 | {
135 | "name": "stdout",
136 | "output_type": "stream",
137 | "text": [
138 | "106 ms ± 11.2 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
139 | ]
140 | }
141 | ],
142 | "source": [
143 | "%timeit memory_use_dict = pq.read_table(pa.BufferReader(contents), read_dictionary=['f0'])"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 18,
149 | "metadata": {},
150 | "outputs": [
151 | {
152 | "data": {
153 | "text/plain": [
154 | "516.1763305664062"
155 | ]
156 | },
157 | "execution_count": 18,
158 | "metadata": {},
159 | "output_type": "execute_result"
160 | }
161 | ],
162 | "source": [
163 | "541250112 / (1 << 20)"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 19,
169 | "metadata": {},
170 | "outputs": [
171 | {
172 | "data": {
173 | "text/plain": [
174 | "39.39068603515625"
175 | ]
176 | },
177 | "execution_count": 19,
178 | "metadata": {},
179 | "output_type": "execute_result"
180 | }
181 | ],
182 | "source": [
183 | "41304128 / (1 << 20)"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": null,
189 | "metadata": {},
190 | "outputs": [],
191 | "source": []
192 | }
193 | ],
194 | "metadata": {
195 | "kernelspec": {
196 | "display_name": "Python 3",
197 | "language": "python",
198 | "name": "python3"
199 | },
200 | "language_info": {
201 | "codemirror_mode": {
202 | "name": "ipython",
203 | "version": 3
204 | },
205 | "file_extension": ".py",
206 | "mimetype": "text/x-python",
207 | "name": "python",
208 | "nbconvert_exporter": "python",
209 | "pygments_lexer": "ipython3",
210 | "version": "3.7.3"
211 | }
212 | },
213 | "nbformat": 4,
214 | "nbformat_minor": 2
215 | }
216 |
--------------------------------------------------------------------------------
/20190803-ARROW-6060.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "Change in memory use: 76502016\n",
13 | "Change in peak use: 5843859776\n"
14 | ]
15 | }
16 | ],
17 | "source": [
18 | "import pandas as pd\n",
19 | "from pandas.util.testing import rands\n",
20 | "\n",
21 | "import pyarrow as pa\n",
22 | "import pyarrow.parquet as pq\n",
23 | "\n",
24 | "import gc\n",
25 | "class memory_use:\n",
26 | " \n",
27 | " def __init__(self):\n",
28 | " self.start_use = pa.total_allocated_bytes() \n",
29 | " self.pool = pa.default_memory_pool()\n",
30 | " self.start_peak_use = self.pool.max_memory()\n",
31 | " \n",
32 | " def __enter__(self):\n",
33 | " return\n",
34 | " \n",
35 | " def __exit__(self, type, value, traceback):\n",
36 | " gc.collect()\n",
37 | " print(\"Change in memory use: {}\"\n",
38 | " .format(pa.total_allocated_bytes() - self.start_use))\n",
39 | " print(\"Change in peak use: {}\"\n",
40 | " .format(self.pool.max_memory() - self.start_peak_use))\n",
41 | "\n",
42 | "with memory_use():\n",
43 | " table = pq.read_table('/home/wesm/Downloads/demofile.parquet') "
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 2,
49 | "metadata": {},
50 | "outputs": [
51 | {
52 | "name": "stdout",
53 | "output_type": "stream",
54 | "text": [
55 | "Change in memory use: 34499968\n",
56 | "Change in peak use: 5801857728\n"
57 | ]
58 | }
59 | ],
60 | "source": [
61 | "with memory_use():\n",
62 | " table = pq.read_table('/home/wesm/Downloads/demofile.parquet', columns=['body'])"
63 | ]
64 | },
65 | {
66 | "cell_type": "code",
67 | "execution_count": 5,
68 | "metadata": {},
69 | "outputs": [
70 | {
71 | "data": {
72 | "text/plain": [
73 | "pyarrow.Table\n",
74 | "archived: bool\n",
75 | "author: string\n",
76 | "author_flair_css_class: string\n",
77 | "author_flair_text: string\n",
78 | "body: string\n",
79 | "controversiality: int64\n",
80 | "created_utc: string\n",
81 | "distinguished: string\n",
82 | "downs: int64\n",
83 | "edited: string\n",
84 | "gilded: int64\n",
85 | "id: string\n",
86 | "link_id: string\n",
87 | "name: string\n",
88 | "parent_id: string\n",
89 | "retrieved_on: int64\n",
90 | "score: int64\n",
91 | "score_hidden: bool\n",
92 | "subreddit: string\n",
93 | "subreddit_id: string\n",
94 | "ups: int64"
95 | ]
96 | },
97 | "execution_count": 5,
98 | "metadata": {},
99 | "output_type": "execute_result"
100 | }
101 | ],
102 | "source": [
103 | "table"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "metadata": {},
110 | "outputs": [],
111 | "source": [
112 | "def generate_strings(length, nunique, string_length=10):\n",
113 | " unique_values = [rands(string_length) for i in range(nunique)]\n",
114 | " values = unique_values * (length // nunique)\n",
115 | " return values\n",
116 | "\n",
117 | "df = pd.DataFrame()\n",
118 | "df['a'] = generate_strings(100000000, 10000)\n",
119 | "df['b'] = generate_strings(100000000, 10000)\n",
120 | "df.to_parquet('/tmp/test.parquet')"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": 2,
126 | "metadata": {},
127 | "outputs": [
128 | {
129 | "name": "stdout",
130 | "output_type": "stream",
131 | "text": [
132 | "Change in memory use: 825560448\n",
133 | "Change in peak use: 1484772224\n"
134 | ]
135 | }
136 | ],
137 | "source": [
138 | "with memory_use():\n",
139 | " table = pq.read_table('/tmp/test.parquet', read_dictionary=['a', 'b'])"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 2,
145 | "metadata": {},
146 | "outputs": [],
147 | "source": [
148 | "pool = pa.default_memory_pool()"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 3,
154 | "metadata": {},
155 | "outputs": [
156 | {
157 | "data": {
158 | "text/plain": [
159 | "0"
160 | ]
161 | },
162 | "execution_count": 3,
163 | "metadata": {},
164 | "output_type": "execute_result"
165 | }
166 | ],
167 | "source": [
168 | "pool.max_memory()"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": 4,
174 | "metadata": {},
175 | "outputs": [],
176 | "source": []
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": 7,
181 | "metadata": {},
182 | "outputs": [
183 | {
184 | "name": "stdout",
185 | "output_type": "stream",
186 | "text": [
187 | "-rw-r--r-- 1 wesm wesm 274263652 Aug 3 14:19 /tmp/test.parquet\r\n"
188 | ]
189 | }
190 | ],
191 | "source": [
192 | "!ls -l /tmp/*.parquet"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": 8,
198 | "metadata": {},
199 | "outputs": [
200 | {
201 | "name": "stdout",
202 | "output_type": "stream",
203 | "text": [
204 | "Change in memory use: 2825000192\n",
205 | "Change in peak use: 3827684608\n"
206 | ]
207 | }
208 | ],
209 | "source": []
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": 10,
214 | "metadata": {},
215 | "outputs": [
216 | {
217 | "data": {
218 | "text/plain": [
219 | "20585786752"
220 | ]
221 | },
222 | "execution_count": 10,
223 | "metadata": {},
224 | "output_type": "execute_result"
225 | }
226 | ],
227 | "source": [
228 | "pool.max_memory()"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": null,
234 | "metadata": {},
235 | "outputs": [],
236 | "source": []
237 | }
238 | ],
239 | "metadata": {
240 | "kernelspec": {
241 | "display_name": "Python 3",
242 | "language": "python",
243 | "name": "python3"
244 | },
245 | "language_info": {
246 | "codemirror_mode": {
247 | "name": "ipython",
248 | "version": 3
249 | },
250 | "file_extension": ".py",
251 | "mimetype": "text/x-python",
252 | "name": "python",
253 | "nbconvert_exporter": "python",
254 | "pygments_lexer": "ipython3",
255 | "version": "3.7.3"
256 | }
257 | },
258 | "nbformat": 4,
259 | "nbformat_minor": 2
260 | }
261 |
--------------------------------------------------------------------------------
/20190815-Parquet-Direct-Dictionary-Write.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import numpy as np\n",
10 | "import pyarrow as pa\n",
11 | "import pyarrow.parquet as pq\n",
12 | "import pandas as pd\n",
13 | "from pandas.util.testing import rands\n",
14 | " \n",
15 | "NUNIQUE = 1000\n",
16 | "STRING_SIZE = 50\n",
17 | "LENGTH = 10_000_000\n",
18 | "REPEATS = LENGTH // NUNIQUE\n",
19 | "\n",
20 | "uniques = np.array([rands(STRING_SIZE) for i in range(NUNIQUE)], dtype='O')\n",
21 | "indices = np.random.randint(0, NUNIQUE, size=LENGTH).astype('i4') \n",
22 | "data = uniques.take(indices)"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 2,
28 | "metadata": {},
29 | "outputs": [],
30 | "source": [
31 | "import gc\n",
32 | "class memory_use:\n",
33 | " \n",
34 | " def __init__(self):\n",
35 | " self.start_use = pa.total_allocated_bytes() \n",
36 | " self.pool = pa.default_memory_pool()\n",
37 | " self.start_peak_use = self.pool.max_memory()\n",
38 | " \n",
39 | " def __enter__(self):\n",
40 | " return\n",
41 | " \n",
42 | " def __exit__(self, type, value, traceback):\n",
43 | " gc.collect()\n",
44 | " print(\"Change in memory use: {}\"\n",
45 | " .format(pa.total_allocated_bytes() - self.start_use))\n",
46 | " print(\"Change in peak use: {}\"\n",
47 | " .format(self.pool.max_memory() - self.start_peak_use))"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 3,
53 | "metadata": {},
54 | "outputs": [],
55 | "source": [
56 | "dict_data = pa.DictionaryArray.from_arrays(indices, uniques)"
57 | ]
58 | },
59 | {
60 | "cell_type": "code",
61 | "execution_count": 4,
62 | "metadata": {},
63 | "outputs": [
64 | {
65 | "data": {
66 | "text/plain": [
67 | "72320"
68 | ]
69 | },
70 | "execution_count": 4,
71 | "metadata": {},
72 | "output_type": "execute_result"
73 | }
74 | ],
75 | "source": [
76 | "pa.default_memory_pool().max_memory()"
77 | ]
78 | },
79 | {
80 | "cell_type": "code",
81 | "execution_count": 5,
82 | "metadata": {},
83 | "outputs": [
84 | {
85 | "name": "stdout",
86 | "output_type": "stream",
87 | "text": [
88 | "Change in memory use: 16777216\n",
89 | "Change in peak use: 753475648\n"
90 | ]
91 | }
92 | ],
93 | "source": [
94 | "table = pa.table([dict_data], names=['f0'])\n",
95 | "with memory_use():\n",
96 | " out_stream = pa.BufferOutputStream()\n",
97 | " pq.write_table(table, out_stream)\n",
98 | " contents = out_stream.getvalue()"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 6,
104 | "metadata": {},
105 | "outputs": [
106 | {
107 | "name": "stdout",
108 | "output_type": "stream",
109 | "text": [
110 | "820 ms ± 11.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
111 | ]
112 | }
113 | ],
114 | "source": [
115 | "%%timeit\n",
116 | "out_stream = pa.BufferOutputStream()\n",
117 | "pq.write_table(table, out_stream)\n",
118 | "contents = out_stream.getvalue()"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 7,
124 | "metadata": {},
125 | "outputs": [
126 | {
127 | "data": {
128 | "text/plain": [
129 | "12576182"
130 | ]
131 | },
132 | "execution_count": 7,
133 | "metadata": {},
134 | "output_type": "execute_result"
135 | }
136 | ],
137 | "source": [
138 | "len(contents)"
139 | ]
140 | },
141 | {
142 | "cell_type": "code",
143 | "execution_count": 8,
144 | "metadata": {},
145 | "outputs": [
146 | {
147 | "name": "stdout",
148 | "output_type": "stream",
149 | "text": [
150 | "495 ms ± 8.04 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
151 | ]
152 | }
153 | ],
154 | "source": [
155 | "%timeit returned_table = pq.read_table(contents)"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 9,
161 | "metadata": {},
162 | "outputs": [
163 | {
164 | "name": "stdout",
165 | "output_type": "stream",
166 | "text": [
167 | "93.1 ms ± 3.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
168 | ]
169 | }
170 | ],
171 | "source": [
172 | "%timeit returned_table = pq.read_table(contents, read_dictionary=['f0'])"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": 10,
178 | "metadata": {},
179 | "outputs": [],
180 | "source": [
181 | "dense_data = dict_data.cast(pa.utf8())\n",
182 | "table = pa.table([dense_data], names=['f0'])"
183 | ]
184 | },
185 | {
186 | "cell_type": "code",
187 | "execution_count": 11,
188 | "metadata": {},
189 | "outputs": [
190 | {
191 | "name": "stdout",
192 | "output_type": "stream",
193 | "text": [
194 | "405 ms ± 27.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
195 | ]
196 | }
197 | ],
198 | "source": [
199 | "%%timeit\n",
200 | "out_stream = pa.BufferOutputStream()\n",
201 | "pq.write_table(table, out_stream)\n",
202 | "contents = out_stream.getvalue()"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": 12,
208 | "metadata": {},
209 | "outputs": [],
210 | "source": [
211 | "out_stream = pa.BufferOutputStream()\n",
212 | "pq.write_table(table, out_stream)\n",
213 | "contents = out_stream.getvalue()"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": 13,
219 | "metadata": {},
220 | "outputs": [
221 | {
222 | "name": "stdout",
223 | "output_type": "stream",
224 | "text": [
225 | "430 ms ± 8.12 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
226 | ]
227 | }
228 | ],
229 | "source": [
230 | "%%timeit\n",
231 | "returned_table = pq.read_table(contents)"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": 14,
237 | "metadata": {},
238 | "outputs": [
239 | {
240 | "data": {
241 | "text/plain": [
242 | "pyarrow.Table\n",
243 | "f0: string"
244 | ]
245 | },
246 | "execution_count": 14,
247 | "metadata": {},
248 | "output_type": "execute_result"
249 | }
250 | ],
251 | "source": [
252 | "pq.read_table(contents)"
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": null,
258 | "metadata": {},
259 | "outputs": [],
260 | "source": []
261 | }
262 | ],
263 | "metadata": {
264 | "kernelspec": {
265 | "display_name": "Python 3",
266 | "language": "python",
267 | "name": "python3"
268 | },
269 | "language_info": {
270 | "codemirror_mode": {
271 | "name": "ipython",
272 | "version": 3
273 | },
274 | "file_extension": ".py",
275 | "mimetype": "text/x-python",
276 | "name": "python",
277 | "nbconvert_exporter": "python",
278 | "pygments_lexer": "ipython3",
279 | "version": "3.7.3"
280 | }
281 | },
282 | "nbformat": 4,
283 | "nbformat_minor": 2
284 | }
285 |
--------------------------------------------------------------------------------
/20190830-VLDB-FlightDemo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pyarrow as pa\n",
10 | "import pyarrow.parquet as pq\n",
11 | "import pyarrow.flight as flight\n",
12 | "import numpy as np\n",
13 | "import pandas as pd\n",
14 | "import time\n",
15 | "import threading"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": null,
21 | "metadata": {},
22 | "outputs": [],
23 | "source": [
24 | "class DemoServer(flight.FlightServerBase):\n",
25 | " \n",
26 | " def __init__(self):\n",
27 | " self._cache = {}\n",
28 | " \n",
29 | " def list_actions(self, context):\n",
30 | " return [flight.ActionType('list-tables', 'List stored tables'),\n",
31 | " flight.ActionType('drop-table', 'Drop a stored table')]\n",
32 | "\n",
33 | " # -----------------------------------------------------------------\n",
34 | " # Implement actions\n",
35 | " \n",
36 | " def do_action(self, context, action):\n",
37 | " handlers = {\n",
38 | " 'list-tables': self._list_tables,\n",
39 | " 'drop-table': self._drop_table\n",
40 | " } \n",
41 | " handler = handlers.get(action.type)\n",
42 | " if not handler:\n",
43 | " raise NotImplementedError \n",
44 | " return handlers[action.type](action)\n",
45 | " \n",
46 | " def _drop_table(self, action):\n",
47 | " del self._cache[action.body]\n",
48 | " \n",
49 | " def _list_tables(self, action):\n",
50 | " return iter([flight.Result(cache_key) \n",
51 | " for cache_key in sorted(self._cache.keys())])\n",
52 | "\n",
53 | " # -----------------------------------------------------------------\n",
54 | " # Implement puts\n",
55 | " \n",
56 | " def do_put(self, context, descriptor, reader, writer):\n",
57 | " self._cache[descriptor.command] = reader.read_all()\n",
58 | " \n",
59 | " # -----------------------------------------------------------------\n",
60 | " # Implement gets\n",
61 | "\n",
62 | " def do_get(self, context, ticket):\n",
63 | " table = self._cache[ticket.ticket]\n",
64 | " return flight.RecordBatchStream(table)"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "import contextlib\n",
74 | "import socket\n",
75 | "def find_free_port():\n",
76 | " # Find a free port\n",
77 | " sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)\n",
78 | " with contextlib.closing(sock) as sock:\n",
79 | " sock.bind(('', 0))\n",
80 | " sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)\n",
81 | " port = sock.getsockname()[1]\n",
82 | " return port\n",
83 | "\n",
84 | "def wait_for_available(client):\n",
85 | " deadline = time.time() + 5.0\n",
86 | " while True:\n",
87 | " try:\n",
88 | " list(client.list_flights())\n",
89 | " except Exception as e:\n",
90 | " if 'Connect Failed' in str(e):\n",
91 | " if time.time() < deadline:\n",
92 | " time.sleep(0.025)\n",
93 | " continue\n",
94 | " else:\n",
95 | " raise\n",
96 | " break"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": null,
102 | "metadata": {},
103 | "outputs": [],
104 | "source": [
105 | "port = 1337\n",
106 | "location = flight.Location.for_grpc_tcp(\"localhost\", find_free_port())\n",
107 | "location"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": null,
113 | "metadata": {},
114 | "outputs": [],
115 | "source": [
116 | "server = DemoServer()\n",
117 | "server.init(location)\n",
118 | "\n",
119 | "thread = threading.Thread(target=lambda: server.run(), daemon=True)\n",
120 | "thread.start()\n",
121 | "\n",
122 | "client = flight.FlightClient.connect(location)\n",
123 | "wait_for_available(client)"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": null,
129 | "metadata": {},
130 | "outputs": [],
131 | "source": [
132 | "client.list_actions()"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": null,
138 | "metadata": {},
139 | "outputs": [],
140 | "source": [
141 | "def list_tables(client):\n",
142 | " action = flight.Action('list-tables', b'')\n",
143 | " return [x.body.to_pybytes().decode('utf8') for x in client.do_action(action)] \n",
144 | "\n",
145 | "# def drop_table(client):\n",
146 | "\n",
147 | "list_tables(client)"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": null,
153 | "metadata": {},
154 | "outputs": [],
155 | "source": [
156 | "def cache_table_in_server(name, table):\n",
157 | " desc = flight.FlightDescriptor.for_command(name.encode('utf8'))\n",
158 | " put_writer, put_meta_reader = client.do_put(desc, table.schema)\n",
159 | " put_writer.write(table)\n",
160 | " put_writer.close()\n",
161 | " \n",
162 | " \n",
163 | "def get_table(name):\n",
164 | " reader = client.do_get(flight.Ticket(name.encode('utf8')))\n",
165 | " return reader.read_all()"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": null,
171 | "metadata": {},
172 | "outputs": [],
173 | "source": [
174 | "table = pa.table([pa.array([1,2,3,4,5])], names=['f0'])\n",
175 | "cache_table_in_server('table1', table)"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": null,
181 | "metadata": {},
182 | "outputs": [],
183 | "source": [
184 | "list_tables(client)"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {},
191 | "outputs": [],
192 | "source": [
193 | "cache_table_in_server('table2', table)\n",
194 | "cache_table_in_server('table3', table)\n",
195 | "cache_table_in_server('table4', table)"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": null,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "list_tables(client)"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "get_table('table1')"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": null,
219 | "metadata": {},
220 | "outputs": [],
221 | "source": [
222 | "import pandas as pd\n",
223 | "fec = pd.read_csv('/home/wesm/code/pydata-book/datasets/fec/P00000001-ALL.csv')\n",
224 | "fec.head()\n",
225 | "def coerce_int(x):\n",
226 | " try:\n",
227 | " return int(x)\n",
228 | " except:\n",
229 | " return -1\n",
230 | "\n",
231 | "fec['contbr_zip'] = fec['contbr_zip'].map(coerce_int).astype(np.int64)"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": 17,
237 | "metadata": {},
238 | "outputs": [],
239 | "source": [
240 | "fec_table = pa.table(fec)"
241 | ]
242 | },
243 | {
244 | "cell_type": "code",
245 | "execution_count": 18,
246 | "metadata": {},
247 | "outputs": [],
248 | "source": [
249 | "fec_table = pa.concat_tables([fec_table] * 10)"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": 19,
255 | "metadata": {},
256 | "outputs": [
257 | {
258 | "name": "stdout",
259 | "output_type": "stream",
260 | "text": [
261 | "CPU times: user 425 ms, sys: 1.13 s, total: 1.56 s\n",
262 | "Wall time: 1.16 s\n"
263 | ]
264 | }
265 | ],
266 | "source": [
267 | "%%time\n",
268 | "cache_table_in_server('fec_table', fec_table)"
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "execution_count": 20,
274 | "metadata": {},
275 | "outputs": [
276 | {
277 | "data": {
278 | "text/plain": [
279 | "['fec_table', 'table1', 'table2', 'table3', 'table4']"
280 | ]
281 | },
282 | "execution_count": 20,
283 | "metadata": {},
284 | "output_type": "execute_result"
285 | }
286 | ],
287 | "source": [
288 | "list_tables(client)"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": 21,
294 | "metadata": {},
295 | "outputs": [
296 | {
297 | "name": "stdout",
298 | "output_type": "stream",
299 | "text": [
300 | "CPU times: user 404 ms, sys: 995 ms, total: 1.4 s\n",
301 | "Wall time: 1.1 s\n"
302 | ]
303 | }
304 | ],
305 | "source": [
306 | "%%time \n",
307 | "\n",
308 | "fec_table_received = get_table('fec_table')"
309 | ]
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": null,
314 | "metadata": {},
315 | "outputs": [],
316 | "source": []
317 | }
318 | ],
319 | "metadata": {
320 | "kernelspec": {
321 | "display_name": "Python 3",
322 | "language": "python",
323 | "name": "python3"
324 | },
325 | "language_info": {
326 | "codemirror_mode": {
327 | "name": "ipython",
328 | "version": 3
329 | },
330 | "file_extension": ".py",
331 | "mimetype": "text/x-python",
332 | "name": "python",
333 | "nbconvert_exporter": "python",
334 | "pygments_lexer": "ipython3",
335 | "version": "3.7.3"
336 | }
337 | },
338 | "nbformat": 4,
339 | "nbformat_minor": 2
340 | }
341 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/FeatherCompression.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 4,
6 | "metadata": {},
7 | "outputs": [
8 | {
9 | "name": "stdout",
10 | "output_type": "stream",
11 | "text": [
12 | "using 8 cpu cores\n"
13 | ]
14 | }
15 | ],
16 | "source": [
17 | "# flake8: noqa\n",
18 | "\n",
19 | "import pyarrow.feather as feather\n",
20 | "import pandas as pd\n",
21 | "import json\n",
22 | "import numpy as np\n",
23 | "import pyarrow as pa\n",
24 | "import pyarrow.parquet as pq\n",
25 | "from pandas.util.testing import rands\n",
26 | "import gc\n",
27 | "import os\n",
28 | "import time\n",
29 | "\n",
30 | "pa.set_cpu_count(8)\n",
31 | "\n",
32 | "print(f\"using {pa.cpu_count()} cpu cores\")\n",
33 | " \n",
34 | "\n",
35 | "def get_timing(f, niter=1):\n",
36 | " start = time.clock_gettime(time.CLOCK_REALTIME)\n",
37 | " for i in range(niter):\n",
38 | " f()\n",
39 | " result = (time.clock_gettime(time.CLOCK_REALTIME) - start) / niter\n",
40 | " return result\n",
41 | "\n",
42 | "\n",
43 | "files = {\n",
44 | " 'fanniemae': {\n",
45 | " 'base': '2016Q4',\n",
46 | " 'source': {\n",
47 | " 'path': '2016Q4.txt',\n",
48 | " 'sep': '|',\n",
49 | " 'header': None\n",
50 | " }\n",
51 | " },\n",
52 | " 'nyctaxi': {\n",
53 | " 'base': 'yellow_tripdata_2010-01',\n",
54 | " 'source': {\n",
55 | " 'path': 'yellow_tripdata_2010-01.csv',\n",
56 | " 'sep': ',',\n",
57 | " 'header': 0\n",
58 | " }\n",
59 | " }\n",
60 | "}\n",
61 | "\n",
62 | "\n",
63 | "compression_cases = [\n",
64 | " (None, None), # uncompressed\n",
65 | " ('zstd', 1), # minimal compression\n",
66 | " ('zstd', 10), # moderate\n",
67 | " ('lz4', None) # LZ4 doesn't support compression level\n",
68 | "]\n",
69 | "\n",
70 | "\n",
71 | "def write_files(files, chunksize=1<<16):\n",
72 | " statistics = []\n",
73 | " for name, info in files.items():\n",
74 | " source = info['source']\n",
75 | " print(\"reading {}\".format(source['path']))\n",
76 | " df = pd.read_csv(source['path'], sep=source['sep'], \n",
77 | " header=source['header'], \n",
78 | " low_memory=False)\n",
79 | " if source['header'] is None:\n",
80 | " df.columns = ['f{}'.format(i) for i in range(len(df.columns))]\n",
81 | "\n",
82 | " t = (pa.Table.from_pandas(df, preserve_index=False)\n",
83 | " .replace_schema_metadata(None))\n",
84 | " for compression, compression_level in compression_cases:\n",
85 | " path = '{}_{}_{}.feather'.format(info['base'], \n",
86 | " compression or 'uncompressed',\n",
87 | " compression_level)\n",
88 | " print((name, compression, compression_level))\n",
89 | " tm = get_timing(lambda: \n",
90 | " feather.write_feather(df, path, compression=compression,\n",
91 | " compression_level=compression_level,\n",
92 | " chunksize=chunksize))\n",
93 | " file_size = os.stat(path).st_size\n",
94 | " result = name, compression, compression_level, file_size, tm\n",
95 | " print(result)\n",
96 | " statistics.append(result)\n",
97 | " return statistics\n",
98 | "\n",
99 | "def get_read_results():\n",
100 | " all_results = []\n",
101 | " for name, info in files.items():\n",
102 | " for compression, compression_level in compression_cases:\n",
103 | " path = '{}_{}_{}.feather'.format(info['base'], \n",
104 | " compression or 'uncompressed',\n",
105 | " compression_level)\n",
106 | " read_time = get_timing(lambda: feather.read_table(path, memory_map=False),\n",
107 | " niter=5)\n",
108 | " result = name, compression, compression_level, read_time\n",
109 | " print(result)\n",
110 | " all_results.append(result) \n",
111 | " return all_results"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": null,
117 | "metadata": {},
118 | "outputs": [],
119 | "source": []
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 5,
124 | "metadata": {},
125 | "outputs": [
126 | {
127 | "name": "stdout",
128 | "output_type": "stream",
129 | "text": [
130 | "1024\n",
131 | "reading 2016Q4.txt\n",
132 | "('fanniemae', None, None)\n",
133 | "('fanniemae', None, None, 5084410194, 11.884642839431763)\n",
134 | "('fanniemae', 'zstd', 1)\n",
135 | "('fanniemae', 'zstd', 1, 501955562, 11.54361605644226)\n",
136 | "('fanniemae', 'zstd', 10)\n",
137 | "('fanniemae', 'zstd', 10, 439460538, 40.15117073059082)\n",
138 | "('fanniemae', 'lz4', None)\n",
139 | "('fanniemae', 'lz4', None, 765604482, 12.363005876541138)\n",
140 | "reading yellow_tripdata_2010-01.csv\n",
141 | "('nyctaxi', None, None)\n",
142 | "('nyctaxi', None, None, 2522035242, 6.970196723937988)\n",
143 | "('nyctaxi', 'zstd', 1)\n",
144 | "('nyctaxi', 'zstd', 1, 878914098, 7.667033433914185)\n",
145 | "('nyctaxi', 'zstd', 10)\n",
146 | "('nyctaxi', 'zstd', 10, 828266042, 32.220927715301514)\n",
147 | "('nyctaxi', 'lz4', None)\n",
148 | "('nyctaxi', 'lz4', None, 1262344938, 7.114352226257324)\n",
149 | "('fanniemae', None, None, 2.2620407581329345)\n",
150 | "('fanniemae', 'zstd', 1, 3.4737910270690917)\n",
151 | "('fanniemae', 'zstd', 10, 3.4430580615997313)\n",
152 | "('fanniemae', 'lz4', None, 3.521429014205933)\n",
153 | "('nyctaxi', None, None, 1.0237845420837401)\n",
154 | "('nyctaxi', 'zstd', 1, 1.8016125202178954)\n",
155 | "('nyctaxi', 'zstd', 10, 1.7049409389495849)\n",
156 | "('nyctaxi', 'lz4', None, 1.3043041229248047)\n",
157 | "2048\n",
158 | "reading 2016Q4.txt\n",
159 | "('fanniemae', None, None)\n",
160 | "('fanniemae', None, None, 5063114554, 11.932640790939331)\n",
161 | "('fanniemae', 'zstd', 1)\n",
162 | "('fanniemae', 'zstd', 1, 468753626, 9.218210458755493)\n",
163 | "('fanniemae', 'zstd', 10)\n",
164 | "('fanniemae', 'zstd', 10, 401064538, 40.00880241394043)\n",
165 | "('fanniemae', 'lz4', None)\n",
166 | "('fanniemae', 'lz4', None, 701361578, 9.259565353393555)\n",
167 | "reading yellow_tripdata_2010-01.csv\n",
168 | "('nyctaxi', None, None)\n",
169 | "('nyctaxi', None, None, 2513790386, 5.759558200836182)\n",
170 | "('nyctaxi', 'zstd', 1)\n",
171 | "('nyctaxi', 'zstd', 1, 851430546, 6.4997947216033936)\n",
172 | "('nyctaxi', 'zstd', 10)\n",
173 | "('nyctaxi', 'zstd', 10, 790773018, 33.690829277038574)\n",
174 | "('nyctaxi', 'lz4', None)\n",
175 | "('nyctaxi', 'lz4', None, 1223064234, 5.975880861282349)\n",
176 | "('fanniemae', None, None, 1.610342788696289)\n",
177 | "('fanniemae', 'zstd', 1, 1.983039951324463)\n",
178 | "('fanniemae', 'zstd', 10, 1.9032105445861816)\n",
179 | "('fanniemae', 'lz4', None, 1.7990120887756347)\n",
180 | "('nyctaxi', None, None, 0.8817797660827636)\n",
181 | "('nyctaxi', 'zstd', 1, 1.2915375709533692)\n",
182 | "('nyctaxi', 'zstd', 10, 1.13835711479187)\n",
183 | "('nyctaxi', 'lz4', None, 0.8505313873291016)\n",
184 | "4096\n",
185 | "reading 2016Q4.txt\n",
186 | "('fanniemae', None, None)\n",
187 | "('fanniemae', None, None, 5052804778, 10.22159743309021)\n",
188 | "('fanniemae', 'zstd', 1)\n",
189 | "('fanniemae', 'zstd', 1, 473501522, 8.019737958908081)\n",
190 | "('fanniemae', 'zstd', 10)\n",
191 | "('fanniemae', 'zstd', 10, 384761498, 13.248246908187866)\n",
192 | "('fanniemae', 'lz4', None)\n",
193 | "('fanniemae', 'lz4', None, 666704194, 7.61299991607666)\n",
194 | "reading yellow_tripdata_2010-01.csv\n",
195 | "('nyctaxi', None, None)\n",
196 | "('nyctaxi', None, None, 2509671706, 6.375310659408569)\n",
197 | "('nyctaxi', 'zstd', 1)\n",
198 | "('nyctaxi', 'zstd', 1, 841720058, 5.634358882904053)\n",
199 | "('nyctaxi', 'zstd', 10)\n",
200 | "('nyctaxi', 'zstd', 10, 765991802, 23.161847591400146)\n",
201 | "('nyctaxi', 'lz4', None)\n",
202 | "('nyctaxi', 'lz4', None, 1165201354, 6.004603624343872)\n",
203 | "('fanniemae', None, None, 1.341871976852417)\n",
204 | "('fanniemae', 'zstd', 1, 1.2426270961761474)\n",
205 | "('fanniemae', 'zstd', 10, 1.113413667678833)\n",
206 | "('fanniemae', 'lz4', None, 1.0141475200653076)\n",
207 | "('nyctaxi', None, None, 0.7986891269683838)\n",
208 | "('nyctaxi', 'zstd', 1, 0.974519681930542)\n",
209 | "('nyctaxi', 'zstd', 10, 0.8223378658294678)\n",
210 | "('nyctaxi', 'lz4', None, 0.5664512634277343)\n",
211 | "8192\n",
212 | "reading 2016Q4.txt\n",
213 | "('fanniemae', None, None)\n",
214 | "('fanniemae', None, None, 5048174170, 10.155773162841797)\n",
215 | "('fanniemae', 'zstd', 1)\n",
216 | "('fanniemae', 'zstd', 1, 476147690, 7.544380187988281)\n",
217 | "('fanniemae', 'zstd', 10)\n",
218 | "('fanniemae', 'zstd', 10, 380904258, 13.942293882369995)\n",
219 | "('fanniemae', 'lz4', None)\n",
220 | "('fanniemae', 'lz4', None, 648217594, 7.258745193481445)\n",
221 | "reading yellow_tripdata_2010-01.csv\n",
222 | "('nyctaxi', None, None)\n",
223 | "('nyctaxi', None, None, 2507611258, 6.470987319946289)\n",
224 | "('nyctaxi', 'zstd', 1)\n",
225 | "('nyctaxi', 'zstd', 1, 837304882, 5.931333303451538)\n",
226 | "('nyctaxi', 'zstd', 10)\n",
227 | "('nyctaxi', 'zstd', 10, 739310474, 29.601118326187134)\n",
228 | "('nyctaxi', 'lz4', None)\n",
229 | "('nyctaxi', 'lz4', None, 1144720050, 4.887480974197388)\n",
230 | "('fanniemae', None, None, 1.3168220043182373)\n",
231 | "('fanniemae', 'zstd', 1, 0.8572097301483155)\n",
232 | "('fanniemae', 'zstd', 10, 0.7228690624237061)\n",
233 | "('fanniemae', 'lz4', None, 0.6564846992492676)\n",
234 | "('nyctaxi', None, None, 0.7386976718902588)\n",
235 | "('nyctaxi', 'zstd', 1, 0.9264132499694824)\n",
236 | "('nyctaxi', 'zstd', 10, 0.7089903354644775)\n",
237 | "('nyctaxi', 'lz4', None, 0.46931772232055663)\n",
238 | "16384\n",
239 | "reading 2016Q4.txt\n",
240 | "('fanniemae', None, None)\n",
241 | "('fanniemae', None, None, 5046354402, 10.359640121459961)\n",
242 | "('fanniemae', 'zstd', 1)\n",
243 | "('fanniemae', 'zstd', 1, 488072882, 6.634678363800049)\n",
244 | "('fanniemae', 'zstd', 10)\n",
245 | "('fanniemae', 'zstd', 10, 386850010, 14.295108318328857)\n",
246 | "('fanniemae', 'lz4', None)\n",
247 | "('fanniemae', 'lz4', None, 644333354, 6.482739210128784)\n",
248 | "reading yellow_tripdata_2010-01.csv\n",
249 | "('nyctaxi', None, None)\n",
250 | "('nyctaxi', None, None, 2506582282, 5.567317008972168)\n",
251 | "('nyctaxi', 'zstd', 1)\n",
252 | "('nyctaxi', 'zstd', 1, 833835922, 4.956018924713135)\n",
253 | "('nyctaxi', 'zstd', 10)\n",
254 | "('nyctaxi', 'zstd', 10, 709229218, 17.30007767677307)\n",
255 | "('nyctaxi', 'lz4', None)\n",
256 | "('nyctaxi', 'lz4', None, 1179681450, 5.5779945850372314)\n",
257 | "('fanniemae', None, None, 1.3266838550567628)\n",
258 | "('fanniemae', 'zstd', 1, 0.7207117557525635)\n",
259 | "('fanniemae', 'zstd', 10, 0.5619686603546142)\n",
260 | "('fanniemae', 'lz4', None, 0.5085867404937744)\n",
261 | "('nyctaxi', None, None, 0.7293866634368896)\n",
262 | "('nyctaxi', 'zstd', 1, 0.780490779876709)\n",
263 | "('nyctaxi', 'zstd', 10, 0.6338376045227051)\n",
264 | "('nyctaxi', 'lz4', None, 0.42446208000183105)\n",
265 | "32768\n",
266 | "reading 2016Q4.txt\n",
267 | "('fanniemae', None, None)\n",
268 | "('fanniemae', None, None, 5045772882, 11.194675922393799)\n",
269 | "('fanniemae', 'zstd', 1)\n",
270 | "('fanniemae', 'zstd', 1, 494361698, 6.307297229766846)\n",
271 | "('fanniemae', 'zstd', 10)\n",
272 | "('fanniemae', 'zstd', 10, 394216642, 16.57004427909851)\n",
273 | "('fanniemae', 'lz4', None)\n",
274 | "('fanniemae', 'lz4', None, 640424914, 6.438863277435303)\n",
275 | "reading yellow_tripdata_2010-01.csv\n",
276 | "('nyctaxi', None, None)\n",
277 | "('nyctaxi', None, None, 2506066506, 5.98804497718811)\n",
278 | "('nyctaxi', 'zstd', 1)\n",
279 | "('nyctaxi', 'zstd', 1, 817758394, 4.760921478271484)\n",
280 | "('nyctaxi', 'zstd', 10)\n",
281 | "('nyctaxi', 'zstd', 10, 675626410, 19.773839712142944)\n",
282 | "('nyctaxi', 'lz4', None)\n",
283 | "('nyctaxi', 'lz4', None, 1176543226, 5.565099239349365)\n",
284 | "('fanniemae', None, None, 1.207357358932495)\n",
285 | "('fanniemae', 'zstd', 1, 0.6379957675933838)\n",
286 | "('fanniemae', 'zstd', 10, 0.5131874561309815)\n",
287 | "('fanniemae', 'lz4', None, 0.45996761322021484)\n",
288 | "('nyctaxi', None, None, 0.6317520141601562)\n",
289 | "('nyctaxi', 'zstd', 1, 0.7357310771942138)\n",
290 | "('nyctaxi', 'zstd', 10, 0.5581299781799316)\n",
291 | "('nyctaxi', 'lz4', None, 0.37372236251831054)\n",
292 | "65536\n",
293 | "reading 2016Q4.txt\n",
294 | "('fanniemae', None, None)\n",
295 | "('fanniemae', None, None, 5045771154, 11.179830074310303)\n",
296 | "('fanniemae', 'zstd', 1)\n",
297 | "('fanniemae', 'zstd', 1, 524046410, 6.3280885219573975)\n",
298 | "('fanniemae', 'zstd', 10)\n",
299 | "('fanniemae', 'zstd', 10, 395368482, 14.682528018951416)\n",
300 | "('fanniemae', 'lz4', None)\n",
301 | "('fanniemae', 'lz4', None, 638440418, 5.975476264953613)\n",
302 | "reading yellow_tripdata_2010-01.csv\n",
303 | "('nyctaxi', None, None)\n",
304 | "('nyctaxi', None, None, 2505808570, 5.9450695514678955)\n",
305 | "('nyctaxi', 'zstd', 1)\n",
306 | "('nyctaxi', 'zstd', 1, 821964938, 5.244204044342041)\n",
307 | "('nyctaxi', 'zstd', 10)\n",
308 | "('nyctaxi', 'zstd', 10, 651798442, 19.96653389930725)\n",
309 | "('nyctaxi', 'lz4', None)\n",
310 | "('nyctaxi', 'lz4', None, 1174964650, 5.419882297515869)\n",
311 | "('fanniemae', None, None, 1.0205121994018556)\n",
312 | "('fanniemae', 'zstd', 1, 0.5739494800567627)\n",
313 | "('fanniemae', 'zstd', 10, 0.4582984924316406)\n",
314 | "('fanniemae', 'lz4', None, 0.41712336540222167)\n",
315 | "('nyctaxi', None, None, 0.5486010074615478)\n",
316 | "('nyctaxi', 'zstd', 1, 0.6663787841796875)\n",
317 | "('nyctaxi', 'zstd', 10, 0.5117742538452148)\n",
318 | "('nyctaxi', 'lz4', None, 0.34208340644836427)\n"
319 | ]
320 | }
321 | ],
322 | "source": [
323 | "chunksizes = [1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15,\n",
324 | " 1 << 16]\n",
325 | "\n",
326 | "results_by_chunksize = {}\n",
327 | "for chunksize in chunksizes:\n",
328 | " print(chunksize)\n",
329 | " write_results = write_files(files, chunksize=chunksize)\n",
330 | " read_results = get_read_results() \n",
331 | " results_by_chunksize[chunksize] = write_results, read_results"
332 | ]
333 | },
334 | {
335 | "cell_type": "code",
336 | "execution_count": null,
337 | "metadata": {},
338 | "outputs": [],
339 | "source": []
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": 6,
344 | "metadata": {},
345 | "outputs": [],
346 | "source": [
347 | "reads = []\n",
348 | "writes = []\n",
349 | "\n",
350 | "for chunksize, (write_results, read_results) in results_by_chunksize.items():\n",
351 | " write_results = pd.DataFrame.from_records(\n",
352 | " write_results, columns=['dataset', 'codec', 'codec_level', \n",
353 | " 'file_size', 'write_time'])\n",
354 | " read_results = pd.DataFrame.from_records(\n",
355 | " read_results, columns=['dataset', 'codec', 'codec_level', \n",
356 | " 'read_time'])\n",
357 | " write_results['chunksize'] = chunksize\n",
358 | " read_results['chunksize'] = chunksize\n",
359 | " \n",
360 | " reads.append(read_results)\n",
361 | " writes.append(write_results)\n",
362 | " \n",
363 | "reads = pd.concat(reads, ignore_index=True)\n",
364 | "writes = pd.concat(writes, ignore_index=True)\n",
365 | "\n",
366 | "def munge_codecs(codec_s, codec_level_s):\n",
367 | " results = []\n",
368 | " codec_s = codec_s.fillna('uncompressed')\n",
369 | " for codec, codec_level in zip(codec_s, codec_level_s):\n",
370 | " if pd.isnull(codec_level):\n",
371 | " results.append(codec)\n",
372 | " else:\n",
373 | " results.append(codec + '-' + str(int(codec_level)))\n",
374 | " return results\n",
375 | "\n",
376 | "reads['codec'] = munge_codecs(reads['codec'], reads.pop('codec_level'))\n",
377 | "writes['codec'] = munge_codecs(writes['codec'], writes.pop('codec_level'))"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": 13,
383 | "metadata": {},
384 | "outputs": [],
385 | "source": [
386 | "%matplotlib notebook"
387 | ]
388 | },
389 | {
390 | "cell_type": "code",
391 | "execution_count": 7,
392 | "metadata": {},
393 | "outputs": [],
394 | "source": [
395 | "reads.to_csv('ipc_read_parallel.csv')\n",
396 | "writes.to_csv('ipc_write_parallel.csv')"
397 | ]
398 | },
399 | {
400 | "cell_type": "code",
401 | "execution_count": 8,
402 | "metadata": {},
403 | "outputs": [
404 | {
405 | "data": {
406 | "text/html": [
407 | "
\n",
408 | "\n",
421 | "
\n",
422 | " \n",
423 | " \n",
424 | " | \n",
425 | " dataset | \n",
426 | " codec | \n",
427 | " read_time | \n",
428 | " chunksize | \n",
429 | "
\n",
430 | " \n",
431 | " \n",
432 | " \n",
433 | " 0 | \n",
434 | " fanniemae | \n",
435 | " uncompressed | \n",
436 | " 1.285094 | \n",
437 | " 1024 | \n",
438 | "
\n",
439 | " \n",
440 | " 1 | \n",
441 | " fanniemae | \n",
442 | " zstd-1 | \n",
443 | " 3.586269 | \n",
444 | " 1024 | \n",
445 | "
\n",
446 | " \n",
447 | " 2 | \n",
448 | " fanniemae | \n",
449 | " zstd-10 | \n",
450 | " 3.704591 | \n",
451 | " 1024 | \n",
452 | "
\n",
453 | " \n",
454 | " 3 | \n",
455 | " fanniemae | \n",
456 | " lz4 | \n",
457 | " 3.590986 | \n",
458 | " 1024 | \n",
459 | "
\n",
460 | " \n",
461 | " 4 | \n",
462 | " nyctaxi | \n",
463 | " uncompressed | \n",
464 | " 0.608589 | \n",
465 | " 1024 | \n",
466 | "
\n",
467 | " \n",
468 | " 5 | \n",
469 | " nyctaxi | \n",
470 | " zstd-1 | \n",
471 | " 1.891127 | \n",
472 | " 1024 | \n",
473 | "
\n",
474 | " \n",
475 | " 6 | \n",
476 | " nyctaxi | \n",
477 | " zstd-10 | \n",
478 | " 1.665766 | \n",
479 | " 1024 | \n",
480 | "
\n",
481 | " \n",
482 | " 7 | \n",
483 | " nyctaxi | \n",
484 | " lz4 | \n",
485 | " 1.227717 | \n",
486 | " 1024 | \n",
487 | "
\n",
488 | " \n",
489 | " 8 | \n",
490 | " fanniemae | \n",
491 | " uncompressed | \n",
492 | " 0.472191 | \n",
493 | " 2048 | \n",
494 | "
\n",
495 | " \n",
496 | " 9 | \n",
497 | " fanniemae | \n",
498 | " zstd-1 | \n",
499 | " 2.057420 | \n",
500 | " 2048 | \n",
501 | "
\n",
502 | " \n",
503 | " 10 | \n",
504 | " fanniemae | \n",
505 | " zstd-10 | \n",
506 | " 1.798960 | \n",
507 | " 2048 | \n",
508 | "
\n",
509 | " \n",
510 | " 11 | \n",
511 | " fanniemae | \n",
512 | " lz4 | \n",
513 | " 1.634048 | \n",
514 | " 2048 | \n",
515 | "
\n",
516 | " \n",
517 | " 12 | \n",
518 | " nyctaxi | \n",
519 | " uncompressed | \n",
520 | " 0.171463 | \n",
521 | " 2048 | \n",
522 | "
\n",
523 | " \n",
524 | " 13 | \n",
525 | " nyctaxi | \n",
526 | " zstd-1 | \n",
527 | " 1.287250 | \n",
528 | " 2048 | \n",
529 | "
\n",
530 | " \n",
531 | " 14 | \n",
532 | " nyctaxi | \n",
533 | " zstd-10 | \n",
534 | " 1.045773 | \n",
535 | " 2048 | \n",
536 | "
\n",
537 | " \n",
538 | " 15 | \n",
539 | " nyctaxi | \n",
540 | " lz4 | \n",
541 | " 0.730872 | \n",
542 | " 2048 | \n",
543 | "
\n",
544 | " \n",
545 | " 16 | \n",
546 | " fanniemae | \n",
547 | " uncompressed | \n",
548 | " 0.193799 | \n",
549 | " 4096 | \n",
550 | "
\n",
551 | " \n",
552 | " 17 | \n",
553 | " fanniemae | \n",
554 | " zstd-1 | \n",
555 | " 1.314903 | \n",
556 | " 4096 | \n",
557 | "
\n",
558 | " \n",
559 | " 18 | \n",
560 | " fanniemae | \n",
561 | " zstd-10 | \n",
562 | " 1.107955 | \n",
563 | " 4096 | \n",
564 | "
\n",
565 | " \n",
566 | " 19 | \n",
567 | " fanniemae | \n",
568 | " lz4 | \n",
569 | " 0.983337 | \n",
570 | " 4096 | \n",
571 | "
\n",
572 | " \n",
573 | " 20 | \n",
574 | " nyctaxi | \n",
575 | " uncompressed | \n",
576 | " 0.089300 | \n",
577 | " 4096 | \n",
578 | "
\n",
579 | " \n",
580 | " 21 | \n",
581 | " nyctaxi | \n",
582 | " zstd-1 | \n",
583 | " 1.009519 | \n",
584 | " 4096 | \n",
585 | "
\n",
586 | " \n",
587 | " 22 | \n",
588 | " nyctaxi | \n",
589 | " zstd-10 | \n",
590 | " 0.806882 | \n",
591 | " 4096 | \n",
592 | "
\n",
593 | " \n",
594 | " 23 | \n",
595 | " nyctaxi | \n",
596 | " lz4 | \n",
597 | " 0.495844 | \n",
598 | " 4096 | \n",
599 | "
\n",
600 | " \n",
601 | " 24 | \n",
602 | " fanniemae | \n",
603 | " uncompressed | \n",
604 | " 0.101595 | \n",
605 | " 8192 | \n",
606 | "
\n",
607 | " \n",
608 | " 25 | \n",
609 | " fanniemae | \n",
610 | " zstd-1 | \n",
611 | " 0.897038 | \n",
612 | " 8192 | \n",
613 | "
\n",
614 | " \n",
615 | " 26 | \n",
616 | " fanniemae | \n",
617 | " zstd-10 | \n",
618 | " 0.713981 | \n",
619 | " 8192 | \n",
620 | "
\n",
621 | " \n",
622 | " 27 | \n",
623 | " fanniemae | \n",
624 | " lz4 | \n",
625 | " 0.635939 | \n",
626 | " 8192 | \n",
627 | "
\n",
628 | " \n",
629 | " 28 | \n",
630 | " nyctaxi | \n",
631 | " uncompressed | \n",
632 | " 0.037341 | \n",
633 | " 8192 | \n",
634 | "
\n",
635 | " \n",
636 | " 29 | \n",
637 | " nyctaxi | \n",
638 | " zstd-1 | \n",
639 | " 0.734890 | \n",
640 | " 8192 | \n",
641 | "
\n",
642 | " \n",
643 | " 30 | \n",
644 | " nyctaxi | \n",
645 | " zstd-10 | \n",
646 | " 0.606730 | \n",
647 | " 8192 | \n",
648 | "
\n",
649 | " \n",
650 | " 31 | \n",
651 | " nyctaxi | \n",
652 | " lz4 | \n",
653 | " 0.350030 | \n",
654 | " 8192 | \n",
655 | "
\n",
656 | " \n",
657 | " 32 | \n",
658 | " fanniemae | \n",
659 | " uncompressed | \n",
660 | " 0.048579 | \n",
661 | " 16384 | \n",
662 | "
\n",
663 | " \n",
664 | " 33 | \n",
665 | " fanniemae | \n",
666 | " zstd-1 | \n",
667 | " 0.760098 | \n",
668 | " 16384 | \n",
669 | "
\n",
670 | " \n",
671 | " 34 | \n",
672 | " fanniemae | \n",
673 | " zstd-10 | \n",
674 | " 0.559905 | \n",
675 | " 16384 | \n",
676 | "
\n",
677 | " \n",
678 | " 35 | \n",
679 | " fanniemae | \n",
680 | " lz4 | \n",
681 | " 0.481683 | \n",
682 | " 16384 | \n",
683 | "
\n",
684 | " \n",
685 | " 36 | \n",
686 | " nyctaxi | \n",
687 | " uncompressed | \n",
688 | " 0.016135 | \n",
689 | " 16384 | \n",
690 | "
\n",
691 | " \n",
692 | " 37 | \n",
693 | " nyctaxi | \n",
694 | " zstd-1 | \n",
695 | " 0.697351 | \n",
696 | " 16384 | \n",
697 | "
\n",
698 | " \n",
699 | " 38 | \n",
700 | " nyctaxi | \n",
701 | " zstd-10 | \n",
702 | " 0.556723 | \n",
703 | " 16384 | \n",
704 | "
\n",
705 | " \n",
706 | " 39 | \n",
707 | " nyctaxi | \n",
708 | " lz4 | \n",
709 | " 0.297602 | \n",
710 | " 16384 | \n",
711 | "
\n",
712 | " \n",
713 | " 40 | \n",
714 | " fanniemae | \n",
715 | " uncompressed | \n",
716 | " 0.021817 | \n",
717 | " 32768 | \n",
718 | "
\n",
719 | " \n",
720 | " 41 | \n",
721 | " fanniemae | \n",
722 | " zstd-1 | \n",
723 | " 0.638263 | \n",
724 | " 32768 | \n",
725 | "
\n",
726 | " \n",
727 | " 42 | \n",
728 | " fanniemae | \n",
729 | " zstd-10 | \n",
730 | " 0.476892 | \n",
731 | " 32768 | \n",
732 | "
\n",
733 | " \n",
734 | " 43 | \n",
735 | " fanniemae | \n",
736 | " lz4 | \n",
737 | " 0.418044 | \n",
738 | " 32768 | \n",
739 | "
\n",
740 | " \n",
741 | " 44 | \n",
742 | " nyctaxi | \n",
743 | " uncompressed | \n",
744 | " 0.008242 | \n",
745 | " 32768 | \n",
746 | "
\n",
747 | " \n",
748 | " 45 | \n",
749 | " nyctaxi | \n",
750 | " zstd-1 | \n",
751 | " 0.712379 | \n",
752 | " 32768 | \n",
753 | "
\n",
754 | " \n",
755 | " 46 | \n",
756 | " nyctaxi | \n",
757 | " zstd-10 | \n",
758 | " 0.530141 | \n",
759 | " 32768 | \n",
760 | "
\n",
761 | " \n",
762 | " 47 | \n",
763 | " nyctaxi | \n",
764 | " lz4 | \n",
765 | " 0.288290 | \n",
766 | " 32768 | \n",
767 | "
\n",
768 | " \n",
769 | " 48 | \n",
770 | " fanniemae | \n",
771 | " uncompressed | \n",
772 | " 0.010546 | \n",
773 | " 65536 | \n",
774 | "
\n",
775 | " \n",
776 | " 49 | \n",
777 | " fanniemae | \n",
778 | " zstd-1 | \n",
779 | " 0.595585 | \n",
780 | " 65536 | \n",
781 | "
\n",
782 | " \n",
783 | " 50 | \n",
784 | " fanniemae | \n",
785 | " zstd-10 | \n",
786 | " 0.440694 | \n",
787 | " 65536 | \n",
788 | "
\n",
789 | " \n",
790 | " 51 | \n",
791 | " fanniemae | \n",
792 | " lz4 | \n",
793 | " 0.395174 | \n",
794 | " 65536 | \n",
795 | "
\n",
796 | " \n",
797 | " 52 | \n",
798 | " nyctaxi | \n",
799 | " uncompressed | \n",
800 | " 0.006750 | \n",
801 | " 65536 | \n",
802 | "
\n",
803 | " \n",
804 | " 53 | \n",
805 | " nyctaxi | \n",
806 | " zstd-1 | \n",
807 | " 0.593057 | \n",
808 | " 65536 | \n",
809 | "
\n",
810 | " \n",
811 | " 54 | \n",
812 | " nyctaxi | \n",
813 | " zstd-10 | \n",
814 | " 0.467952 | \n",
815 | " 65536 | \n",
816 | "
\n",
817 | " \n",
818 | " 55 | \n",
819 | " nyctaxi | \n",
820 | " lz4 | \n",
821 | " 0.277783 | \n",
822 | " 65536 | \n",
823 | "
\n",
824 | " \n",
825 | "
\n",
826 | "
"
827 | ],
828 | "text/plain": [
829 | " dataset codec read_time chunksize\n",
830 | "0 fanniemae uncompressed 1.285094 1024\n",
831 | "1 fanniemae zstd-1 3.586269 1024\n",
832 | "2 fanniemae zstd-10 3.704591 1024\n",
833 | "3 fanniemae lz4 3.590986 1024\n",
834 | "4 nyctaxi uncompressed 0.608589 1024\n",
835 | "5 nyctaxi zstd-1 1.891127 1024\n",
836 | "6 nyctaxi zstd-10 1.665766 1024\n",
837 | "7 nyctaxi lz4 1.227717 1024\n",
838 | "8 fanniemae uncompressed 0.472191 2048\n",
839 | "9 fanniemae zstd-1 2.057420 2048\n",
840 | "10 fanniemae zstd-10 1.798960 2048\n",
841 | "11 fanniemae lz4 1.634048 2048\n",
842 | "12 nyctaxi uncompressed 0.171463 2048\n",
843 | "13 nyctaxi zstd-1 1.287250 2048\n",
844 | "14 nyctaxi zstd-10 1.045773 2048\n",
845 | "15 nyctaxi lz4 0.730872 2048\n",
846 | "16 fanniemae uncompressed 0.193799 4096\n",
847 | "17 fanniemae zstd-1 1.314903 4096\n",
848 | "18 fanniemae zstd-10 1.107955 4096\n",
849 | "19 fanniemae lz4 0.983337 4096\n",
850 | "20 nyctaxi uncompressed 0.089300 4096\n",
851 | "21 nyctaxi zstd-1 1.009519 4096\n",
852 | "22 nyctaxi zstd-10 0.806882 4096\n",
853 | "23 nyctaxi lz4 0.495844 4096\n",
854 | "24 fanniemae uncompressed 0.101595 8192\n",
855 | "25 fanniemae zstd-1 0.897038 8192\n",
856 | "26 fanniemae zstd-10 0.713981 8192\n",
857 | "27 fanniemae lz4 0.635939 8192\n",
858 | "28 nyctaxi uncompressed 0.037341 8192\n",
859 | "29 nyctaxi zstd-1 0.734890 8192\n",
860 | "30 nyctaxi zstd-10 0.606730 8192\n",
861 | "31 nyctaxi lz4 0.350030 8192\n",
862 | "32 fanniemae uncompressed 0.048579 16384\n",
863 | "33 fanniemae zstd-1 0.760098 16384\n",
864 | "34 fanniemae zstd-10 0.559905 16384\n",
865 | "35 fanniemae lz4 0.481683 16384\n",
866 | "36 nyctaxi uncompressed 0.016135 16384\n",
867 | "37 nyctaxi zstd-1 0.697351 16384\n",
868 | "38 nyctaxi zstd-10 0.556723 16384\n",
869 | "39 nyctaxi lz4 0.297602 16384\n",
870 | "40 fanniemae uncompressed 0.021817 32768\n",
871 | "41 fanniemae zstd-1 0.638263 32768\n",
872 | "42 fanniemae zstd-10 0.476892 32768\n",
873 | "43 fanniemae lz4 0.418044 32768\n",
874 | "44 nyctaxi uncompressed 0.008242 32768\n",
875 | "45 nyctaxi zstd-1 0.712379 32768\n",
876 | "46 nyctaxi zstd-10 0.530141 32768\n",
877 | "47 nyctaxi lz4 0.288290 32768\n",
878 | "48 fanniemae uncompressed 0.010546 65536\n",
879 | "49 fanniemae zstd-1 0.595585 65536\n",
880 | "50 fanniemae zstd-10 0.440694 65536\n",
881 | "51 fanniemae lz4 0.395174 65536\n",
882 | "52 nyctaxi uncompressed 0.006750 65536\n",
883 | "53 nyctaxi zstd-1 0.593057 65536\n",
884 | "54 nyctaxi zstd-10 0.467952 65536\n",
885 | "55 nyctaxi lz4 0.277783 65536"
886 | ]
887 | },
888 | "execution_count": 8,
889 | "metadata": {},
890 | "output_type": "execute_result"
891 | }
892 | ],
893 | "source": [
894 | "reads"
895 | ]
896 | },
897 | {
898 | "cell_type": "code",
899 | "execution_count": null,
900 | "metadata": {},
901 | "outputs": [],
902 | "source": []
903 | }
904 | ],
905 | "metadata": {
906 | "kernelspec": {
907 | "display_name": "Python 3",
908 | "language": "python",
909 | "name": "python3"
910 | },
911 | "language_info": {
912 | "codemirror_mode": {
913 | "name": "ipython",
914 | "version": 3
915 | },
916 | "file_extension": ".py",
917 | "mimetype": "text/x-python",
918 | "name": "python",
919 | "nbconvert_exporter": "python",
920 | "pygments_lexer": "ipython3",
921 | "version": "3.7.6"
922 | }
923 | },
924 | "nbformat": 4,
925 | "nbformat_minor": 4
926 | }
927 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/all_read_results.csv:
--------------------------------------------------------------------------------
1 | expr,time,dataset,output_type,nthreads,language
2 | csv_fread,17.535751806,fanniemae,R data.frame,1,R
3 | fst (UNC),5.83356695,fanniemae,R data.frame,1,R
4 | fst (c=50),5.875382178,fanniemae,R data.frame,1,R
5 | feather V1,10.078519502799999,fanniemae,R data.frame,1,R
6 | feather V2 (UNC),4.7198155451999995,fanniemae,R data.frame,1,R
7 | feather V2 (LZ4),5.852145495199999,fanniemae,R data.frame,1,R
8 | feather V2 (ZSTD),7.77908361,fanniemae,R data.frame,1,R
9 | parquet (UNC),9.4933916048,fanniemae,R data.frame,1,R
10 | parquet (SNAPPY),9.911315661200002,fanniemae,R data.frame,1,R
11 | RDS (UNC),30.2670197082,fanniemae,R data.frame,1,R
12 | RDS (C),41.482849064199996,fanniemae,R data.frame,1,R
13 | csv_fread,23.370041255,nyctaxi,R data.frame,1,R
14 | fst (UNC),13.017416436,nyctaxi,R data.frame,1,R
15 | fst (c=50),12.6347099714,nyctaxi,R data.frame,1,R
16 | feather V1,13.443664009399999,nyctaxi,R data.frame,1,R
17 | feather V2 (UNC),11.3714301042,nyctaxi,R data.frame,1,R
18 | feather V2 (LZ4),13.29604463,nyctaxi,R data.frame,1,R
19 | feather V2 (ZSTD),14.5943007722,nyctaxi,R data.frame,1,R
20 | parquet (UNC),13.1586667582,nyctaxi,R data.frame,1,R
21 | parquet (SNAPPY),13.958228992,nyctaxi,R data.frame,1,R
22 | RDS (UNC),22.211784820200002,nyctaxi,R data.frame,1,R
23 | RDS (C),30.765105346200002,nyctaxi,R data.frame,1,R
24 | parquet (UNC),6.126083183288574,fanniemae,arrow Table,1,Python
25 | parquet (UNC),9.3643874168396,fanniemae,pandas,1,Python
26 | parquet (SNAPPY),6.056532478332518,fanniemae,arrow Table,1,Python
27 | parquet (SNAPPY),9.177780771255494,fanniemae,pandas,1,Python
28 | feather V2 (UNC),4.354116058349609,fanniemae,pandas,1,Python
29 | feather V2 (LZ4),4.396533584594726,fanniemae,pandas,1,Python
30 | feather V2 (ZSTD),5.775776481628418,fanniemae,pandas,1,Python
31 | feather V2 (UNC),1.0860649585723876,fanniemae,arrow Table,1,Python
32 | feather V2 (LZ4),1.0962132453918456,fanniemae,arrow Table,1,Python
33 | feather V2 (ZSTD),2.531323909759521,fanniemae,arrow Table,1,Python
34 | parquet (UNC),2.2780594348907472,nyctaxi,arrow Table,1,Python
35 | parquet (UNC),9.222453880310061,nyctaxi,pandas,1,Python
36 | parquet (SNAPPY),2.8247000694274904,nyctaxi,arrow Table,1,Python
37 | parquet (SNAPPY),9.735122680664062,nyctaxi,pandas,1,Python
38 | feather V2 (UNC),7.608278465270996,nyctaxi,pandas,1,Python
39 | feather V2 (LZ4),7.784061861038206,nyctaxi,pandas,1,Python
40 | feather V2 (ZSTD),9.633673095703122,nyctaxi,pandas,1,Python
41 | feather V2 (UNC),0.5403317451477051,nyctaxi,arrow Table,1,Python
42 | feather V2 (LZ4),0.9643253803253172,nyctaxi,arrow Table,1,Python
43 | feather V2 (ZSTD),2.7800182342529296,nyctaxi,arrow Table,1,Python
44 | csv_fread,8.036938666600001,fanniemae,R data.frame,4,R
45 | fst (UNC),6.3416014972,fanniemae,R data.frame,4,R
46 | fst (c=50),5.0547549678,fanniemae,R data.frame,4,R
47 | feather V1,9.799018014,fanniemae,R data.frame,4,R
48 | feather V2 (UNC),5.0542017474,fanniemae,R data.frame,4,R
49 | feather V2 (LZ4),4.928118181,fanniemae,R data.frame,4,R
50 | feather V2 (ZSTD),5.5355538286,fanniemae,R data.frame,4,R
51 | parquet (UNC),6.281569166600001,fanniemae,R data.frame,4,R
52 | parquet (SNAPPY),6.3922376926,fanniemae,R data.frame,4,R
53 | RDS (UNC),29.8928874914,fanniemae,R data.frame,4,R
54 | RDS (C),41.273872293800004,fanniemae,R data.frame,4,R
55 | csv_fread,18.312046954,nyctaxi,R data.frame,4,R
56 | fst (UNC),11.9693504656,nyctaxi,R data.frame,4,R
57 | fst (c=50),13.4391470686,nyctaxi,R data.frame,4,R
58 | feather V1,12.034649945,nyctaxi,R data.frame,4,R
59 | feather V2 (UNC),11.0239614322,nyctaxi,R data.frame,4,R
60 | feather V2 (LZ4),11.592801001,nyctaxi,R data.frame,4,R
61 | feather V2 (ZSTD),12.704684877,nyctaxi,R data.frame,4,R
62 | parquet (UNC),12.225668849,nyctaxi,R data.frame,4,R
63 | parquet (SNAPPY),12.0044663816,nyctaxi,R data.frame,4,R
64 | RDS (UNC),21.847153904,nyctaxi,R data.frame,4,R
65 | RDS (C),30.735937022799998,nyctaxi,R data.frame,4,R
66 | parquet (UNC),1.841284704208374,fanniemae,arrow Table,4,Python
67 | parquet (UNC),4.0880148887634284,fanniemae,pandas,4,Python
68 | parquet (SNAPPY),1.8786502361297608,fanniemae,arrow Table,4,Python
69 | parquet (SNAPPY),4.165652704238892,fanniemae,pandas,4,Python
70 | feather V2 (UNC),3.5610058307647705,fanniemae,pandas,4,Python
71 | feather V2 (LZ4),2.778682994842529,fanniemae,pandas,4,Python
72 | feather V2 (ZSTD),3.0616337299346923,fanniemae,pandas,4,Python
73 | feather V2 (UNC),1.1269856452941895,fanniemae,arrow Table,4,Python
74 | feather V2 (LZ4),0.4898182392120362,fanniemae,arrow Table,4,Python
75 | feather V2 (ZSTD),0.8093690395355224,fanniemae,arrow Table,4,Python
76 | parquet (UNC),0.6995339870452881,nyctaxi,arrow Table,4,Python
77 | parquet (UNC),7.4361457347869875,nyctaxi,pandas,4,Python
78 | parquet (SNAPPY),0.78084397315979,nyctaxi,arrow Table,4,Python
79 | parquet (SNAPPY),7.540273284912108,nyctaxi,pandas,4,Python
80 | feather V2 (UNC),7.369460582733153,nyctaxi,pandas,4,Python
81 | feather V2 (LZ4),7.119231033325195,nyctaxi,pandas,4,Python
82 | feather V2 (ZSTD),7.537483549118043,nyctaxi,pandas,4,Python
83 | feather V2 (UNC),0.6116453170776367,nyctaxi,arrow Table,4,Python
84 | feather V2 (LZ4),0.4065845012664795,nyctaxi,arrow Table,4,Python
85 | feather V2 (ZSTD),0.8925417900085449,nyctaxi,arrow Table,4,Python
86 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/all_results.csv:
--------------------------------------------------------------------------------
1 | expr,time,dataset,output_type,nthreads,language
2 | R csv_fread,17.8678609836,fanniemae,R data.frame,1,R
3 | R fst,5.7596893994,fanniemae,R data.frame,1,R
4 | feather (UNC),4.4072281468000005,fanniemae,R data.frame,1,R
5 | feather (LZ4),6.0330426373999995,fanniemae,R data.frame,1,R
6 | feather (ZSTD),7.526674342,fanniemae,R data.frame,1,R
7 | parquet (SNAPPY),10.0367648462,fanniemae,R data.frame,1,R
8 | R csv_fread,24.648797387400002,nyctaxi,R data.frame,1,R
9 | R fst,13.142260905799999,nyctaxi,R data.frame,1,R
10 | feather (UNC),10.96529547,nyctaxi,R data.frame,1,R
11 | feather (LZ4),11.801702598,nyctaxi,R data.frame,1,R
12 | feather (ZSTD),14.2444990752,nyctaxi,R data.frame,1,R
13 | parquet (SNAPPY),13.157420057,nyctaxi,R data.frame,1,R
14 | parquet (SNAPPY),5.7931968688964846,fanniemae,arrow Table,1,Python
15 | parquet (SNAPPY),9.107409811019897,fanniemae,pandas,1,Python
16 | feather (UNC),4.035408067703247,fanniemae,pandas,1,Python
17 | feather (LZ4),4.295090818405152,fanniemae,pandas,1,Python
18 | feather (ZSTD),5.678592157363893,fanniemae,pandas,1,Python
19 | feather (UNC),1.2464978694915771,fanniemae,arrow Table,1,Python
20 | feather (LZ4),1.062558937072754,fanniemae,arrow Table,1,Python
21 | feather (ZSTD),2.471682643890381,fanniemae,arrow Table,1,Python
22 | parquet (SNAPPY),2.7657272338867194,nyctaxi,arrow Table,1,Python
23 | parquet (SNAPPY),9.840531587600706,nyctaxi,pandas,1,Python
24 | feather (UNC),7.5906150341033936,nyctaxi,pandas,1,Python
25 | feather (LZ4),7.9236814975738525,nyctaxi,pandas,1,Python
26 | feather (ZSTD),9.791791486740113,nyctaxi,pandas,1,Python
27 | feather (UNC),0.6637681007385254,nyctaxi,arrow Table,1,Python
28 | feather (LZ4),1.0227035522460937,nyctaxi,arrow Table,1,Python
29 | feather (ZSTD),2.77500696182251,nyctaxi,arrow Table,1,Python
30 | R csv_fread,8.381513095,fanniemae,R data.frame,4,R
31 | R fst,4.8154870964,fanniemae,R data.frame,4,R
32 | feather (UNC),4.8105564258,fanniemae,R data.frame,4,R
33 | feather (LZ4),5.4882766928,fanniemae,R data.frame,4,R
34 | feather (ZSTD),5.986291964,fanniemae,R data.frame,4,R
35 | parquet (SNAPPY),6.7089619354,fanniemae,R data.frame,4,R
36 | R csv_fread,19.3027468002,nyctaxi,R data.frame,4,R
37 | R fst,13.0800444294,nyctaxi,R data.frame,4,R
38 | feather (UNC),11.8721187678,nyctaxi,R data.frame,4,R
39 | feather (LZ4),12.5549529788,nyctaxi,R data.frame,4,R
40 | feather (ZSTD),12.829650966600001,nyctaxi,R data.frame,4,R
41 | parquet (SNAPPY),12.7536964852,nyctaxi,R data.frame,4,R
42 | parquet (SNAPPY),1.8717081069946289,fanniemae,arrow Table,4,Python
43 | parquet (SNAPPY),4.098778772354127,fanniemae,pandas,4,Python
44 | feather (UNC),3.539084482192993,fanniemae,pandas,4,Python
45 | feather (LZ4),2.8530110359191894,fanniemae,pandas,4,Python
46 | feather (ZSTD),3.06166353225708,fanniemae,pandas,4,Python
47 | feather (UNC),1.3176395416259763,fanniemae,arrow Table,4,Python
48 | feather (LZ4),0.4744390964508057,fanniemae,arrow Table,4,Python
49 | feather (ZSTD),0.7838622570037842,fanniemae,arrow Table,4,Python
50 | parquet (SNAPPY),0.8635732173919678,nyctaxi,arrow Table,4,Python
51 | parquet (SNAPPY),7.623702335357666,nyctaxi,pandas,4,Python
52 | feather (UNC),7.328182792663574,nyctaxi,pandas,4,Python
53 | feather (LZ4),7.2832419872283936,nyctaxi,pandas,4,Python
54 | feather (ZSTD),8.017264556884765,nyctaxi,pandas,4,Python
55 | feather (UNC),0.6738637924194336,nyctaxi,arrow Table,4,Python
56 | feather (LZ4),0.4330804347991944,nyctaxi,arrow Table,4,Python
57 | feather (ZSTD),0.9005756855010987,nyctaxi,arrow Table,4,Python
58 | R csv_fread,8.235247531,fanniemae,R data.frame,8,R
59 | R fst,4.5943393692,fanniemae,R data.frame,8,R
60 | feather (UNC),4.7164801714,fanniemae,R data.frame,8,R
61 | feather (LZ4),4.6001075036,fanniemae,R data.frame,8,R
62 | feather (ZSTD),5.166106334399999,fanniemae,R data.frame,8,R
63 | parquet (SNAPPY),5.9058646954,fanniemae,R data.frame,8,R
64 | R csv_fread,17.998316013,nyctaxi,R data.frame,8,R
65 | R fst,13.064559282,nyctaxi,R data.frame,8,R
66 | feather (UNC),11.93319899,nyctaxi,R data.frame,8,R
67 | feather (LZ4),12.5654696644,nyctaxi,R data.frame,8,R
68 | feather (ZSTD),12.1251017998,nyctaxi,R data.frame,8,R
69 | parquet (SNAPPY),11.4879469076,nyctaxi,R data.frame,8,R
70 | parquet (SNAPPY),1.3059203624725342,fanniemae,arrow Table,8,Python
71 | parquet (SNAPPY),3.710281848907471,fanniemae,pandas,8,Python
72 | feather (UNC),3.67109489440918,fanniemae,pandas,8,Python
73 | feather (LZ4),2.8483234405517583,fanniemae,pandas,8,Python
74 | feather (ZSTD),2.943112850189209,fanniemae,pandas,8,Python
75 | feather (UNC),1.3228723049163815,fanniemae,arrow Table,8,Python
76 | feather (LZ4),0.4322311401367188,fanniemae,arrow Table,8,Python
77 | feather (ZSTD),0.5514030933380127,fanniemae,arrow Table,8,Python
78 | parquet (SNAPPY),0.7145666599273681,nyctaxi,arrow Table,8,Python
79 | parquet (SNAPPY),7.5506598472595226,nyctaxi,pandas,8,Python
80 | feather (UNC),7.442094039916992,nyctaxi,pandas,8,Python
81 | feather (LZ4),7.163635158538819,nyctaxi,pandas,8,Python
82 | feather (ZSTD),7.376304483413696,nyctaxi,pandas,8,Python
83 | feather (UNC),0.638268232345581,nyctaxi,arrow Table,8,Python
84 | feather (LZ4),0.3298566818237305,nyctaxi,arrow Table,8,Python
85 | feather (ZSTD),0.576887559890747,nyctaxi,arrow Table,8,Python
86 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/all_write_results.csv:
--------------------------------------------------------------------------------
1 | expr,time,dataset,output_type,nthreads,language
2 | fst (UNC),7.00702392,fanniemae,R data.frame,1,R
3 | fst (c=50),4.385196419,fanniemae,R data.frame,1,R
4 | feather V1,8.656647228,fanniemae,R data.frame,1,R
5 | feather V2 (UNC),10.040626659,fanniemae,R data.frame,1,R
6 | feather V2 (LZ4),10.818098194,fanniemae,R data.frame,1,R
7 | feather V2 (ZSTD),11.438481575,fanniemae,R data.frame,1,R
8 | parquet (UNC),10.434816898,fanniemae,R data.frame,1,R
9 | parquet (SNAPPY),10.800951873,fanniemae,R data.frame,1,R
10 | RDS (C),76.929230341,fanniemae,R data.frame,1,R
11 | RDS (UNC),24.216423401,fanniemae,R data.frame,1,R
12 | fst (UNC),4.08787925,nyctaxi,R data.frame,1,R
13 | fst (c=50),3.950344461,nyctaxi,R data.frame,1,R
14 | feather V1,5.97229482,nyctaxi,R data.frame,1,R
15 | feather V2 (UNC),5.888590985,nyctaxi,R data.frame,1,R
16 | feather V2 (LZ4),8.325439328,nyctaxi,R data.frame,1,R
17 | feather V2 (ZSTD),10.223231254,nyctaxi,R data.frame,1,R
18 | parquet (UNC),7.71564074,nyctaxi,R data.frame,1,R
19 | parquet (SNAPPY),8.585539352,nyctaxi,R data.frame,1,R
20 | RDS (C),104.898052261,nyctaxi,R data.frame,1,R
21 | RDS (UNC),10.739751088,nyctaxi,R data.frame,1,R
22 | parquet (UNC),6.220219850540161,fanniemae,arrow Table,1,Python
23 | parquet (UNC),12.395264983177185,fanniemae,pandas,1,Python
24 | parquet (SNAPPY),6.694774866104126,fanniemae,arrow Table,1,Python
25 | parquet (SNAPPY),13.161320447921753,fanniemae,pandas,1,Python
26 | feather V2 (UNC),12.677234172821045,fanniemae,pandas,1,Python
27 | feather V2 (UNC),6.397535443305969,fanniemae,arrow Table,1,Python
28 | feather V2 (LZ4),8.32238781452179,fanniemae,pandas,1,Python
29 | feather V2 (LZ4),2.2326916456222534,fanniemae,arrow Table,1,Python
30 | feather V2 (ZSTD),10.61594545841217,fanniemae,pandas,1,Python
31 | feather V2 (ZSTD),4.308579444885254,fanniemae,arrow Table,1,Python
32 | parquet (UNC),4.5986950397491455,nyctaxi,arrow Table,1,Python
33 | parquet (UNC),9.009780049324037,nyctaxi,pandas,1,Python
34 | parquet (SNAPPY),5.70121443271637,nyctaxi,arrow Table,1,Python
35 | parquet (SNAPPY),10.175373315811155,nyctaxi,pandas,1,Python
36 | feather V2 (UNC),7.1334041357040405,nyctaxi,pandas,1,Python
37 | feather V2 (UNC),3.112175464630127,nyctaxi,arrow Table,1,Python
38 | feather V2 (LZ4),7.4143136739730835,nyctaxi,pandas,1,Python
39 | feather V2 (LZ4),3.567118763923645,nyctaxi,arrow Table,1,Python
40 | feather V2 (ZSTD),11.283223748207092,nyctaxi,pandas,1,Python
41 | feather V2 (ZSTD),6.928452372550964,nyctaxi,arrow Table,1,Python
42 | fst (UNC),7.758567831,fanniemae,R data.frame,4,R
43 | fst (c=50),3.700873556,fanniemae,R data.frame,4,R
44 | feather V1,7.08059183,fanniemae,R data.frame,4,R
45 | feather V2 (UNC),10.413025112,fanniemae,R data.frame,4,R
46 | feather V2 (LZ4),10.818213516,fanniemae,R data.frame,4,R
47 | feather V2 (ZSTD),11.563816777,fanniemae,R data.frame,4,R
48 | parquet (UNC),10.814584911,fanniemae,R data.frame,4,R
49 | parquet (SNAPPY),11.152511189,fanniemae,R data.frame,4,R
50 | RDS (C),78.42714811,fanniemae,R data.frame,4,R
51 | RDS (UNC),24.919762665,fanniemae,R data.frame,4,R
52 | fst (UNC),4.399914353,nyctaxi,R data.frame,4,R
53 | fst (c=50),3.305661431,nyctaxi,R data.frame,4,R
54 | feather V1,5.47744372,nyctaxi,R data.frame,4,R
55 | feather V2 (UNC),5.864371601,nyctaxi,R data.frame,4,R
56 | feather V2 (LZ4),8.494803995,nyctaxi,R data.frame,4,R
57 | feather V2 (ZSTD),10.073068744,nyctaxi,R data.frame,4,R
58 | parquet (UNC),7.675560036,nyctaxi,R data.frame,4,R
59 | parquet (SNAPPY),8.428579617,nyctaxi,R data.frame,4,R
60 | RDS (C),108.234060692,nyctaxi,R data.frame,4,R
61 | RDS (UNC),10.717121094,nyctaxi,R data.frame,4,R
62 | parquet (UNC),6.162686586380005,fanniemae,arrow Table,4,Python
63 | parquet (UNC),11.565850496292114,fanniemae,pandas,4,Python
64 | parquet (SNAPPY),6.410535216331482,fanniemae,arrow Table,4,Python
65 | parquet (SNAPPY),11.6298109292984,fanniemae,pandas,4,Python
66 | feather V2 (UNC),11.104193806648254,fanniemae,pandas,4,Python
67 | feather V2 (UNC),5.889622092247009,fanniemae,arrow Table,4,Python
68 | feather V2 (LZ4),6.612253308296204,fanniemae,pandas,4,Python
69 | feather V2 (LZ4),1.306950330734253,fanniemae,arrow Table,4,Python
70 | feather V2 (ZSTD),7.202290296554565,fanniemae,pandas,4,Python
71 | feather V2 (ZSTD),1.8320761919021609,fanniemae,arrow Table,4,Python
72 | parquet (UNC),4.338123440742494,nyctaxi,arrow Table,4,Python
73 | parquet (UNC),8.028993129730225,nyctaxi,pandas,4,Python
74 | parquet (SNAPPY),5.622675895690918,nyctaxi,arrow Table,4,Python
75 | parquet (SNAPPY),9.33586835861206,nyctaxi,pandas,4,Python
76 | feather V2 (UNC),6.233096599578857,nyctaxi,pandas,4,Python
77 | feather V2 (UNC),2.994387269020081,nyctaxi,arrow Table,4,Python
78 | feather V2 (LZ4),5.67785370349884,nyctaxi,pandas,4,Python
79 | feather V2 (LZ4),2.289505124092102,nyctaxi,arrow Table,4,Python
80 | feather V2 (ZSTD),6.161942005157472,nyctaxi,pandas,4,Python
81 | feather V2 (ZSTD),2.7954366207122803,nyctaxi,arrow Table,4,Python
82 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/benchmark.R:
--------------------------------------------------------------------------------
1 | library(fst)
2 | library(microbenchmark)
3 | library(data.table)
4 | library(arrow)
5 | library(feather)
6 | library(stringr)
7 | library(dplyr)
8 |
9 | files <- c("2016Q4", "yellow_tripdata_2010-01")
10 | names <- c("fanniemae", "nyctaxi")
11 | seps <- c("|", ",")
12 |
13 | create_files <- function(base) {
14 | df <- arrow::read_parquet(str_c(base, "_snappy.parquet"))
15 | feather::write_feather(df, str_c(base, "_v1.feather"))
16 | fst::write_fst(df, str_c(base, "_0.fst"), compress=0)
17 | fst::write_fst(df, str_c(base, "_50.fst"), compress=50)
18 | saveRDS(df, str_c(base, "_compressed.rds"), compress=TRUE)
19 | saveRDS(df, str_c(base, "_uncompressed.rds"), compress=FALSE)
20 | }
21 |
22 | do_benchmark <- function(index) {
23 | base <- files[index]
24 | sep <- seps[index]
25 |
26 | csv_path <- str_c("data/", base, ".csv")
27 | feather_v1_path <- str_c(base, "_v1.feather")
28 | feather_unc_path <- str_c(base, "_uncompressed.feather")
29 | feather_lz4_path <- str_c(base, "_lz4.feather")
30 | feather_zstd_path <- str_c(base, "_zstd.feather")
31 | fst_0_path <- str_c(base, "_0.fst")
32 | fst_50_path <- str_c(base, "_50.fst")
33 | parquet_unc_path <- str_c(base, "_uncompressed.parquet")
34 | parquet_snappy_path <- str_c(base, "_snappy.parquet")
35 | rds_unc_path <- str_c(base, "_uncompressed.rds")
36 | rds_compressed_path <- str_c(base, "_compressed.rds")
37 |
38 | mbm <- microbenchmark(
39 | csv_fread=data.table::fread(csv_path, sep=sep, header=FALSE),
40 | fst_unc=fst::read_fst(fst_0_path),
41 | fst_50=fst::read_fst(fst_50_path),
42 | feather_v1=feather::read_feather(feather_v1_path),
43 | feather_unc=arrow::read_feather(feather_unc_path),
44 | feather_lz4=arrow::read_feather(feather_lz4_path),
45 | feather_zstd=arrow::read_feather(feather_zstd_path),
46 | parquet_unc=arrow::read_parquet(parquet_unc_path),
47 | parquet_snappy=arrow::read_parquet(parquet_snappy_path),
48 | rds_unc=readRDS(rds_unc_path),
49 | rds_compressed=readRDS(rds_compressed_path),
50 | times=5
51 | )
52 | mbm <- data.frame(mbm) %>% dplyr::group_by(expr) %>% dplyr::summarize(time=mean(time))
53 | mbm$dataset <- names[index]
54 | mbm
55 | }
56 |
57 | do_write_benchmark <- function(index) {
58 | base <- files[index]
59 | sep <- seps[index]
60 |
61 | df <- arrow::read_parquet(str_c(base, "_snappy.parquet"))
62 |
63 | mbm <- microbenchmark(
64 | fst_unc=fst::write_fst(df, str_c(base, "_0.fst"), compress=0),
65 | fst_50=fst::write_fst(df, str_c(base, "_50.fst"), compress=50),
66 | feather_v1=feather::write_feather(df, str_c(base, "_v1.feather")),
67 | feather_unc=arrow::write_feather(df, str_c(base, "_unc_r.feather"),
68 | compression="uncompressed"),
69 | feather_lz4=arrow::write_parquet(df, str_c(base, "_lz4_r.feather"),
70 | compression="lz4"),
71 | feather_zstd=arrow::write_parquet(df, str_c(base, "_zstd_r.feather"),
72 | compression="zstd"),
73 | parquet_unc=arrow::write_parquet(df, str_c(base, "_unc_r.parquet"),
74 | compression="uncompressed"),
75 | parquet_snappy=arrow::write_parquet(df, str_c(base, "_snappy_r.parquet"),
76 | compression="snappy"),
77 | rds_compressed=saveRDS(df, str_c(base, "_compressed.rds"), compress=TRUE),
78 | rds_unc=saveRDS(df, str_c(base, "_uncompressed.rds"), compress=FALSE),
79 | times=1
80 | )
81 | mbm <- data.frame(mbm) %>% dplyr::group_by(expr) %>% dplyr::summarize(time=mean(time))
82 | mbm$dataset <- names[index]
83 | mbm
84 | }
85 |
86 | generate_files <- function() {
87 | for (base in files) {
88 | create_files(base)
89 | }
90 | }
91 |
92 | # generate_files()
93 |
94 | print(str_c("Using ", arrow::cpu_count(), " threads"))
95 |
96 | results <- dplyr::bind_rows(do_benchmark(1), do_benchmark(2))
97 | print(results)
98 | write.csv(results, str_c("r_read_results_", arrow::cpu_count(), ".csv"))
99 |
100 | write_results <- dplyr::bind_rows(do_write_benchmark(1), do_write_benchmark(2))
101 | print(write_results)
102 | write.csv(write_results, str_c("r_write_results_", arrow::cpu_count(), ".csv"))
103 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/benchmark.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | import pyarrow.feather as feather
4 | import pandas as pd
5 | import json
6 | import numpy as np
7 | import pyarrow as pa
8 | import pyarrow.parquet as pq
9 | from pandas.util.testing import rands
10 | import gc
11 | import time
12 |
13 |
14 | def get_timing(f, niter):
15 | start = time.clock_gettime(time.CLOCK_REALTIME)
16 | for i in range(niter):
17 | f()
18 | result = (time.clock_gettime(time.CLOCK_REALTIME) - start) / niter
19 | return result
20 |
21 |
22 | class Benchmarker:
23 |
24 | def __init__(self, file_info):
25 | self.base = file_info['base']
26 | (self.csv_path,
27 | self.sep,
28 | self.header) = unpack(file_info['source'], 'path', 'sep', 'header')
29 |
30 | self.parquet_unc_path = '{}_uncompressed.parquet'.format(self.base)
31 | self.parquet_snappy_path = '{}_snappy.parquet'.format(self.base)
32 | self.feather_unc_path = '{}_uncompressed.feather'.format(self.base)
33 | self.feather_lz4_path = '{}_lz4.feather'.format(self.base)
34 | self.feather_zstd_path = '{}_zstd.feather'.format(self.base)
35 |
36 | def bench_read(self, niter=5):
37 | cases = [
38 | ('parquet (UNC)', 'arrow Table',
39 | lambda: pq.read_table(self.parquet_unc_path, memory_map=False)),
40 | ('parquet (UNC)', 'pandas',
41 | lambda: (pq.read_table(self.parquet_unc_path, memory_map=False)
42 | .to_pandas())),
43 | ('parquet (SNAPPY)', 'arrow Table',
44 | lambda: pq.read_table(self.parquet_snappy_path,
45 | memory_map=False)),
46 | ('parquet (SNAPPY)', 'pandas',
47 | lambda: (pq.read_table(self.parquet_snappy_path, memory_map=False)
48 | .to_pandas())),
49 | ('feather V2 (UNC)', 'pandas',
50 | lambda: feather.read_feather(self.feather_unc_path,
51 | memory_map=False)),
52 | ('feather V2 (LZ4)', 'pandas',
53 | lambda: feather.read_feather(self.feather_lz4_path,
54 | memory_map=False)),
55 | ('feather V2 (ZSTD)', 'pandas',
56 | lambda: feather.read_feather(self.feather_zstd_path,
57 | memory_map=False)),
58 | ('feather V2 (UNC)', 'arrow Table',
59 | lambda: feather.read_table(self.feather_unc_path,
60 | memory_map=False)),
61 | ('feather V2 (LZ4)', 'arrow Table',
62 | lambda: feather.read_table(self.feather_lz4_path,
63 | memory_map=False)),
64 | ('feather V2 (ZSTD)', 'arrow Table',
65 | lambda: feather.read_table(self.feather_zstd_path,
66 | memory_map=False)),
67 | ]
68 |
69 | return self._bench_cases(cases, niter)
70 |
71 | def bench_write(self, niter=2):
72 | print("Reading text file: {}".format(self.csv_path))
73 | df = pd.read_csv(self.csv_path, sep=self.sep, header=self.header,
74 | low_memory=False)
75 | if self.header is None:
76 | df.columns = ['f{}'.format(i) for i in range(len(df.columns))]
77 |
78 | def _get_table(df):
79 | return (pa.Table.from_pandas(df, preserve_index=False)
80 | .replace_schema_metadata(None))
81 |
82 | t = _get_table(df)
83 |
84 | cases = [
85 | ('parquet (UNC)', 'arrow Table',
86 | lambda: pq.write_table(t, self.parquet_unc_path,
87 | compression='NONE')),
88 | ('parquet (UNC)', 'pandas',
89 | lambda: pq.write_table(_get_table(df), self.parquet_unc_path,
90 | compression='NONE')),
91 | ('parquet (SNAPPY)', 'arrow Table',
92 | lambda: pq.write_table(t, self.parquet_snappy_path)),
93 | ('parquet (SNAPPY)', 'pandas',
94 | lambda: pq.write_table(_get_table(df), self.parquet_snappy_path)),
95 | ('feather V2 (UNC)', 'pandas',
96 | lambda: feather.write_feather(df, self.feather_unc_path,
97 | compression='uncompressed')),
98 | ('feather V2 (UNC)', 'arrow Table',
99 | lambda: feather.write_feather(t, self.feather_unc_path,
100 | compression='uncompressed')),
101 | ('feather V2 (LZ4)', 'pandas',
102 | lambda: feather.write_feather(df, self.feather_lz4_path,
103 | compression='lz4')),
104 | ('feather V2 (LZ4)', 'arrow Table',
105 | lambda: feather.write_feather(t, self.feather_lz4_path,
106 | compression='lz4')),
107 | ('feather V2 (ZSTD)', 'pandas',
108 | lambda: feather.write_feather(df, self.feather_zstd_path,
109 | compression='zstd')),
110 | ('feather V2 (ZSTD)', 'arrow Table',
111 | lambda: feather.write_feather(t, self.feather_zstd_path,
112 | compression='zstd'))
113 | ]
114 |
115 | return self._bench_cases(cases, niter)
116 |
117 | def _bench_cases(self, cases, niter):
118 | results = []
119 | for name, output_type, f in cases:
120 | print(name)
121 | result = (name, output_type, get_timing(f, niter))
122 | print(result)
123 | results.append(result)
124 | return pd.DataFrame.from_records(results,
125 | columns=['expr', 'output_type',
126 | 'mean'])
127 |
128 |
129 | def unpack(d, *fields):
130 | return (d[f] for f in fields)
131 |
132 |
133 |
134 | files = {
135 | 'fanniemae': {
136 | 'base': '2016Q4',
137 | 'source': {
138 | 'path': 'data/2016Q4.csv',
139 | 'sep': '|',
140 | 'header': None
141 | }
142 | },
143 | 'nyctaxi': {
144 | 'base': 'yellow_tripdata_2010-01',
145 | 'source': {
146 | 'path': 'data/yellow_tripdata_2010-01.csv',
147 | 'sep': ',',
148 | 'header': 0
149 | }
150 | }
151 | }
152 |
153 |
154 | def run_benchmarks(num_threads, what='read'):
155 | pa.set_cpu_count(num_threads)
156 |
157 | all_results = []
158 | for name, info in files.items():
159 | benchmarker = Benchmarker(info)
160 | if what == 'read':
161 | print("Benchmarking reads")
162 | file_results = benchmarker.bench_read()
163 | elif what == 'write':
164 | print("Benchmarking writes")
165 | file_results = benchmarker.bench_write()
166 | else:
167 | raise ValueError(what)
168 | file_results['dataset'] = name
169 | all_results.append(file_results)
170 |
171 | print(all_results)
172 | return pd.concat(all_results, ignore_index=True)
173 |
174 |
175 |
176 | # for i in range(5):
177 | # pq.read_table('yellow_tripdata_2010-01.parquet').to_pandas()
178 |
179 | # write_files(files)
180 |
181 | num_threads_cases = [1, 4]
182 |
183 | for nthreads in num_threads_cases:
184 | write_results = run_benchmarks(nthreads, what='write')
185 | write_results.to_csv('py_write_results_{}.csv'.format(nthreads))
186 |
187 | read_results = run_benchmarks(nthreads, what='read')
188 | read_results.to_csv('py_read_results_{}.csv'.format(nthreads))
189 |
190 | # for nthreads in num_threads_cases:
191 | # run_benchmarks(nthreads)
192 |
193 | # ('pyarrow.parquet', 1.5470361709594727)
194 | # ('pyarrow.parquet-pandas', 2.925654172897339)
195 | # ('pyarrow.feather', 1.6384665012359618)
196 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/file_sizes.csv:
--------------------------------------------------------------------------------
1 | dataset,file_type,size
2 | fanniemae,feather V1,4812.5457763671875
3 | fanniemae,feather V2 (UNC),4812.015695571899
4 | fanniemae,feather V2 (LZ4),608.8575687408447
5 | fanniemae,feather V2 (ZSTD),499.76035499572754
6 | fanniemae,parquet (UNC),372.0420684814453
7 | fanniemae,parquet (SNAPPY),136.54194450378418
8 | fanniemae,fst (UNC),5033.68958568573
9 | fanniemae,fst (C=50),766.0145416259766
10 | fanniemae,RDS (C),114.32603359222412
11 | fanniemae,RDS (UNC),5682.447074890137
12 | nyctaxi,feather V1,2389.4743881225586
13 | nyctaxi,feather V2 (UNC),2389.72052192688
14 | nyctaxi,feather V2 (LZ4),1120.5288562774658
15 | nyctaxi,feather V2 (ZSTD),783.8803653717041
16 | nyctaxi,parquet (UNC),1188.3576135635376
17 | nyctaxi,parquet (SNAPPY),719.5741958618164
18 | nyctaxi,fst (UNC),2412.6597032546997
19 | nyctaxi,fst (C=50),1199.6639070510864
20 | nyctaxi,RDS (C),541.6701745986938
21 | nyctaxi,RDS (UNC),2671.2057056427
22 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/generate_results.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Make sure we're using performance CPU governor
4 | sudo cpufreq-set -g performance
5 |
6 | python benchmark.py
7 |
8 | OMP_NUM_THREADS=1 Rscript benchmark.R
9 | OMP_NUM_THREADS=4 Rscript benchmark.R
10 |
11 | python glue_results.py
12 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/glue_results.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 |
4 |
5 | def munge_results(kind='read'):
6 | pieces = []
7 | for num_threads in (1, 4):
8 | expr_rename = {
9 | 'parquet_unc': 'parquet (UNC)',
10 | 'parquet_snappy': 'parquet (SNAPPY)',
11 | 'feather_v1': 'feather V1',
12 | 'feather_unc': 'feather V2 (UNC)',
13 | 'feather_lz4': 'feather V2 (LZ4)',
14 | 'feather_zstd': 'feather V2 (ZSTD)',
15 | 'fst_unc': 'fst (UNC)',
16 | 'fst_50': 'fst (c=50)',
17 | 'rds_unc': 'RDS (UNC)',
18 | 'rds_compressed': 'RDS (C)',
19 | 'pyarrow.parquet': 'parquet (SNAPPY)',
20 | 'pyarrow.feather (UNC)': 'feather V2 (UNC)',
21 | 'pyarrow.feather (LZ4)': 'feather V2 (LZ4)',
22 | 'pyarrow.feather (ZSTD)': 'feather V2 (ZSTD)',
23 | }
24 |
25 | r_results = pd.read_csv('r_{}_results_{}.csv'.format(kind,
26 | num_threads))
27 | r_results = r_results[['expr', 'time', 'dataset']]
28 | r_results['output_type'] = "R data.frame"
29 | r_results['expr'] = r_results['expr']
30 | r_results['time'] /= 1e9
31 | r_results['nthreads'] = num_threads
32 | r_results['language'] = 'R'
33 |
34 | r_results.expr = r_results.expr.map(lambda x: expr_rename.get(x, x))
35 |
36 | py_results = pd.read_csv('py_{}_results_{}.csv'.format(kind,
37 | num_threads))
38 | py_results = py_results[['expr', 'output_type', 'mean', 'dataset']]
39 | py_results['time'] = py_results.pop('mean')
40 | py_results['nthreads'] = num_threads
41 | py_results['language'] = 'Python'
42 |
43 | py_results.expr = py_results.expr.map(lambda x: expr_rename.get(x, x))
44 |
45 | renamings = {
46 | 'pyarrow.Table': 'arrow Table',
47 | }
48 |
49 | py_results.output_type = py_results.output_type.map(
50 | lambda x: renamings.get(x, x))
51 |
52 | pieces.extend([r_results, py_results])
53 | return pd.concat(pieces, ignore_index=True, sort=False)
54 |
55 |
56 | read_results = munge_results('read')
57 | read_results.to_csv('all_read_results.csv', index=False)
58 |
59 | write_results = munge_results('write')
60 | write_results.to_csv('all_write_results.csv', index=False)
61 |
62 |
63 | files = [('fanniemae', '2016Q4'),
64 | ('nyctaxi', 'yellow_tripdata_2010-01')]
65 |
66 | cases = [
67 | ('feather V1', '_v1.feather'),
68 | ('feather V2 (UNC)', '_uncompressed.feather'),
69 | ('feather V2 (LZ4)', '_lz4.feather'),
70 | ('feather V2 (ZSTD)', '_zstd.feather'),
71 | ('parquet (UNC)', '_uncompressed.parquet'),
72 | ('parquet (SNAPPY)', '_snappy.parquet'),
73 | ('fst (UNC)', '_0.fst'),
74 | ('fst (C=50)', '_50.fst'),
75 | ('RDS (C)', '_compressed.rds'),
76 | ('RDS (UNC)', '_uncompressed.rds')
77 | ]
78 |
79 | file_sizes = []
80 |
81 |
82 | for logical_name, file_base in files:
83 | for storage, ending in cases:
84 | full_path = f'{file_base}{ending}'
85 | size = os.stat(full_path).st_size
86 | result = (logical_name, storage, size / (1 << 20))
87 | print(result)
88 | file_sizes.append(result)
89 |
90 | file_sizes = pd.DataFrame.from_records(
91 | file_sizes, columns=['dataset', 'file_type', 'size'])
92 |
93 | file_sizes.to_csv('file_sizes.csv', index=False)
94 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/i9-9880H-1/all_results.csv:
--------------------------------------------------------------------------------
1 | expr,time,dataset,output_type
2 | R rds,24.258071124,fanniemae,R data.frame
3 | R csv_fread,11.890215172200001,fanniemae,R data.frame
4 | R feather_old,5.2771154078,fanniemae,R data.frame
5 | R fst,4.2202414134,fanniemae,R data.frame
6 | R feather_arrow,2.9834087618000003,fanniemae,R data.frame
7 | R parquet,7.969573458,fanniemae,R data.frame
8 | R rds,21.7943077156,nyctaxi,R data.frame
9 | R csv_fread,21.743098532599998,nyctaxi,R data.frame
10 | R feather_old,13.1421169332,nyctaxi,R data.frame
11 | R fst,13.226063631799999,nyctaxi,R data.frame
12 | R feather_arrow,11.358103880200002,nyctaxi,R data.frame
13 | R parquet,13.9190224234,nyctaxi,R data.frame
14 | pyarrow.parquet,5.198975515365602,fanniemae,arrow Table
15 | pyarrow.parquet-pandas,7.051469707489014,fanniemae,pandas
16 | pyarrow.feather,1.979597759246826,fanniemae,pandas
17 | pyarrow.parquet,2.888606691360473,nyctaxi,arrow Table
18 | pyarrow.parquet-pandas,9.884848737716675,nyctaxi,pandas
19 | pyarrow.feather,6.670159721374513,nyctaxi,pandas
20 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/i9-9880H-1/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ursa-labs/notebooks/fef623e363546b76333a58d31dd58805248a596d/20190919file_benchmarks/i9-9880H-1/plot.png
--------------------------------------------------------------------------------
/20190919file_benchmarks/i9-9880H-1/py_results.csv:
--------------------------------------------------------------------------------
1 | ,expr,output_type,mean,dataset
2 | 0,pyarrow.parquet,arrow Table,5.198975515365601,fanniemae
3 | 1,pyarrow.parquet-pandas,pandas,7.051469707489014,fanniemae
4 | 2,pyarrow.feather,pandas,1.9795977592468261,fanniemae
5 | 3,pyarrow.parquet,arrow Table,2.8886066913604735,nyctaxi
6 | 4,pyarrow.parquet-pandas,pandas,9.884848737716675,nyctaxi
7 | 5,pyarrow.feather,pandas,6.670159721374512,nyctaxi
8 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/i9-9880H-1/r_results.csv:
--------------------------------------------------------------------------------
1 | "","expr","time","dataset"
2 | "1","rds",24258071124,"fanniemae"
3 | "2","csv_fread",11890215172.2,"fanniemae"
4 | "3","feather_old",5277115407.8,"fanniemae"
5 | "4","fst",4220241413.4,"fanniemae"
6 | "5","feather_arrow",2983408761.8,"fanniemae"
7 | "6","parquet",7969573458,"fanniemae"
8 | "7","rds",21794307715.6,"nyctaxi"
9 | "8","csv_fread",21743098532.6,"nyctaxi"
10 | "9","feather_old",13142116933.2,"nyctaxi"
11 | "10","fst",13226063631.8,"nyctaxi"
12 | "11","feather_arrow",11358103880.2,"nyctaxi"
13 | "12","parquet",13919022423.4,"nyctaxi"
14 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/i9-9880H-4/all_results.csv:
--------------------------------------------------------------------------------
1 | expr,time,dataset,output_type
2 | R rds,24.7440853608,fanniemae,R data.frame
3 | R csv_fread,5.4296275238,fanniemae,R data.frame
4 | R feather_old,5.2222283298,fanniemae,R data.frame
5 | R fst,3.3813570264,fanniemae,R data.frame
6 | R feather_arrow,2.9662292186,fanniemae,R data.frame
7 | R parquet,4.6544630666,fanniemae,R data.frame
8 | R rds,22.135398477200003,nyctaxi,R data.frame
9 | R csv_fread,17.687647606,nyctaxi,R data.frame
10 | R feather_old,11.989569364200001,nyctaxi,R data.frame
11 | R fst,12.0112101424,nyctaxi,R data.frame
12 | R feather_arrow,11.617949409200001,nyctaxi,R data.frame
13 | R parquet,13.0886089094,nyctaxi,R data.frame
14 | pyarrow.parquet,2.1267578125,fanniemae,arrow Table
15 | pyarrow.parquet-pandas,3.518295383453369,fanniemae,pandas
16 | pyarrow.feather,1.6831360816955567,fanniemae,pandas
17 | pyarrow.parquet,1.1050359725952148,nyctaxi,arrow Table
18 | pyarrow.parquet-pandas,7.481458854675293,nyctaxi,pandas
19 | pyarrow.feather,6.5046766757965075,nyctaxi,pandas
20 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/i9-9880H-4/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ursa-labs/notebooks/fef623e363546b76333a58d31dd58805248a596d/20190919file_benchmarks/i9-9880H-4/plot.png
--------------------------------------------------------------------------------
/20190919file_benchmarks/i9-9880H-4/py_results.csv:
--------------------------------------------------------------------------------
1 | ,expr,output_type,mean,dataset
2 | 0,pyarrow.parquet,arrow Table,2.1267578125,fanniemae
3 | 1,pyarrow.parquet-pandas,pandas,3.5182953834533692,fanniemae
4 | 2,pyarrow.feather,pandas,1.6831360816955567,fanniemae
5 | 3,pyarrow.parquet,arrow Table,1.1050359725952148,nyctaxi
6 | 4,pyarrow.parquet-pandas,pandas,7.481458854675293,nyctaxi
7 | 5,pyarrow.feather,pandas,6.504676675796508,nyctaxi
8 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/i9-9880H-4/r_results.csv:
--------------------------------------------------------------------------------
1 | "","expr","time","dataset"
2 | "1","rds",24744085360.8,"fanniemae"
3 | "2","csv_fread",5429627523.8,"fanniemae"
4 | "3","feather_old",5222228329.8,"fanniemae"
5 | "4","fst",3381357026.4,"fanniemae"
6 | "5","feather_arrow",2966229218.6,"fanniemae"
7 | "6","parquet",4654463066.6,"fanniemae"
8 | "7","rds",22135398477.2,"nyctaxi"
9 | "8","csv_fread",17687647606,"nyctaxi"
10 | "9","feather_old",11989569364.2,"nyctaxi"
11 | "10","fst",12011210142.4,"nyctaxi"
12 | "11","feather_arrow",11617949409.2,"nyctaxi"
13 | "12","parquet",13088608909.4,"nyctaxi"
14 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/i9-9880H-8/all_results.csv:
--------------------------------------------------------------------------------
1 | expr,time,dataset,output_type
2 | R rds,25.963613643200002,fanniemae,R data.frame
3 | R csv_fread,5.125688092600001,fanniemae,R data.frame
4 | R feather_old,5.8978863578,fanniemae,R data.frame
5 | R fst,3.6207146728,fanniemae,R data.frame
6 | R feather_arrow,3.285127359,fanniemae,R data.frame
7 | R parquet,4.608878230399999,fanniemae,R data.frame
8 | R rds,22.5701864218,nyctaxi,R data.frame
9 | R csv_fread,17.681116847200002,nyctaxi,R data.frame
10 | R feather_old,13.7390440426,nyctaxi,R data.frame
11 | R fst,13.188127108200002,nyctaxi,R data.frame
12 | R feather_arrow,12.2201220736,nyctaxi,R data.frame
13 | R parquet,12.6165632024,nyctaxi,R data.frame
14 | pyarrow.parquet,1.6406285285949709,fanniemae,arrow Table
15 | pyarrow.parquet-pandas,3.035256814956665,fanniemae,pandas
16 | pyarrow.feather,1.6025235176086423,fanniemae,pandas
17 | pyarrow.parquet,0.9039567470550536,nyctaxi,arrow Table
18 | pyarrow.parquet-pandas,6.945300436019897,nyctaxi,pandas
19 | pyarrow.feather,6.311788606643678,nyctaxi,pandas
20 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/i9-9880H-8/plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ursa-labs/notebooks/fef623e363546b76333a58d31dd58805248a596d/20190919file_benchmarks/i9-9880H-8/plot.png
--------------------------------------------------------------------------------
/20190919file_benchmarks/i9-9880H-8/py_results.csv:
--------------------------------------------------------------------------------
1 | ,expr,output_type,mean,dataset
2 | 0,pyarrow.parquet,arrow Table,1.6406285285949707,fanniemae
3 | 1,pyarrow.parquet-pandas,pandas,3.035256814956665,fanniemae
4 | 2,pyarrow.feather,pandas,1.6025235176086425,fanniemae
5 | 3,pyarrow.parquet,arrow Table,0.9039567470550537,nyctaxi
6 | 4,pyarrow.parquet-pandas,pandas,6.945300436019897,nyctaxi
7 | 5,pyarrow.feather,pandas,6.311788606643677,nyctaxi
8 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/i9-9880H-8/r_results.csv:
--------------------------------------------------------------------------------
1 | "","expr","time","dataset"
2 | "1","rds",25963613643.2,"fanniemae"
3 | "2","csv_fread",5125688092.6,"fanniemae"
4 | "3","feather_old",5897886357.8,"fanniemae"
5 | "4","fst",3620714672.8,"fanniemae"
6 | "5","feather_arrow",3285127359,"fanniemae"
7 | "6","parquet",4608878230.4,"fanniemae"
8 | "7","rds",22570186421.8,"nyctaxi"
9 | "8","csv_fread",17681116847.2,"nyctaxi"
10 | "9","feather_old",13739044042.6,"nyctaxi"
11 | "10","fst",13188127108.2,"nyctaxi"
12 | "11","feather_arrow",12220122073.6,"nyctaxi"
13 | "12","parquet",12616563202.4,"nyctaxi"
14 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/make_feather_plots.R:
--------------------------------------------------------------------------------
1 | library(ggplot2)
2 |
3 | # install.packages("stringi")
4 |
5 | setwd("~/code/notebooks/20190919file_benchmarks/")
6 |
7 | reads <- read.csv("ipc_read_parallel.csv")
8 | writes <- read.csv("ipc_write_parallel.csv")
9 |
10 | writes
11 |
12 | # file size
13 | ggplot(writes, aes(fill=factor(chunksize), y=file_size, x=codec)) +
14 | facet_grid(rows=vars(dataset)) +
15 | geom_bar(position="dodge", stat="identity") +
16 | coord_flip()
17 |
18 | ggsave("ipc_file_size.png", width=10, height=4)
19 |
20 | # write time
21 | ggplot(writes, aes(fill=factor(chunksize), y=write_time, x=codec)) +
22 | facet_grid(rows=vars(dataset)) +
23 | geom_bar(position="dodge", stat="identity") +
24 | coord_flip()
25 |
26 | ggsave("ipc_write_time.png", width=10, height=4)
27 |
28 | # read time
29 | ggplot(reads, aes(fill=factor(chunksize), y=read_time, x=codec)) +
30 | facet_grid(rows=vars(dataset)) +
31 | geom_bar(position="dodge", stat="identity") +
32 | coord_flip()
33 |
34 | ggsave("ipc_read_time.png", width=10, height=4)
--------------------------------------------------------------------------------
/20190919file_benchmarks/make_plots.R:
--------------------------------------------------------------------------------
1 | library(ggplot2)
2 | library(dplyr)
3 |
4 |
5 | read_results <- read.csv("all_read_results.csv")
6 | write_results <- read.csv("all_write_results.csv")
7 | # Add a row for the Fannie Mae CSV file size, as reported in the previous post
8 | file_sizes <- rbind(
9 | read.csv("file_sizes.csv", stringsAsFactors = FALSE),
10 | data.frame(
11 | dataset = "fanniemae",
12 | file_type = "CSV",
13 | size = 1.52*1024,
14 | stringsAsFactors = FALSE
15 | )
16 | )
17 |
18 | # Color mapping
19 | cols <- c(
20 | "feather V1" = "steelblue",
21 | "feather V2 (UNC)" = "steelblue",
22 | "feather V2 (LZ4)" = "steelblue",
23 | "feather V2 (ZSTD)" = "steelblue",
24 | "parquet (SNAPPY)" = "steelblue1",
25 | "parquet (UNC)" = "steelblue1",
26 | "fst (C=50)" = "wheat4",
27 | "fst (UNC)" = "wheat4",
28 | "RDS (C)" = "gray",
29 | "RDS (UNC)" = "gray",
30 | "csv_fread" = "wheat3",
31 | "CSV" = "wheat3"
32 | )
33 |
34 | # This is ugly but it makes the graph labels prettier
35 | munge_labels <- function (x) {
36 | sub("csv_fread", "CSV (data.table::fread)",
37 | sub("UNC", "Uncompressed",
38 | sub("feather", "Feather",
39 | sub("parquet", "Parquet",
40 | sub("[Cc]=", "ZSTD, ",
41 | sub("ZSTD", "ZSTD, 1",
42 | sub("\\(C\\)", "(GZIP)",
43 | sub("V1", "V1 (Uncompressed)",
44 | x))))))))
45 | }
46 | names(cols) <- munge_labels(names(cols))
47 |
48 | fix_formats <- function(x) {
49 | # This applies the pretty names and reorders the factor levels so that
50 | # they print in the order we want
51 | levels(x) <- munge_labels(levels(x))
52 | factor(x, levels = rev(c(
53 | "Feather V1 (Uncompressed)",
54 | "Feather V2 (Uncompressed)",
55 | "Feather V2 (LZ4)",
56 | "Feather V2 (ZSTD, 1)",
57 | "Parquet (Uncompressed)",
58 | "Parquet (SNAPPY)",
59 | "RDS (Uncompressed)",
60 | "RDS (GZIP)",
61 | "CSV",
62 | "CSV (data.table::fread)",
63 | "fst (Uncompressed)",
64 | "fst (ZSTD, 50)"
65 | ))
66 | )
67 | }
68 |
69 | benchmark_plot <- function(data) {
70 | # Since we do the same thing for most of the graphs, collect plotting logic here
71 | ggplot(data, aes(y=time, fill=expr, x=expr)) +
72 | facet_wrap(vars(output_type), ncol=1) +
73 | geom_col(position="dodge") +
74 | theme_minimal() +
75 | scale_fill_manual(values = cols) +
76 | coord_flip() +
77 | theme(
78 | legend.position = "none",
79 | panel.grid.major.y = element_blank()
80 | )
81 | }
82 |
83 |
84 | ### Reading
85 | read_results$expr <- fix_formats(read_results$expr)
86 | read_results$Threads <- factor(read_results$nthreads)
87 | # All
88 | ggplot(read_results, aes(fill=Threads, y=time, x=expr)) +
89 | facet_grid(rows=vars(output_type), col=vars(dataset)) +
90 | geom_bar(position="dodge", stat="identity") +
91 | coord_flip() +
92 | theme_minimal() +
93 | theme(legend.position = "right") +
94 | labs(x = "Format", y = "Time to read (s)", title = "")
95 | ggsave("20200414_read_full.png", width=10, height=6)
96 |
97 | # Python and Arrow only
98 | read_results %>%
99 | filter(nthreads == 4 & dataset == "fanniemae" & language == "Python") %>%
100 | benchmark_plot() +
101 | scale_y_continuous(breaks = seq(0, 10, 2), limits = c(0, 10)) +
102 | labs(x = "", y = "Time to read (s)", title = "")
103 | ggsave("20200414_read_py.png", width=10, height=3)
104 |
105 | # R (and drop RDS because it is out of range)
106 | read_results %>%
107 | filter(nthreads == 4 & dataset == "fanniemae" & language == "R" & !grepl("^RDS", expr)) %>%
108 | benchmark_plot() +
109 | scale_y_continuous(breaks = seq(0, 10, 2), limits = c(0, 10)) +
110 | labs(x = "", y = "Time to read (s)", title = "")
111 | ggsave("20200414_read_r.png", width=10, height=3)
112 |
113 | ### Writing
114 | write_results$expr <- fix_formats(write_results$expr)
115 | write_results$Threads <- factor(write_results$nthreads)
116 | # All
117 | ggplot(write_results, aes(fill=Threads, y=time, x=expr)) +
118 | facet_grid(rows=vars(output_type), col=vars(dataset)) +
119 | geom_bar(position="dodge", stat="identity") +
120 | coord_flip() +
121 | labs(x = "Format", y = "Time (s)", title = "Write speeds")
122 | ggsave("20200414_write_full.png", width=10, height=6)
123 |
124 | # Python and Arrow only
125 | write_results %>%
126 | filter(nthreads == 4 & dataset == "fanniemae" & language == "Python") %>%
127 | benchmark_plot() +
128 | scale_y_continuous(breaks = seq(0, 12, 2), limits = c(0, 12)) +
129 | labs(x = "", y = "Time to write (s)", title = "")
130 | ggsave("20200414_write_py.png", width=10, height=3)
131 |
132 | # R
133 | write_results %>%
134 | filter(nthreads == 4 & dataset == "fanniemae" & language == "R" & !grepl("^RDS", expr)) %>%
135 | benchmark_plot() +
136 | scale_y_continuous(breaks = seq(0, 12, 2), limits = c(0, 12)) +
137 | labs(x = "", y = "Time to write (s)", title = "")
138 | ggsave("20200414_write_r.png", width=10, height=3)
139 |
140 | ### File sizes
141 | file_sizes$file_type <- fix_formats(as.factor(file_sizes$file_type))
142 | ggplot(file_sizes[file_sizes$dataset == "fanniemae",], aes(y=size/1024, file_type, fill = file_type)) +
143 | geom_col(position="dodge") +
144 | theme_minimal() +
145 | scale_fill_manual(values = cols) +
146 | coord_flip() +
147 | theme(
148 | legend.position = "none",
149 | panel.grid.major.y = element_blank()
150 | ) +
151 | labs(y = "File size (GB)", x = "", title = "")
152 | ggsave("20200414_file_sizes.png", width=10, height=3)
153 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/py_read_results_1.csv:
--------------------------------------------------------------------------------
1 | ,expr,output_type,mean,dataset
2 | 0,parquet (UNC),arrow Table,6.126083183288574,fanniemae
3 | 1,parquet (UNC),pandas,9.3643874168396,fanniemae
4 | 2,parquet (SNAPPY),arrow Table,6.056532478332519,fanniemae
5 | 3,parquet (SNAPPY),pandas,9.177780771255494,fanniemae
6 | 4,feather V2 (UNC),pandas,4.354116058349609,fanniemae
7 | 5,feather V2 (LZ4),pandas,4.396533584594726,fanniemae
8 | 6,feather V2 (ZSTD),pandas,5.775776481628418,fanniemae
9 | 7,feather V2 (UNC),arrow Table,1.0860649585723876,fanniemae
10 | 8,feather V2 (LZ4),arrow Table,1.0962132453918456,fanniemae
11 | 9,feather V2 (ZSTD),arrow Table,2.5313239097595215,fanniemae
12 | 10,parquet (UNC),arrow Table,2.2780594348907472,nyctaxi
13 | 11,parquet (UNC),pandas,9.22245388031006,nyctaxi
14 | 12,parquet (SNAPPY),arrow Table,2.8247000694274904,nyctaxi
15 | 13,parquet (SNAPPY),pandas,9.735122680664062,nyctaxi
16 | 14,feather V2 (UNC),pandas,7.608278465270996,nyctaxi
17 | 15,feather V2 (LZ4),pandas,7.784061861038208,nyctaxi
18 | 16,feather V2 (ZSTD),pandas,9.633673095703125,nyctaxi
19 | 17,feather V2 (UNC),arrow Table,0.5403317451477051,nyctaxi
20 | 18,feather V2 (LZ4),arrow Table,0.9643253803253173,nyctaxi
21 | 19,feather V2 (ZSTD),arrow Table,2.7800182342529296,nyctaxi
22 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/py_read_results_4.csv:
--------------------------------------------------------------------------------
1 | ,expr,output_type,mean,dataset
2 | 0,parquet (UNC),arrow Table,1.841284704208374,fanniemae
3 | 1,parquet (UNC),pandas,4.088014888763428,fanniemae
4 | 2,parquet (SNAPPY),arrow Table,1.8786502361297608,fanniemae
5 | 3,parquet (SNAPPY),pandas,4.165652704238892,fanniemae
6 | 4,feather V2 (UNC),pandas,3.5610058307647705,fanniemae
7 | 5,feather V2 (LZ4),pandas,2.778682994842529,fanniemae
8 | 6,feather V2 (ZSTD),pandas,3.0616337299346923,fanniemae
9 | 7,feather V2 (UNC),arrow Table,1.1269856452941895,fanniemae
10 | 8,feather V2 (LZ4),arrow Table,0.48981823921203616,fanniemae
11 | 9,feather V2 (ZSTD),arrow Table,0.8093690395355224,fanniemae
12 | 10,parquet (UNC),arrow Table,0.6995339870452881,nyctaxi
13 | 11,parquet (UNC),pandas,7.4361457347869875,nyctaxi
14 | 12,parquet (SNAPPY),arrow Table,0.78084397315979,nyctaxi
15 | 13,parquet (SNAPPY),pandas,7.540273284912109,nyctaxi
16 | 14,feather V2 (UNC),pandas,7.369460582733154,nyctaxi
17 | 15,feather V2 (LZ4),pandas,7.119231033325195,nyctaxi
18 | 16,feather V2 (ZSTD),pandas,7.537483549118042,nyctaxi
19 | 17,feather V2 (UNC),arrow Table,0.6116453170776367,nyctaxi
20 | 18,feather V2 (LZ4),arrow Table,0.4065845012664795,nyctaxi
21 | 19,feather V2 (ZSTD),arrow Table,0.8925417900085449,nyctaxi
22 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/py_write_results_1.csv:
--------------------------------------------------------------------------------
1 | ,expr,output_type,mean,dataset
2 | 0,parquet (UNC),arrow Table,6.220219850540161,fanniemae
3 | 1,parquet (UNC),pandas,12.395264983177185,fanniemae
4 | 2,parquet (SNAPPY),arrow Table,6.694774866104126,fanniemae
5 | 3,parquet (SNAPPY),pandas,13.161320447921753,fanniemae
6 | 4,feather V2 (UNC),pandas,12.677234172821045,fanniemae
7 | 5,feather V2 (UNC),arrow Table,6.397535443305969,fanniemae
8 | 6,feather V2 (LZ4),pandas,8.32238781452179,fanniemae
9 | 7,feather V2 (LZ4),arrow Table,2.2326916456222534,fanniemae
10 | 8,feather V2 (ZSTD),pandas,10.61594545841217,fanniemae
11 | 9,feather V2 (ZSTD),arrow Table,4.308579444885254,fanniemae
12 | 10,parquet (UNC),arrow Table,4.5986950397491455,nyctaxi
13 | 11,parquet (UNC),pandas,9.009780049324036,nyctaxi
14 | 12,parquet (SNAPPY),arrow Table,5.70121443271637,nyctaxi
15 | 13,parquet (SNAPPY),pandas,10.175373315811157,nyctaxi
16 | 14,feather V2 (UNC),pandas,7.1334041357040405,nyctaxi
17 | 15,feather V2 (UNC),arrow Table,3.112175464630127,nyctaxi
18 | 16,feather V2 (LZ4),pandas,7.4143136739730835,nyctaxi
19 | 17,feather V2 (LZ4),arrow Table,3.567118763923645,nyctaxi
20 | 18,feather V2 (ZSTD),pandas,11.283223748207092,nyctaxi
21 | 19,feather V2 (ZSTD),arrow Table,6.928452372550964,nyctaxi
22 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/py_write_results_4.csv:
--------------------------------------------------------------------------------
1 | ,expr,output_type,mean,dataset
2 | 0,parquet (UNC),arrow Table,6.162686586380005,fanniemae
3 | 1,parquet (UNC),pandas,11.565850496292114,fanniemae
4 | 2,parquet (SNAPPY),arrow Table,6.410535216331482,fanniemae
5 | 3,parquet (SNAPPY),pandas,11.6298109292984,fanniemae
6 | 4,feather V2 (UNC),pandas,11.104193806648254,fanniemae
7 | 5,feather V2 (UNC),arrow Table,5.889622092247009,fanniemae
8 | 6,feather V2 (LZ4),pandas,6.612253308296204,fanniemae
9 | 7,feather V2 (LZ4),arrow Table,1.306950330734253,fanniemae
10 | 8,feather V2 (ZSTD),pandas,7.202290296554565,fanniemae
11 | 9,feather V2 (ZSTD),arrow Table,1.8320761919021606,fanniemae
12 | 10,parquet (UNC),arrow Table,4.338123440742493,nyctaxi
13 | 11,parquet (UNC),pandas,8.028993129730225,nyctaxi
14 | 12,parquet (SNAPPY),arrow Table,5.622675895690918,nyctaxi
15 | 13,parquet (SNAPPY),pandas,9.33586835861206,nyctaxi
16 | 14,feather V2 (UNC),pandas,6.233096599578857,nyctaxi
17 | 15,feather V2 (UNC),arrow Table,2.9943872690200806,nyctaxi
18 | 16,feather V2 (LZ4),pandas,5.67785370349884,nyctaxi
19 | 17,feather V2 (LZ4),arrow Table,2.289505124092102,nyctaxi
20 | 18,feather V2 (ZSTD),pandas,6.161942005157471,nyctaxi
21 | 19,feather V2 (ZSTD),arrow Table,2.7954366207122803,nyctaxi
22 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/r_read_results_1.csv:
--------------------------------------------------------------------------------
1 | "","expr","time","dataset"
2 | "1","csv_fread",17535751806,"fanniemae"
3 | "2","fst_unc",5833566950,"fanniemae"
4 | "3","fst_50",5875382178,"fanniemae"
5 | "4","feather_v1",10078519502.8,"fanniemae"
6 | "5","feather_unc",4719815545.2,"fanniemae"
7 | "6","feather_lz4",5852145495.2,"fanniemae"
8 | "7","feather_zstd",7779083610,"fanniemae"
9 | "8","parquet_unc",9493391604.8,"fanniemae"
10 | "9","parquet_snappy",9911315661.2,"fanniemae"
11 | "10","rds_unc",30267019708.2,"fanniemae"
12 | "11","rds_compressed",41482849064.2,"fanniemae"
13 | "12","csv_fread",23370041255,"nyctaxi"
14 | "13","fst_unc",13017416436,"nyctaxi"
15 | "14","fst_50",12634709971.4,"nyctaxi"
16 | "15","feather_v1",13443664009.4,"nyctaxi"
17 | "16","feather_unc",11371430104.2,"nyctaxi"
18 | "17","feather_lz4",13296044630,"nyctaxi"
19 | "18","feather_zstd",14594300772.2,"nyctaxi"
20 | "19","parquet_unc",13158666758.2,"nyctaxi"
21 | "20","parquet_snappy",13958228992,"nyctaxi"
22 | "21","rds_unc",22211784820.2,"nyctaxi"
23 | "22","rds_compressed",30765105346.2,"nyctaxi"
24 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/r_read_results_4.csv:
--------------------------------------------------------------------------------
1 | "","expr","time","dataset"
2 | "1","csv_fread",8036938666.6,"fanniemae"
3 | "2","fst_unc",6341601497.2,"fanniemae"
4 | "3","fst_50",5054754967.8,"fanniemae"
5 | "4","feather_v1",9799018014,"fanniemae"
6 | "5","feather_unc",5054201747.4,"fanniemae"
7 | "6","feather_lz4",4928118181,"fanniemae"
8 | "7","feather_zstd",5535553828.6,"fanniemae"
9 | "8","parquet_unc",6281569166.6,"fanniemae"
10 | "9","parquet_snappy",6392237692.6,"fanniemae"
11 | "10","rds_unc",29892887491.4,"fanniemae"
12 | "11","rds_compressed",41273872293.8,"fanniemae"
13 | "12","csv_fread",18312046954,"nyctaxi"
14 | "13","fst_unc",11969350465.6,"nyctaxi"
15 | "14","fst_50",13439147068.6,"nyctaxi"
16 | "15","feather_v1",12034649945,"nyctaxi"
17 | "16","feather_unc",11023961432.2,"nyctaxi"
18 | "17","feather_lz4",11592801001,"nyctaxi"
19 | "18","feather_zstd",12704684877,"nyctaxi"
20 | "19","parquet_unc",12225668849,"nyctaxi"
21 | "20","parquet_snappy",12004466381.6,"nyctaxi"
22 | "21","rds_unc",21847153904,"nyctaxi"
23 | "22","rds_compressed",30735937022.8,"nyctaxi"
24 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/r_write_results_1.csv:
--------------------------------------------------------------------------------
1 | "","expr","time","dataset"
2 | "1","fst_unc",7007023920,"fanniemae"
3 | "2","fst_50",4385196419,"fanniemae"
4 | "3","feather_v1",8656647228,"fanniemae"
5 | "4","feather_unc",10040626659,"fanniemae"
6 | "5","feather_lz4",10818098194,"fanniemae"
7 | "6","feather_zstd",11438481575,"fanniemae"
8 | "7","parquet_unc",10434816898,"fanniemae"
9 | "8","parquet_snappy",10800951873,"fanniemae"
10 | "9","rds_compressed",76929230341,"fanniemae"
11 | "10","rds_unc",24216423401,"fanniemae"
12 | "11","fst_unc",4087879250,"nyctaxi"
13 | "12","fst_50",3950344461,"nyctaxi"
14 | "13","feather_v1",5972294820,"nyctaxi"
15 | "14","feather_unc",5888590985,"nyctaxi"
16 | "15","feather_lz4",8325439328,"nyctaxi"
17 | "16","feather_zstd",10223231254,"nyctaxi"
18 | "17","parquet_unc",7715640740,"nyctaxi"
19 | "18","parquet_snappy",8585539352,"nyctaxi"
20 | "19","rds_compressed",104898052261,"nyctaxi"
21 | "20","rds_unc",10739751088,"nyctaxi"
22 |
--------------------------------------------------------------------------------
/20190919file_benchmarks/r_write_results_4.csv:
--------------------------------------------------------------------------------
1 | "","expr","time","dataset"
2 | "1","fst_unc",7758567831,"fanniemae"
3 | "2","fst_50",3700873556,"fanniemae"
4 | "3","feather_v1",7080591830,"fanniemae"
5 | "4","feather_unc",10413025112,"fanniemae"
6 | "5","feather_lz4",10818213516,"fanniemae"
7 | "6","feather_zstd",11563816777,"fanniemae"
8 | "7","parquet_unc",10814584911,"fanniemae"
9 | "8","parquet_snappy",11152511189,"fanniemae"
10 | "9","rds_compressed",78427148110,"fanniemae"
11 | "10","rds_unc",24919762665,"fanniemae"
12 | "11","fst_unc",4399914353,"nyctaxi"
13 | "12","fst_50",3305661431,"nyctaxi"
14 | "13","feather_v1",5477443720,"nyctaxi"
15 | "14","feather_unc",5864371601,"nyctaxi"
16 | "15","feather_lz4",8494803995,"nyctaxi"
17 | "16","feather_zstd",10073068744,"nyctaxi"
18 | "17","parquet_unc",7675560036,"nyctaxi"
19 | "18","parquet_snappy",8428579617,"nyctaxi"
20 | "19","rds_compressed",108234060692,"nyctaxi"
21 | "20","rds_unc",10717121094,"nyctaxi"
22 |
--------------------------------------------------------------------------------
/20200402pandas_load/.ipynb_checkpoints/Untitled-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import pyarrow as pa\n",
11 | "import numpy as np\n",
12 | "\n",
13 | "num_rows = 1_000_000\n",
14 | "num_columns = 100\n",
15 | "arr = np.random.randn(num_rows)\n",
16 | "dict_of_numpy_arrays = {\n",
17 | " 'f{}'.format(i): arr\n",
18 | " for i in range(num_columns)\n",
19 | "}"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 3,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": []
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 4,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": []
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 8,
39 | "metadata": {},
40 | "outputs": [
41 | {
42 | "name": "stdout",
43 | "output_type": "stream",
44 | "text": [
45 | "85.7 ms ± 4.55 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
46 | ]
47 | }
48 | ],
49 | "source": [
50 | "%timeit df = pd.DataFrame(dict_of_numpy_arrays)"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 11,
56 | "metadata": {},
57 | "outputs": [
58 | {
59 | "name": "stdout",
60 | "output_type": "stream",
61 | "text": [
62 | "49.7 ms ± 4.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
63 | ]
64 | }
65 | ],
66 | "source": [
67 | "timeit df = pa.table(dict_of_numpy_arrays).to_pandas()"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 12,
73 | "metadata": {},
74 | "outputs": [
75 | {
76 | "name": "stdout",
77 | "output_type": "stream",
78 | "text": [
79 | "66.9 ms ± 4.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
80 | ]
81 | }
82 | ],
83 | "source": [
84 | "timeit df = pa.table(dict_of_numpy_arrays).to_pandas(use_threads=False)"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": []
93 | }
94 | ],
95 | "metadata": {
96 | "kernelspec": {
97 | "display_name": "Python 3",
98 | "language": "python",
99 | "name": "python3"
100 | },
101 | "language_info": {
102 | "codemirror_mode": {
103 | "name": "ipython",
104 | "version": 3
105 | },
106 | "file_extension": ".py",
107 | "mimetype": "text/x-python",
108 | "name": "python",
109 | "nbconvert_exporter": "python",
110 | "pygments_lexer": "ipython3",
111 | "version": "3.7.6"
112 | }
113 | },
114 | "nbformat": 4,
115 | "nbformat_minor": 4
116 | }
117 |
--------------------------------------------------------------------------------
/20200402pandas_load/Untitled.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pandas as pd\n",
10 | "import pyarrow as pa\n",
11 | "import numpy as np\n",
12 | "\n",
13 | "num_rows = 1_000_000\n",
14 | "num_columns = 100\n",
15 | "arr = np.random.randn(num_rows)\n",
16 | "dict_of_numpy_arrays = {\n",
17 | " 'f{}'.format(i): arr\n",
18 | " for i in range(num_columns)\n",
19 | "}"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 3,
25 | "metadata": {},
26 | "outputs": [],
27 | "source": []
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 4,
32 | "metadata": {},
33 | "outputs": [],
34 | "source": []
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 8,
39 | "metadata": {},
40 | "outputs": [
41 | {
42 | "name": "stdout",
43 | "output_type": "stream",
44 | "text": [
45 | "85.7 ms ± 4.55 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
46 | ]
47 | }
48 | ],
49 | "source": [
50 | "%timeit df = pd.DataFrame(dict_of_numpy_arrays)"
51 | ]
52 | },
53 | {
54 | "cell_type": "code",
55 | "execution_count": 11,
56 | "metadata": {},
57 | "outputs": [
58 | {
59 | "name": "stdout",
60 | "output_type": "stream",
61 | "text": [
62 | "49.7 ms ± 4.12 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
63 | ]
64 | }
65 | ],
66 | "source": [
67 | "timeit df = pa.table(dict_of_numpy_arrays).to_pandas()"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 12,
73 | "metadata": {},
74 | "outputs": [
75 | {
76 | "name": "stdout",
77 | "output_type": "stream",
78 | "text": [
79 | "66.9 ms ± 4.6 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
80 | ]
81 | }
82 | ],
83 | "source": [
84 | "timeit df = pa.table(dict_of_numpy_arrays).to_pandas(use_threads=False)"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": null,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": []
93 | }
94 | ],
95 | "metadata": {
96 | "kernelspec": {
97 | "display_name": "Python 3",
98 | "language": "python",
99 | "name": "python3"
100 | },
101 | "language_info": {
102 | "codemirror_mode": {
103 | "name": "ipython",
104 | "version": 3
105 | },
106 | "file_extension": ".py",
107 | "mimetype": "text/x-python",
108 | "name": "python",
109 | "nbconvert_exporter": "python",
110 | "pygments_lexer": "ipython3",
111 | "version": "3.7.6"
112 | }
113 | },
114 | "nbformat": 4,
115 | "nbformat_minor": 4
116 | }
117 |
--------------------------------------------------------------------------------
/20200509wideparquet/WideParquet.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pyarrow as pa\n",
10 | "import pyarrow.parquet as pq\n",
11 | "import numpy as np\n",
12 | "import os\n",
13 | "import pandas as pd\n",
14 | "import time\n",
15 | "\n",
16 | "pa.set_cpu_count(8)\n",
17 | "\n",
18 | "def get_timing(f, niter):\n",
19 | " start = time.clock_gettime(time.CLOCK_REALTIME)\n",
20 | " for i in range(niter):\n",
21 | " f()\n",
22 | " result = (time.clock_gettime(time.CLOCK_REALTIME) - start) / niter\n",
23 | " return result"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 2,
29 | "metadata": {},
30 | "outputs": [],
31 | "source": [
32 | "total_num_values = 100_000_000"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 4,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "def make_example_table(num_cols):\n",
42 | " num_rows = total_num_values // num_cols\n",
43 | " \n",
44 | " values = np.arange(num_rows)\n",
45 | " \n",
46 | " return pa.table([values] * num_cols, \n",
47 | " names=['f{}'.format(i) for i in range(num_cols)])\n",
48 | " "
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 5,
54 | "metadata": {},
55 | "outputs": [],
56 | "source": [
57 | "t1 = make_example_table(100)\n",
58 | "t2 = make_example_table(100000)"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 6,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "import pyarrow.feather as fth"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 17,
73 | "metadata": {},
74 | "outputs": [],
75 | "source": [
76 | "fth.write_feather(t1, 't1.arrow', compression=None)\n",
77 | "fth.write_feather(t2, 't2.arrow', compression=None)"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 18,
83 | "metadata": {},
84 | "outputs": [
85 | {
86 | "name": "stdout",
87 | "output_type": "stream",
88 | "text": [
89 | "total 2519376\r\n",
90 | "-rw------- 1 wesm wesm 800088522 May 9 16:44 t1.arrow\r\n",
91 | "-rw------- 1 wesm wesm 815199522 May 9 16:44 t2.arrow\r\n",
92 | "-rw------- 1 wesm wesm 964513790 May 9 16:23 test.parquet\r\n",
93 | "-rw------- 1 wesm wesm 17815 May 9 16:42 WideParquet.ipynb\r\n"
94 | ]
95 | }
96 | ],
97 | "source": [
98 | "!ls -l"
99 | ]
100 | },
101 | {
102 | "cell_type": "code",
103 | "execution_count": 15,
104 | "metadata": {},
105 | "outputs": [
106 | {
107 | "name": "stdout",
108 | "output_type": "stream",
109 | "text": [
110 | "118 ms ± 1.28 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
111 | ]
112 | }
113 | ],
114 | "source": [
115 | "%timeit fth.read_table('t1.arrow')"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 16,
121 | "metadata": {},
122 | "outputs": [
123 | {
124 | "name": "stdout",
125 | "output_type": "stream",
126 | "text": [
127 | "398 ms ± 10.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
128 | ]
129 | }
130 | ],
131 | "source": [
132 | "%timeit fth.read_table('t2.arrow')"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 13,
138 | "metadata": {},
139 | "outputs": [],
140 | "source": [
141 | "fth.write_feather?"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": 27,
147 | "metadata": {},
148 | "outputs": [
149 | {
150 | "name": "stdout",
151 | "output_type": "stream",
152 | "text": [
153 | "10\n",
154 | "100\n",
155 | "1000\n",
156 | "10000\n",
157 | "100000\n"
158 | ]
159 | }
160 | ],
161 | "source": [
162 | "num_cols_cases = [10, 100, 1000, 10000, 100000]\n",
163 | "\n",
164 | "file_path = 'test.parquet'\n",
165 | "\n",
166 | "file_sizes = {}\n",
167 | "read_times = {}\n",
168 | "\n",
169 | "for num_cols in num_cols_cases:\n",
170 | " print(num_cols)\n",
171 | "\n",
172 | " table = make_example_table(num_cols)\n",
173 | " \n",
174 | " pq.write_table(table, file_path, compression='NONE')\n",
175 | " \n",
176 | " file_sizes[num_cols] = os.stat(file_path).st_size\n",
177 | " read_times[num_cols] = get_timing(lambda: pq.read_table(file_path), 10)"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": 28,
183 | "metadata": {},
184 | "outputs": [
185 | {
186 | "data": {
187 | "text/plain": [
188 | "{10: 802850724,\n",
189 | " 100: 827973796,\n",
190 | " 1000: 1013094899,\n",
191 | " 10000: 979191400,\n",
192 | " 100000: 964513790}"
193 | ]
194 | },
195 | "execution_count": 28,
196 | "metadata": {},
197 | "output_type": "execute_result"
198 | }
199 | ],
200 | "source": [
201 | "file_sizes"
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": 29,
207 | "metadata": {},
208 | "outputs": [
209 | {
210 | "data": {
211 | "text/plain": [
212 | "{10: 0.10510084629058838,\n",
213 | " 100: 0.07333686351776122,\n",
214 | " 1000: 0.12494065761566162,\n",
215 | " 10000: 0.5032524585723877,\n",
216 | " 100000: 3.8229554891586304}"
217 | ]
218 | },
219 | "execution_count": 29,
220 | "metadata": {},
221 | "output_type": "execute_result"
222 | }
223 | ],
224 | "source": [
225 | "read_times"
226 | ]
227 | },
228 | {
229 | "cell_type": "code",
230 | "execution_count": 30,
231 | "metadata": {},
232 | "outputs": [
233 | {
234 | "data": {
235 | "text/plain": [
236 | ""
237 | ]
238 | },
239 | "execution_count": 30,
240 | "metadata": {},
241 | "output_type": "execute_result"
242 | },
243 | {
244 | "data": {
245 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAEgCAYAAACkfIiyAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAQ40lEQVR4nO3dbYxcZ3nG8f9Vm6BSAqGxScEvOCoO4A8EgUlQ1aqhCHBAqkspJQE1TRRkRRBKPyAwX9pShAillSjKi2WBCfnQutCi4oIbI9EGWkEgNoUUJ3VihbxsTInDS1CLaORw98NMMuNhvDs24z27z/x/0spznvNk9s7R6tpnn7nnTKoKSdLy9wtdFyBJmg4DXZIaYaBLUiMMdElqhIEuSY0w0CWpEZ0GepJdSR5K8q0J5j4nyReS3J7kliRrF6NGSVouul6h3whsmXDuXwI3VdULgT8HPnC6ipKk5ajTQK+qLwHfHx5L8qtJbk5yIMm/JXl+/9Qm4Av9x/8KbF3EUiVpyet6hT7OTuDtVfUS4J3A9f3xbwKv7z9+HXBmkrM7qE+SlqSVXRcwLMlTgV8DPpXk8eEn9/99J3BtksuBLwEPAscWu0ZJWqqWVKDT+4vhh1X1otETVXUE+F14IvhfX1WPLHJ9krRkLaktl6r6EfDtJG8ASM/5/cerkjxe73uAXR2VKUlLUtdti38LfAV4XpK5JFcCbwauTPJN4CCDFz8vAg4luQs4B3h/ByVL0pIVb58rSW1YUlsukqRTZ6BLUiM663JZtWpVbdiwoatvL0nL0oEDBx6uqtXjznUW6Bs2bGD//v1dfXtJWpaS3Heic265SFIjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUiAUDfaHP/ezfEfEjSQ73P+/zxdMvU5K0kEneWHQjcC1w0wnOXwxs7H9dCNzQ/1fqxIbtn+u6BO695rVdl6AZtOAKfdznfo7YSu/Dm6uqbgXOSvKsaRUoSZrMNPbQ1wAPDB3P9cckSYtoGoGeMWNjb7KeZFuS/Un2Hz16dArfWpL0uGkE+hywbuh4LXBk3MSq2llVm6tq8+rVY28WJkk6RdMI9D3AZf1ul5cBj1TVd6bwvJKkk7Bgl0v/cz8vAlYlmQP+FHgSQFXtAPYCrwEOAz8GrjhdxUqSTmzBQK+qSxc4X8DbplaRpKmxhXO2+E5RSWqEgS5JjejsI+gkaTHNwvaTK3RJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREruy5A07Fh++e6LoF7r3lt1yVIM80VuiQ1YqJAT7IlyaEkh5NsH3P+6Un+Kck3kxxMcsX0S5UkzWfBQE+yArgOuBjYBFyaZNPItLcBd1TV+cBFwF8lOWPKtUqS5jHJCv0C4HBV3VNVjwK7ga0jcwo4M0mApwLfB45NtVJJ0rwmeVF0DfDA0PEccOHInGuBPcAR4EzgjVX106lUOA9fCJSkgUlW6BkzViPHrwa+ATwbeBFwbZKn/cwTJduS7E+y/+jRoyddrCTpxCYJ9Dlg3dDxWnor8WFXAJ+unsPAt4Hnjz5RVe2sqs1VtXn16tWnWrMkaYxJAv02YGOSc/svdF5Cb3tl2P3AKwCSnAM8D7hnmoVKkua34B56VR1LcjWwD1gB7Kqqg0mu6p/fAbwPuDHJf9Lbonl3VT18GuuWJI2Y6J2iVbUX2DsytmPo8RHgVdMtTZJ0MnynqCQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakREwV6ki1JDiU5nGT7CeZclOQbSQ4m+eJ0y5QkLWTlQhOSrACuA14JzAG3JdlTVXcMzTkLuB7YUlX3J3nm6SpYkjTeJCv0C4DDVXVPVT0K7Aa2jsx5E/DpqrofoKoemm6ZkqSFTBLoa4AHho7n+mPDzgOekeSWJAeSXDatAiVJk1lwywXImLEa8zwvAV4B/CLwlSS3VtVdxz1Rsg3YBrB+/fqTr1aSdEKTrNDngHVDx2uBI2Pm3FxV/1tVDwNfAs4ffaKq2llVm6tq8+rVq0+1ZknSGJME+m3AxiTnJjkDuATYMzLnM8BvJFmZ5CnAhcCd0y1VkjSfBbdcqupYkquBfcAKYFdVHUxyVf/8jqq6M8nNwO3AT4GPVtW3TmfhkqTjTbKHTlXtBfaOjO0YOf4Q8KHplSZJOhm+U1SSGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUiIkCPcmWJIeSHE6yfZ55L03yWJLfm16JkqRJLBjoSVYA1wEXA5uAS5NsOsG8DwL7pl2kJGlhk6zQLwAOV9U9VfUosBvYOmbe24F/AB6aYn2SpAlNEuhrgAeGjuf6Y09IsgZ4HbBjeqVJkk7GJIGeMWM1cvxh4N1V9di8T5RsS7I/yf6jR49OWqMkaQIrJ5gzB6wbOl4LHBmZsxnYnQRgFfCaJMeq6h+HJ1XVTmAnwObNm0d/KUiSfg6TBPptwMYk5wIPApcAbxqeUFXnPv44yY3AZ0fDXJJ0ei0Y6FV1LMnV9LpXVgC7qupgkqv65903l6QlYJIVOlW1F9g7MjY2yKvq8p+/LEnSyfKdopLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGTBToSbYkOZTkcJLtY86/Ocnt/a8vJzl/+qVKkuazYKAnWQFcB1wMbAIuTbJpZNq3gd+sqhcC7wN2TrtQSdL8JlmhXwAcrqp7qupRYDewdXhCVX25qn7QP7wVWDvdMiVJC5kk0NcADwwdz/XHTuRK4J9/nqIkSSdv5QRzMmasxk5MXk4v0H/9BOe3AdsA1q9fP2GJkqRJTLJCnwPWDR2vBY6MTkryQuCjwNaq+t64J6qqnVW1uao2r169+lTqlSSdwCSBfhuwMcm5Sc4ALgH2DE9Ish74NPAHVXXX9MuUJC1kwS2XqjqW5GpgH7AC2FVVB5Nc1T+/A/gT4Gzg+iQAx6pq8+krW5I0apI9dKpqL7B3ZGzH0OO3AG+ZbmmSpJPhO0UlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjZgo0JNsSXIoyeEk28ecT5KP9M/fnuTF0y9VkjSfBQM9yQrgOuBiYBNwaZJNI9MuBjb2v7YBN0y5TknSAiZZoV8AHK6qe6rqUWA3sHVkzlbgpuq5FTgrybOmXKskaR6TBPoa4IGh47n+2MnOkSSdRisnmJMxY3UKc0iyjd6WDMD/JDk0wfc/3VYBD5/qf5wPTrGS7nktBrwWA16LgaVwLZ5zohOTBPocsG7oeC1w5BTmUFU7gZ0TfM9Fk2R/VW3uuo6lwGsx4LUY8FoMLPVrMcmWy23AxiTnJjkDuATYMzJnD3BZv9vlZcAjVfWdKdcqSZrHgiv0qjqW5GpgH7AC2FVVB5Nc1T+/A9gLvAY4DPwYuOL0lSxJGmeSLReqai+90B4e2zH0uIC3Tbe0RbOktoA65rUY8FoMeC0GlvS1SC+LJUnLnW/9l6RGGOiS1AgDXZIaYaBLUiMm6nJpRZKnA+8BfgdY3R9+CPgMcE1V/bCr2rqQJPTu1bOG3jt7jwBfqxl8pTzJq+n9XAxfi89U1c2dFtahJL9Mr4ntB13X0pV+Zmzh+J+LfUs1K2Zthf5J4AfARVV1dlWdDby8P/apTitbZEleBdwN/Bm99xC8FngvcHf/3MxI8mHgHcAXgb8APtR//EdJ/rrL2hZbkvVJdic5CnwVuC3JQ/2xDd1Wt7iSXAZ8HbgIeArwS/Ty4kD/3JIzU22LSQ5V1fNO9lyLktwJXFxV946MnwvsraoXdFJYB5LcVVXnjRkPcFdVbeygrE4k+QrwYeDvq+qx/tgK4A3AH1fVy7qsbzH17zV14ehqPMkzgK+O+5np2qyt0O9L8q4k5zw+kOScJO/m+LtFzoKV9O7BM+pB4EmLXEvXfpLkgjHjLwV+stjFdGxVVf3d42EOUFWPVdVu4OwO6+pCGHOTQeCnjL8hYedmag8deCOwHfhikmf2x75L7140b+isqm7sovfn9G4Gv8zW0btXz8c6q6oblwM3JDmTwS+5dcCP+udmyYEk1wOf4Pifiz8E/qOzqrrxfuDrST7P4FqsB14JvK+zquYxU1su80lyRVV9vOs6FlP/k6d+m94LPqEXZnuq6o5OC+tIkl9h6FpU1X93XNKi69+A70p6H1pz3M8F8LGq+r8Oy1t0/e2VV3P8tdi3VF8oNtD7ktxfVeu7rkPdsONH81kuHT8zteWS5PYTnQLOOcG5JtnCOdDv6rmeXtfPg/3htcBzk7y1qj7fWXEdsIWzJ8l6el1PvwU80hvK04B/AbaPNhQsBTO1Qk/yXXp/Po3+lg3w5ap69uJX1Y0k++j9YH7i8a2F/pbD5cArquqVHZa3qOz4Gei3cJ4H3MTg9YS1wGXA3VX1jq5qW2zLseNn1gL9Y8DHq+rfx5z7m6p6UwdldcIWzoEkdwMvqKpjI+NnAHdU1XO7qWzx2cI5kOTuE/3/zneuSzO15VJVV85zbmbCvO++JO+it0L/LvRaOOmt0GethdOOn4GfJLmgqr42Mj6LLZzLruNnplboGui/er+dXjfDaAvnNUv9xZ9ps+OnJ8mLgRuAcS2cb62qA13VttiWY8ePga6fMYstnDqeLZzLk4GunzFrLZx2/BzPFs6B5dbxY6DPqAVaOM+rqicvZj1dsuNnYL4WTnpbLjPTwrkcO34M9BllC+eAHT8DtnAOLMeOn1m7OZcGPgs8taruG/m6F7il29IWnTdtG/CmbQPL7qZtrtA18+z4GUjyHuD3gXEtnJ+sqg90VdtiW44dPwa6NI9Z7PixhfN4y6njx0CX5jFrHT863nLr+Jmpd4pK43jTtgFbOAeW403bXKFr5tnxM2AL58By7PhxhS4NOn6+MXoiyS2LX06nNlTVB4cH+sF+TZIrOqqpK8uu48dA18zzpm3H8aZtA8vupm1uuUh6gi2cx1tuHT8GuqSJzGIL53LjO0UlTeq9XRewmJI8Pck1Sf4ryff6X3f2x87qur5x3EOX9ARbOI/zSXodPxeN6fj5FLDkOn7ccpH0BFs4B5bjTdtcoUsaZgvnwLLr+HGFLkljLMeOHwNdkk7SUu34MdAl6SQt1Zu2uYcuSWMsx44fA12SxjuHeTp+Fr+chRnokjTesuv4cQ9dkhrhW/8lqREGuiQ1wkCXpEYY6JLUCANdkhrx/4TufEphIn9cAAAAAElFTkSuQmCC\n",
246 | "text/plain": [
247 | ""
248 | ]
249 | },
250 | "metadata": {
251 | "needs_background": "light"
252 | },
253 | "output_type": "display_data"
254 | }
255 | ],
256 | "source": [
257 | "pd.Series(file_sizes).plot.bar()"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": 21,
263 | "metadata": {},
264 | "outputs": [
265 | {
266 | "data": {
267 | "text/plain": [
268 | ""
269 | ]
270 | },
271 | "execution_count": 21,
272 | "metadata": {},
273 | "output_type": "execute_result"
274 | },
275 | {
276 | "data": {
277 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAEVCAYAAADwyx6sAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAARcUlEQVR4nO3dbYxcZ3nG8f9VYwotlLR4S9LYjpEwFS9SIXJNEF9cCiUJUdMP0AYkUiIkizcVJCQa+ABFVdXQSojSQCxLCRCJkoaCQgROA1J5i9oEHNcJJAZiIWhMUmICOFi8yfTuhzlh1pPZndl4do/3mf9PGuXMc56duXO0uvb4mfucSVUhSVr/fq3vAiRJs2GgS1IjDHRJaoSBLkmNMNAlqREGuiQ14jF9vfGmTZtq27Ztfb29JK1Lt99++/eramHcvt4Cfdu2bezfv7+vt5ekdSnJd5ba55KLJDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqRG9XVgkSWtp2+Wf7rsEvn3FS1f19T1Dl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRkwM9CSPS/LlJHckuSvJu8bM2ZXkWJKD3eMdq1OuJGkp01z6/3PghVV1PMlG4JYkN1XVrSPzvlRVF82+REnSNCYGelUVcLx7urF71GoWJUlauanW0JNsSHIQeAD4bFXdNmba87tlmZuSPGumVUqSJpoq0Kvql1X1HGAzsDPJs0emHADOqao/AP4ZuGHc6yTZnWR/kv1Hjx49lbolSSNW1OVSVT8CPg+cPzL+UFUd77b3ARuTbBrz83urakdV7VhYWHj0VUuSHmGaLpeFJGd0248HXgR8fWTOmUnSbe/sXvfB2ZcrSVrKNF0uZwEfTrKBQVBfX1WfSvJagKraA7wMeF2SE8BPgUu6D1MlSWtkmi6XO4Hnjhnfs2j7SuDK2ZYmSVoJrxSVpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGjEx0JM8LsmXk9yR5K4k7xozJ0nel+RwkjuTnLs65UqSljLxS6KBnwMvrKrjSTYCtyS5qapuXTTnAmB793gecFX3X0nSGpl4hl4Dx7unG7tHjUy7GLi2m3srcEaSs2ZbqiRpOVOtoSfZkOQg8ADw2aq6bWTK2cC9i54f6cYkSWtkqkCvql9W1XOAzcDOJM8emZJxPzY6kGR3kv1J9h89enTl1UqSlrSiLpeq+hHweeD8kV1HgC2Lnm8G7hvz83urakdV7VhYWFhhqZKk5UzT5bKQ5Ixu+/HAi4Cvj0y7Ebi063Y5DzhWVffPvFpJ0pKm6XI5C/hwkg0M/gBcX1WfSvJagKraA+wDLgQOAz8BLluleiVJS5gY6FV1J/DcMeN7Fm0X8IbZliZJWgmvFJWkRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1IhpviR6S5LPJTmU5K4kbxozZ1eSY0kOdo93rE65kqSlTPMl0SeAt1TVgSRPBG5P8tmquntk3peq6qLZlyhJmsbEM/Squr+qDnTbPwYOAWevdmGSpJVZ0Rp6km3Ac4Hbxux+fpI7ktyU5FkzqE2StALTLLkAkOQJwMeBN1fVQyO7DwDnVNXxJBcCNwDbx7zGbmA3wNatWx910ZKkR5rqDD3JRgZh/pGq+sTo/qp6qKqOd9v7gI1JNo2Zt7eqdlTVjoWFhVMsXZK02DRdLgGuBg5V1XuWmHNmN48kO7vXfXCWhUqSljfNkssLgFcBX01ysBt7O7AVoKr2AC8DXpfkBPBT4JKqqlWoV5K0hImBXlW3AJkw50rgylkVJUlaOa8UlaRGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhoxMdCTbEnyuSSHktyV5E1j5iTJ+5IcTnJnknNXp1xJ0lImfkk0cAJ4S1UdSPJE4PYkn62quxfNuQDY3j2eB1zV/VeStEYmnqFX1f1VdaDb/jFwCDh7ZNrFwLU1cCtwRpKzZl6tJGlJK1pDT7INeC5w28ius4F7Fz0/wiNDX5K0iqYO9CRPAD4OvLmqHhrdPeZHasxr7E6yP8n+o0ePrqxSSdKypgr0JBsZhPlHquoTY6YcAbYser4ZuG90UlXtraodVbVjYWHh0dQrSVrCNF0uAa4GDlXVe5aYdiNwadftch5wrKrun2GdkqQJpulyeQHwKuCrSQ52Y28HtgJU1R5gH3AhcBj4CXDZ7EuVJC1nYqBX1S2MXyNfPKeAN8yqKEnSynmlqCQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktSIiYGe5JokDyT52hL7dyU5luRg93jH7MuUJE0y8UuigQ8BVwLXLjPnS1V10UwqkiQ9KhPP0Kvqi8AP1qAWSdIpmNUa+vOT3JHkpiTPmtFrSpJWYJoll0kOAOdU1fEkFwI3ANvHTUyyG9gNsHXr1hm8tSTpYad8hl5VD1XV8W57H7AxyaYl5u6tqh1VtWNhYeFU31qStMgpB3qSM5Ok297ZveaDp/q6kqSVmbjkkuSjwC5gU5IjwDuBjQBVtQd4GfC6JCeAnwKXVFWtWsWSpLEmBnpVvWLC/isZtDVKknrklaKS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhoxMdCTXJPkgSRfW2J/krwvyeEkdyY5d/ZlSpImmeYM/UPA+cvsvwDY3j12A1edelmSpJWaGOhV9UXgB8tMuRi4tgZuBc5IctasCpQkTWcWa+hnA/cuen6kG5MkraFZBHrGjNXYicnuJPuT7D969OgM3lqS9LBZBPoRYMui55uB+8ZNrKq9VbWjqnYsLCzM4K0lSQ+bRaDfCFzadbucBxyrqvtn8LqSpBV4zKQJST4K7AI2JTkCvBPYCFBVe4B9wIXAYeAnwGWrVawkaWkTA72qXjFhfwFvmFlFkqRHxStFJakRBrokNcJAl6RGGOiS1AgDXZIaYaBLUiMMdElqhIEuSY0w0CWpEQa6JDXCQJekRhjoktQIA12SGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEZMFehJzk/yjSSHk1w+Zv+uJMeSHOwe75h9qZKk5Uz8kugkG4D3Ay8GjgBfSXJjVd09MvVLVXXRKtQoSZrCNGfoO4HDVfWtqvoFcB1w8eqWJUlaqWkC/Wzg3kXPj3Rjo56f5I4kNyV51kyqkyRNbeKSC5AxYzXy/ABwTlUdT3IhcAOw/REvlOwGdgNs3bp1haVKkpYzzRn6EWDLouebgfsWT6iqh6rqeLe9D9iYZNPoC1XV3qraUVU7FhYWTqFsSdKoac7QvwJsT/JU4LvAJcArF09IcibwvaqqJDsZ/KF4cNbFSlqZbZd/uu8S+PYVL+27hLkxMdCr6kSSNwI3AxuAa6rqriSv7fbvAV4GvC7JCeCnwCVVNbosI0laRdOcoT+8jLJvZGzPou0rgStnW5okaSW8UlSSGmGgS1IjDHRJaoSBLkmNMNAlqREGuiQ1wkCXpEYY6JLUCANdkhphoEtSIwx0SWqEgS5JjTDQJakRBrokNcJAl6RGTHU/dGk98Vt6NK88Q5ekRqzrM3TPxCRpaF0Huob84yZpqiWXJOcn+UaSw0kuH7M/Sd7X7b8zybmzL1WStJyJgZ5kA/B+4ALgmcArkjxzZNoFwPbusRu4asZ1SpImmOYMfSdwuKq+VVW/AK4DLh6ZczFwbQ3cCpyR5KwZ1ypJWsY0gX42cO+i50e6sZXOkSStomk+FM2YsXoUc0iym8GSDMDxJN+Y4v1X2ybg+4/2h/PuGVbSP4/FkMdiyGMxdDoci3OW2jFNoB8Btix6vhm471HMoar2AnuneM81k2R/Ve3ou47TgcdiyGMx5LEYOt2PxTRLLl8Btid5apLHApcAN47MuRG4tOt2OQ84VlX3z7hWSdIyJp6hV9WJJG8EbgY2ANdU1V1JXtvt3wPsAy4EDgM/AS5bvZIlSeNMdWFRVe1jENqLx/Ys2i7gDbMtbc2cVktAPfNYDHkshjwWQ6f1scggiyVJ650355KkRhjoktQIA12SGmGgS1Ij5ur2uUmeBLwN+DNgoRt+APgkcEVV/aiv2vqQJAzu1XM2gyt77wO+XHP4SXmSlzD4vVh8LD5ZVf/ea2E9SvI7DJrYfth3LX3pMuN8Tv69uPl0zYp5O0O/HvghsKuqnlxVTwb+qBv7WK+VrbEkfwLcA/wNg2sIXgq8C7in2zc3krwXeBPwBeAfgH/stv8qyT/1WdtaS7I1yXVJjgK3AV9J8kA3tq3f6tZWkkuBA8Au4DeA32SQF7d3+047c9W2mOQbVfX7K93XoiSHgAuq6tsj408F9lXVM3oprAdJvllVTx8zHuCbVbW9h7J6keS/gPcC/1ZVv+zGNgAvB95cVef1Wd9a6u419bzRs/Ekvw3cNu53pm/zdob+nSRvTfKUhweSPCXJX3Py3SLnwWMY3INn1HeBjWtcS99+lmTnmPE/BH621sX0bFNV/evDYQ5QVb+squuAJ/dYVx/CmJsMAv/H+BsS9m6u1tCBvwAuB76Q5He7se8xuBfNy3urqh/XMPjn9HUM/5htYXCvnqt7q6ofrwauSvJEhn/ktgAPdfvmye1JPgB8mJN/L/4S+O/equrH3wEHknyG4bHYCrwY+NveqlrGXC25LCfJZVX1wb7rWEvdN0/9KYMPfMIgzG6sqrt7LawnSc5k0bGoqv/tuaQ1192A7zUMvrTmpN8L4Oqq+nmP5a25bnnlJZx8LG4+XT8oNtA7Sf6nqrb2XYf6YcePlrNeOn7masklyZ1L7QKessS+JtnCOdR19XyAQdfPd7vhzcDTkry+qj7TW3E9sIVzIMlWBl1PLwSODYbyW8B/AJePNhScDubqDD3J9xj882n0r2yA/6yq31v7qvqR5GYGv5gffnhpoVtyeDXwx1X14h7LW1N2/Ax1LZxPB65l+HnCZuBS4J6qelNfta219djxM2+BfjXwwaq6Zcy+f6mqV/ZQVi9s4RxKcg/wjKo6MTL+WODuqnpaP5WtPVs4h5Lcs9T/73L7+jRXSy5V9Zpl9s1NmHe+k+StDM7QvweDFk4GZ+jz1sJpx8/Qz5LsrKovj4zPYwvnuuv4maszdA11n95fzqCbYbSF84rT/cOfWbPjZyDJucBVwLgWztdX1e191bbW1mPHj4GuR5jHFk6dzBbO9clA1yPMWwunHT8ns4VzaL11/Bjoc2pCC+fTq+rX17KePtnxM7RcCyeDJZe5aeFcjx0/BvqcsoVzyI6fIVs4h9Zjx8+83ZxLQ58CnlBV3xl5fBv4fL+lrTlv2jbkTduG1t1N2zxD19yz42coyduAPwfGtXBeX1V/31dta209dvwY6NIy5rHjxxbOk62njh8DXVrGvHX86GTrreNnrq4Ulcbxpm1DtnAOrcebtnmGrrlnx8+QLZxD67HjxzN0adjxc3B0R5LPr305vdpWVe9ePNAF+xVJLuuppr6su44fA11zz5u2ncSbtg2tu5u2ueQi6Vds4TzZeuv4MdAlTWUeWzjXG68UlTStd/VdwFpK8qQkVyT5epIHu8ehbuyMvusbxzV0Sb9iC+dJrmfQ8bNrTMfPx4DTruPHJRdJv2IL59B6vGmbZ+iSFrOFc2jddfx4hi5JY6zHjh8DXZJW6HTt+DHQJWmFTtebtrmGLkljrMeOHwNdksZ7Cst0/Kx9OZMZ6JI03rrr+HENXZIa4aX/ktQIA12SGmGgS1IjDHRJaoSBLkmN+H+NiAucWRIu8wAAAABJRU5ErkJggg==\n",
278 | "text/plain": [
279 | ""
280 | ]
281 | },
282 | "metadata": {
283 | "needs_background": "light"
284 | },
285 | "output_type": "display_data"
286 | }
287 | ],
288 | "source": [
289 | "pd.Series(read_times).plot.bar()"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": null,
295 | "metadata": {},
296 | "outputs": [],
297 | "source": []
298 | }
299 | ],
300 | "metadata": {
301 | "kernelspec": {
302 | "display_name": "Python 3",
303 | "language": "python",
304 | "name": "python3"
305 | },
306 | "language_info": {
307 | "codemirror_mode": {
308 | "name": "ipython",
309 | "version": 3
310 | },
311 | "file_extension": ".py",
312 | "mimetype": "text/x-python",
313 | "name": "python",
314 | "nbconvert_exporter": "python",
315 | "pygments_lexer": "ipython3",
316 | "version": "3.7.3"
317 | }
318 | },
319 | "nbformat": 4,
320 | "nbformat_minor": 2
321 | }
322 |
--------------------------------------------------------------------------------
/peak_use.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from pandas.util.testing import rands
3 |
4 | import pyarrow as pa
5 | import pyarrow.parquet as pq
6 |
7 | import gc
8 |
9 | GB = 1 << 30
10 |
11 | class memory_use:
12 |
13 | def __init__(self):
14 | self.start_use = pa.total_allocated_bytes()
15 | self.pool = pa.default_memory_pool()
16 | self.start_peak_use = self.pool.max_memory()
17 |
18 | def __enter__(self):
19 | return
20 |
21 | def __exit__(self, type, value, traceback):
22 | gc.collect()
23 | print("Change in memory use: {}"
24 | .format((pa.total_allocated_bytes() - self.start_use) / GB))
25 | print("Change in peak use: {}"
26 | .format((self.pool.max_memory() - self.start_peak_use) / GB))
27 |
28 |
29 | with memory_use():
30 | table = pq.read_table('/tmp/test.parquet')
31 |
--------------------------------------------------------------------------------
/scripts/20190903_parquet_benchmark.py:
--------------------------------------------------------------------------------
1 | import json
2 | import numpy as np
3 | import pyarrow as pa
4 | import pyarrow.parquet as pq
5 | from pandas.util.testing import rands
6 | import gc
7 | import time
8 |
9 |
10 | class memory_use:
11 |
12 | def __init__(self):
13 | self.start_use = pa.total_allocated_bytes()
14 | self.pool = pa.default_memory_pool()
15 | self.start_peak_use = self.pool.max_memory()
16 |
17 | def __enter__(self):
18 | return
19 |
20 | def __exit__(self, type, value, traceback):
21 | gc.collect()
22 | print("Change in memory use: {}"
23 | .format(pa.total_allocated_bytes() - self.start_use))
24 | print("Change in peak use: {}"
25 | .format(self.pool.max_memory() - self.start_peak_use))
26 |
27 |
28 | def generate_strings(string_size, nunique, length, random_order=True):
29 | uniques = np.array([rands(string_size) for i in range(nunique)], dtype='O')
30 | if random_order:
31 | indices = np.random.randint(0, nunique, size=length).astype('i4')
32 | return uniques.take(indices)
33 | else:
34 | return uniques.repeat(length // nunique)
35 |
36 |
37 | def generate_dict_strings(string_size, nunique, length, random_order=True):
38 | uniques = np.array([rands(string_size) for i in range(nunique)], dtype='O')
39 | if random_order:
40 | indices = np.random.randint(0, nunique, size=length).astype('i4')
41 | else:
42 | indices = np.arange(nunique).astype('i4').repeat(length // nunique)
43 | return pa.DictionaryArray.from_arrays(indices, uniques)
44 |
45 |
46 | STRING_SIZE = 32
47 | LENGTH = 3_000_000
48 | NITER = 5
49 |
50 |
51 | def generate_table(nunique, num_cols=10, random_order=True):
52 | data = generate_strings(STRING_SIZE, nunique, LENGTH,
53 | random_order=random_order)
54 | return pa.Table.from_arrays([
55 | pa.array(data) for i in range(num_cols)
56 | ], names=['f{}'.format(i) for i in range(num_cols)])
57 |
58 |
59 | def generate_dict_table(nunique, num_cols=10, random_order=True):
60 | data = generate_dict_strings(STRING_SIZE, nunique, LENGTH,
61 | random_order=random_order)
62 | return pa.Table.from_arrays([
63 | data for i in range(num_cols)
64 | ], names=['f{}'.format(i) for i in range(num_cols)])
65 |
66 |
67 | def get_timing(f, niter):
68 | start = time.clock_gettime(time.CLOCK_REALTIME)
69 | gc.disable()
70 | for i in range(niter):
71 | f()
72 | result = (time.clock_gettime(time.CLOCK_REALTIME) - start) / niter
73 | gc.enable()
74 | gc.collect()
75 | return result
76 |
77 |
78 | def write_table(t):
79 | out = pa.BufferOutputStream()
80 | pq.write_table(t, out)
81 | return out.getvalue()
82 |
83 |
84 | def read_table(source):
85 | return pq.read_table(source)
86 |
87 |
88 | def get_write_read_results(table, case_name):
89 | buf = write_table(table)
90 | results = [({'case': f'write-{case_name}'},
91 | get_timing(lambda: write_table(table), 1)),
92 | ({'case': f'read-{case_name}'},
93 | get_timing(lambda: read_table(buf), NITER)),
94 | ({'case': f'read-{case_name}-single-thread'},
95 | get_timing(lambda: pq.read_table(buf, use_threads=False),
96 | NITER))]
97 | for item in results:
98 | print(item)
99 | return results
100 |
101 |
102 | def get_cases(nunique):
103 | return {
104 | 'dense-random': generate_table(nunique),
105 | 'dense-sequential': generate_table(nunique, random_order=False),
106 | 'dict-random': generate_dict_table(nunique),
107 | 'dict-sequential': generate_dict_table(nunique, random_order=False)
108 | }
109 |
110 |
111 | def run_benchmarks():
112 | results = {}
113 |
114 | nuniques = [1000, 100000]
115 | # nuniques = [100000]
116 | for nunique in nuniques:
117 | nunique_results = []
118 |
119 | cases = get_cases(nunique)
120 | for case_name, table in cases.items():
121 | print(case_name, nunique)
122 | nunique_results.extend(get_write_read_results(table, case_name))
123 |
124 | results[nunique] = nunique_results
125 |
126 | return results
127 |
128 |
129 | # cases = get_cases(100000)
130 |
131 | # buf = write_table(cases['dict-random'])
132 | # with memory_use():
133 | # result = pq.read_table(buf)
134 |
135 |
136 | print(json.dumps(run_benchmarks()))
137 |
--------------------------------------------------------------------------------
/scripts/arrow7305.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import pyarrow as pa
4 | import pyarrow.parquet as pq
5 | import time
6 |
7 | import gc
8 | import psutil
9 |
10 |
11 | PROC = psutil.Process()
12 |
13 |
14 | def get_rss():
15 | return PROC.memory_info().rss
16 |
17 |
18 | def print_rss():
19 | print(f"RSS: {get_rss()}")
20 |
21 |
22 | RSS_TELEMETRY = []
23 |
24 |
25 | class memory_use:
26 |
27 | def __init__(self):
28 | self.start_use = pa.total_allocated_bytes()
29 | self.start_rss = get_rss()
30 | self.pool = pa.default_memory_pool()
31 | self.start_peak_use = self.pool.max_memory()
32 |
33 | def __enter__(self):
34 | return
35 |
36 | def __exit__(self, type, value, traceback):
37 | gc.collect()
38 | rss = get_rss()
39 | print("RSS: {}, change: {}"
40 | .format(rss, rss - self.start_rss))
41 | RSS_TELEMETRY.append(rss)
42 | # print("Change in Arrow allocations: {}"
43 | # .format(pa.total_allocated_bytes() - self.start_use))
44 | # print("Change in peak use: {}"
45 | # .format(self.pool.max_memory() - self.start_peak_use))
46 |
47 |
48 | def log_(msg):
49 | print(f"{msg} RSS: {get_rss()}")
50 |
51 |
52 | path = '/home/wesm/Downloads/big.snappy.parquet'
53 |
54 | CSV_PATH = '/home/wesm/Downloads/50mb.csv.gz'
55 |
56 | pa.jemalloc_set_decay_ms(0)
57 |
58 | log_("Starting")
59 |
60 | for i in range(10):
61 | df = pd.read_csv(CSV_PATH)
62 | log_("Read CSV")
63 |
64 | df.to_parquet('out.parquet')
65 | log_("Wrote Parquet")
66 |
67 | time.sleep(1)
68 | log_(f"Waited 1 second")
69 |
70 | # for i in range(10):
71 | # time.sleep(0.1)
72 | # elapsed = "%.2f" % (0.1 * (i + 1))
73 | # log_(f"{elapsed} seconds elapsed")
74 |
75 |
76 | for i in range(10):
77 | time.sleep(1)
78 | log_(f"Waited 1 second")
79 |
--------------------------------------------------------------------------------