├── LICENSE
├── Manifest.toml
├── Project.toml
├── README.md
├── WDI2009.dta
├── cars.RData
├── juliacon2023_part1_intro.ipynb
├── juliacon2023_part2_largedata.ipynb
├── juliacon2023_part3_issues.ipynb
└── logo.png
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Bogumił Kamiński
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Manifest.toml:
--------------------------------------------------------------------------------
1 | # This file is machine-generated - editing it directly is not advised
2 |
3 | julia_version = "1.9.2"
4 | manifest_format = "2.0"
5 | project_hash = "3e8dcfe1bfca6cff0aed9017c4b73bc43ca70ef3"
6 |
7 | [[deps.AbstractTrees]]
8 | git-tree-sha1 = "faa260e4cb5aba097a73fab382dd4b5819d8ec8c"
9 | uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
10 | version = "0.4.4"
11 |
12 | [[deps.Adapt]]
13 | deps = ["LinearAlgebra", "Requires"]
14 | git-tree-sha1 = "76289dc51920fdc6e0013c872ba9551d54961c24"
15 | uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
16 | version = "3.6.2"
17 | weakdeps = ["StaticArrays"]
18 |
19 | [deps.Adapt.extensions]
20 | AdaptStaticArraysExt = "StaticArrays"
21 |
22 | [[deps.ArgCheck]]
23 | git-tree-sha1 = "a3a402a35a2f7e0b87828ccabbd5ebfbebe356b4"
24 | uuid = "dce04be8-c92d-5529-be00-80e4d2c0e197"
25 | version = "2.3.0"
26 |
27 | [[deps.ArgTools]]
28 | uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
29 | version = "1.1.1"
30 |
31 | [[deps.ArrayLayouts]]
32 | deps = ["FillArrays", "LinearAlgebra", "SparseArrays"]
33 | git-tree-sha1 = "06fb6abc448771b8eac175fd675c2e4453c4e7bd"
34 | uuid = "4c555306-a7a7-4459-81d9-ec55ddd5c99a"
35 | version = "1.0.13"
36 |
37 | [[deps.Artifacts]]
38 | uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
39 |
40 | [[deps.BangBang]]
41 | deps = ["Compat", "ConstructionBase", "InitialValues", "LinearAlgebra", "Requires", "Setfield", "Tables"]
42 | git-tree-sha1 = "e28912ce94077686443433c2800104b061a827ed"
43 | uuid = "198e06fe-97b7-11e9-32a5-e1d131e6ad66"
44 | version = "0.3.39"
45 |
46 | [deps.BangBang.extensions]
47 | BangBangChainRulesCoreExt = "ChainRulesCore"
48 | BangBangDataFramesExt = "DataFrames"
49 | BangBangStaticArraysExt = "StaticArrays"
50 | BangBangStructArraysExt = "StructArrays"
51 | BangBangTypedTablesExt = "TypedTables"
52 |
53 | [deps.BangBang.weakdeps]
54 | ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
55 | DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
56 | StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
57 | StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
58 | TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9"
59 |
60 | [[deps.Base64]]
61 | uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
62 |
63 | [[deps.Baselet]]
64 | git-tree-sha1 = "aebf55e6d7795e02ca500a689d326ac979aaf89e"
65 | uuid = "9718e550-a3fa-408a-8086-8db961cd8217"
66 | version = "0.1.1"
67 |
68 | [[deps.BitIntegers]]
69 | deps = ["Random"]
70 | git-tree-sha1 = "fc54d5837033a170f3bad307f993e156eefc345f"
71 | uuid = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1"
72 | version = "0.2.7"
73 |
74 | [[deps.CEnum]]
75 | git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90"
76 | uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
77 | version = "0.4.2"
78 |
79 | [[deps.CategoricalArrays]]
80 | deps = ["DataAPI", "Future", "Missings", "Printf", "Requires", "Statistics", "Unicode"]
81 | git-tree-sha1 = "1568b28f91293458345dabba6a5ea3f183250a61"
82 | uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597"
83 | version = "0.10.8"
84 | weakdeps = ["JSON", "RecipesBase", "SentinelArrays", "StructTypes"]
85 |
86 | [deps.CategoricalArrays.extensions]
87 | CategoricalArraysJSONExt = "JSON"
88 | CategoricalArraysRecipesBaseExt = "RecipesBase"
89 | CategoricalArraysSentinelArraysExt = "SentinelArrays"
90 | CategoricalArraysStructTypesExt = "StructTypes"
91 |
92 | [[deps.CodecLz4]]
93 | deps = ["Lz4_jll", "TranscodingStreams"]
94 | git-tree-sha1 = "59fe0cb37784288d6b9f1baebddbf75457395d40"
95 | uuid = "5ba52731-8f18-5e0d-9241-30f10d1ec561"
96 | version = "0.4.0"
97 |
98 | [[deps.CodecXz]]
99 | deps = ["Libdl", "TranscodingStreams", "XZ_jll"]
100 | git-tree-sha1 = "82c4c000edf64b6bda6766377e69a1028f3549ee"
101 | uuid = "ba30903b-d9e8-5048-a5ec-d1f5b0d4b47b"
102 | version = "0.7.0"
103 |
104 | [[deps.CodecZlib]]
105 | deps = ["TranscodingStreams", "Zlib_jll"]
106 | git-tree-sha1 = "02aa26a4cf76381be7f66e020a3eddeb27b0a092"
107 | uuid = "944b1d66-785c-5afd-91f1-9de20f533193"
108 | version = "0.7.2"
109 |
110 | [[deps.CodecZstd]]
111 | deps = ["CEnum", "TranscodingStreams", "Zstd_jll"]
112 | git-tree-sha1 = "849470b337d0fa8449c21061de922386f32949d9"
113 | uuid = "6b39b394-51ab-5f42-8807-6242bab2b4c2"
114 | version = "0.7.2"
115 |
116 | [[deps.Compat]]
117 | deps = ["UUIDs"]
118 | git-tree-sha1 = "4e88377ae7ebeaf29a047aa1ee40826e0b708a5d"
119 | uuid = "34da2185-b29b-5c13-b0c7-acf172513d20"
120 | version = "4.7.0"
121 | weakdeps = ["Dates", "LinearAlgebra"]
122 |
123 | [deps.Compat.extensions]
124 | CompatLinearAlgebraExt = "LinearAlgebra"
125 |
126 | [[deps.CompilerSupportLibraries_jll]]
127 | deps = ["Artifacts", "Libdl"]
128 | uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae"
129 | version = "1.0.5+0"
130 |
131 | [[deps.CompositionsBase]]
132 | git-tree-sha1 = "802bb88cd69dfd1509f6670416bd4434015693ad"
133 | uuid = "a33af91c-f02d-484b-be07-31d278c5ca2b"
134 | version = "0.1.2"
135 |
136 | [deps.CompositionsBase.extensions]
137 | CompositionsBaseInverseFunctionsExt = "InverseFunctions"
138 |
139 | [deps.CompositionsBase.weakdeps]
140 | InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112"
141 |
142 | [[deps.Conda]]
143 | deps = ["Downloads", "JSON", "VersionParsing"]
144 | git-tree-sha1 = "8c86e48c0db1564a1d49548d3515ced5d604c408"
145 | uuid = "8f4d0f93-b110-5947-807f-2305c1781a2d"
146 | version = "1.9.1"
147 |
148 | [[deps.ConstructionBase]]
149 | deps = ["LinearAlgebra"]
150 | git-tree-sha1 = "fe2838a593b5f776e1597e086dcd47560d94e816"
151 | uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
152 | version = "1.5.3"
153 |
154 | [deps.ConstructionBase.extensions]
155 | ConstructionBaseIntervalSetsExt = "IntervalSets"
156 | ConstructionBaseStaticArraysExt = "StaticArrays"
157 |
158 | [deps.ConstructionBase.weakdeps]
159 | IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953"
160 | StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
161 |
162 | [[deps.Crayons]]
163 | git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15"
164 | uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
165 | version = "4.1.1"
166 |
167 | [[deps.DBInterface]]
168 | git-tree-sha1 = "9b0dc525a052b9269ccc5f7f04d5b3639c65bca5"
169 | uuid = "a10d1c49-ce27-4219-8d33-6db1a4562965"
170 | version = "2.5.0"
171 |
172 | [[deps.DataAPI]]
173 | git-tree-sha1 = "8da84edb865b0b5b0100c0666a9bc9a0b71c553c"
174 | uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a"
175 | version = "1.15.0"
176 |
177 | [[deps.DataFrames]]
178 | deps = ["Compat", "DataAPI", "DataStructures", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "REPL", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"]
179 | git-tree-sha1 = "04c738083f29f86e62c8afc341f0967d8717bdb8"
180 | uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
181 | version = "1.6.1"
182 |
183 | [[deps.DataStructures]]
184 | deps = ["Compat", "InteractiveUtils", "OrderedCollections"]
185 | git-tree-sha1 = "cf25ccb972fec4e4817764d01c82386ae94f77b4"
186 | uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8"
187 | version = "0.18.14"
188 |
189 | [[deps.DataValueInterfaces]]
190 | git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6"
191 | uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464"
192 | version = "1.0.0"
193 |
194 | [[deps.Dates]]
195 | deps = ["Printf"]
196 | uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
197 |
198 | [[deps.DecFP]]
199 | deps = ["DecFP_jll", "Printf", "Random", "SpecialFunctions"]
200 | git-tree-sha1 = "4a10cec664e26d9d63597daf9e62147e79d636e3"
201 | uuid = "55939f99-70c6-5e9b-8bb0-5071ed7d61fd"
202 | version = "1.3.2"
203 |
204 | [[deps.DecFP_jll]]
205 | deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
206 | git-tree-sha1 = "e9a8da19f847bbfed4076071f6fef8665a30d9e5"
207 | uuid = "47200ebd-12ce-5be5-abb7-8e082af23329"
208 | version = "2.0.3+1"
209 |
210 | [[deps.DefineSingletons]]
211 | git-tree-sha1 = "0fba8b706d0178b4dc7fd44a96a92382c9065c2c"
212 | uuid = "244e2a9f-e319-4986-a169-4d1fe445cd52"
213 | version = "0.1.2"
214 |
215 | [[deps.Distributed]]
216 | deps = ["Random", "Serialization", "Sockets"]
217 | uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b"
218 |
219 | [[deps.DocStringExtensions]]
220 | deps = ["LibGit2"]
221 | git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d"
222 | uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
223 | version = "0.9.3"
224 |
225 | [[deps.Downloads]]
226 | deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
227 | uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
228 | version = "1.6.0"
229 |
230 | [[deps.DuckDB]]
231 | deps = ["DBInterface", "DataFrames", "Dates", "DuckDB_jll", "FixedPointDecimals", "Tables", "UUIDs", "WeakRefStrings"]
232 | git-tree-sha1 = "88cd745f64a570e7f865c49c17f59822f7f7e47b"
233 | uuid = "d2f5444f-75bc-4fdf-ac35-56f514c445e1"
234 | version = "0.8.1"
235 |
236 | [[deps.DuckDB_jll]]
237 | deps = ["Artifacts", "JLLWrappers", "Libdl"]
238 | git-tree-sha1 = "f23f3781c620a97a9d0f7e4e057e94f9c9ef70e1"
239 | uuid = "2cbbab25-fc8b-58cf-88d4-687a02676033"
240 | version = "0.8.1+0"
241 |
242 | [[deps.ExprTools]]
243 | git-tree-sha1 = "c1d06d129da9f55715c6c212866f5b1bddc5fa00"
244 | uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
245 | version = "0.1.9"
246 |
247 | [[deps.FNVHash]]
248 | git-tree-sha1 = "d6de2c735a8bffce9bc481942dfa453cc815357e"
249 | uuid = "5207ad80-27db-4d23-8732-fa0bd339ea89"
250 | version = "0.1.0"
251 |
252 | [[deps.FileIO]]
253 | deps = ["Pkg", "Requires", "UUIDs"]
254 | git-tree-sha1 = "299dc33549f68299137e51e6d49a13b5b1da9673"
255 | uuid = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
256 | version = "1.16.1"
257 |
258 | [[deps.FilePathsBase]]
259 | deps = ["Compat", "Dates", "Mmap", "Printf", "Test", "UUIDs"]
260 | git-tree-sha1 = "e27c4ebe80e8699540f2d6c805cc12203b614f12"
261 | uuid = "48062228-2e41-5def-b9a4-89aafe57970f"
262 | version = "0.9.20"
263 |
264 | [[deps.FileWatching]]
265 | uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
266 |
267 | [[deps.FillArrays]]
268 | deps = ["LinearAlgebra", "Random", "SparseArrays", "Statistics"]
269 | git-tree-sha1 = "f0af9b12329a637e8fba7d6543f915fff6ba0090"
270 | uuid = "1a297f60-69ca-5386-bcde-b61e274b549b"
271 | version = "1.4.2"
272 |
273 | [[deps.FixedPointDecimals]]
274 | deps = ["Parsers"]
275 | git-tree-sha1 = "d58aa8e85901dee0915262c1c2697c4037281982"
276 | uuid = "fb4d412d-6eee-574d-9565-ede6634db7b0"
277 | version = "0.4.3"
278 |
279 | [[deps.Formatting]]
280 | deps = ["Printf"]
281 | git-tree-sha1 = "8339d61043228fdd3eb658d86c926cb282ae72a8"
282 | uuid = "59287772-0a20-5a39-b81b-1366585eb4c0"
283 | version = "0.4.2"
284 |
285 | [[deps.Future]]
286 | deps = ["Random"]
287 | uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"
288 |
289 | [[deps.GPUArraysCore]]
290 | deps = ["Adapt"]
291 | git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0"
292 | uuid = "46192b85-c4d5-4398-a991-12ede77f4527"
293 | version = "0.1.5"
294 |
295 | [[deps.IJulia]]
296 | deps = ["Base64", "Conda", "Dates", "InteractiveUtils", "JSON", "Libdl", "Logging", "Markdown", "MbedTLS", "Pkg", "Printf", "REPL", "Random", "SoftGlobalScope", "Test", "UUIDs", "ZMQ"]
297 | git-tree-sha1 = "47ac8cc196b81001a711f4b2c12c97372338f00c"
298 | uuid = "7073ff75-c697-5162-941a-fcdaad2a7d2a"
299 | version = "1.24.2"
300 |
301 | [[deps.InitialValues]]
302 | git-tree-sha1 = "4da0f88e9a39111c2fa3add390ab15f3a44f3ca3"
303 | uuid = "22cec73e-a1b8-11e9-2c92-598750a2cf9c"
304 | version = "0.3.1"
305 |
306 | [[deps.InlineStrings]]
307 | deps = ["Parsers"]
308 | git-tree-sha1 = "9cc2baf75c6d09f9da536ddf58eb2f29dedaf461"
309 | uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48"
310 | version = "1.4.0"
311 |
312 | [[deps.InteractiveUtils]]
313 | deps = ["Markdown"]
314 | uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
315 |
316 | [[deps.InvertedIndices]]
317 | git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038"
318 | uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f"
319 | version = "1.3.0"
320 |
321 | [[deps.IrrationalConstants]]
322 | git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2"
323 | uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
324 | version = "0.2.2"
325 |
326 | [[deps.IteratorInterfaceExtensions]]
327 | git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856"
328 | uuid = "82899510-4779-5014-852e-03e436cf321d"
329 | version = "1.0.0"
330 |
331 | [[deps.JLLWrappers]]
332 | deps = ["Preferences"]
333 | git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1"
334 | uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
335 | version = "1.4.1"
336 |
337 | [[deps.JSON]]
338 | deps = ["Dates", "Mmap", "Parsers", "Unicode"]
339 | git-tree-sha1 = "31e996f0a15c7b280ba9f76636b3ff9e2ae58c9a"
340 | uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
341 | version = "0.21.4"
342 |
343 | [[deps.JSON3]]
344 | deps = ["Dates", "Mmap", "Parsers", "PrecompileTools", "StructTypes", "UUIDs"]
345 | git-tree-sha1 = "5b62d93f2582b09e469b3099d839c2d2ebf5066d"
346 | uuid = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
347 | version = "1.13.1"
348 |
349 | [[deps.LZO_jll]]
350 | deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
351 | git-tree-sha1 = "e5b909bcf985c5e2605737d2ce278ed791b89be6"
352 | uuid = "dd4b983a-f0e5-5f8d-a1b7-129d4a5fb1ac"
353 | version = "2.10.1+0"
354 |
355 | [[deps.LaTeXStrings]]
356 | git-tree-sha1 = "f2355693d6778a178ade15952b7ac47a4ff97996"
357 | uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f"
358 | version = "1.3.0"
359 |
360 | [[deps.LazyArrays]]
361 | deps = ["ArrayLayouts", "FillArrays", "LinearAlgebra", "MacroTools", "MatrixFactorizations", "SparseArrays"]
362 | git-tree-sha1 = "a552e17ee600c6fa933f3f9bff7c380b2e032ba8"
363 | uuid = "5078a376-72f3-5289-bfd5-ec5146d43c02"
364 | version = "1.4.1"
365 | weakdeps = ["StaticArrays"]
366 |
367 | [deps.LazyArrays.extensions]
368 | LazyArraysStaticArraysExt = "StaticArrays"
369 |
370 | [[deps.LazyArtifacts]]
371 | deps = ["Artifacts", "Pkg"]
372 | uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
373 |
374 | [[deps.LibCURL]]
375 | deps = ["LibCURL_jll", "MozillaCACerts_jll"]
376 | uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
377 | version = "0.6.3"
378 |
379 | [[deps.LibCURL_jll]]
380 | deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
381 | uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
382 | version = "7.84.0+0"
383 |
384 | [[deps.LibGit2]]
385 | deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
386 | uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
387 |
388 | [[deps.LibSSH2_jll]]
389 | deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
390 | uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
391 | version = "1.10.2+0"
392 |
393 | [[deps.Libdl]]
394 | uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
395 |
396 | [[deps.Libiconv_jll]]
397 | deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
398 | git-tree-sha1 = "c7cb1f5d892775ba13767a87c7ada0b980ea0a71"
399 | uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531"
400 | version = "1.16.1+2"
401 |
402 | [[deps.LightBSON]]
403 | deps = ["DataStructures", "Dates", "DecFP", "FNVHash", "JSON3", "Sockets", "StructTypes", "Transducers", "UUIDs", "UnsafeArrays", "WeakRefStrings"]
404 | git-tree-sha1 = "66369db4570bcd852bde2dd39beaa559bc9890dd"
405 | uuid = "a4a7f996-b3a6-4de6-b9db-2fa5f350df41"
406 | version = "0.2.16"
407 |
408 | [[deps.LinearAlgebra]]
409 | deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"]
410 | uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
411 |
412 | [[deps.LogExpFunctions]]
413 | deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"]
414 | git-tree-sha1 = "c3ce8e7420b3a6e071e0fe4745f5d4300e37b13f"
415 | uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
416 | version = "0.3.24"
417 |
418 | [deps.LogExpFunctions.extensions]
419 | LogExpFunctionsChainRulesCoreExt = "ChainRulesCore"
420 | LogExpFunctionsChangesOfVariablesExt = "ChangesOfVariables"
421 | LogExpFunctionsInverseFunctionsExt = "InverseFunctions"
422 |
423 | [deps.LogExpFunctions.weakdeps]
424 | ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
425 | ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0"
426 | InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112"
427 |
428 | [[deps.Logging]]
429 | uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
430 |
431 | [[deps.Lz4_jll]]
432 | deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
433 | git-tree-sha1 = "5d494bc6e85c4c9b626ee0cab05daa4085486ab1"
434 | uuid = "5ced341a-0733-55b8-9ab6-a4889d929147"
435 | version = "1.9.3+0"
436 |
437 | [[deps.MacroTools]]
438 | deps = ["Markdown", "Random"]
439 | git-tree-sha1 = "42324d08725e200c23d4dfb549e0d5d89dede2d2"
440 | uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
441 | version = "0.5.10"
442 |
443 | [[deps.Markdown]]
444 | deps = ["Base64"]
445 | uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
446 |
447 | [[deps.MatrixFactorizations]]
448 | deps = ["ArrayLayouts", "LinearAlgebra", "Printf", "Random"]
449 | git-tree-sha1 = "6507b5bde6500ae31c01a1d893764e130b62256d"
450 | uuid = "a3b82374-2e81-5b9e-98ce-41277c0e4c87"
451 | version = "2.0.0"
452 |
453 | [[deps.MbedTLS]]
454 | deps = ["Dates", "MbedTLS_jll", "MozillaCACerts_jll", "Random", "Sockets"]
455 | git-tree-sha1 = "03a9b9718f5682ecb107ac9f7308991db4ce395b"
456 | uuid = "739be429-bea8-5141-9913-cc70e7f3736d"
457 | version = "1.1.7"
458 |
459 | [[deps.MbedTLS_jll]]
460 | deps = ["Artifacts", "Libdl"]
461 | uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
462 | version = "2.28.2+0"
463 |
464 | [[deps.MicroCollections]]
465 | deps = ["BangBang", "InitialValues", "Setfield"]
466 | git-tree-sha1 = "629afd7d10dbc6935ec59b32daeb33bc4460a42e"
467 | uuid = "128add7d-3638-4c79-886c-908ea0c25c34"
468 | version = "0.1.4"
469 |
470 | [[deps.Missings]]
471 | deps = ["DataAPI"]
472 | git-tree-sha1 = "f66bdc5de519e8f8ae43bdc598782d35a25b1272"
473 | uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
474 | version = "1.1.0"
475 |
476 | [[deps.Mmap]]
477 | uuid = "a63ad114-7e13-5084-954f-fe012c677804"
478 |
479 | [[deps.Mocking]]
480 | deps = ["Compat", "ExprTools"]
481 | git-tree-sha1 = "4cc0c5a83933648b615c36c2b956d94fda70641e"
482 | uuid = "78c3b35d-d492-501b-9361-3d52fe80e533"
483 | version = "0.7.7"
484 |
485 | [[deps.MozillaCACerts_jll]]
486 | uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
487 | version = "2022.10.11"
488 |
489 | [[deps.NetworkOptions]]
490 | uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
491 | version = "1.2.0"
492 |
493 | [[deps.OpenBLAS_jll]]
494 | deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"]
495 | uuid = "4536629a-c528-5b80-bd46-f80d51c5b363"
496 | version = "0.3.21+4"
497 |
498 | [[deps.OpenLibm_jll]]
499 | deps = ["Artifacts", "Libdl"]
500 | uuid = "05823500-19ac-5b8b-9628-191a04bc5112"
501 | version = "0.8.1+0"
502 |
503 | [[deps.OpenSpecFun_jll]]
504 | deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"]
505 | git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1"
506 | uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
507 | version = "0.5.5+0"
508 |
509 | [[deps.OrderedCollections]]
510 | git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3"
511 | uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
512 | version = "1.6.2"
513 |
514 | [[deps.Parquet2]]
515 | deps = ["AbstractTrees", "BitIntegers", "CodecLz4", "CodecZlib", "CodecZstd", "DataAPI", "Dates", "DecFP", "FilePathsBase", "FillArrays", "JSON3", "LazyArrays", "LightBSON", "Mmap", "OrderedCollections", "PooledArrays", "PrecompileTools", "SentinelArrays", "Snappy", "StaticArrays", "TableOperations", "Tables", "Thrift2", "Transducers", "UUIDs", "WeakRefStrings"]
516 | git-tree-sha1 = "8bb2f9e729a2becea1ed351253e14f5660e304ab"
517 | uuid = "98572fba-bba0-415d-956f-fa77e587d26d"
518 | version = "0.2.17"
519 |
520 | [[deps.Parsers]]
521 | deps = ["Dates", "PrecompileTools", "UUIDs"]
522 | git-tree-sha1 = "4b2e829ee66d4218e0cef22c0a64ee37cf258c29"
523 | uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
524 | version = "2.7.1"
525 |
526 | [[deps.Pkg]]
527 | deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
528 | uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
529 | version = "1.9.2"
530 |
531 | [[deps.PooledArrays]]
532 | deps = ["DataAPI", "Future"]
533 | git-tree-sha1 = "a6062fe4063cdafe78f4a0a81cfffb89721b30e7"
534 | uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720"
535 | version = "1.4.2"
536 |
537 | [[deps.PrecompileTools]]
538 | deps = ["Preferences"]
539 | git-tree-sha1 = "9673d39decc5feece56ef3940e5dafba15ba0f81"
540 | uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a"
541 | version = "1.1.2"
542 |
543 | [[deps.Preferences]]
544 | deps = ["TOML"]
545 | git-tree-sha1 = "7eb1686b4f04b82f96ed7a4ea5890a4f0c7a09f1"
546 | uuid = "21216c6a-2e73-6563-6e65-726566657250"
547 | version = "1.4.0"
548 |
549 | [[deps.PrettyTables]]
550 | deps = ["Crayons", "Formatting", "LaTeXStrings", "Markdown", "Reexport", "StringManipulation", "Tables"]
551 | git-tree-sha1 = "542b1bd03329c1d235110f96f1bb0eeffc48a87d"
552 | uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d"
553 | version = "2.2.6"
554 |
555 | [[deps.Printf]]
556 | deps = ["Unicode"]
557 | uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
558 |
559 | [[deps.RData]]
560 | deps = ["CategoricalArrays", "CodecZlib", "DataAPI", "DataFrames", "Dates", "FileIO", "Requires", "TimeZones", "Unicode"]
561 | git-tree-sha1 = "9a6220c8f59c38ddf6217638042ae6788973f617"
562 | uuid = "df47a6cb-8c03-5eed-afd8-b6050d6c41da"
563 | version = "1.0.0"
564 |
565 | [[deps.REPL]]
566 | deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
567 | uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
568 |
569 | [[deps.Random]]
570 | deps = ["SHA", "Serialization"]
571 | uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
572 |
573 | [[deps.ReadStatTables]]
574 | deps = ["CEnum", "DataAPI", "Dates", "InlineStrings", "PooledArrays", "PrecompileTools", "PrettyTables", "ReadStat_jll", "SentinelArrays", "StructArrays", "Tables"]
575 | git-tree-sha1 = "fd0de7ebae24cfb11e0d8d0bc5f59e24d7f304e5"
576 | uuid = "52522f7a-9570-4e34-8ac6-c005c74d4b84"
577 | version = "0.2.5"
578 |
579 | [[deps.ReadStat_jll]]
580 | deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Zlib_jll"]
581 | git-tree-sha1 = "28e990e90ca643e99f3ec0188089c1816e8b46f4"
582 | uuid = "a4dc8951-f1cc-5499-9034-9ec1c3e64557"
583 | version = "1.1.9+0"
584 |
585 | [[deps.RecipesBase]]
586 | deps = ["PrecompileTools"]
587 | git-tree-sha1 = "5c3d09cc4f31f5fc6af001c250bf1278733100ff"
588 | uuid = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
589 | version = "1.3.4"
590 |
591 | [[deps.Reexport]]
592 | git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b"
593 | uuid = "189a3867-3050-52da-a836-e630ba90ab69"
594 | version = "1.2.2"
595 |
596 | [[deps.Requires]]
597 | deps = ["UUIDs"]
598 | git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7"
599 | uuid = "ae029012-a4dd-5104-9daa-d747884805df"
600 | version = "1.3.0"
601 |
602 | [[deps.SHA]]
603 | uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
604 | version = "0.7.0"
605 |
606 | [[deps.SQLite]]
607 | deps = ["DBInterface", "Random", "SQLite_jll", "Serialization", "Tables", "WeakRefStrings"]
608 | git-tree-sha1 = "eb9a473c9b191ced349d04efa612ec9f39c087ea"
609 | uuid = "0aa819cd-b072-5ff4-a722-6bc24af294d9"
610 | version = "1.6.0"
611 |
612 | [[deps.SQLite_jll]]
613 | deps = ["Artifacts", "JLLWrappers", "Libdl", "Zlib_jll"]
614 | git-tree-sha1 = "4619dd3363610d94fb42a95a6dc35b526a26d0ef"
615 | uuid = "76ed43ae-9a5d-5a62-8c75-30186b810ce8"
616 | version = "3.42.0+0"
617 |
618 | [[deps.Scratch]]
619 | deps = ["Dates"]
620 | git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a"
621 | uuid = "6c6a2e73-6563-6170-7368-637461726353"
622 | version = "1.2.0"
623 |
624 | [[deps.SentinelArrays]]
625 | deps = ["Dates", "Random"]
626 | git-tree-sha1 = "04bdff0b09c65ff3e06a05e3eb7b120223da3d39"
627 | uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c"
628 | version = "1.4.0"
629 |
630 | [[deps.Serialization]]
631 | uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
632 |
633 | [[deps.Setfield]]
634 | deps = ["ConstructionBase", "Future", "MacroTools", "StaticArraysCore"]
635 | git-tree-sha1 = "e2cc6d8c88613c05e1defb55170bf5ff211fbeac"
636 | uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46"
637 | version = "1.1.1"
638 |
639 | [[deps.Snappy]]
640 | deps = ["CEnum", "snappy_jll"]
641 | git-tree-sha1 = "72bae53c0691f4b6fd259587dab8821ae0e025f6"
642 | uuid = "59d4ed8c-697a-5b28-a4c7-fe95c22820f9"
643 | version = "0.4.2"
644 |
645 | [[deps.Sockets]]
646 | uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
647 |
648 | [[deps.SoftGlobalScope]]
649 | deps = ["REPL"]
650 | git-tree-sha1 = "986ec2b6162ccb95de5892ed17832f95badf770c"
651 | uuid = "b85f4697-e234-5449-a836-ec8e2f98b302"
652 | version = "1.1.0"
653 |
654 | [[deps.SortingAlgorithms]]
655 | deps = ["DataStructures"]
656 | git-tree-sha1 = "c60ec5c62180f27efea3ba2908480f8055e17cee"
657 | uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c"
658 | version = "1.1.1"
659 |
660 | [[deps.SparseArrays]]
661 | deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"]
662 | uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
663 |
664 | [[deps.SpecialFunctions]]
665 | deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"]
666 | git-tree-sha1 = "7beb031cf8145577fbccacd94b8a8f4ce78428d3"
667 | uuid = "276daf66-3868-5448-9aa4-cd146d93841b"
668 | version = "2.3.0"
669 |
670 | [deps.SpecialFunctions.extensions]
671 | SpecialFunctionsChainRulesCoreExt = "ChainRulesCore"
672 |
673 | [deps.SpecialFunctions.weakdeps]
674 | ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
675 |
676 | [[deps.SplittablesBase]]
677 | deps = ["Setfield", "Test"]
678 | git-tree-sha1 = "e08a62abc517eb79667d0a29dc08a3b589516bb5"
679 | uuid = "171d559e-b47b-412a-8079-5efa626c420e"
680 | version = "0.1.15"
681 |
682 | [[deps.StaticArrays]]
683 | deps = ["LinearAlgebra", "Random", "StaticArraysCore"]
684 | git-tree-sha1 = "9cabadf6e7cd2349b6cf49f1915ad2028d65e881"
685 | uuid = "90137ffa-7385-5640-81b9-e52037218182"
686 | version = "1.6.2"
687 | weakdeps = ["Statistics"]
688 |
689 | [deps.StaticArrays.extensions]
690 | StaticArraysStatisticsExt = "Statistics"
691 |
692 | [[deps.StaticArraysCore]]
693 | git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d"
694 | uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c"
695 | version = "1.4.2"
696 |
697 | [[deps.Statistics]]
698 | deps = ["LinearAlgebra", "SparseArrays"]
699 | uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
700 | version = "1.9.0"
701 |
702 | [[deps.StatsAPI]]
703 | deps = ["LinearAlgebra"]
704 | git-tree-sha1 = "45a7769a04a3cf80da1c1c7c60caf932e6f4c9f7"
705 | uuid = "82ae8749-77ed-4fe6-ae5f-f523153014b0"
706 | version = "1.6.0"
707 |
708 | [[deps.StatsBase]]
709 | deps = ["DataAPI", "DataStructures", "LinearAlgebra", "LogExpFunctions", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics", "StatsAPI"]
710 | git-tree-sha1 = "75ebe04c5bed70b91614d684259b661c9e6274a4"
711 | uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
712 | version = "0.34.0"
713 |
714 | [[deps.StringManipulation]]
715 | git-tree-sha1 = "46da2434b41f41ac3594ee9816ce5541c6096123"
716 | uuid = "892a3eda-7b42-436c-8928-eab12a02cf0e"
717 | version = "0.3.0"
718 |
719 | [[deps.StructArrays]]
720 | deps = ["Adapt", "DataAPI", "GPUArraysCore", "StaticArraysCore", "Tables"]
721 | git-tree-sha1 = "521a0e828e98bb69042fec1809c1b5a680eb7389"
722 | uuid = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
723 | version = "0.6.15"
724 |
725 | [[deps.StructTypes]]
726 | deps = ["Dates", "UUIDs"]
727 | git-tree-sha1 = "ca4bccb03acf9faaf4137a9abc1881ed1841aa70"
728 | uuid = "856f2bd8-1eba-4b0a-8007-ebc267875bd4"
729 | version = "1.10.0"
730 |
731 | [[deps.SuiteSparse_jll]]
732 | deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"]
733 | uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c"
734 | version = "5.10.1+6"
735 |
736 | [[deps.TOML]]
737 | deps = ["Dates"]
738 | uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
739 | version = "1.0.3"
740 |
741 | [[deps.TableOperations]]
742 | deps = ["SentinelArrays", "Tables", "Test"]
743 | git-tree-sha1 = "e383c87cf2a1dc41fa30c093b2a19877c83e1bc1"
744 | uuid = "ab02a1b2-a7df-11e8-156e-fb1833f50b87"
745 | version = "1.2.0"
746 |
747 | [[deps.TableTraits]]
748 | deps = ["IteratorInterfaceExtensions"]
749 | git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39"
750 | uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c"
751 | version = "1.0.1"
752 |
753 | [[deps.Tables]]
754 | deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits", "Test"]
755 | git-tree-sha1 = "1544b926975372da01227b382066ab70e574a3ec"
756 | uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
757 | version = "1.10.1"
758 |
759 | [[deps.Tar]]
760 | deps = ["ArgTools", "SHA"]
761 | uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
762 | version = "1.10.0"
763 |
764 | [[deps.Test]]
765 | deps = ["InteractiveUtils", "Logging", "Random", "Serialization"]
766 | uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
767 |
768 | [[deps.Thrift2]]
769 | deps = ["MacroTools", "OrderedCollections", "PrecompileTools"]
770 | git-tree-sha1 = "00d618714271f283ea3829ab058d5e5bd1847f85"
771 | uuid = "9be31aac-5446-47db-bfeb-416acd2e4415"
772 | version = "0.1.4"
773 |
774 | [[deps.TimeZones]]
775 | deps = ["Dates", "Downloads", "InlineStrings", "LazyArtifacts", "Mocking", "Printf", "RecipesBase", "Scratch", "Unicode"]
776 | git-tree-sha1 = "cdaa0c2a4449724aded839550eca7d7240bb6938"
777 | uuid = "f269a46b-ccf7-5d73-abea-4c690281aa53"
778 | version = "1.10.0"
779 |
780 | [[deps.TranscodingStreams]]
781 | deps = ["Random", "Test"]
782 | git-tree-sha1 = "9a6ae7ed916312b41236fcef7e0af564ef934769"
783 | uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa"
784 | version = "0.9.13"
785 |
786 | [[deps.Transducers]]
787 | deps = ["Adapt", "ArgCheck", "BangBang", "Baselet", "CompositionsBase", "ConstructionBase", "DefineSingletons", "Distributed", "InitialValues", "Logging", "Markdown", "MicroCollections", "Requires", "Setfield", "SplittablesBase", "Tables"]
788 | git-tree-sha1 = "53bd5978b182fa7c57577bdb452c35e5b4fb73a5"
789 | uuid = "28d57a85-8fef-5791-bfe6-a80928e7c999"
790 | version = "0.4.78"
791 |
792 | [deps.Transducers.extensions]
793 | TransducersBlockArraysExt = "BlockArrays"
794 | TransducersDataFramesExt = "DataFrames"
795 | TransducersLazyArraysExt = "LazyArrays"
796 | TransducersOnlineStatsBaseExt = "OnlineStatsBase"
797 | TransducersReferenceablesExt = "Referenceables"
798 |
799 | [deps.Transducers.weakdeps]
800 | BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e"
801 | DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
802 | LazyArrays = "5078a376-72f3-5289-bfd5-ec5146d43c02"
803 | OnlineStatsBase = "925886fa-5bf2-5e8e-b522-a9147a512338"
804 | Referenceables = "42d2dcc6-99eb-4e98-b66c-637b7d73030e"
805 |
806 | [[deps.UUIDs]]
807 | deps = ["Random", "SHA"]
808 | uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
809 |
810 | [[deps.Unicode]]
811 | uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
812 |
813 | [[deps.UnsafeArrays]]
814 | git-tree-sha1 = "3350f94f6caa02f324a23645bf524fc9334c7488"
815 | uuid = "c4a57d5a-5b31-53a6-b365-19f8c011fbd6"
816 | version = "1.0.4"
817 |
818 | [[deps.VersionParsing]]
819 | git-tree-sha1 = "58d6e80b4ee071f5efd07fda82cb9fbe17200868"
820 | uuid = "81def892-9a0e-5fdd-b105-ffc91e053289"
821 | version = "1.3.0"
822 |
823 | [[deps.WeakRefStrings]]
824 | deps = ["DataAPI", "InlineStrings", "Parsers"]
825 | git-tree-sha1 = "b1be2855ed9ed8eac54e5caff2afcdb442d52c23"
826 | uuid = "ea10d353-3f73-51f8-a26c-33c1cb351aa5"
827 | version = "1.4.2"
828 |
829 | [[deps.XZ_jll]]
830 | deps = ["Artifacts", "JLLWrappers", "Libdl"]
831 | git-tree-sha1 = "2222b751598bd9f4885c9ce9cd23e83404baa8ce"
832 | uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800"
833 | version = "5.4.3+1"
834 |
835 | [[deps.ZMQ]]
836 | deps = ["FileWatching", "Sockets", "ZeroMQ_jll"]
837 | git-tree-sha1 = "356d2bdcc0bce90aabee1d1c0f6d6f301eda8f77"
838 | uuid = "c2297ded-f4af-51ae-bb23-16f91089e4e1"
839 | version = "1.2.2"
840 |
841 | [[deps.ZeroMQ_jll]]
842 | deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "libsodium_jll"]
843 | git-tree-sha1 = "fe5c65a526f066fb3000da137d5785d9649a8a47"
844 | uuid = "8f1865be-045e-5c20-9c9f-bfbfb0764568"
845 | version = "4.3.4+0"
846 |
847 | [[deps.Zlib_jll]]
848 | deps = ["Libdl"]
849 | uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
850 | version = "1.2.13+0"
851 |
852 | [[deps.Zstd_jll]]
853 | deps = ["Artifacts", "JLLWrappers", "Libdl"]
854 | git-tree-sha1 = "49ce682769cd5de6c72dcf1b94ed7790cd08974c"
855 | uuid = "3161d3a3-bdf6-5164-811a-617609db77b4"
856 | version = "1.5.5+0"
857 |
858 | [[deps.libblastrampoline_jll]]
859 | deps = ["Artifacts", "Libdl"]
860 | uuid = "8e850b90-86db-534c-a0d3-1478176c7d93"
861 | version = "5.8.0+0"
862 |
863 | [[deps.libsodium_jll]]
864 | deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"]
865 | git-tree-sha1 = "848ab3d00fe39d6fbc2a8641048f8f272af1c51e"
866 | uuid = "a9144af2-ca23-56d9-984f-0d03f7b5ccf8"
867 | version = "1.0.20+0"
868 |
869 | [[deps.nghttp2_jll]]
870 | deps = ["Artifacts", "Libdl"]
871 | uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
872 | version = "1.48.0+0"
873 |
874 | [[deps.p7zip_jll]]
875 | deps = ["Artifacts", "Libdl"]
876 | uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
877 | version = "17.4.0+0"
878 |
879 | [[deps.snappy_jll]]
880 | deps = ["Artifacts", "JLLWrappers", "LZO_jll", "Libdl", "Pkg", "Zlib_jll"]
881 | git-tree-sha1 = "985c1da710b0e43f7c52f037441021dfd0e3be14"
882 | uuid = "fe1e1685-f7be-5f59-ac9f-4ca204017dfd"
883 | version = "1.1.9+1"
884 |
--------------------------------------------------------------------------------
/Project.toml:
--------------------------------------------------------------------------------
1 | [deps]
2 | CodecXz = "ba30903b-d9e8-5048-a5ec-d1f5b0d4b47b"
3 | DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
4 | DuckDB = "d2f5444f-75bc-4fdf-ac35-56f514c445e1"
5 | IJulia = "7073ff75-c697-5162-941a-fcdaad2a7d2a"
6 | Parquet2 = "98572fba-bba0-415d-956f-fa77e587d26d"
7 | RData = "df47a6cb-8c03-5eed-afd8-b6050d6c41da"
8 | ReadStatTables = "52522f7a-9570-4e34-8ac6-c005c74d4b84"
9 | SQLite = "0aa819cd-b072-5ff4-a722-6bc24af294d9"
10 | StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
11 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Working with DataFrames.jl beyond CSV files
2 |
3 | [](https://www.youtube.com/watch?v=oorErKcAWIQ)
4 |
5 | This is an introductory part of the workshop
6 | prepared for [JuliaCon2023](https://juliacon.org/2023/).
7 |
8 | In order to run the tutorial make sure that you have Julia executable installed.
9 | The tutorial was developed under Julia 1.9.2.
10 |
11 | The simplest way to run it is to proceed as follows:
12 | 1. Clone the
13 | [tutorial repository](https://github.com/bkamins/JuliaCon2023-Tutorial)
14 | to a local folder on your computer.
15 | 2. Start Julia in your local folder using the `julia --project` command.
16 | 3. Run the following commands (this step needs to be run only once per installation and is made to double check that you have proper versions of packages downloaded):
17 | ```
18 | using Pkg
19 | Pkg.instantiate()
20 | Pkg.status()
21 | ```
22 | 4. Start Jupyter Notebook with:
23 | ```
24 | using IJulia
25 | notebook(dir=pwd())
26 | ```
27 | 5. In the Jupyter Notebook open and run the *ipynb* files with the tutorial material.
28 |
29 | ---
30 |
31 | *Preparation of this workshop has been supported by the Polish National Agency for Academic Exchange under the Strategic Partnerships programme, grant number BPI/PST/2021/1/00069/U/00001.*
32 |
33 | 
34 |
--------------------------------------------------------------------------------
/WDI2009.dta:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bkamins/JuliaCon2023-Tutorial/2da04fe2d76a2d468e96d7795b3d713de914a31f/WDI2009.dta
--------------------------------------------------------------------------------
/cars.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bkamins/JuliaCon2023-Tutorial/2da04fe2d76a2d468e96d7795b3d713de914a31f/cars.RData
--------------------------------------------------------------------------------
/juliacon2023_part2_largedata.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "1b05b2ef",
6 | "metadata": {},
7 | "source": [
8 | "# Working with DataFrames.jl beyond CSV files\n",
9 | "\n",
10 | "# Part 2: Using Parquet for data larger than RAM\n",
11 | "\n",
12 | "## Bogumił Kamiński\n",
13 | "### June 25, 2023"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "id": "3bd5a5f2",
19 | "metadata": {},
20 | "source": [
21 | "What is covered in part 2:\n",
22 | "* how to iteratively create Parquet data store that jointly has more data than available RAM\n",
23 | "* how to manually process such data on a single machine (notebook-oriented process)"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "id": "8aeb9ca8",
29 | "metadata": {},
30 | "source": [
31 | "## Setup"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 1,
37 | "id": "391b440d",
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "using DataFrames"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 2,
47 | "id": "b6120192",
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "using Parquet2"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 3,
57 | "id": "e6ba8ab9",
58 | "metadata": {},
59 | "outputs": [],
60 | "source": [
61 | "using Random"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 4,
67 | "id": "e5551636",
68 | "metadata": {},
69 | "outputs": [],
70 | "source": [
71 | "using Statistics"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 5,
77 | "id": "a3d36eba",
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "using StatsBase"
82 | ]
83 | },
84 | {
85 | "cell_type": "markdown",
86 | "id": "cd17df01",
87 | "metadata": {},
88 | "source": [
89 | "## Generate some large data"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 6,
95 | "id": "daa36b81",
96 | "metadata": {},
97 | "outputs": [
98 | {
99 | "data": {
100 | "text/plain": [
101 | "false"
102 | ]
103 | },
104 | "execution_count": 6,
105 | "metadata": {},
106 | "output_type": "execute_result"
107 | }
108 | ],
109 | "source": [
110 | "isdir(\"pq_experiment\") && rm(\"pq_experiment\"; recursive=true)"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": 7,
116 | "id": "18b99893",
117 | "metadata": {},
118 | "outputs": [
119 | {
120 | "data": {
121 | "text/plain": [
122 | "\"pq_experiment\""
123 | ]
124 | },
125 | "execution_count": 7,
126 | "metadata": {},
127 | "output_type": "execute_result"
128 | }
129 | ],
130 | "source": [
131 | "mkdir(\"pq_experiment\")"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 8,
137 | "id": "7740f592",
138 | "metadata": {},
139 | "outputs": [],
140 | "source": [
141 | "Random.seed!(1234);"
142 | ]
143 | },
144 | {
145 | "cell_type": "markdown",
146 | "id": "32e53ec3",
147 | "metadata": {},
148 | "source": [
149 | "Create 500 groups (range `0.0:0.002:1.0`) of data, each having $2^{20}$ = 1,048,576 rows and two `Float64` columns (I could have made it larger, but this should be enough as an example)."
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 9,
155 | "id": "a6dd4359",
156 | "metadata": {},
157 | "outputs": [
158 | {
159 | "name": "stderr",
160 | "output_type": "stream",
161 | "text": [
162 | "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mwriting file #1\n",
163 | "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mwriting file #2\n",
164 | "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mwriting file #3\n",
165 | "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mwriting file #4\n",
166 | "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mwriting file #5\n",
167 | "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mwriting file #6\n"
168 | ]
169 | },
170 | {
171 | "data": {
172 | "text/plain": [
173 | "\u001b[34m✏ \u001b[39mParquet2.FileWriter{IOStream}(pq_experiment/experiment_6.parquet)"
174 | ]
175 | },
176 | "execution_count": 9,
177 | "metadata": {},
178 | "output_type": "execute_result"
179 | }
180 | ],
181 | "source": [
182 | "let # create local scope for more consistent variable scoping behavior and avoid temporary variable leakage\n",
183 | " i = 1\n",
184 | " df = DataFrame() # temporary data frame to store intermediate results\n",
185 | " maxsize = 10^8 # define size of one chunk of data written to disk\n",
186 | " for μ in 0.0:0.002:1.0\n",
187 | " result = DataFrame(mu=μ, x=randn(2^20) .+ μ)\n",
188 | " append!(df, result) # keep appending data from partial simulations\n",
189 | " if nrow(df) > maxsize # if our data gets to big dump it to a consecutive file\n",
190 | " @info \"writing file #$i\"\n",
191 | " Parquet2.writefile(\"pq_experiment/experiment_$i.parquet\", @view df[1:maxsize, :])\n",
192 | " deleteat!(df, 1:maxsize) # drop data stored in a file\n",
193 | " i += 1\n",
194 | " end\n",
195 | " end\n",
196 | " if nrow(df) > 0 # if we have some unsaved data store it now\n",
197 | " @info \"writing file #$i\"\n",
198 | " Parquet2.writefile(\"pq_experiment/experiment_$i.parquet\", df)\n",
199 | " end\n",
200 | "end"
201 | ]
202 | },
203 | {
204 | "cell_type": "markdown",
205 | "id": "66605891",
206 | "metadata": {},
207 | "source": [
208 | "Note that reading the file is lazy. Actual data is not read yet:"
209 | ]
210 | },
211 | {
212 | "cell_type": "code",
213 | "execution_count": 10,
214 | "id": "a1337937",
215 | "metadata": {},
216 | "outputs": [
217 | {
218 | "data": {
219 | "text/plain": [
220 | "\u001b[34m≔ \u001b[39mParquet2.Dataset (837645573 bytes)\n",
221 | "\t1. \u001b[33m\"mu\"\u001b[39m: \u001b[36mFloat64\u001b[39m\n",
222 | "\t2. \u001b[33m\"x\"\u001b[39m: \u001b[36mFloat64\u001b[39m\n"
223 | ]
224 | },
225 | "execution_count": 10,
226 | "metadata": {},
227 | "output_type": "execute_result"
228 | }
229 | ],
230 | "source": [
231 | "pq_experiment = Parquet2.readfile(\"pq_experiment\", load_initial=true)"
232 | ]
233 | },
234 | {
235 | "cell_type": "markdown",
236 | "id": "66f0e314",
237 | "metadata": {},
238 | "source": [
239 | "We have six chunks of data (each corresponding to one file, as we did not create row groups within files):"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": 11,
245 | "id": "dfa6c314",
246 | "metadata": {},
247 | "outputs": [
248 | {
249 | "data": {
250 | "text/plain": [
251 | "6"
252 | ]
253 | },
254 | "execution_count": 11,
255 | "metadata": {},
256 | "output_type": "execute_result"
257 | }
258 | ],
259 | "source": [
260 | "length(pq_experiment)"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": 12,
266 | "id": "0492ece9",
267 | "metadata": {},
268 | "outputs": [
269 | {
270 | "data": {
271 | "text/plain": [
272 | "6-element Vector{FilePathsBase.WindowsPath}:\n",
273 | " p\"C:/WORK/dev/DataFramesTutorials/JuliaCon2023-Tutorial/pq_experiment/experiment_1.parquet\"\n",
274 | " p\"C:/WORK/dev/DataFramesTutorials/JuliaCon2023-Tutorial/pq_experiment/experiment_2.parquet\"\n",
275 | " p\"C:/WORK/dev/DataFramesTutorials/JuliaCon2023-Tutorial/pq_experiment/experiment_3.parquet\"\n",
276 | " p\"C:/WORK/dev/DataFramesTutorials/JuliaCon2023-Tutorial/pq_experiment/experiment_4.parquet\"\n",
277 | " p\"C:/WORK/dev/DataFramesTutorials/JuliaCon2023-Tutorial/pq_experiment/experiment_5.parquet\"\n",
278 | " p\"C:/WORK/dev/DataFramesTutorials/JuliaCon2023-Tutorial/pq_experiment/experiment_6.parquet\""
279 | ]
280 | },
281 | "execution_count": 12,
282 | "metadata": {},
283 | "output_type": "execute_result"
284 | }
285 | ],
286 | "source": [
287 | "Parquet2.filelist(pq_experiment)"
288 | ]
289 | },
290 | {
291 | "cell_type": "markdown",
292 | "id": "903d2f1c",
293 | "metadata": {},
294 | "source": [
295 | "Note that the last file has less rows than the rest:"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": 13,
301 | "id": "1270096b",
302 | "metadata": {},
303 | "outputs": [
304 | {
305 | "data": {
306 | "text/plain": [
307 | "6-element Vector{Int64}:\n",
308 | " 100000000\n",
309 | " 100000000\n",
310 | " 100000000\n",
311 | " 100000000\n",
312 | " 100000000\n",
313 | " 25336576"
314 | ]
315 | },
316 | "execution_count": 13,
317 | "metadata": {},
318 | "output_type": "execute_result"
319 | }
320 | ],
321 | "source": [
322 | "nrow.(pq_experiment)"
323 | ]
324 | },
325 | {
326 | "cell_type": "markdown",
327 | "id": "0d3ea748",
328 | "metadata": {},
329 | "source": [
330 | "The challenge we have in this dataset is that the same values of keys (`mu` column) are split across multiple files.\n",
331 | "\n",
332 | "Assume we want to get a mean over all keys. We need to do it in two steps.\n",
333 | "\n",
334 | "This is a standard map-reduce pattern. In this tutorial we perform both steps manually on a single node:"
335 | ]
336 | },
337 | {
338 | "cell_type": "code",
339 | "execution_count": 14,
340 | "id": "2f0b7a3d",
341 | "metadata": {},
342 | "outputs": [
343 | {
344 | "name": "stderr",
345 | "output_type": "stream",
346 | "text": [
347 | "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mprocessing chunk of data #1\n",
348 | "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mprocessing chunk of data #2\n",
349 | "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mprocessing chunk of data #3\n",
350 | "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mprocessing chunk of data #4\n",
351 | "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mprocessing chunk of data #5\n",
352 | "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mprocessing chunk of data #6\n"
353 | ]
354 | },
355 | {
356 | "data": {
357 | "text/plain": [
358 | "6-element Vector{DataFrame}:\n",
359 | " \u001b[1m96×3 DataFrame\u001b[0m\n",
360 | "\u001b[1m Row \u001b[0m│\u001b[1m mu \u001b[0m\u001b[1m x_mean \u001b[0m\u001b[1m nrow \u001b[0m\n",
361 | " │\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Int64 \u001b[0m\n",
362 | "─────┼────────────────────────────────\n",
363 | " 1 │ 0.0 -0.000165159 1048576\n",
364 | " 2 │ 0.002 0.00181935 1048576\n",
365 | " 3 │ 0.004 0.00547356 1048576\n",
366 | " 4 │ 0.006 0.0053043 1048576\n",
367 | " 5 │ 0.008 0.00748101 1048576\n",
368 | " 6 │ 0.01 0.0105489 1048576\n",
369 | " 7 │ 0.012 0.0113697 1048576\n",
370 | " 8 │ 0.014 0.0133716 1048576\n",
371 | " 9 │ 0.016 0.0163869 1048576\n",
372 | " 10 │ 0.018 0.0192935 1048576\n",
373 | " 11 │ 0.02 0.0198156 1048576\n",
374 | " ⋮ │ ⋮ ⋮ ⋮\n",
375 | " 87 │ 0.172 0.171104 1048576\n",
376 | " 88 │ 0.174 0.175584 1048576\n",
377 | " 89 │ 0.176 0.176103 1048576\n",
378 | " 90 │ 0.178 0.178837 1048576\n",
379 | " 91 │ 0.18 0.182103 1048576\n",
380 | " 92 │ 0.182 0.182695 1048576\n",
381 | " 93 │ 0.184 0.181319 1048576\n",
382 | " 94 │ 0.186 0.186327 1048576\n",
383 | " 95 │ 0.188 0.188068 1048576\n",
384 | " 96 │ 0.19 0.187567 385280\n",
385 | "\u001b[36m 75 rows omitted\u001b[0m\n",
386 | " \u001b[1m96×3 DataFrame\u001b[0m\n",
387 | "\u001b[1m Row \u001b[0m│\u001b[1m mu \u001b[0m\u001b[1m x_mean \u001b[0m\u001b[1m nrow \u001b[0m\n",
388 | " │\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Int64 \u001b[0m\n",
389 | "─────┼────────────────────────────\n",
390 | " 1 │ 0.19 0.190411 663296\n",
391 | " 2 │ 0.192 0.19314 1048576\n",
392 | " 3 │ 0.194 0.193709 1048576\n",
393 | " 4 │ 0.196 0.195765 1048576\n",
394 | " 5 │ 0.198 0.198623 1048576\n",
395 | " 6 │ 0.2 0.200585 1048576\n",
396 | " 7 │ 0.202 0.200626 1048576\n",
397 | " 8 │ 0.204 0.205575 1048576\n",
398 | " 9 │ 0.206 0.205889 1048576\n",
399 | " 10 │ 0.208 0.207115 1048576\n",
400 | " 11 │ 0.21 0.20955 1048576\n",
401 | " ⋮ │ ⋮ ⋮ ⋮\n",
402 | " 87 │ 0.362 0.360563 1048576\n",
403 | " 88 │ 0.364 0.363005 1048576\n",
404 | " 89 │ 0.366 0.368417 1048576\n",
405 | " 90 │ 0.368 0.367043 1048576\n",
406 | " 91 │ 0.37 0.369158 1048576\n",
407 | " 92 │ 0.372 0.371584 1048576\n",
408 | " 93 │ 0.374 0.371797 1048576\n",
409 | " 94 │ 0.376 0.376891 1048576\n",
410 | " 95 │ 0.378 0.379768 1048576\n",
411 | " 96 │ 0.38 0.380511 770560\n",
412 | "\u001b[36m 75 rows omitted\u001b[0m\n",
413 | " \u001b[1m97×3 DataFrame\u001b[0m\n",
414 | "\u001b[1m Row \u001b[0m│\u001b[1m mu \u001b[0m\u001b[1m x_mean \u001b[0m\u001b[1m nrow \u001b[0m\n",
415 | " │\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Int64 \u001b[0m\n",
416 | "─────┼────────────────────────────\n",
417 | " 1 │ 0.38 0.379609 278016\n",
418 | " 2 │ 0.382 0.380177 1048576\n",
419 | " 3 │ 0.384 0.385076 1048576\n",
420 | " 4 │ 0.386 0.385815 1048576\n",
421 | " 5 │ 0.388 0.388314 1048576\n",
422 | " 6 │ 0.39 0.391087 1048576\n",
423 | " 7 │ 0.392 0.391428 1048576\n",
424 | " 8 │ 0.394 0.394603 1048576\n",
425 | " 9 │ 0.396 0.397369 1048576\n",
426 | " 10 │ 0.398 0.399379 1048576\n",
427 | " 11 │ 0.4 0.400048 1048576\n",
428 | " ⋮ │ ⋮ ⋮ ⋮\n",
429 | " 88 │ 0.554 0.553804 1048576\n",
430 | " 89 │ 0.556 0.557008 1048576\n",
431 | " 90 │ 0.558 0.558443 1048576\n",
432 | " 91 │ 0.56 0.559155 1048576\n",
433 | " 92 │ 0.562 0.5614 1048576\n",
434 | " 93 │ 0.564 0.565902 1048576\n",
435 | " 94 │ 0.566 0.565953 1048576\n",
436 | " 95 │ 0.568 0.567578 1048576\n",
437 | " 96 │ 0.57 0.570556 1048576\n",
438 | " 97 │ 0.572 0.574669 107264\n",
439 | "\u001b[36m 76 rows omitted\u001b[0m\n",
440 | " \u001b[1m96×3 DataFrame\u001b[0m\n",
441 | "\u001b[1m Row \u001b[0m│\u001b[1m mu \u001b[0m\u001b[1m x_mean \u001b[0m\u001b[1m nrow \u001b[0m\n",
442 | " │\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Int64 \u001b[0m\n",
443 | "─────┼────────────────────────────\n",
444 | " 1 │ 0.572 0.571994 941312\n",
445 | " 2 │ 0.574 0.573747 1048576\n",
446 | " 3 │ 0.576 0.575925 1048576\n",
447 | " 4 │ 0.578 0.57772 1048576\n",
448 | " 5 │ 0.58 0.578779 1048576\n",
449 | " 6 │ 0.582 0.582317 1048576\n",
450 | " 7 │ 0.584 0.585391 1048576\n",
451 | " 8 │ 0.586 0.585153 1048576\n",
452 | " 9 │ 0.588 0.587751 1048576\n",
453 | " 10 │ 0.59 0.5879 1048576\n",
454 | " 11 │ 0.592 0.593437 1048576\n",
455 | " ⋮ │ ⋮ ⋮ ⋮\n",
456 | " 87 │ 0.744 0.744916 1048576\n",
457 | " 88 │ 0.746 0.745429 1048576\n",
458 | " 89 │ 0.748 0.747461 1048576\n",
459 | " 90 │ 0.75 0.750765 1048576\n",
460 | " 91 │ 0.752 0.753685 1048576\n",
461 | " 92 │ 0.754 0.754218 1048576\n",
462 | " 93 │ 0.756 0.756017 1048576\n",
463 | " 94 │ 0.758 0.757438 1048576\n",
464 | " 95 │ 0.76 0.758445 1048576\n",
465 | " 96 │ 0.762 0.762476 492544\n",
466 | "\u001b[36m 75 rows omitted\u001b[0m\n",
467 | " \u001b[1m96×3 DataFrame\u001b[0m\n",
468 | "\u001b[1m Row \u001b[0m│\u001b[1m mu \u001b[0m\u001b[1m x_mean \u001b[0m\u001b[1m nrow \u001b[0m\n",
469 | " │\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Int64 \u001b[0m\n",
470 | "─────┼────────────────────────────\n",
471 | " 1 │ 0.762 0.762469 556032\n",
472 | " 2 │ 0.764 0.763604 1048576\n",
473 | " 3 │ 0.766 0.765431 1048576\n",
474 | " 4 │ 0.768 0.767295 1048576\n",
475 | " 5 │ 0.77 0.770628 1048576\n",
476 | " 6 │ 0.772 0.772683 1048576\n",
477 | " 7 │ 0.774 0.774978 1048576\n",
478 | " 8 │ 0.776 0.774604 1048576\n",
479 | " 9 │ 0.778 0.777869 1048576\n",
480 | " 10 │ 0.78 0.778066 1048576\n",
481 | " 11 │ 0.782 0.782015 1048576\n",
482 | " ⋮ │ ⋮ ⋮ ⋮\n",
483 | " 87 │ 0.934 0.932314 1048576\n",
484 | " 88 │ 0.936 0.93643 1048576\n",
485 | " 89 │ 0.938 0.937392 1048576\n",
486 | " 90 │ 0.94 0.94038 1048576\n",
487 | " 91 │ 0.942 0.942672 1048576\n",
488 | " 92 │ 0.944 0.943768 1048576\n",
489 | " 93 │ 0.946 0.946796 1048576\n",
490 | " 94 │ 0.948 0.947403 1048576\n",
491 | " 95 │ 0.95 0.95025 1048576\n",
492 | " 96 │ 0.952 0.953107 877824\n",
493 | "\u001b[36m 75 rows omitted\u001b[0m\n",
494 | " \u001b[1m25×3 DataFrame\u001b[0m\n",
495 | "\u001b[1m Row \u001b[0m│\u001b[1m mu \u001b[0m\u001b[1m x_mean \u001b[0m\u001b[1m nrow \u001b[0m\n",
496 | " │\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Int64 \u001b[0m\n",
497 | "─────┼────────────────────────────\n",
498 | " 1 │ 0.952 0.954526 170752\n",
499 | " 2 │ 0.954 0.95376 1048576\n",
500 | " 3 │ 0.956 0.955728 1048576\n",
501 | " 4 │ 0.958 0.95682 1048576\n",
502 | " 5 │ 0.96 0.959156 1048576\n",
503 | " 6 │ 0.962 0.962672 1048576\n",
504 | " 7 │ 0.964 0.965211 1048576\n",
505 | " 8 │ 0.966 0.964998 1048576\n",
506 | " 9 │ 0.968 0.968262 1048576\n",
507 | " 10 │ 0.97 0.969782 1048576\n",
508 | " 11 │ 0.972 0.971397 1048576\n",
509 | " ⋮ │ ⋮ ⋮ ⋮\n",
510 | " 16 │ 0.982 0.980645 1048576\n",
511 | " 17 │ 0.984 0.985538 1048576\n",
512 | " 18 │ 0.986 0.986935 1048576\n",
513 | " 19 │ 0.988 0.987878 1048576\n",
514 | " 20 │ 0.99 0.990001 1048576\n",
515 | " 21 │ 0.992 0.993618 1048576\n",
516 | " 22 │ 0.994 0.994494 1048576\n",
517 | " 23 │ 0.996 0.995764 1048576\n",
518 | " 24 │ 0.998 0.999058 1048576\n",
519 | " 25 │ 1.0 0.999101 1048576\n",
520 | "\u001b[36m 4 rows omitted\u001b[0m"
521 | ]
522 | },
523 | "execution_count": 14,
524 | "metadata": {},
525 | "output_type": "execute_result"
526 | }
527 | ],
528 | "source": [
529 | "agg1 = map(enumerate(pq_experiment)) do (i, rowset)\n",
530 | " @info \"processing chunk of data #$i\"\n",
531 | " df = DataFrame(rowset, copycols=false)\n",
532 | " gdf = groupby(df, :mu)\n",
533 | " return combine(gdf, :x => mean, nrow)\n",
534 | "end"
535 | ]
536 | },
537 | {
538 | "cell_type": "code",
539 | "execution_count": 15,
540 | "id": "53fff171",
541 | "metadata": {},
542 | "outputs": [
543 | {
544 | "data": {
545 | "text/html": [
546 | "
506×3 DataFrame
481 rows omitted
1 | 0.0 | -0.000165159 | 1048576 |
2 | 0.002 | 0.00181935 | 1048576 |
3 | 0.004 | 0.00547356 | 1048576 |
4 | 0.006 | 0.0053043 | 1048576 |
5 | 0.008 | 0.00748101 | 1048576 |
6 | 0.01 | 0.0105489 | 1048576 |
7 | 0.012 | 0.0113697 | 1048576 |
8 | 0.014 | 0.0133716 | 1048576 |
9 | 0.016 | 0.0163869 | 1048576 |
10 | 0.018 | 0.0192935 | 1048576 |
11 | 0.02 | 0.0198156 | 1048576 |
12 | 0.022 | 0.0234726 | 1048576 |
13 | 0.024 | 0.0237806 | 1048576 |
⋮ | ⋮ | ⋮ | ⋮ |
495 | 0.978 | 0.979726 | 1048576 |
496 | 0.98 | 0.980824 | 1048576 |
497 | 0.982 | 0.980645 | 1048576 |
498 | 0.984 | 0.985538 | 1048576 |
499 | 0.986 | 0.986935 | 1048576 |
500 | 0.988 | 0.987878 | 1048576 |
501 | 0.99 | 0.990001 | 1048576 |
502 | 0.992 | 0.993618 | 1048576 |
503 | 0.994 | 0.994494 | 1048576 |
504 | 0.996 | 0.995764 | 1048576 |
505 | 0.998 | 0.999058 | 1048576 |
506 | 1.0 | 0.999101 | 1048576 |
"
547 | ],
548 | "text/latex": [
549 | "\\begin{tabular}{r|ccc}\n",
550 | "\t& mu & x\\_mean & nrow\\\\\n",
551 | "\t\\hline\n",
552 | "\t& Float64 & Float64 & Int64\\\\\n",
553 | "\t\\hline\n",
554 | "\t1 & 0.0 & -0.000165159 & 1048576 \\\\\n",
555 | "\t2 & 0.002 & 0.00181935 & 1048576 \\\\\n",
556 | "\t3 & 0.004 & 0.00547356 & 1048576 \\\\\n",
557 | "\t4 & 0.006 & 0.0053043 & 1048576 \\\\\n",
558 | "\t5 & 0.008 & 0.00748101 & 1048576 \\\\\n",
559 | "\t6 & 0.01 & 0.0105489 & 1048576 \\\\\n",
560 | "\t7 & 0.012 & 0.0113697 & 1048576 \\\\\n",
561 | "\t8 & 0.014 & 0.0133716 & 1048576 \\\\\n",
562 | "\t9 & 0.016 & 0.0163869 & 1048576 \\\\\n",
563 | "\t10 & 0.018 & 0.0192935 & 1048576 \\\\\n",
564 | "\t11 & 0.02 & 0.0198156 & 1048576 \\\\\n",
565 | "\t12 & 0.022 & 0.0234726 & 1048576 \\\\\n",
566 | "\t13 & 0.024 & 0.0237806 & 1048576 \\\\\n",
567 | "\t14 & 0.026 & 0.0249626 & 1048576 \\\\\n",
568 | "\t15 & 0.028 & 0.0273897 & 1048576 \\\\\n",
569 | "\t16 & 0.03 & 0.030416 & 1048576 \\\\\n",
570 | "\t17 & 0.032 & 0.0332699 & 1048576 \\\\\n",
571 | "\t18 & 0.034 & 0.0349246 & 1048576 \\\\\n",
572 | "\t19 & 0.036 & 0.0382223 & 1048576 \\\\\n",
573 | "\t20 & 0.038 & 0.0385496 & 1048576 \\\\\n",
574 | "\t21 & 0.04 & 0.0399323 & 1048576 \\\\\n",
575 | "\t22 & 0.042 & 0.0425588 & 1048576 \\\\\n",
576 | "\t23 & 0.044 & 0.0439851 & 1048576 \\\\\n",
577 | "\t24 & 0.046 & 0.0462198 & 1048576 \\\\\n",
578 | "\t25 & 0.048 & 0.0465669 & 1048576 \\\\\n",
579 | "\t26 & 0.05 & 0.0487787 & 1048576 \\\\\n",
580 | "\t27 & 0.052 & 0.0511604 & 1048576 \\\\\n",
581 | "\t28 & 0.054 & 0.0538052 & 1048576 \\\\\n",
582 | "\t29 & 0.056 & 0.0564606 & 1048576 \\\\\n",
583 | "\t30 & 0.058 & 0.0583234 & 1048576 \\\\\n",
584 | "\t$\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ \\\\\n",
585 | "\\end{tabular}\n"
586 | ],
587 | "text/plain": [
588 | "\u001b[1m506×3 DataFrame\u001b[0m\n",
589 | "\u001b[1m Row \u001b[0m│\u001b[1m mu \u001b[0m\u001b[1m x_mean \u001b[0m\u001b[1m nrow \u001b[0m\n",
590 | " │\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Int64 \u001b[0m\n",
591 | "─────┼────────────────────────────────\n",
592 | " 1 │ 0.0 -0.000165159 1048576\n",
593 | " 2 │ 0.002 0.00181935 1048576\n",
594 | " 3 │ 0.004 0.00547356 1048576\n",
595 | " 4 │ 0.006 0.0053043 1048576\n",
596 | " 5 │ 0.008 0.00748101 1048576\n",
597 | " 6 │ 0.01 0.0105489 1048576\n",
598 | " 7 │ 0.012 0.0113697 1048576\n",
599 | " 8 │ 0.014 0.0133716 1048576\n",
600 | " 9 │ 0.016 0.0163869 1048576\n",
601 | " 10 │ 0.018 0.0192935 1048576\n",
602 | " 11 │ 0.02 0.0198156 1048576\n",
603 | " ⋮ │ ⋮ ⋮ ⋮\n",
604 | " 497 │ 0.982 0.980645 1048576\n",
605 | " 498 │ 0.984 0.985538 1048576\n",
606 | " 499 │ 0.986 0.986935 1048576\n",
607 | " 500 │ 0.988 0.987878 1048576\n",
608 | " 501 │ 0.99 0.990001 1048576\n",
609 | " 502 │ 0.992 0.993618 1048576\n",
610 | " 503 │ 0.994 0.994494 1048576\n",
611 | " 504 │ 0.996 0.995764 1048576\n",
612 | " 505 │ 0.998 0.999058 1048576\n",
613 | " 506 │ 1.0 0.999101 1048576\n",
614 | "\u001b[36m 485 rows omitted\u001b[0m"
615 | ]
616 | },
617 | "execution_count": 15,
618 | "metadata": {},
619 | "output_type": "execute_result"
620 | }
621 | ],
622 | "source": [
623 | "agg2 = reduce(vcat, agg1)"
624 | ]
625 | },
626 | {
627 | "cell_type": "code",
628 | "execution_count": 16,
629 | "id": "b8c8b31b",
630 | "metadata": {},
631 | "outputs": [
632 | {
633 | "data": {
634 | "text/html": [
635 | "501×2 DataFrame
476 rows omitted
1 | 0.0 | -0.000165159 |
2 | 0.002 | 0.00181935 |
3 | 0.004 | 0.00547356 |
4 | 0.006 | 0.0053043 |
5 | 0.008 | 0.00748101 |
6 | 0.01 | 0.0105489 |
7 | 0.012 | 0.0113697 |
8 | 0.014 | 0.0133716 |
9 | 0.016 | 0.0163869 |
10 | 0.018 | 0.0192935 |
11 | 0.02 | 0.0198156 |
12 | 0.022 | 0.0234726 |
13 | 0.024 | 0.0237806 |
⋮ | ⋮ | ⋮ |
490 | 0.978 | 0.979726 |
491 | 0.98 | 0.980824 |
492 | 0.982 | 0.980645 |
493 | 0.984 | 0.985538 |
494 | 0.986 | 0.986935 |
495 | 0.988 | 0.987878 |
496 | 0.99 | 0.990001 |
497 | 0.992 | 0.993618 |
498 | 0.994 | 0.994494 |
499 | 0.996 | 0.995764 |
500 | 0.998 | 0.999058 |
501 | 1.0 | 0.999101 |
"
636 | ],
637 | "text/latex": [
638 | "\\begin{tabular}{r|cc}\n",
639 | "\t& mu & x\\_mean\\\\\n",
640 | "\t\\hline\n",
641 | "\t& Float64 & Float64\\\\\n",
642 | "\t\\hline\n",
643 | "\t1 & 0.0 & -0.000165159 \\\\\n",
644 | "\t2 & 0.002 & 0.00181935 \\\\\n",
645 | "\t3 & 0.004 & 0.00547356 \\\\\n",
646 | "\t4 & 0.006 & 0.0053043 \\\\\n",
647 | "\t5 & 0.008 & 0.00748101 \\\\\n",
648 | "\t6 & 0.01 & 0.0105489 \\\\\n",
649 | "\t7 & 0.012 & 0.0113697 \\\\\n",
650 | "\t8 & 0.014 & 0.0133716 \\\\\n",
651 | "\t9 & 0.016 & 0.0163869 \\\\\n",
652 | "\t10 & 0.018 & 0.0192935 \\\\\n",
653 | "\t11 & 0.02 & 0.0198156 \\\\\n",
654 | "\t12 & 0.022 & 0.0234726 \\\\\n",
655 | "\t13 & 0.024 & 0.0237806 \\\\\n",
656 | "\t14 & 0.026 & 0.0249626 \\\\\n",
657 | "\t15 & 0.028 & 0.0273897 \\\\\n",
658 | "\t16 & 0.03 & 0.030416 \\\\\n",
659 | "\t17 & 0.032 & 0.0332699 \\\\\n",
660 | "\t18 & 0.034 & 0.0349246 \\\\\n",
661 | "\t19 & 0.036 & 0.0382223 \\\\\n",
662 | "\t20 & 0.038 & 0.0385496 \\\\\n",
663 | "\t21 & 0.04 & 0.0399323 \\\\\n",
664 | "\t22 & 0.042 & 0.0425588 \\\\\n",
665 | "\t23 & 0.044 & 0.0439851 \\\\\n",
666 | "\t24 & 0.046 & 0.0462198 \\\\\n",
667 | "\t25 & 0.048 & 0.0465669 \\\\\n",
668 | "\t26 & 0.05 & 0.0487787 \\\\\n",
669 | "\t27 & 0.052 & 0.0511604 \\\\\n",
670 | "\t28 & 0.054 & 0.0538052 \\\\\n",
671 | "\t29 & 0.056 & 0.0564606 \\\\\n",
672 | "\t30 & 0.058 & 0.0583234 \\\\\n",
673 | "\t$\\dots$ & $\\dots$ & $\\dots$ \\\\\n",
674 | "\\end{tabular}\n"
675 | ],
676 | "text/plain": [
677 | "\u001b[1m501×2 DataFrame\u001b[0m\n",
678 | "\u001b[1m Row \u001b[0m│\u001b[1m mu \u001b[0m\u001b[1m x_mean \u001b[0m\n",
679 | " │\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\n",
680 | "─────┼───────────────────────\n",
681 | " 1 │ 0.0 -0.000165159\n",
682 | " 2 │ 0.002 0.00181935\n",
683 | " 3 │ 0.004 0.00547356\n",
684 | " 4 │ 0.006 0.0053043\n",
685 | " 5 │ 0.008 0.00748101\n",
686 | " 6 │ 0.01 0.0105489\n",
687 | " 7 │ 0.012 0.0113697\n",
688 | " 8 │ 0.014 0.0133716\n",
689 | " 9 │ 0.016 0.0163869\n",
690 | " 10 │ 0.018 0.0192935\n",
691 | " 11 │ 0.02 0.0198156\n",
692 | " ⋮ │ ⋮ ⋮\n",
693 | " 492 │ 0.982 0.980645\n",
694 | " 493 │ 0.984 0.985538\n",
695 | " 494 │ 0.986 0.986935\n",
696 | " 495 │ 0.988 0.987878\n",
697 | " 496 │ 0.99 0.990001\n",
698 | " 497 │ 0.992 0.993618\n",
699 | " 498 │ 0.994 0.994494\n",
700 | " 499 │ 0.996 0.995764\n",
701 | " 500 │ 0.998 0.999058\n",
702 | " 501 │ 1.0 0.999101\n",
703 | "\u001b[36m 480 rows omitted\u001b[0m"
704 | ]
705 | },
706 | "execution_count": 16,
707 | "metadata": {},
708 | "output_type": "execute_result"
709 | }
710 | ],
711 | "source": [
712 | "agg3 = combine(groupby(agg2, :mu)) do sdf\n",
713 | " return (; x_mean = mean(sdf.x_mean, Weights(sdf.nrow)))\n",
714 | "end"
715 | ]
716 | },
717 | {
718 | "cell_type": "markdown",
719 | "id": "eb93a9ec",
720 | "metadata": {},
721 | "source": [
722 | "Other common possible scenarios:\n",
723 | "\n",
724 | "* using multiple-threads for in-core data\n",
725 | "* using several machines in a cluster"
726 | ]
727 | },
728 | {
729 | "cell_type": "markdown",
730 | "id": "a895c32b",
731 | "metadata": {},
732 | "source": [
733 | "What I presented above gives full flexibility, but requires manual handling of reduction.\n",
734 | "\n",
735 | "For common operations [DTables.jl](https://github.com/JuliaParallel/DTables.jl) provides distributed table structures and data manipulation operations built on top of Dagger.jl."
736 | ]
737 | },
738 | {
739 | "cell_type": "markdown",
740 | "id": "79263dbb",
741 | "metadata": {},
742 | "source": [
743 | "In part 3 we discuss some limitations of Parquet format that one needs to keep in mind when working with it."
744 | ]
745 | },
746 | {
747 | "cell_type": "markdown",
748 | "id": "e7820da9",
749 | "metadata": {},
750 | "source": [
751 | "*Preparation of this worksop has been supported by the Polish National Agency for Academic Exchange under the Strategic Partnerships programme, grant number BPI/PST/2021/1/00069/U/00001.*\n",
752 | "\n",
753 | ""
754 | ]
755 | }
756 | ],
757 | "metadata": {
758 | "kernelspec": {
759 | "display_name": "Julia 1.9.2",
760 | "language": "julia",
761 | "name": "julia-1.9"
762 | },
763 | "language_info": {
764 | "file_extension": ".jl",
765 | "mimetype": "application/julia",
766 | "name": "julia",
767 | "version": "1.9.2"
768 | }
769 | },
770 | "nbformat": 4,
771 | "nbformat_minor": 5
772 | }
773 |
--------------------------------------------------------------------------------
/juliacon2023_part3_issues.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "81950dea",
6 | "metadata": {},
7 | "source": [
8 | "# Working with DataFrames.jl beyond CSV files\n",
9 | "\n",
10 | "# Part 3: Important limitations of Parquet\n",
11 | "\n",
12 | "## Bogumił Kamiński\n",
13 | "### June 25, 2023"
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "id": "4acb3ec2",
19 | "metadata": {},
20 | "source": [
21 | "What is covered in part 3:\n",
22 | "* Limitations of `RowGroup` size\n",
23 | "* Avoid excessive copying of data"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "id": "cf3d92ee",
29 | "metadata": {},
30 | "source": [
31 | "## Setup"
32 | ]
33 | },
34 | {
35 | "cell_type": "code",
36 | "execution_count": 1,
37 | "id": "3ac0783f",
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "using DataFrames"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": 2,
47 | "id": "8bc85c87",
48 | "metadata": {},
49 | "outputs": [],
50 | "source": [
51 | "using Parquet2"
52 | ]
53 | },
54 | {
55 | "cell_type": "markdown",
56 | "id": "180ff9ba",
57 | "metadata": {},
58 | "source": [
59 | "## Handling tables with large number of rows"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 3,
65 | "id": "453115bc",
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "isfile(\"large_df.parquet\") && rm(\"large_df.parquet\")"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 4,
75 | "id": "4b246cce",
76 | "metadata": {},
77 | "outputs": [
78 | {
79 | "data": {
80 | "text/html": [
81 | "300000000×1 DataFrame
299999975 rows omitted
1 | 0.970803 |
2 | 0.841825 |
3 | 0.0484231 |
4 | 0.764995 |
5 | 0.438937 |
6 | 0.972333 |
7 | 0.357896 |
8 | 0.830436 |
9 | 0.090152 |
10 | 0.780723 |
11 | 0.144802 |
12 | 0.903739 |
13 | 0.375271 |
⋮ | ⋮ |
299999989 | 0.865466 |
299999990 | 0.850942 |
299999991 | 0.866739 |
299999992 | 0.281398 |
299999993 | 0.751348 |
299999994 | 0.819741 |
299999995 | 0.235294 |
299999996 | 0.760385 |
299999997 | 0.588597 |
299999998 | 0.580177 |
299999999 | 0.456726 |
300000000 | 0.554389 |
"
82 | ],
83 | "text/latex": [
84 | "\\begin{tabular}{r|c}\n",
85 | "\t& x\\\\\n",
86 | "\t\\hline\n",
87 | "\t& Float64\\\\\n",
88 | "\t\\hline\n",
89 | "\t1 & 0.970803 \\\\\n",
90 | "\t2 & 0.841825 \\\\\n",
91 | "\t3 & 0.0484231 \\\\\n",
92 | "\t4 & 0.764995 \\\\\n",
93 | "\t5 & 0.438937 \\\\\n",
94 | "\t6 & 0.972333 \\\\\n",
95 | "\t7 & 0.357896 \\\\\n",
96 | "\t8 & 0.830436 \\\\\n",
97 | "\t9 & 0.090152 \\\\\n",
98 | "\t10 & 0.780723 \\\\\n",
99 | "\t11 & 0.144802 \\\\\n",
100 | "\t12 & 0.903739 \\\\\n",
101 | "\t13 & 0.375271 \\\\\n",
102 | "\t14 & 0.713255 \\\\\n",
103 | "\t15 & 0.855335 \\\\\n",
104 | "\t16 & 0.633159 \\\\\n",
105 | "\t17 & 0.815826 \\\\\n",
106 | "\t18 & 0.0735042 \\\\\n",
107 | "\t19 & 0.573114 \\\\\n",
108 | "\t20 & 0.403656 \\\\\n",
109 | "\t21 & 0.172389 \\\\\n",
110 | "\t22 & 0.365429 \\\\\n",
111 | "\t23 & 0.290728 \\\\\n",
112 | "\t24 & 0.228912 \\\\\n",
113 | "\t25 & 0.0985188 \\\\\n",
114 | "\t26 & 0.387811 \\\\\n",
115 | "\t27 & 0.978131 \\\\\n",
116 | "\t28 & 0.859664 \\\\\n",
117 | "\t29 & 0.922425 \\\\\n",
118 | "\t30 & 0.868172 \\\\\n",
119 | "\t$\\dots$ & $\\dots$ \\\\\n",
120 | "\\end{tabular}\n"
121 | ],
122 | "text/plain": [
123 | "\u001b[1m300000000×1 DataFrame\u001b[0m\n",
124 | "\u001b[1m Row \u001b[0m│\u001b[1m x \u001b[0m\n",
125 | " │\u001b[90m Float64 \u001b[0m\n",
126 | "───────────┼───────────\n",
127 | " 1 │ 0.970803\n",
128 | " 2 │ 0.841825\n",
129 | " 3 │ 0.0484231\n",
130 | " 4 │ 0.764995\n",
131 | " 5 │ 0.438937\n",
132 | " 6 │ 0.972333\n",
133 | " 7 │ 0.357896\n",
134 | " 8 │ 0.830436\n",
135 | " 9 │ 0.090152\n",
136 | " 10 │ 0.780723\n",
137 | " 11 │ 0.144802\n",
138 | " ⋮ │ ⋮\n",
139 | " 299999991 │ 0.866739\n",
140 | " 299999992 │ 0.281398\n",
141 | " 299999993 │ 0.751348\n",
142 | " 299999994 │ 0.819741\n",
143 | " 299999995 │ 0.235294\n",
144 | " 299999996 │ 0.760385\n",
145 | " 299999997 │ 0.588597\n",
146 | " 299999998 │ 0.580177\n",
147 | " 299999999 │ 0.456726\n",
148 | " 300000000 │ 0.554389\n",
149 | "\u001b[36m 299999979 rows omitted\u001b[0m"
150 | ]
151 | },
152 | "execution_count": 4,
153 | "metadata": {},
154 | "output_type": "execute_result"
155 | }
156 | ],
157 | "source": [
158 | "large_df = DataFrame(x=rand(3*10^8))"
159 | ]
160 | },
161 | {
162 | "cell_type": "markdown",
163 | "id": "42e65e33",
164 | "metadata": {},
165 | "source": [
166 | "This table has too many rows and cannot be stored in Parquet as one `RowGroup`."
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": 5,
172 | "id": "59088cc4",
173 | "metadata": {},
174 | "outputs": [
175 | {
176 | "ename": "LoadError",
177 | "evalue": "InexactError: trunc(Int32, 2400000000)",
178 | "output_type": "error",
179 | "traceback": [
180 | "InexactError: trunc(Int32, 2400000000)",
181 | "",
182 | "Stacktrace:",
183 | " [1] throw_inexacterror(f::Symbol, #unused#::Type{Int32}, val::Int64)",
184 | " @ Core .\\boot.jl:634",
185 | " ...",
186 | " [25] top-level scope",
187 | " @ In[5]:1"
188 | ]
189 | }
190 | ],
191 | "source": [
192 | "Parquet2.writefile(\"large_df.parquet\", large_df)"
193 | ]
194 | },
195 | {
196 | "cell_type": "markdown",
197 | "id": "84101f37",
198 | "metadata": {},
199 | "source": [
200 | "We need to split it into partitions of smaller size using `Iterators.partition`:"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 6,
206 | "id": "18ba68dd",
207 | "metadata": {},
208 | "outputs": [
209 | {
210 | "data": {
211 | "text/plain": [
212 | "\u001b[34m✏ \u001b[39mParquet2.FileWriter{IOStream}(large_df.parquet)"
213 | ]
214 | },
215 | "execution_count": 6,
216 | "metadata": {},
217 | "output_type": "execute_result"
218 | }
219 | ],
220 | "source": [
221 | "Parquet2.writefile(\"large_df.parquet\", Iterators.partition(large_df, 10^8))"
222 | ]
223 | },
224 | {
225 | "cell_type": "markdown",
226 | "id": "4f2ffa33",
227 | "metadata": {},
228 | "source": [
229 | "Drop original data to save memory:"
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "execution_count": 7,
235 | "id": "6d1fc286",
236 | "metadata": {},
237 | "outputs": [],
238 | "source": [
239 | "large_df = nothing"
240 | ]
241 | },
242 | {
243 | "cell_type": "markdown",
244 | "id": "85895503",
245 | "metadata": {},
246 | "source": [
247 | "## Impact of `copycols` keyword argument when fetching data to a `DataFrame`"
248 | ]
249 | },
250 | {
251 | "cell_type": "markdown",
252 | "id": "6e716993",
253 | "metadata": {},
254 | "source": [
255 | "`copycols=true` option:"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": 8,
261 | "id": "493809aa",
262 | "metadata": {},
263 | "outputs": [
264 | {
265 | "data": {
266 | "text/plain": [
267 | "12000208304"
268 | ]
269 | },
270 | "execution_count": 8,
271 | "metadata": {},
272 | "output_type": "execute_result"
273 | }
274 | ],
275 | "source": [
276 | "DataFrame(Parquet2.readfile(\"large_df.parquet\"))\n",
277 | "GC.gc(); GC.gc(); GC.gc(); GC.gc()\n",
278 | "@allocated DataFrame(Parquet2.readfile(\"large_df.parquet\"))"
279 | ]
280 | },
281 | {
282 | "cell_type": "markdown",
283 | "id": "a900295c",
284 | "metadata": {},
285 | "source": [
286 | "`copycols=false` option:"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": 9,
292 | "id": "b2ba467a",
293 | "metadata": {},
294 | "outputs": [
295 | {
296 | "data": {
297 | "text/plain": [
298 | "9600208336"
299 | ]
300 | },
301 | "execution_count": 9,
302 | "metadata": {},
303 | "output_type": "execute_result"
304 | }
305 | ],
306 | "source": [
307 | "DataFrame(Parquet2.readfile(\"large_df.parquet\"), copycols=false)\n",
308 | "GC.gc(); GC.gc(); GC.gc(); GC.gc()\n",
309 | "@allocated DataFrame(Parquet2.readfile(\"large_df.parquet\"), copycols=false)"
310 | ]
311 | },
312 | {
313 | "cell_type": "markdown",
314 | "id": "9d2f8738",
315 | "metadata": {},
316 | "source": [
317 | "**This issue has been fixed in Parquet2.jl version 0.2.18. Since this version you can omit passing `copycols=false`. Excessive copying is automatically avoided.**"
318 | ]
319 | },
320 | {
321 | "cell_type": "markdown",
322 | "id": "97ea16bc",
323 | "metadata": {},
324 | "source": [
325 | "*Preparation of this worksop has been supported by the Polish National Agency for Academic Exchange under the Strategic Partnerships programme, grant number BPI/PST/2021/1/00069/U/00001.*\n",
326 | "\n",
327 | ""
328 | ]
329 | }
330 | ],
331 | "metadata": {
332 | "kernelspec": {
333 | "display_name": "Julia 1.9.2",
334 | "language": "julia",
335 | "name": "julia-1.9"
336 | },
337 | "language_info": {
338 | "file_extension": ".jl",
339 | "mimetype": "application/julia",
340 | "name": "julia",
341 | "version": "1.9.2"
342 | }
343 | },
344 | "nbformat": 4,
345 | "nbformat_minor": 5
346 | }
347 |
--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/bkamins/JuliaCon2023-Tutorial/2da04fe2d76a2d468e96d7795b3d713de914a31f/logo.png
--------------------------------------------------------------------------------