├── LICENSE ├── Manifest.toml ├── Project.toml ├── README.md ├── WDI2009.dta ├── cars.RData ├── juliacon2023_part1_intro.ipynb ├── juliacon2023_part2_largedata.ipynb ├── juliacon2023_part3_issues.ipynb └── logo.png /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Bogumił Kamiński 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Manifest.toml: -------------------------------------------------------------------------------- 1 | # This file is machine-generated - editing it directly is not advised 2 | 3 | julia_version = "1.9.2" 4 | manifest_format = "2.0" 5 | project_hash = "3e8dcfe1bfca6cff0aed9017c4b73bc43ca70ef3" 6 | 7 | [[deps.AbstractTrees]] 8 | git-tree-sha1 = "faa260e4cb5aba097a73fab382dd4b5819d8ec8c" 9 | uuid = "1520ce14-60c1-5f80-bbc7-55ef81b5835c" 10 | version = "0.4.4" 11 | 12 | [[deps.Adapt]] 13 | deps = ["LinearAlgebra", "Requires"] 14 | git-tree-sha1 = "76289dc51920fdc6e0013c872ba9551d54961c24" 15 | uuid = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" 16 | version = "3.6.2" 17 | weakdeps = ["StaticArrays"] 18 | 19 | [deps.Adapt.extensions] 20 | AdaptStaticArraysExt = "StaticArrays" 21 | 22 | [[deps.ArgCheck]] 23 | git-tree-sha1 = "a3a402a35a2f7e0b87828ccabbd5ebfbebe356b4" 24 | uuid = "dce04be8-c92d-5529-be00-80e4d2c0e197" 25 | version = "2.3.0" 26 | 27 | [[deps.ArgTools]] 28 | uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" 29 | version = "1.1.1" 30 | 31 | [[deps.ArrayLayouts]] 32 | deps = ["FillArrays", "LinearAlgebra", "SparseArrays"] 33 | git-tree-sha1 = "06fb6abc448771b8eac175fd675c2e4453c4e7bd" 34 | uuid = "4c555306-a7a7-4459-81d9-ec55ddd5c99a" 35 | version = "1.0.13" 36 | 37 | [[deps.Artifacts]] 38 | uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" 39 | 40 | [[deps.BangBang]] 41 | deps = ["Compat", "ConstructionBase", "InitialValues", "LinearAlgebra", "Requires", "Setfield", "Tables"] 42 | git-tree-sha1 = "e28912ce94077686443433c2800104b061a827ed" 43 | uuid = "198e06fe-97b7-11e9-32a5-e1d131e6ad66" 44 | version = "0.3.39" 45 | 46 | [deps.BangBang.extensions] 47 | BangBangChainRulesCoreExt = "ChainRulesCore" 48 | BangBangDataFramesExt = "DataFrames" 49 | BangBangStaticArraysExt = "StaticArrays" 50 | BangBangStructArraysExt = "StructArrays" 51 | BangBangTypedTablesExt = "TypedTables" 52 | 53 | [deps.BangBang.weakdeps] 54 | ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" 55 | DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" 56 | StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" 57 | StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a" 58 | TypedTables = "9d95f2ec-7b3d-5a63-8d20-e2491e220bb9" 59 | 60 | [[deps.Base64]] 61 | uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" 62 | 63 | [[deps.Baselet]] 64 | git-tree-sha1 = "aebf55e6d7795e02ca500a689d326ac979aaf89e" 65 | uuid = "9718e550-a3fa-408a-8086-8db961cd8217" 66 | version = "0.1.1" 67 | 68 | [[deps.BitIntegers]] 69 | deps = ["Random"] 70 | git-tree-sha1 = "fc54d5837033a170f3bad307f993e156eefc345f" 71 | uuid = "c3b6d118-76ef-56ca-8cc7-ebb389d030a1" 72 | version = "0.2.7" 73 | 74 | [[deps.CEnum]] 75 | git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90" 76 | uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" 77 | version = "0.4.2" 78 | 79 | [[deps.CategoricalArrays]] 80 | deps = ["DataAPI", "Future", "Missings", "Printf", "Requires", "Statistics", "Unicode"] 81 | git-tree-sha1 = "1568b28f91293458345dabba6a5ea3f183250a61" 82 | uuid = "324d7699-5711-5eae-9e2f-1d82baa6b597" 83 | version = "0.10.8" 84 | weakdeps = ["JSON", "RecipesBase", "SentinelArrays", "StructTypes"] 85 | 86 | [deps.CategoricalArrays.extensions] 87 | CategoricalArraysJSONExt = "JSON" 88 | CategoricalArraysRecipesBaseExt = "RecipesBase" 89 | CategoricalArraysSentinelArraysExt = "SentinelArrays" 90 | CategoricalArraysStructTypesExt = "StructTypes" 91 | 92 | [[deps.CodecLz4]] 93 | deps = ["Lz4_jll", "TranscodingStreams"] 94 | git-tree-sha1 = "59fe0cb37784288d6b9f1baebddbf75457395d40" 95 | uuid = "5ba52731-8f18-5e0d-9241-30f10d1ec561" 96 | version = "0.4.0" 97 | 98 | [[deps.CodecXz]] 99 | deps = ["Libdl", "TranscodingStreams", "XZ_jll"] 100 | git-tree-sha1 = "82c4c000edf64b6bda6766377e69a1028f3549ee" 101 | uuid = "ba30903b-d9e8-5048-a5ec-d1f5b0d4b47b" 102 | version = "0.7.0" 103 | 104 | [[deps.CodecZlib]] 105 | deps = ["TranscodingStreams", "Zlib_jll"] 106 | git-tree-sha1 = "02aa26a4cf76381be7f66e020a3eddeb27b0a092" 107 | uuid = "944b1d66-785c-5afd-91f1-9de20f533193" 108 | version = "0.7.2" 109 | 110 | [[deps.CodecZstd]] 111 | deps = ["CEnum", "TranscodingStreams", "Zstd_jll"] 112 | git-tree-sha1 = "849470b337d0fa8449c21061de922386f32949d9" 113 | uuid = "6b39b394-51ab-5f42-8807-6242bab2b4c2" 114 | version = "0.7.2" 115 | 116 | [[deps.Compat]] 117 | deps = ["UUIDs"] 118 | git-tree-sha1 = "4e88377ae7ebeaf29a047aa1ee40826e0b708a5d" 119 | uuid = "34da2185-b29b-5c13-b0c7-acf172513d20" 120 | version = "4.7.0" 121 | weakdeps = ["Dates", "LinearAlgebra"] 122 | 123 | [deps.Compat.extensions] 124 | CompatLinearAlgebraExt = "LinearAlgebra" 125 | 126 | [[deps.CompilerSupportLibraries_jll]] 127 | deps = ["Artifacts", "Libdl"] 128 | uuid = "e66e0078-7015-5450-92f7-15fbd957f2ae" 129 | version = "1.0.5+0" 130 | 131 | [[deps.CompositionsBase]] 132 | git-tree-sha1 = "802bb88cd69dfd1509f6670416bd4434015693ad" 133 | uuid = "a33af91c-f02d-484b-be07-31d278c5ca2b" 134 | version = "0.1.2" 135 | 136 | [deps.CompositionsBase.extensions] 137 | CompositionsBaseInverseFunctionsExt = "InverseFunctions" 138 | 139 | [deps.CompositionsBase.weakdeps] 140 | InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112" 141 | 142 | [[deps.Conda]] 143 | deps = ["Downloads", "JSON", "VersionParsing"] 144 | git-tree-sha1 = "8c86e48c0db1564a1d49548d3515ced5d604c408" 145 | uuid = "8f4d0f93-b110-5947-807f-2305c1781a2d" 146 | version = "1.9.1" 147 | 148 | [[deps.ConstructionBase]] 149 | deps = ["LinearAlgebra"] 150 | git-tree-sha1 = "fe2838a593b5f776e1597e086dcd47560d94e816" 151 | uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9" 152 | version = "1.5.3" 153 | 154 | [deps.ConstructionBase.extensions] 155 | ConstructionBaseIntervalSetsExt = "IntervalSets" 156 | ConstructionBaseStaticArraysExt = "StaticArrays" 157 | 158 | [deps.ConstructionBase.weakdeps] 159 | IntervalSets = "8197267c-284f-5f27-9208-e0e47529a953" 160 | StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" 161 | 162 | [[deps.Crayons]] 163 | git-tree-sha1 = "249fe38abf76d48563e2f4556bebd215aa317e15" 164 | uuid = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f" 165 | version = "4.1.1" 166 | 167 | [[deps.DBInterface]] 168 | git-tree-sha1 = "9b0dc525a052b9269ccc5f7f04d5b3639c65bca5" 169 | uuid = "a10d1c49-ce27-4219-8d33-6db1a4562965" 170 | version = "2.5.0" 171 | 172 | [[deps.DataAPI]] 173 | git-tree-sha1 = "8da84edb865b0b5b0100c0666a9bc9a0b71c553c" 174 | uuid = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" 175 | version = "1.15.0" 176 | 177 | [[deps.DataFrames]] 178 | deps = ["Compat", "DataAPI", "DataStructures", "Future", "InlineStrings", "InvertedIndices", "IteratorInterfaceExtensions", "LinearAlgebra", "Markdown", "Missings", "PooledArrays", "PrecompileTools", "PrettyTables", "Printf", "REPL", "Random", "Reexport", "SentinelArrays", "SortingAlgorithms", "Statistics", "TableTraits", "Tables", "Unicode"] 179 | git-tree-sha1 = "04c738083f29f86e62c8afc341f0967d8717bdb8" 180 | uuid = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" 181 | version = "1.6.1" 182 | 183 | [[deps.DataStructures]] 184 | deps = ["Compat", "InteractiveUtils", "OrderedCollections"] 185 | git-tree-sha1 = "cf25ccb972fec4e4817764d01c82386ae94f77b4" 186 | uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" 187 | version = "0.18.14" 188 | 189 | [[deps.DataValueInterfaces]] 190 | git-tree-sha1 = "bfc1187b79289637fa0ef6d4436ebdfe6905cbd6" 191 | uuid = "e2d170a0-9d28-54be-80f0-106bbe20a464" 192 | version = "1.0.0" 193 | 194 | [[deps.Dates]] 195 | deps = ["Printf"] 196 | uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" 197 | 198 | [[deps.DecFP]] 199 | deps = ["DecFP_jll", "Printf", "Random", "SpecialFunctions"] 200 | git-tree-sha1 = "4a10cec664e26d9d63597daf9e62147e79d636e3" 201 | uuid = "55939f99-70c6-5e9b-8bb0-5071ed7d61fd" 202 | version = "1.3.2" 203 | 204 | [[deps.DecFP_jll]] 205 | deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] 206 | git-tree-sha1 = "e9a8da19f847bbfed4076071f6fef8665a30d9e5" 207 | uuid = "47200ebd-12ce-5be5-abb7-8e082af23329" 208 | version = "2.0.3+1" 209 | 210 | [[deps.DefineSingletons]] 211 | git-tree-sha1 = "0fba8b706d0178b4dc7fd44a96a92382c9065c2c" 212 | uuid = "244e2a9f-e319-4986-a169-4d1fe445cd52" 213 | version = "0.1.2" 214 | 215 | [[deps.Distributed]] 216 | deps = ["Random", "Serialization", "Sockets"] 217 | uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" 218 | 219 | [[deps.DocStringExtensions]] 220 | deps = ["LibGit2"] 221 | git-tree-sha1 = "2fb1e02f2b635d0845df5d7c167fec4dd739b00d" 222 | uuid = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae" 223 | version = "0.9.3" 224 | 225 | [[deps.Downloads]] 226 | deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] 227 | uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" 228 | version = "1.6.0" 229 | 230 | [[deps.DuckDB]] 231 | deps = ["DBInterface", "DataFrames", "Dates", "DuckDB_jll", "FixedPointDecimals", "Tables", "UUIDs", "WeakRefStrings"] 232 | git-tree-sha1 = "88cd745f64a570e7f865c49c17f59822f7f7e47b" 233 | uuid = "d2f5444f-75bc-4fdf-ac35-56f514c445e1" 234 | version = "0.8.1" 235 | 236 | [[deps.DuckDB_jll]] 237 | deps = ["Artifacts", "JLLWrappers", "Libdl"] 238 | git-tree-sha1 = "f23f3781c620a97a9d0f7e4e057e94f9c9ef70e1" 239 | uuid = "2cbbab25-fc8b-58cf-88d4-687a02676033" 240 | version = "0.8.1+0" 241 | 242 | [[deps.ExprTools]] 243 | git-tree-sha1 = "c1d06d129da9f55715c6c212866f5b1bddc5fa00" 244 | uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" 245 | version = "0.1.9" 246 | 247 | [[deps.FNVHash]] 248 | git-tree-sha1 = "d6de2c735a8bffce9bc481942dfa453cc815357e" 249 | uuid = "5207ad80-27db-4d23-8732-fa0bd339ea89" 250 | version = "0.1.0" 251 | 252 | [[deps.FileIO]] 253 | deps = ["Pkg", "Requires", "UUIDs"] 254 | git-tree-sha1 = "299dc33549f68299137e51e6d49a13b5b1da9673" 255 | uuid = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549" 256 | version = "1.16.1" 257 | 258 | [[deps.FilePathsBase]] 259 | deps = ["Compat", "Dates", "Mmap", "Printf", "Test", "UUIDs"] 260 | git-tree-sha1 = "e27c4ebe80e8699540f2d6c805cc12203b614f12" 261 | uuid = "48062228-2e41-5def-b9a4-89aafe57970f" 262 | version = "0.9.20" 263 | 264 | [[deps.FileWatching]] 265 | uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" 266 | 267 | [[deps.FillArrays]] 268 | deps = ["LinearAlgebra", "Random", "SparseArrays", "Statistics"] 269 | git-tree-sha1 = "f0af9b12329a637e8fba7d6543f915fff6ba0090" 270 | uuid = "1a297f60-69ca-5386-bcde-b61e274b549b" 271 | version = "1.4.2" 272 | 273 | [[deps.FixedPointDecimals]] 274 | deps = ["Parsers"] 275 | git-tree-sha1 = "d58aa8e85901dee0915262c1c2697c4037281982" 276 | uuid = "fb4d412d-6eee-574d-9565-ede6634db7b0" 277 | version = "0.4.3" 278 | 279 | [[deps.Formatting]] 280 | deps = ["Printf"] 281 | git-tree-sha1 = "8339d61043228fdd3eb658d86c926cb282ae72a8" 282 | uuid = "59287772-0a20-5a39-b81b-1366585eb4c0" 283 | version = "0.4.2" 284 | 285 | [[deps.Future]] 286 | deps = ["Random"] 287 | uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820" 288 | 289 | [[deps.GPUArraysCore]] 290 | deps = ["Adapt"] 291 | git-tree-sha1 = "2d6ca471a6c7b536127afccfa7564b5b39227fe0" 292 | uuid = "46192b85-c4d5-4398-a991-12ede77f4527" 293 | version = "0.1.5" 294 | 295 | [[deps.IJulia]] 296 | deps = ["Base64", "Conda", "Dates", "InteractiveUtils", "JSON", "Libdl", "Logging", "Markdown", "MbedTLS", "Pkg", "Printf", "REPL", "Random", "SoftGlobalScope", "Test", "UUIDs", "ZMQ"] 297 | git-tree-sha1 = "47ac8cc196b81001a711f4b2c12c97372338f00c" 298 | uuid = "7073ff75-c697-5162-941a-fcdaad2a7d2a" 299 | version = "1.24.2" 300 | 301 | [[deps.InitialValues]] 302 | git-tree-sha1 = "4da0f88e9a39111c2fa3add390ab15f3a44f3ca3" 303 | uuid = "22cec73e-a1b8-11e9-2c92-598750a2cf9c" 304 | version = "0.3.1" 305 | 306 | [[deps.InlineStrings]] 307 | deps = ["Parsers"] 308 | git-tree-sha1 = "9cc2baf75c6d09f9da536ddf58eb2f29dedaf461" 309 | uuid = "842dd82b-1e85-43dc-bf29-5d0ee9dffc48" 310 | version = "1.4.0" 311 | 312 | [[deps.InteractiveUtils]] 313 | deps = ["Markdown"] 314 | uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" 315 | 316 | [[deps.InvertedIndices]] 317 | git-tree-sha1 = "0dc7b50b8d436461be01300fd8cd45aa0274b038" 318 | uuid = "41ab1584-1d38-5bbf-9106-f11c6c58b48f" 319 | version = "1.3.0" 320 | 321 | [[deps.IrrationalConstants]] 322 | git-tree-sha1 = "630b497eafcc20001bba38a4651b327dcfc491d2" 323 | uuid = "92d709cd-6900-40b7-9082-c6be49f344b6" 324 | version = "0.2.2" 325 | 326 | [[deps.IteratorInterfaceExtensions]] 327 | git-tree-sha1 = "a3f24677c21f5bbe9d2a714f95dcd58337fb2856" 328 | uuid = "82899510-4779-5014-852e-03e436cf321d" 329 | version = "1.0.0" 330 | 331 | [[deps.JLLWrappers]] 332 | deps = ["Preferences"] 333 | git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1" 334 | uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" 335 | version = "1.4.1" 336 | 337 | [[deps.JSON]] 338 | deps = ["Dates", "Mmap", "Parsers", "Unicode"] 339 | git-tree-sha1 = "31e996f0a15c7b280ba9f76636b3ff9e2ae58c9a" 340 | uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" 341 | version = "0.21.4" 342 | 343 | [[deps.JSON3]] 344 | deps = ["Dates", "Mmap", "Parsers", "PrecompileTools", "StructTypes", "UUIDs"] 345 | git-tree-sha1 = "5b62d93f2582b09e469b3099d839c2d2ebf5066d" 346 | uuid = "0f8b85d8-7281-11e9-16c2-39a750bddbf1" 347 | version = "1.13.1" 348 | 349 | [[deps.LZO_jll]] 350 | deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] 351 | git-tree-sha1 = "e5b909bcf985c5e2605737d2ce278ed791b89be6" 352 | uuid = "dd4b983a-f0e5-5f8d-a1b7-129d4a5fb1ac" 353 | version = "2.10.1+0" 354 | 355 | [[deps.LaTeXStrings]] 356 | git-tree-sha1 = "f2355693d6778a178ade15952b7ac47a4ff97996" 357 | uuid = "b964fa9f-0449-5b57-a5c2-d3ea65f4040f" 358 | version = "1.3.0" 359 | 360 | [[deps.LazyArrays]] 361 | deps = ["ArrayLayouts", "FillArrays", "LinearAlgebra", "MacroTools", "MatrixFactorizations", "SparseArrays"] 362 | git-tree-sha1 = "a552e17ee600c6fa933f3f9bff7c380b2e032ba8" 363 | uuid = "5078a376-72f3-5289-bfd5-ec5146d43c02" 364 | version = "1.4.1" 365 | weakdeps = ["StaticArrays"] 366 | 367 | [deps.LazyArrays.extensions] 368 | LazyArraysStaticArraysExt = "StaticArrays" 369 | 370 | [[deps.LazyArtifacts]] 371 | deps = ["Artifacts", "Pkg"] 372 | uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" 373 | 374 | [[deps.LibCURL]] 375 | deps = ["LibCURL_jll", "MozillaCACerts_jll"] 376 | uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" 377 | version = "0.6.3" 378 | 379 | [[deps.LibCURL_jll]] 380 | deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] 381 | uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" 382 | version = "7.84.0+0" 383 | 384 | [[deps.LibGit2]] 385 | deps = ["Base64", "NetworkOptions", "Printf", "SHA"] 386 | uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" 387 | 388 | [[deps.LibSSH2_jll]] 389 | deps = ["Artifacts", "Libdl", "MbedTLS_jll"] 390 | uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" 391 | version = "1.10.2+0" 392 | 393 | [[deps.Libdl]] 394 | uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" 395 | 396 | [[deps.Libiconv_jll]] 397 | deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] 398 | git-tree-sha1 = "c7cb1f5d892775ba13767a87c7ada0b980ea0a71" 399 | uuid = "94ce4f54-9a6c-5748-9c1c-f9c7231a4531" 400 | version = "1.16.1+2" 401 | 402 | [[deps.LightBSON]] 403 | deps = ["DataStructures", "Dates", "DecFP", "FNVHash", "JSON3", "Sockets", "StructTypes", "Transducers", "UUIDs", "UnsafeArrays", "WeakRefStrings"] 404 | git-tree-sha1 = "66369db4570bcd852bde2dd39beaa559bc9890dd" 405 | uuid = "a4a7f996-b3a6-4de6-b9db-2fa5f350df41" 406 | version = "0.2.16" 407 | 408 | [[deps.LinearAlgebra]] 409 | deps = ["Libdl", "OpenBLAS_jll", "libblastrampoline_jll"] 410 | uuid = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" 411 | 412 | [[deps.LogExpFunctions]] 413 | deps = ["DocStringExtensions", "IrrationalConstants", "LinearAlgebra"] 414 | git-tree-sha1 = "c3ce8e7420b3a6e071e0fe4745f5d4300e37b13f" 415 | uuid = "2ab3a3ac-af41-5b50-aa03-7779005ae688" 416 | version = "0.3.24" 417 | 418 | [deps.LogExpFunctions.extensions] 419 | LogExpFunctionsChainRulesCoreExt = "ChainRulesCore" 420 | LogExpFunctionsChangesOfVariablesExt = "ChangesOfVariables" 421 | LogExpFunctionsInverseFunctionsExt = "InverseFunctions" 422 | 423 | [deps.LogExpFunctions.weakdeps] 424 | ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" 425 | ChangesOfVariables = "9e997f8a-9a97-42d5-a9f1-ce6bfc15e2c0" 426 | InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112" 427 | 428 | [[deps.Logging]] 429 | uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" 430 | 431 | [[deps.Lz4_jll]] 432 | deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] 433 | git-tree-sha1 = "5d494bc6e85c4c9b626ee0cab05daa4085486ab1" 434 | uuid = "5ced341a-0733-55b8-9ab6-a4889d929147" 435 | version = "1.9.3+0" 436 | 437 | [[deps.MacroTools]] 438 | deps = ["Markdown", "Random"] 439 | git-tree-sha1 = "42324d08725e200c23d4dfb549e0d5d89dede2d2" 440 | uuid = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" 441 | version = "0.5.10" 442 | 443 | [[deps.Markdown]] 444 | deps = ["Base64"] 445 | uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" 446 | 447 | [[deps.MatrixFactorizations]] 448 | deps = ["ArrayLayouts", "LinearAlgebra", "Printf", "Random"] 449 | git-tree-sha1 = "6507b5bde6500ae31c01a1d893764e130b62256d" 450 | uuid = "a3b82374-2e81-5b9e-98ce-41277c0e4c87" 451 | version = "2.0.0" 452 | 453 | [[deps.MbedTLS]] 454 | deps = ["Dates", "MbedTLS_jll", "MozillaCACerts_jll", "Random", "Sockets"] 455 | git-tree-sha1 = "03a9b9718f5682ecb107ac9f7308991db4ce395b" 456 | uuid = "739be429-bea8-5141-9913-cc70e7f3736d" 457 | version = "1.1.7" 458 | 459 | [[deps.MbedTLS_jll]] 460 | deps = ["Artifacts", "Libdl"] 461 | uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" 462 | version = "2.28.2+0" 463 | 464 | [[deps.MicroCollections]] 465 | deps = ["BangBang", "InitialValues", "Setfield"] 466 | git-tree-sha1 = "629afd7d10dbc6935ec59b32daeb33bc4460a42e" 467 | uuid = "128add7d-3638-4c79-886c-908ea0c25c34" 468 | version = "0.1.4" 469 | 470 | [[deps.Missings]] 471 | deps = ["DataAPI"] 472 | git-tree-sha1 = "f66bdc5de519e8f8ae43bdc598782d35a25b1272" 473 | uuid = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28" 474 | version = "1.1.0" 475 | 476 | [[deps.Mmap]] 477 | uuid = "a63ad114-7e13-5084-954f-fe012c677804" 478 | 479 | [[deps.Mocking]] 480 | deps = ["Compat", "ExprTools"] 481 | git-tree-sha1 = "4cc0c5a83933648b615c36c2b956d94fda70641e" 482 | uuid = "78c3b35d-d492-501b-9361-3d52fe80e533" 483 | version = "0.7.7" 484 | 485 | [[deps.MozillaCACerts_jll]] 486 | uuid = "14a3606d-f60d-562e-9121-12d972cd8159" 487 | version = "2022.10.11" 488 | 489 | [[deps.NetworkOptions]] 490 | uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" 491 | version = "1.2.0" 492 | 493 | [[deps.OpenBLAS_jll]] 494 | deps = ["Artifacts", "CompilerSupportLibraries_jll", "Libdl"] 495 | uuid = "4536629a-c528-5b80-bd46-f80d51c5b363" 496 | version = "0.3.21+4" 497 | 498 | [[deps.OpenLibm_jll]] 499 | deps = ["Artifacts", "Libdl"] 500 | uuid = "05823500-19ac-5b8b-9628-191a04bc5112" 501 | version = "0.8.1+0" 502 | 503 | [[deps.OpenSpecFun_jll]] 504 | deps = ["Artifacts", "CompilerSupportLibraries_jll", "JLLWrappers", "Libdl", "Pkg"] 505 | git-tree-sha1 = "13652491f6856acfd2db29360e1bbcd4565d04f1" 506 | uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" 507 | version = "0.5.5+0" 508 | 509 | [[deps.OrderedCollections]] 510 | git-tree-sha1 = "2e73fe17cac3c62ad1aebe70d44c963c3cfdc3e3" 511 | uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" 512 | version = "1.6.2" 513 | 514 | [[deps.Parquet2]] 515 | deps = ["AbstractTrees", "BitIntegers", "CodecLz4", "CodecZlib", "CodecZstd", "DataAPI", "Dates", "DecFP", "FilePathsBase", "FillArrays", "JSON3", "LazyArrays", "LightBSON", "Mmap", "OrderedCollections", "PooledArrays", "PrecompileTools", "SentinelArrays", "Snappy", "StaticArrays", "TableOperations", "Tables", "Thrift2", "Transducers", "UUIDs", "WeakRefStrings"] 516 | git-tree-sha1 = "8bb2f9e729a2becea1ed351253e14f5660e304ab" 517 | uuid = "98572fba-bba0-415d-956f-fa77e587d26d" 518 | version = "0.2.17" 519 | 520 | [[deps.Parsers]] 521 | deps = ["Dates", "PrecompileTools", "UUIDs"] 522 | git-tree-sha1 = "4b2e829ee66d4218e0cef22c0a64ee37cf258c29" 523 | uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" 524 | version = "2.7.1" 525 | 526 | [[deps.Pkg]] 527 | deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] 528 | uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" 529 | version = "1.9.2" 530 | 531 | [[deps.PooledArrays]] 532 | deps = ["DataAPI", "Future"] 533 | git-tree-sha1 = "a6062fe4063cdafe78f4a0a81cfffb89721b30e7" 534 | uuid = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" 535 | version = "1.4.2" 536 | 537 | [[deps.PrecompileTools]] 538 | deps = ["Preferences"] 539 | git-tree-sha1 = "9673d39decc5feece56ef3940e5dafba15ba0f81" 540 | uuid = "aea7be01-6a6a-4083-8856-8a6e6704d82a" 541 | version = "1.1.2" 542 | 543 | [[deps.Preferences]] 544 | deps = ["TOML"] 545 | git-tree-sha1 = "7eb1686b4f04b82f96ed7a4ea5890a4f0c7a09f1" 546 | uuid = "21216c6a-2e73-6563-6e65-726566657250" 547 | version = "1.4.0" 548 | 549 | [[deps.PrettyTables]] 550 | deps = ["Crayons", "Formatting", "LaTeXStrings", "Markdown", "Reexport", "StringManipulation", "Tables"] 551 | git-tree-sha1 = "542b1bd03329c1d235110f96f1bb0eeffc48a87d" 552 | uuid = "08abe8d2-0d0c-5749-adfa-8a2ac140af0d" 553 | version = "2.2.6" 554 | 555 | [[deps.Printf]] 556 | deps = ["Unicode"] 557 | uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" 558 | 559 | [[deps.RData]] 560 | deps = ["CategoricalArrays", "CodecZlib", "DataAPI", "DataFrames", "Dates", "FileIO", "Requires", "TimeZones", "Unicode"] 561 | git-tree-sha1 = "9a6220c8f59c38ddf6217638042ae6788973f617" 562 | uuid = "df47a6cb-8c03-5eed-afd8-b6050d6c41da" 563 | version = "1.0.0" 564 | 565 | [[deps.REPL]] 566 | deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] 567 | uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" 568 | 569 | [[deps.Random]] 570 | deps = ["SHA", "Serialization"] 571 | uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" 572 | 573 | [[deps.ReadStatTables]] 574 | deps = ["CEnum", "DataAPI", "Dates", "InlineStrings", "PooledArrays", "PrecompileTools", "PrettyTables", "ReadStat_jll", "SentinelArrays", "StructArrays", "Tables"] 575 | git-tree-sha1 = "fd0de7ebae24cfb11e0d8d0bc5f59e24d7f304e5" 576 | uuid = "52522f7a-9570-4e34-8ac6-c005c74d4b84" 577 | version = "0.2.5" 578 | 579 | [[deps.ReadStat_jll]] 580 | deps = ["Artifacts", "JLLWrappers", "Libdl", "Libiconv_jll", "Zlib_jll"] 581 | git-tree-sha1 = "28e990e90ca643e99f3ec0188089c1816e8b46f4" 582 | uuid = "a4dc8951-f1cc-5499-9034-9ec1c3e64557" 583 | version = "1.1.9+0" 584 | 585 | [[deps.RecipesBase]] 586 | deps = ["PrecompileTools"] 587 | git-tree-sha1 = "5c3d09cc4f31f5fc6af001c250bf1278733100ff" 588 | uuid = "3cdcf5f2-1ef4-517c-9805-6587b60abb01" 589 | version = "1.3.4" 590 | 591 | [[deps.Reexport]] 592 | git-tree-sha1 = "45e428421666073eab6f2da5c9d310d99bb12f9b" 593 | uuid = "189a3867-3050-52da-a836-e630ba90ab69" 594 | version = "1.2.2" 595 | 596 | [[deps.Requires]] 597 | deps = ["UUIDs"] 598 | git-tree-sha1 = "838a3a4188e2ded87a4f9f184b4b0d78a1e91cb7" 599 | uuid = "ae029012-a4dd-5104-9daa-d747884805df" 600 | version = "1.3.0" 601 | 602 | [[deps.SHA]] 603 | uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" 604 | version = "0.7.0" 605 | 606 | [[deps.SQLite]] 607 | deps = ["DBInterface", "Random", "SQLite_jll", "Serialization", "Tables", "WeakRefStrings"] 608 | git-tree-sha1 = "eb9a473c9b191ced349d04efa612ec9f39c087ea" 609 | uuid = "0aa819cd-b072-5ff4-a722-6bc24af294d9" 610 | version = "1.6.0" 611 | 612 | [[deps.SQLite_jll]] 613 | deps = ["Artifacts", "JLLWrappers", "Libdl", "Zlib_jll"] 614 | git-tree-sha1 = "4619dd3363610d94fb42a95a6dc35b526a26d0ef" 615 | uuid = "76ed43ae-9a5d-5a62-8c75-30186b810ce8" 616 | version = "3.42.0+0" 617 | 618 | [[deps.Scratch]] 619 | deps = ["Dates"] 620 | git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a" 621 | uuid = "6c6a2e73-6563-6170-7368-637461726353" 622 | version = "1.2.0" 623 | 624 | [[deps.SentinelArrays]] 625 | deps = ["Dates", "Random"] 626 | git-tree-sha1 = "04bdff0b09c65ff3e06a05e3eb7b120223da3d39" 627 | uuid = "91c51154-3ec4-41a3-a24f-3f23e20d615c" 628 | version = "1.4.0" 629 | 630 | [[deps.Serialization]] 631 | uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" 632 | 633 | [[deps.Setfield]] 634 | deps = ["ConstructionBase", "Future", "MacroTools", "StaticArraysCore"] 635 | git-tree-sha1 = "e2cc6d8c88613c05e1defb55170bf5ff211fbeac" 636 | uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46" 637 | version = "1.1.1" 638 | 639 | [[deps.Snappy]] 640 | deps = ["CEnum", "snappy_jll"] 641 | git-tree-sha1 = "72bae53c0691f4b6fd259587dab8821ae0e025f6" 642 | uuid = "59d4ed8c-697a-5b28-a4c7-fe95c22820f9" 643 | version = "0.4.2" 644 | 645 | [[deps.Sockets]] 646 | uuid = "6462fe0b-24de-5631-8697-dd941f90decc" 647 | 648 | [[deps.SoftGlobalScope]] 649 | deps = ["REPL"] 650 | git-tree-sha1 = "986ec2b6162ccb95de5892ed17832f95badf770c" 651 | uuid = "b85f4697-e234-5449-a836-ec8e2f98b302" 652 | version = "1.1.0" 653 | 654 | [[deps.SortingAlgorithms]] 655 | deps = ["DataStructures"] 656 | git-tree-sha1 = "c60ec5c62180f27efea3ba2908480f8055e17cee" 657 | uuid = "a2af1166-a08f-5f64-846c-94a0d3cef48c" 658 | version = "1.1.1" 659 | 660 | [[deps.SparseArrays]] 661 | deps = ["Libdl", "LinearAlgebra", "Random", "Serialization", "SuiteSparse_jll"] 662 | uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" 663 | 664 | [[deps.SpecialFunctions]] 665 | deps = ["IrrationalConstants", "LogExpFunctions", "OpenLibm_jll", "OpenSpecFun_jll"] 666 | git-tree-sha1 = "7beb031cf8145577fbccacd94b8a8f4ce78428d3" 667 | uuid = "276daf66-3868-5448-9aa4-cd146d93841b" 668 | version = "2.3.0" 669 | 670 | [deps.SpecialFunctions.extensions] 671 | SpecialFunctionsChainRulesCoreExt = "ChainRulesCore" 672 | 673 | [deps.SpecialFunctions.weakdeps] 674 | ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" 675 | 676 | [[deps.SplittablesBase]] 677 | deps = ["Setfield", "Test"] 678 | git-tree-sha1 = "e08a62abc517eb79667d0a29dc08a3b589516bb5" 679 | uuid = "171d559e-b47b-412a-8079-5efa626c420e" 680 | version = "0.1.15" 681 | 682 | [[deps.StaticArrays]] 683 | deps = ["LinearAlgebra", "Random", "StaticArraysCore"] 684 | git-tree-sha1 = "9cabadf6e7cd2349b6cf49f1915ad2028d65e881" 685 | uuid = "90137ffa-7385-5640-81b9-e52037218182" 686 | version = "1.6.2" 687 | weakdeps = ["Statistics"] 688 | 689 | [deps.StaticArrays.extensions] 690 | StaticArraysStatisticsExt = "Statistics" 691 | 692 | [[deps.StaticArraysCore]] 693 | git-tree-sha1 = "36b3d696ce6366023a0ea192b4cd442268995a0d" 694 | uuid = "1e83bf80-4336-4d27-bf5d-d5a4f845583c" 695 | version = "1.4.2" 696 | 697 | [[deps.Statistics]] 698 | deps = ["LinearAlgebra", "SparseArrays"] 699 | uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" 700 | version = "1.9.0" 701 | 702 | [[deps.StatsAPI]] 703 | deps = ["LinearAlgebra"] 704 | git-tree-sha1 = "45a7769a04a3cf80da1c1c7c60caf932e6f4c9f7" 705 | uuid = "82ae8749-77ed-4fe6-ae5f-f523153014b0" 706 | version = "1.6.0" 707 | 708 | [[deps.StatsBase]] 709 | deps = ["DataAPI", "DataStructures", "LinearAlgebra", "LogExpFunctions", "Missings", "Printf", "Random", "SortingAlgorithms", "SparseArrays", "Statistics", "StatsAPI"] 710 | git-tree-sha1 = "75ebe04c5bed70b91614d684259b661c9e6274a4" 711 | uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" 712 | version = "0.34.0" 713 | 714 | [[deps.StringManipulation]] 715 | git-tree-sha1 = "46da2434b41f41ac3594ee9816ce5541c6096123" 716 | uuid = "892a3eda-7b42-436c-8928-eab12a02cf0e" 717 | version = "0.3.0" 718 | 719 | [[deps.StructArrays]] 720 | deps = ["Adapt", "DataAPI", "GPUArraysCore", "StaticArraysCore", "Tables"] 721 | git-tree-sha1 = "521a0e828e98bb69042fec1809c1b5a680eb7389" 722 | uuid = "09ab397b-f2b6-538f-b94a-2f83cf4a842a" 723 | version = "0.6.15" 724 | 725 | [[deps.StructTypes]] 726 | deps = ["Dates", "UUIDs"] 727 | git-tree-sha1 = "ca4bccb03acf9faaf4137a9abc1881ed1841aa70" 728 | uuid = "856f2bd8-1eba-4b0a-8007-ebc267875bd4" 729 | version = "1.10.0" 730 | 731 | [[deps.SuiteSparse_jll]] 732 | deps = ["Artifacts", "Libdl", "Pkg", "libblastrampoline_jll"] 733 | uuid = "bea87d4a-7f5b-5778-9afe-8cc45184846c" 734 | version = "5.10.1+6" 735 | 736 | [[deps.TOML]] 737 | deps = ["Dates"] 738 | uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" 739 | version = "1.0.3" 740 | 741 | [[deps.TableOperations]] 742 | deps = ["SentinelArrays", "Tables", "Test"] 743 | git-tree-sha1 = "e383c87cf2a1dc41fa30c093b2a19877c83e1bc1" 744 | uuid = "ab02a1b2-a7df-11e8-156e-fb1833f50b87" 745 | version = "1.2.0" 746 | 747 | [[deps.TableTraits]] 748 | deps = ["IteratorInterfaceExtensions"] 749 | git-tree-sha1 = "c06b2f539df1c6efa794486abfb6ed2022561a39" 750 | uuid = "3783bdb8-4a98-5b6b-af9a-565f29a5fe9c" 751 | version = "1.0.1" 752 | 753 | [[deps.Tables]] 754 | deps = ["DataAPI", "DataValueInterfaces", "IteratorInterfaceExtensions", "LinearAlgebra", "OrderedCollections", "TableTraits", "Test"] 755 | git-tree-sha1 = "1544b926975372da01227b382066ab70e574a3ec" 756 | uuid = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" 757 | version = "1.10.1" 758 | 759 | [[deps.Tar]] 760 | deps = ["ArgTools", "SHA"] 761 | uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" 762 | version = "1.10.0" 763 | 764 | [[deps.Test]] 765 | deps = ["InteractiveUtils", "Logging", "Random", "Serialization"] 766 | uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" 767 | 768 | [[deps.Thrift2]] 769 | deps = ["MacroTools", "OrderedCollections", "PrecompileTools"] 770 | git-tree-sha1 = "00d618714271f283ea3829ab058d5e5bd1847f85" 771 | uuid = "9be31aac-5446-47db-bfeb-416acd2e4415" 772 | version = "0.1.4" 773 | 774 | [[deps.TimeZones]] 775 | deps = ["Dates", "Downloads", "InlineStrings", "LazyArtifacts", "Mocking", "Printf", "RecipesBase", "Scratch", "Unicode"] 776 | git-tree-sha1 = "cdaa0c2a4449724aded839550eca7d7240bb6938" 777 | uuid = "f269a46b-ccf7-5d73-abea-4c690281aa53" 778 | version = "1.10.0" 779 | 780 | [[deps.TranscodingStreams]] 781 | deps = ["Random", "Test"] 782 | git-tree-sha1 = "9a6ae7ed916312b41236fcef7e0af564ef934769" 783 | uuid = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" 784 | version = "0.9.13" 785 | 786 | [[deps.Transducers]] 787 | deps = ["Adapt", "ArgCheck", "BangBang", "Baselet", "CompositionsBase", "ConstructionBase", "DefineSingletons", "Distributed", "InitialValues", "Logging", "Markdown", "MicroCollections", "Requires", "Setfield", "SplittablesBase", "Tables"] 788 | git-tree-sha1 = "53bd5978b182fa7c57577bdb452c35e5b4fb73a5" 789 | uuid = "28d57a85-8fef-5791-bfe6-a80928e7c999" 790 | version = "0.4.78" 791 | 792 | [deps.Transducers.extensions] 793 | TransducersBlockArraysExt = "BlockArrays" 794 | TransducersDataFramesExt = "DataFrames" 795 | TransducersLazyArraysExt = "LazyArrays" 796 | TransducersOnlineStatsBaseExt = "OnlineStatsBase" 797 | TransducersReferenceablesExt = "Referenceables" 798 | 799 | [deps.Transducers.weakdeps] 800 | BlockArrays = "8e7c35d0-a365-5155-bbbb-fb81a777f24e" 801 | DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" 802 | LazyArrays = "5078a376-72f3-5289-bfd5-ec5146d43c02" 803 | OnlineStatsBase = "925886fa-5bf2-5e8e-b522-a9147a512338" 804 | Referenceables = "42d2dcc6-99eb-4e98-b66c-637b7d73030e" 805 | 806 | [[deps.UUIDs]] 807 | deps = ["Random", "SHA"] 808 | uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" 809 | 810 | [[deps.Unicode]] 811 | uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" 812 | 813 | [[deps.UnsafeArrays]] 814 | git-tree-sha1 = "3350f94f6caa02f324a23645bf524fc9334c7488" 815 | uuid = "c4a57d5a-5b31-53a6-b365-19f8c011fbd6" 816 | version = "1.0.4" 817 | 818 | [[deps.VersionParsing]] 819 | git-tree-sha1 = "58d6e80b4ee071f5efd07fda82cb9fbe17200868" 820 | uuid = "81def892-9a0e-5fdd-b105-ffc91e053289" 821 | version = "1.3.0" 822 | 823 | [[deps.WeakRefStrings]] 824 | deps = ["DataAPI", "InlineStrings", "Parsers"] 825 | git-tree-sha1 = "b1be2855ed9ed8eac54e5caff2afcdb442d52c23" 826 | uuid = "ea10d353-3f73-51f8-a26c-33c1cb351aa5" 827 | version = "1.4.2" 828 | 829 | [[deps.XZ_jll]] 830 | deps = ["Artifacts", "JLLWrappers", "Libdl"] 831 | git-tree-sha1 = "2222b751598bd9f4885c9ce9cd23e83404baa8ce" 832 | uuid = "ffd25f8a-64ca-5728-b0f7-c24cf3aae800" 833 | version = "5.4.3+1" 834 | 835 | [[deps.ZMQ]] 836 | deps = ["FileWatching", "Sockets", "ZeroMQ_jll"] 837 | git-tree-sha1 = "356d2bdcc0bce90aabee1d1c0f6d6f301eda8f77" 838 | uuid = "c2297ded-f4af-51ae-bb23-16f91089e4e1" 839 | version = "1.2.2" 840 | 841 | [[deps.ZeroMQ_jll]] 842 | deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg", "libsodium_jll"] 843 | git-tree-sha1 = "fe5c65a526f066fb3000da137d5785d9649a8a47" 844 | uuid = "8f1865be-045e-5c20-9c9f-bfbfb0764568" 845 | version = "4.3.4+0" 846 | 847 | [[deps.Zlib_jll]] 848 | deps = ["Libdl"] 849 | uuid = "83775a58-1f1d-513f-b197-d71354ab007a" 850 | version = "1.2.13+0" 851 | 852 | [[deps.Zstd_jll]] 853 | deps = ["Artifacts", "JLLWrappers", "Libdl"] 854 | git-tree-sha1 = "49ce682769cd5de6c72dcf1b94ed7790cd08974c" 855 | uuid = "3161d3a3-bdf6-5164-811a-617609db77b4" 856 | version = "1.5.5+0" 857 | 858 | [[deps.libblastrampoline_jll]] 859 | deps = ["Artifacts", "Libdl"] 860 | uuid = "8e850b90-86db-534c-a0d3-1478176c7d93" 861 | version = "5.8.0+0" 862 | 863 | [[deps.libsodium_jll]] 864 | deps = ["Artifacts", "JLLWrappers", "Libdl", "Pkg"] 865 | git-tree-sha1 = "848ab3d00fe39d6fbc2a8641048f8f272af1c51e" 866 | uuid = "a9144af2-ca23-56d9-984f-0d03f7b5ccf8" 867 | version = "1.0.20+0" 868 | 869 | [[deps.nghttp2_jll]] 870 | deps = ["Artifacts", "Libdl"] 871 | uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" 872 | version = "1.48.0+0" 873 | 874 | [[deps.p7zip_jll]] 875 | deps = ["Artifacts", "Libdl"] 876 | uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" 877 | version = "17.4.0+0" 878 | 879 | [[deps.snappy_jll]] 880 | deps = ["Artifacts", "JLLWrappers", "LZO_jll", "Libdl", "Pkg", "Zlib_jll"] 881 | git-tree-sha1 = "985c1da710b0e43f7c52f037441021dfd0e3be14" 882 | uuid = "fe1e1685-f7be-5f59-ac9f-4ca204017dfd" 883 | version = "1.1.9+1" 884 | -------------------------------------------------------------------------------- /Project.toml: -------------------------------------------------------------------------------- 1 | [deps] 2 | CodecXz = "ba30903b-d9e8-5048-a5ec-d1f5b0d4b47b" 3 | DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0" 4 | DuckDB = "d2f5444f-75bc-4fdf-ac35-56f514c445e1" 5 | IJulia = "7073ff75-c697-5162-941a-fcdaad2a7d2a" 6 | Parquet2 = "98572fba-bba0-415d-956f-fa77e587d26d" 7 | RData = "df47a6cb-8c03-5eed-afd8-b6050d6c41da" 8 | ReadStatTables = "52522f7a-9570-4e34-8ac6-c005c74d4b84" 9 | SQLite = "0aa819cd-b072-5ff4-a722-6bc24af294d9" 10 | StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Working with DataFrames.jl beyond CSV files 2 | 3 | [![JuliaCon2023 Talk](https://img.youtube.com/vi/oorErKcAWIQ/0.jpg)](https://www.youtube.com/watch?v=oorErKcAWIQ) 4 | 5 | This is an introductory part of the workshop 6 | prepared for [JuliaCon2023](https://juliacon.org/2023/). 7 | 8 | In order to run the tutorial make sure that you have Julia executable installed. 9 | The tutorial was developed under Julia 1.9.2. 10 | 11 | The simplest way to run it is to proceed as follows: 12 | 1. Clone the 13 | [tutorial repository](https://github.com/bkamins/JuliaCon2023-Tutorial) 14 | to a local folder on your computer. 15 | 2. Start Julia in your local folder using the `julia --project` command. 16 | 3. Run the following commands (this step needs to be run only once per installation and is made to double check that you have proper versions of packages downloaded): 17 | ``` 18 | using Pkg 19 | Pkg.instantiate() 20 | Pkg.status() 21 | ``` 22 | 4. Start Jupyter Notebook with: 23 | ``` 24 | using IJulia 25 | notebook(dir=pwd()) 26 | ``` 27 | 5. In the Jupyter Notebook open and run the *ipynb* files with the tutorial material. 28 | 29 | --- 30 | 31 | *Preparation of this workshop has been supported by the Polish National Agency for Academic Exchange under the Strategic Partnerships programme, grant number BPI/PST/2021/1/00069/U/00001.* 32 | 33 | ![SGH & NAWA](logo.png) 34 | -------------------------------------------------------------------------------- /WDI2009.dta: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bkamins/JuliaCon2023-Tutorial/2da04fe2d76a2d468e96d7795b3d713de914a31f/WDI2009.dta -------------------------------------------------------------------------------- /cars.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bkamins/JuliaCon2023-Tutorial/2da04fe2d76a2d468e96d7795b3d713de914a31f/cars.RData -------------------------------------------------------------------------------- /juliacon2023_part2_largedata.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "1b05b2ef", 6 | "metadata": {}, 7 | "source": [ 8 | "# Working with DataFrames.jl beyond CSV files\n", 9 | "\n", 10 | "# Part 2: Using Parquet for data larger than RAM\n", 11 | "\n", 12 | "## Bogumił Kamiński\n", 13 | "### June 25, 2023" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "id": "3bd5a5f2", 19 | "metadata": {}, 20 | "source": [ 21 | "What is covered in part 2:\n", 22 | "* how to iteratively create Parquet data store that jointly has more data than available RAM\n", 23 | "* how to manually process such data on a single machine (notebook-oriented process)" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "id": "8aeb9ca8", 29 | "metadata": {}, 30 | "source": [ 31 | "## Setup" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 1, 37 | "id": "391b440d", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "using DataFrames" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "id": "b6120192", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "using Parquet2" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 3, 57 | "id": "e6ba8ab9", 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "using Random" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 4, 67 | "id": "e5551636", 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "using Statistics" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 5, 77 | "id": "a3d36eba", 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "using StatsBase" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "id": "cd17df01", 87 | "metadata": {}, 88 | "source": [ 89 | "## Generate some large data" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 6, 95 | "id": "daa36b81", 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "data": { 100 | "text/plain": [ 101 | "false" 102 | ] 103 | }, 104 | "execution_count": 6, 105 | "metadata": {}, 106 | "output_type": "execute_result" 107 | } 108 | ], 109 | "source": [ 110 | "isdir(\"pq_experiment\") && rm(\"pq_experiment\"; recursive=true)" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 7, 116 | "id": "18b99893", 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "data": { 121 | "text/plain": [ 122 | "\"pq_experiment\"" 123 | ] 124 | }, 125 | "execution_count": 7, 126 | "metadata": {}, 127 | "output_type": "execute_result" 128 | } 129 | ], 130 | "source": [ 131 | "mkdir(\"pq_experiment\")" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 8, 137 | "id": "7740f592", 138 | "metadata": {}, 139 | "outputs": [], 140 | "source": [ 141 | "Random.seed!(1234);" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "id": "32e53ec3", 147 | "metadata": {}, 148 | "source": [ 149 | "Create 500 groups (range `0.0:0.002:1.0`) of data, each having $2^{20}$ = 1,048,576 rows and two `Float64` columns (I could have made it larger, but this should be enough as an example)." 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 9, 155 | "id": "a6dd4359", 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "name": "stderr", 160 | "output_type": "stream", 161 | "text": [ 162 | "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mwriting file #1\n", 163 | "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mwriting file #2\n", 164 | "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mwriting file #3\n", 165 | "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mwriting file #4\n", 166 | "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mwriting file #5\n", 167 | "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mwriting file #6\n" 168 | ] 169 | }, 170 | { 171 | "data": { 172 | "text/plain": [ 173 | "\u001b[34m✏ \u001b[39mParquet2.FileWriter{IOStream}(pq_experiment/experiment_6.parquet)" 174 | ] 175 | }, 176 | "execution_count": 9, 177 | "metadata": {}, 178 | "output_type": "execute_result" 179 | } 180 | ], 181 | "source": [ 182 | "let # create local scope for more consistent variable scoping behavior and avoid temporary variable leakage\n", 183 | " i = 1\n", 184 | " df = DataFrame() # temporary data frame to store intermediate results\n", 185 | " maxsize = 10^8 # define size of one chunk of data written to disk\n", 186 | " for μ in 0.0:0.002:1.0\n", 187 | " result = DataFrame(mu=μ, x=randn(2^20) .+ μ)\n", 188 | " append!(df, result) # keep appending data from partial simulations\n", 189 | " if nrow(df) > maxsize # if our data gets to big dump it to a consecutive file\n", 190 | " @info \"writing file #$i\"\n", 191 | " Parquet2.writefile(\"pq_experiment/experiment_$i.parquet\", @view df[1:maxsize, :])\n", 192 | " deleteat!(df, 1:maxsize) # drop data stored in a file\n", 193 | " i += 1\n", 194 | " end\n", 195 | " end\n", 196 | " if nrow(df) > 0 # if we have some unsaved data store it now\n", 197 | " @info \"writing file #$i\"\n", 198 | " Parquet2.writefile(\"pq_experiment/experiment_$i.parquet\", df)\n", 199 | " end\n", 200 | "end" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "id": "66605891", 206 | "metadata": {}, 207 | "source": [ 208 | "Note that reading the file is lazy. Actual data is not read yet:" 209 | ] 210 | }, 211 | { 212 | "cell_type": "code", 213 | "execution_count": 10, 214 | "id": "a1337937", 215 | "metadata": {}, 216 | "outputs": [ 217 | { 218 | "data": { 219 | "text/plain": [ 220 | "\u001b[34m≔ \u001b[39mParquet2.Dataset (837645573 bytes)\n", 221 | "\t1. \u001b[33m\"mu\"\u001b[39m: \u001b[36mFloat64\u001b[39m\n", 222 | "\t2. \u001b[33m\"x\"\u001b[39m: \u001b[36mFloat64\u001b[39m\n" 223 | ] 224 | }, 225 | "execution_count": 10, 226 | "metadata": {}, 227 | "output_type": "execute_result" 228 | } 229 | ], 230 | "source": [ 231 | "pq_experiment = Parquet2.readfile(\"pq_experiment\", load_initial=true)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "id": "66f0e314", 237 | "metadata": {}, 238 | "source": [ 239 | "We have six chunks of data (each corresponding to one file, as we did not create row groups within files):" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 11, 245 | "id": "dfa6c314", 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "data": { 250 | "text/plain": [ 251 | "6" 252 | ] 253 | }, 254 | "execution_count": 11, 255 | "metadata": {}, 256 | "output_type": "execute_result" 257 | } 258 | ], 259 | "source": [ 260 | "length(pq_experiment)" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 12, 266 | "id": "0492ece9", 267 | "metadata": {}, 268 | "outputs": [ 269 | { 270 | "data": { 271 | "text/plain": [ 272 | "6-element Vector{FilePathsBase.WindowsPath}:\n", 273 | " p\"C:/WORK/dev/DataFramesTutorials/JuliaCon2023-Tutorial/pq_experiment/experiment_1.parquet\"\n", 274 | " p\"C:/WORK/dev/DataFramesTutorials/JuliaCon2023-Tutorial/pq_experiment/experiment_2.parquet\"\n", 275 | " p\"C:/WORK/dev/DataFramesTutorials/JuliaCon2023-Tutorial/pq_experiment/experiment_3.parquet\"\n", 276 | " p\"C:/WORK/dev/DataFramesTutorials/JuliaCon2023-Tutorial/pq_experiment/experiment_4.parquet\"\n", 277 | " p\"C:/WORK/dev/DataFramesTutorials/JuliaCon2023-Tutorial/pq_experiment/experiment_5.parquet\"\n", 278 | " p\"C:/WORK/dev/DataFramesTutorials/JuliaCon2023-Tutorial/pq_experiment/experiment_6.parquet\"" 279 | ] 280 | }, 281 | "execution_count": 12, 282 | "metadata": {}, 283 | "output_type": "execute_result" 284 | } 285 | ], 286 | "source": [ 287 | "Parquet2.filelist(pq_experiment)" 288 | ] 289 | }, 290 | { 291 | "cell_type": "markdown", 292 | "id": "903d2f1c", 293 | "metadata": {}, 294 | "source": [ 295 | "Note that the last file has less rows than the rest:" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 13, 301 | "id": "1270096b", 302 | "metadata": {}, 303 | "outputs": [ 304 | { 305 | "data": { 306 | "text/plain": [ 307 | "6-element Vector{Int64}:\n", 308 | " 100000000\n", 309 | " 100000000\n", 310 | " 100000000\n", 311 | " 100000000\n", 312 | " 100000000\n", 313 | " 25336576" 314 | ] 315 | }, 316 | "execution_count": 13, 317 | "metadata": {}, 318 | "output_type": "execute_result" 319 | } 320 | ], 321 | "source": [ 322 | "nrow.(pq_experiment)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "id": "0d3ea748", 328 | "metadata": {}, 329 | "source": [ 330 | "The challenge we have in this dataset is that the same values of keys (`mu` column) are split across multiple files.\n", 331 | "\n", 332 | "Assume we want to get a mean over all keys. We need to do it in two steps.\n", 333 | "\n", 334 | "This is a standard map-reduce pattern. In this tutorial we perform both steps manually on a single node:" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 14, 340 | "id": "2f0b7a3d", 341 | "metadata": {}, 342 | "outputs": [ 343 | { 344 | "name": "stderr", 345 | "output_type": "stream", 346 | "text": [ 347 | "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mprocessing chunk of data #1\n", 348 | "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mprocessing chunk of data #2\n", 349 | "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mprocessing chunk of data #3\n", 350 | "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mprocessing chunk of data #4\n", 351 | "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mprocessing chunk of data #5\n", 352 | "\u001b[36m\u001b[1m[ \u001b[22m\u001b[39m\u001b[36m\u001b[1mInfo: \u001b[22m\u001b[39mprocessing chunk of data #6\n" 353 | ] 354 | }, 355 | { 356 | "data": { 357 | "text/plain": [ 358 | "6-element Vector{DataFrame}:\n", 359 | " \u001b[1m96×3 DataFrame\u001b[0m\n", 360 | "\u001b[1m Row \u001b[0m│\u001b[1m mu \u001b[0m\u001b[1m x_mean \u001b[0m\u001b[1m nrow \u001b[0m\n", 361 | " │\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Int64 \u001b[0m\n", 362 | "─────┼────────────────────────────────\n", 363 | " 1 │ 0.0 -0.000165159 1048576\n", 364 | " 2 │ 0.002 0.00181935 1048576\n", 365 | " 3 │ 0.004 0.00547356 1048576\n", 366 | " 4 │ 0.006 0.0053043 1048576\n", 367 | " 5 │ 0.008 0.00748101 1048576\n", 368 | " 6 │ 0.01 0.0105489 1048576\n", 369 | " 7 │ 0.012 0.0113697 1048576\n", 370 | " 8 │ 0.014 0.0133716 1048576\n", 371 | " 9 │ 0.016 0.0163869 1048576\n", 372 | " 10 │ 0.018 0.0192935 1048576\n", 373 | " 11 │ 0.02 0.0198156 1048576\n", 374 | " ⋮ │ ⋮ ⋮ ⋮\n", 375 | " 87 │ 0.172 0.171104 1048576\n", 376 | " 88 │ 0.174 0.175584 1048576\n", 377 | " 89 │ 0.176 0.176103 1048576\n", 378 | " 90 │ 0.178 0.178837 1048576\n", 379 | " 91 │ 0.18 0.182103 1048576\n", 380 | " 92 │ 0.182 0.182695 1048576\n", 381 | " 93 │ 0.184 0.181319 1048576\n", 382 | " 94 │ 0.186 0.186327 1048576\n", 383 | " 95 │ 0.188 0.188068 1048576\n", 384 | " 96 │ 0.19 0.187567 385280\n", 385 | "\u001b[36m 75 rows omitted\u001b[0m\n", 386 | " \u001b[1m96×3 DataFrame\u001b[0m\n", 387 | "\u001b[1m Row \u001b[0m│\u001b[1m mu \u001b[0m\u001b[1m x_mean \u001b[0m\u001b[1m nrow \u001b[0m\n", 388 | " │\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Int64 \u001b[0m\n", 389 | "─────┼────────────────────────────\n", 390 | " 1 │ 0.19 0.190411 663296\n", 391 | " 2 │ 0.192 0.19314 1048576\n", 392 | " 3 │ 0.194 0.193709 1048576\n", 393 | " 4 │ 0.196 0.195765 1048576\n", 394 | " 5 │ 0.198 0.198623 1048576\n", 395 | " 6 │ 0.2 0.200585 1048576\n", 396 | " 7 │ 0.202 0.200626 1048576\n", 397 | " 8 │ 0.204 0.205575 1048576\n", 398 | " 9 │ 0.206 0.205889 1048576\n", 399 | " 10 │ 0.208 0.207115 1048576\n", 400 | " 11 │ 0.21 0.20955 1048576\n", 401 | " ⋮ │ ⋮ ⋮ ⋮\n", 402 | " 87 │ 0.362 0.360563 1048576\n", 403 | " 88 │ 0.364 0.363005 1048576\n", 404 | " 89 │ 0.366 0.368417 1048576\n", 405 | " 90 │ 0.368 0.367043 1048576\n", 406 | " 91 │ 0.37 0.369158 1048576\n", 407 | " 92 │ 0.372 0.371584 1048576\n", 408 | " 93 │ 0.374 0.371797 1048576\n", 409 | " 94 │ 0.376 0.376891 1048576\n", 410 | " 95 │ 0.378 0.379768 1048576\n", 411 | " 96 │ 0.38 0.380511 770560\n", 412 | "\u001b[36m 75 rows omitted\u001b[0m\n", 413 | " \u001b[1m97×3 DataFrame\u001b[0m\n", 414 | "\u001b[1m Row \u001b[0m│\u001b[1m mu \u001b[0m\u001b[1m x_mean \u001b[0m\u001b[1m nrow \u001b[0m\n", 415 | " │\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Int64 \u001b[0m\n", 416 | "─────┼────────────────────────────\n", 417 | " 1 │ 0.38 0.379609 278016\n", 418 | " 2 │ 0.382 0.380177 1048576\n", 419 | " 3 │ 0.384 0.385076 1048576\n", 420 | " 4 │ 0.386 0.385815 1048576\n", 421 | " 5 │ 0.388 0.388314 1048576\n", 422 | " 6 │ 0.39 0.391087 1048576\n", 423 | " 7 │ 0.392 0.391428 1048576\n", 424 | " 8 │ 0.394 0.394603 1048576\n", 425 | " 9 │ 0.396 0.397369 1048576\n", 426 | " 10 │ 0.398 0.399379 1048576\n", 427 | " 11 │ 0.4 0.400048 1048576\n", 428 | " ⋮ │ ⋮ ⋮ ⋮\n", 429 | " 88 │ 0.554 0.553804 1048576\n", 430 | " 89 │ 0.556 0.557008 1048576\n", 431 | " 90 │ 0.558 0.558443 1048576\n", 432 | " 91 │ 0.56 0.559155 1048576\n", 433 | " 92 │ 0.562 0.5614 1048576\n", 434 | " 93 │ 0.564 0.565902 1048576\n", 435 | " 94 │ 0.566 0.565953 1048576\n", 436 | " 95 │ 0.568 0.567578 1048576\n", 437 | " 96 │ 0.57 0.570556 1048576\n", 438 | " 97 │ 0.572 0.574669 107264\n", 439 | "\u001b[36m 76 rows omitted\u001b[0m\n", 440 | " \u001b[1m96×3 DataFrame\u001b[0m\n", 441 | "\u001b[1m Row \u001b[0m│\u001b[1m mu \u001b[0m\u001b[1m x_mean \u001b[0m\u001b[1m nrow \u001b[0m\n", 442 | " │\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Int64 \u001b[0m\n", 443 | "─────┼────────────────────────────\n", 444 | " 1 │ 0.572 0.571994 941312\n", 445 | " 2 │ 0.574 0.573747 1048576\n", 446 | " 3 │ 0.576 0.575925 1048576\n", 447 | " 4 │ 0.578 0.57772 1048576\n", 448 | " 5 │ 0.58 0.578779 1048576\n", 449 | " 6 │ 0.582 0.582317 1048576\n", 450 | " 7 │ 0.584 0.585391 1048576\n", 451 | " 8 │ 0.586 0.585153 1048576\n", 452 | " 9 │ 0.588 0.587751 1048576\n", 453 | " 10 │ 0.59 0.5879 1048576\n", 454 | " 11 │ 0.592 0.593437 1048576\n", 455 | " ⋮ │ ⋮ ⋮ ⋮\n", 456 | " 87 │ 0.744 0.744916 1048576\n", 457 | " 88 │ 0.746 0.745429 1048576\n", 458 | " 89 │ 0.748 0.747461 1048576\n", 459 | " 90 │ 0.75 0.750765 1048576\n", 460 | " 91 │ 0.752 0.753685 1048576\n", 461 | " 92 │ 0.754 0.754218 1048576\n", 462 | " 93 │ 0.756 0.756017 1048576\n", 463 | " 94 │ 0.758 0.757438 1048576\n", 464 | " 95 │ 0.76 0.758445 1048576\n", 465 | " 96 │ 0.762 0.762476 492544\n", 466 | "\u001b[36m 75 rows omitted\u001b[0m\n", 467 | " \u001b[1m96×3 DataFrame\u001b[0m\n", 468 | "\u001b[1m Row \u001b[0m│\u001b[1m mu \u001b[0m\u001b[1m x_mean \u001b[0m\u001b[1m nrow \u001b[0m\n", 469 | " │\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Int64 \u001b[0m\n", 470 | "─────┼────────────────────────────\n", 471 | " 1 │ 0.762 0.762469 556032\n", 472 | " 2 │ 0.764 0.763604 1048576\n", 473 | " 3 │ 0.766 0.765431 1048576\n", 474 | " 4 │ 0.768 0.767295 1048576\n", 475 | " 5 │ 0.77 0.770628 1048576\n", 476 | " 6 │ 0.772 0.772683 1048576\n", 477 | " 7 │ 0.774 0.774978 1048576\n", 478 | " 8 │ 0.776 0.774604 1048576\n", 479 | " 9 │ 0.778 0.777869 1048576\n", 480 | " 10 │ 0.78 0.778066 1048576\n", 481 | " 11 │ 0.782 0.782015 1048576\n", 482 | " ⋮ │ ⋮ ⋮ ⋮\n", 483 | " 87 │ 0.934 0.932314 1048576\n", 484 | " 88 │ 0.936 0.93643 1048576\n", 485 | " 89 │ 0.938 0.937392 1048576\n", 486 | " 90 │ 0.94 0.94038 1048576\n", 487 | " 91 │ 0.942 0.942672 1048576\n", 488 | " 92 │ 0.944 0.943768 1048576\n", 489 | " 93 │ 0.946 0.946796 1048576\n", 490 | " 94 │ 0.948 0.947403 1048576\n", 491 | " 95 │ 0.95 0.95025 1048576\n", 492 | " 96 │ 0.952 0.953107 877824\n", 493 | "\u001b[36m 75 rows omitted\u001b[0m\n", 494 | " \u001b[1m25×3 DataFrame\u001b[0m\n", 495 | "\u001b[1m Row \u001b[0m│\u001b[1m mu \u001b[0m\u001b[1m x_mean \u001b[0m\u001b[1m nrow \u001b[0m\n", 496 | " │\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Int64 \u001b[0m\n", 497 | "─────┼────────────────────────────\n", 498 | " 1 │ 0.952 0.954526 170752\n", 499 | " 2 │ 0.954 0.95376 1048576\n", 500 | " 3 │ 0.956 0.955728 1048576\n", 501 | " 4 │ 0.958 0.95682 1048576\n", 502 | " 5 │ 0.96 0.959156 1048576\n", 503 | " 6 │ 0.962 0.962672 1048576\n", 504 | " 7 │ 0.964 0.965211 1048576\n", 505 | " 8 │ 0.966 0.964998 1048576\n", 506 | " 9 │ 0.968 0.968262 1048576\n", 507 | " 10 │ 0.97 0.969782 1048576\n", 508 | " 11 │ 0.972 0.971397 1048576\n", 509 | " ⋮ │ ⋮ ⋮ ⋮\n", 510 | " 16 │ 0.982 0.980645 1048576\n", 511 | " 17 │ 0.984 0.985538 1048576\n", 512 | " 18 │ 0.986 0.986935 1048576\n", 513 | " 19 │ 0.988 0.987878 1048576\n", 514 | " 20 │ 0.99 0.990001 1048576\n", 515 | " 21 │ 0.992 0.993618 1048576\n", 516 | " 22 │ 0.994 0.994494 1048576\n", 517 | " 23 │ 0.996 0.995764 1048576\n", 518 | " 24 │ 0.998 0.999058 1048576\n", 519 | " 25 │ 1.0 0.999101 1048576\n", 520 | "\u001b[36m 4 rows omitted\u001b[0m" 521 | ] 522 | }, 523 | "execution_count": 14, 524 | "metadata": {}, 525 | "output_type": "execute_result" 526 | } 527 | ], 528 | "source": [ 529 | "agg1 = map(enumerate(pq_experiment)) do (i, rowset)\n", 530 | " @info \"processing chunk of data #$i\"\n", 531 | " df = DataFrame(rowset, copycols=false)\n", 532 | " gdf = groupby(df, :mu)\n", 533 | " return combine(gdf, :x => mean, nrow)\n", 534 | "end" 535 | ] 536 | }, 537 | { 538 | "cell_type": "code", 539 | "execution_count": 15, 540 | "id": "53fff171", 541 | "metadata": {}, 542 | "outputs": [ 543 | { 544 | "data": { 545 | "text/html": [ 546 | "
506×3 DataFrame
481 rows omitted
Rowmux_meannrow
Float64Float64Int64
10.0-0.0001651591048576
20.0020.001819351048576
30.0040.005473561048576
40.0060.00530431048576
50.0080.007481011048576
60.010.01054891048576
70.0120.01136971048576
80.0140.01337161048576
90.0160.01638691048576
100.0180.01929351048576
110.020.01981561048576
120.0220.02347261048576
130.0240.02378061048576
4950.9780.9797261048576
4960.980.9808241048576
4970.9820.9806451048576
4980.9840.9855381048576
4990.9860.9869351048576
5000.9880.9878781048576
5010.990.9900011048576
5020.9920.9936181048576
5030.9940.9944941048576
5040.9960.9957641048576
5050.9980.9990581048576
5061.00.9991011048576
" 547 | ], 548 | "text/latex": [ 549 | "\\begin{tabular}{r|ccc}\n", 550 | "\t& mu & x\\_mean & nrow\\\\\n", 551 | "\t\\hline\n", 552 | "\t& Float64 & Float64 & Int64\\\\\n", 553 | "\t\\hline\n", 554 | "\t1 & 0.0 & -0.000165159 & 1048576 \\\\\n", 555 | "\t2 & 0.002 & 0.00181935 & 1048576 \\\\\n", 556 | "\t3 & 0.004 & 0.00547356 & 1048576 \\\\\n", 557 | "\t4 & 0.006 & 0.0053043 & 1048576 \\\\\n", 558 | "\t5 & 0.008 & 0.00748101 & 1048576 \\\\\n", 559 | "\t6 & 0.01 & 0.0105489 & 1048576 \\\\\n", 560 | "\t7 & 0.012 & 0.0113697 & 1048576 \\\\\n", 561 | "\t8 & 0.014 & 0.0133716 & 1048576 \\\\\n", 562 | "\t9 & 0.016 & 0.0163869 & 1048576 \\\\\n", 563 | "\t10 & 0.018 & 0.0192935 & 1048576 \\\\\n", 564 | "\t11 & 0.02 & 0.0198156 & 1048576 \\\\\n", 565 | "\t12 & 0.022 & 0.0234726 & 1048576 \\\\\n", 566 | "\t13 & 0.024 & 0.0237806 & 1048576 \\\\\n", 567 | "\t14 & 0.026 & 0.0249626 & 1048576 \\\\\n", 568 | "\t15 & 0.028 & 0.0273897 & 1048576 \\\\\n", 569 | "\t16 & 0.03 & 0.030416 & 1048576 \\\\\n", 570 | "\t17 & 0.032 & 0.0332699 & 1048576 \\\\\n", 571 | "\t18 & 0.034 & 0.0349246 & 1048576 \\\\\n", 572 | "\t19 & 0.036 & 0.0382223 & 1048576 \\\\\n", 573 | "\t20 & 0.038 & 0.0385496 & 1048576 \\\\\n", 574 | "\t21 & 0.04 & 0.0399323 & 1048576 \\\\\n", 575 | "\t22 & 0.042 & 0.0425588 & 1048576 \\\\\n", 576 | "\t23 & 0.044 & 0.0439851 & 1048576 \\\\\n", 577 | "\t24 & 0.046 & 0.0462198 & 1048576 \\\\\n", 578 | "\t25 & 0.048 & 0.0465669 & 1048576 \\\\\n", 579 | "\t26 & 0.05 & 0.0487787 & 1048576 \\\\\n", 580 | "\t27 & 0.052 & 0.0511604 & 1048576 \\\\\n", 581 | "\t28 & 0.054 & 0.0538052 & 1048576 \\\\\n", 582 | "\t29 & 0.056 & 0.0564606 & 1048576 \\\\\n", 583 | "\t30 & 0.058 & 0.0583234 & 1048576 \\\\\n", 584 | "\t$\\dots$ & $\\dots$ & $\\dots$ & $\\dots$ \\\\\n", 585 | "\\end{tabular}\n" 586 | ], 587 | "text/plain": [ 588 | "\u001b[1m506×3 DataFrame\u001b[0m\n", 589 | "\u001b[1m Row \u001b[0m│\u001b[1m mu \u001b[0m\u001b[1m x_mean \u001b[0m\u001b[1m nrow \u001b[0m\n", 590 | " │\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\u001b[90m Int64 \u001b[0m\n", 591 | "─────┼────────────────────────────────\n", 592 | " 1 │ 0.0 -0.000165159 1048576\n", 593 | " 2 │ 0.002 0.00181935 1048576\n", 594 | " 3 │ 0.004 0.00547356 1048576\n", 595 | " 4 │ 0.006 0.0053043 1048576\n", 596 | " 5 │ 0.008 0.00748101 1048576\n", 597 | " 6 │ 0.01 0.0105489 1048576\n", 598 | " 7 │ 0.012 0.0113697 1048576\n", 599 | " 8 │ 0.014 0.0133716 1048576\n", 600 | " 9 │ 0.016 0.0163869 1048576\n", 601 | " 10 │ 0.018 0.0192935 1048576\n", 602 | " 11 │ 0.02 0.0198156 1048576\n", 603 | " ⋮ │ ⋮ ⋮ ⋮\n", 604 | " 497 │ 0.982 0.980645 1048576\n", 605 | " 498 │ 0.984 0.985538 1048576\n", 606 | " 499 │ 0.986 0.986935 1048576\n", 607 | " 500 │ 0.988 0.987878 1048576\n", 608 | " 501 │ 0.99 0.990001 1048576\n", 609 | " 502 │ 0.992 0.993618 1048576\n", 610 | " 503 │ 0.994 0.994494 1048576\n", 611 | " 504 │ 0.996 0.995764 1048576\n", 612 | " 505 │ 0.998 0.999058 1048576\n", 613 | " 506 │ 1.0 0.999101 1048576\n", 614 | "\u001b[36m 485 rows omitted\u001b[0m" 615 | ] 616 | }, 617 | "execution_count": 15, 618 | "metadata": {}, 619 | "output_type": "execute_result" 620 | } 621 | ], 622 | "source": [ 623 | "agg2 = reduce(vcat, agg1)" 624 | ] 625 | }, 626 | { 627 | "cell_type": "code", 628 | "execution_count": 16, 629 | "id": "b8c8b31b", 630 | "metadata": {}, 631 | "outputs": [ 632 | { 633 | "data": { 634 | "text/html": [ 635 | "
501×2 DataFrame
476 rows omitted
Rowmux_mean
Float64Float64
10.0-0.000165159
20.0020.00181935
30.0040.00547356
40.0060.0053043
50.0080.00748101
60.010.0105489
70.0120.0113697
80.0140.0133716
90.0160.0163869
100.0180.0192935
110.020.0198156
120.0220.0234726
130.0240.0237806
4900.9780.979726
4910.980.980824
4920.9820.980645
4930.9840.985538
4940.9860.986935
4950.9880.987878
4960.990.990001
4970.9920.993618
4980.9940.994494
4990.9960.995764
5000.9980.999058
5011.00.999101
" 636 | ], 637 | "text/latex": [ 638 | "\\begin{tabular}{r|cc}\n", 639 | "\t& mu & x\\_mean\\\\\n", 640 | "\t\\hline\n", 641 | "\t& Float64 & Float64\\\\\n", 642 | "\t\\hline\n", 643 | "\t1 & 0.0 & -0.000165159 \\\\\n", 644 | "\t2 & 0.002 & 0.00181935 \\\\\n", 645 | "\t3 & 0.004 & 0.00547356 \\\\\n", 646 | "\t4 & 0.006 & 0.0053043 \\\\\n", 647 | "\t5 & 0.008 & 0.00748101 \\\\\n", 648 | "\t6 & 0.01 & 0.0105489 \\\\\n", 649 | "\t7 & 0.012 & 0.0113697 \\\\\n", 650 | "\t8 & 0.014 & 0.0133716 \\\\\n", 651 | "\t9 & 0.016 & 0.0163869 \\\\\n", 652 | "\t10 & 0.018 & 0.0192935 \\\\\n", 653 | "\t11 & 0.02 & 0.0198156 \\\\\n", 654 | "\t12 & 0.022 & 0.0234726 \\\\\n", 655 | "\t13 & 0.024 & 0.0237806 \\\\\n", 656 | "\t14 & 0.026 & 0.0249626 \\\\\n", 657 | "\t15 & 0.028 & 0.0273897 \\\\\n", 658 | "\t16 & 0.03 & 0.030416 \\\\\n", 659 | "\t17 & 0.032 & 0.0332699 \\\\\n", 660 | "\t18 & 0.034 & 0.0349246 \\\\\n", 661 | "\t19 & 0.036 & 0.0382223 \\\\\n", 662 | "\t20 & 0.038 & 0.0385496 \\\\\n", 663 | "\t21 & 0.04 & 0.0399323 \\\\\n", 664 | "\t22 & 0.042 & 0.0425588 \\\\\n", 665 | "\t23 & 0.044 & 0.0439851 \\\\\n", 666 | "\t24 & 0.046 & 0.0462198 \\\\\n", 667 | "\t25 & 0.048 & 0.0465669 \\\\\n", 668 | "\t26 & 0.05 & 0.0487787 \\\\\n", 669 | "\t27 & 0.052 & 0.0511604 \\\\\n", 670 | "\t28 & 0.054 & 0.0538052 \\\\\n", 671 | "\t29 & 0.056 & 0.0564606 \\\\\n", 672 | "\t30 & 0.058 & 0.0583234 \\\\\n", 673 | "\t$\\dots$ & $\\dots$ & $\\dots$ \\\\\n", 674 | "\\end{tabular}\n" 675 | ], 676 | "text/plain": [ 677 | "\u001b[1m501×2 DataFrame\u001b[0m\n", 678 | "\u001b[1m Row \u001b[0m│\u001b[1m mu \u001b[0m\u001b[1m x_mean \u001b[0m\n", 679 | " │\u001b[90m Float64 \u001b[0m\u001b[90m Float64 \u001b[0m\n", 680 | "─────┼───────────────────────\n", 681 | " 1 │ 0.0 -0.000165159\n", 682 | " 2 │ 0.002 0.00181935\n", 683 | " 3 │ 0.004 0.00547356\n", 684 | " 4 │ 0.006 0.0053043\n", 685 | " 5 │ 0.008 0.00748101\n", 686 | " 6 │ 0.01 0.0105489\n", 687 | " 7 │ 0.012 0.0113697\n", 688 | " 8 │ 0.014 0.0133716\n", 689 | " 9 │ 0.016 0.0163869\n", 690 | " 10 │ 0.018 0.0192935\n", 691 | " 11 │ 0.02 0.0198156\n", 692 | " ⋮ │ ⋮ ⋮\n", 693 | " 492 │ 0.982 0.980645\n", 694 | " 493 │ 0.984 0.985538\n", 695 | " 494 │ 0.986 0.986935\n", 696 | " 495 │ 0.988 0.987878\n", 697 | " 496 │ 0.99 0.990001\n", 698 | " 497 │ 0.992 0.993618\n", 699 | " 498 │ 0.994 0.994494\n", 700 | " 499 │ 0.996 0.995764\n", 701 | " 500 │ 0.998 0.999058\n", 702 | " 501 │ 1.0 0.999101\n", 703 | "\u001b[36m 480 rows omitted\u001b[0m" 704 | ] 705 | }, 706 | "execution_count": 16, 707 | "metadata": {}, 708 | "output_type": "execute_result" 709 | } 710 | ], 711 | "source": [ 712 | "agg3 = combine(groupby(agg2, :mu)) do sdf\n", 713 | " return (; x_mean = mean(sdf.x_mean, Weights(sdf.nrow)))\n", 714 | "end" 715 | ] 716 | }, 717 | { 718 | "cell_type": "markdown", 719 | "id": "eb93a9ec", 720 | "metadata": {}, 721 | "source": [ 722 | "Other common possible scenarios:\n", 723 | "\n", 724 | "* using multiple-threads for in-core data\n", 725 | "* using several machines in a cluster" 726 | ] 727 | }, 728 | { 729 | "cell_type": "markdown", 730 | "id": "a895c32b", 731 | "metadata": {}, 732 | "source": [ 733 | "What I presented above gives full flexibility, but requires manual handling of reduction.\n", 734 | "\n", 735 | "For common operations [DTables.jl](https://github.com/JuliaParallel/DTables.jl) provides distributed table structures and data manipulation operations built on top of Dagger.jl." 736 | ] 737 | }, 738 | { 739 | "cell_type": "markdown", 740 | "id": "79263dbb", 741 | "metadata": {}, 742 | "source": [ 743 | "In part 3 we discuss some limitations of Parquet format that one needs to keep in mind when working with it." 744 | ] 745 | }, 746 | { 747 | "cell_type": "markdown", 748 | "id": "e7820da9", 749 | "metadata": {}, 750 | "source": [ 751 | "*Preparation of this worksop has been supported by the Polish National Agency for Academic Exchange under the Strategic Partnerships programme, grant number BPI/PST/2021/1/00069/U/00001.*\n", 752 | "\n", 753 | "![SGH & NAWA](logo.png)" 754 | ] 755 | } 756 | ], 757 | "metadata": { 758 | "kernelspec": { 759 | "display_name": "Julia 1.9.2", 760 | "language": "julia", 761 | "name": "julia-1.9" 762 | }, 763 | "language_info": { 764 | "file_extension": ".jl", 765 | "mimetype": "application/julia", 766 | "name": "julia", 767 | "version": "1.9.2" 768 | } 769 | }, 770 | "nbformat": 4, 771 | "nbformat_minor": 5 772 | } 773 | -------------------------------------------------------------------------------- /juliacon2023_part3_issues.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "81950dea", 6 | "metadata": {}, 7 | "source": [ 8 | "# Working with DataFrames.jl beyond CSV files\n", 9 | "\n", 10 | "# Part 3: Important limitations of Parquet\n", 11 | "\n", 12 | "## Bogumił Kamiński\n", 13 | "### June 25, 2023" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "id": "4acb3ec2", 19 | "metadata": {}, 20 | "source": [ 21 | "What is covered in part 3:\n", 22 | "* Limitations of `RowGroup` size\n", 23 | "* Avoid excessive copying of data" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "id": "cf3d92ee", 29 | "metadata": {}, 30 | "source": [ 31 | "## Setup" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 1, 37 | "id": "3ac0783f", 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "using DataFrames" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "id": "8bc85c87", 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "using Parquet2" 52 | ] 53 | }, 54 | { 55 | "cell_type": "markdown", 56 | "id": "180ff9ba", 57 | "metadata": {}, 58 | "source": [ 59 | "## Handling tables with large number of rows" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 3, 65 | "id": "453115bc", 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "isfile(\"large_df.parquet\") && rm(\"large_df.parquet\")" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 4, 75 | "id": "4b246cce", 76 | "metadata": {}, 77 | "outputs": [ 78 | { 79 | "data": { 80 | "text/html": [ 81 | "
300000000×1 DataFrame
299999975 rows omitted
Rowx
Float64
10.970803
20.841825
30.0484231
40.764995
50.438937
60.972333
70.357896
80.830436
90.090152
100.780723
110.144802
120.903739
130.375271
2999999890.865466
2999999900.850942
2999999910.866739
2999999920.281398
2999999930.751348
2999999940.819741
2999999950.235294
2999999960.760385
2999999970.588597
2999999980.580177
2999999990.456726
3000000000.554389
" 82 | ], 83 | "text/latex": [ 84 | "\\begin{tabular}{r|c}\n", 85 | "\t& x\\\\\n", 86 | "\t\\hline\n", 87 | "\t& Float64\\\\\n", 88 | "\t\\hline\n", 89 | "\t1 & 0.970803 \\\\\n", 90 | "\t2 & 0.841825 \\\\\n", 91 | "\t3 & 0.0484231 \\\\\n", 92 | "\t4 & 0.764995 \\\\\n", 93 | "\t5 & 0.438937 \\\\\n", 94 | "\t6 & 0.972333 \\\\\n", 95 | "\t7 & 0.357896 \\\\\n", 96 | "\t8 & 0.830436 \\\\\n", 97 | "\t9 & 0.090152 \\\\\n", 98 | "\t10 & 0.780723 \\\\\n", 99 | "\t11 & 0.144802 \\\\\n", 100 | "\t12 & 0.903739 \\\\\n", 101 | "\t13 & 0.375271 \\\\\n", 102 | "\t14 & 0.713255 \\\\\n", 103 | "\t15 & 0.855335 \\\\\n", 104 | "\t16 & 0.633159 \\\\\n", 105 | "\t17 & 0.815826 \\\\\n", 106 | "\t18 & 0.0735042 \\\\\n", 107 | "\t19 & 0.573114 \\\\\n", 108 | "\t20 & 0.403656 \\\\\n", 109 | "\t21 & 0.172389 \\\\\n", 110 | "\t22 & 0.365429 \\\\\n", 111 | "\t23 & 0.290728 \\\\\n", 112 | "\t24 & 0.228912 \\\\\n", 113 | "\t25 & 0.0985188 \\\\\n", 114 | "\t26 & 0.387811 \\\\\n", 115 | "\t27 & 0.978131 \\\\\n", 116 | "\t28 & 0.859664 \\\\\n", 117 | "\t29 & 0.922425 \\\\\n", 118 | "\t30 & 0.868172 \\\\\n", 119 | "\t$\\dots$ & $\\dots$ \\\\\n", 120 | "\\end{tabular}\n" 121 | ], 122 | "text/plain": [ 123 | "\u001b[1m300000000×1 DataFrame\u001b[0m\n", 124 | "\u001b[1m Row \u001b[0m│\u001b[1m x \u001b[0m\n", 125 | " │\u001b[90m Float64 \u001b[0m\n", 126 | "───────────┼───────────\n", 127 | " 1 │ 0.970803\n", 128 | " 2 │ 0.841825\n", 129 | " 3 │ 0.0484231\n", 130 | " 4 │ 0.764995\n", 131 | " 5 │ 0.438937\n", 132 | " 6 │ 0.972333\n", 133 | " 7 │ 0.357896\n", 134 | " 8 │ 0.830436\n", 135 | " 9 │ 0.090152\n", 136 | " 10 │ 0.780723\n", 137 | " 11 │ 0.144802\n", 138 | " ⋮ │ ⋮\n", 139 | " 299999991 │ 0.866739\n", 140 | " 299999992 │ 0.281398\n", 141 | " 299999993 │ 0.751348\n", 142 | " 299999994 │ 0.819741\n", 143 | " 299999995 │ 0.235294\n", 144 | " 299999996 │ 0.760385\n", 145 | " 299999997 │ 0.588597\n", 146 | " 299999998 │ 0.580177\n", 147 | " 299999999 │ 0.456726\n", 148 | " 300000000 │ 0.554389\n", 149 | "\u001b[36m 299999979 rows omitted\u001b[0m" 150 | ] 151 | }, 152 | "execution_count": 4, 153 | "metadata": {}, 154 | "output_type": "execute_result" 155 | } 156 | ], 157 | "source": [ 158 | "large_df = DataFrame(x=rand(3*10^8))" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "id": "42e65e33", 164 | "metadata": {}, 165 | "source": [ 166 | "This table has too many rows and cannot be stored in Parquet as one `RowGroup`." 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 5, 172 | "id": "59088cc4", 173 | "metadata": {}, 174 | "outputs": [ 175 | { 176 | "ename": "LoadError", 177 | "evalue": "InexactError: trunc(Int32, 2400000000)", 178 | "output_type": "error", 179 | "traceback": [ 180 | "InexactError: trunc(Int32, 2400000000)", 181 | "", 182 | "Stacktrace:", 183 | " [1] throw_inexacterror(f::Symbol, #unused#::Type{Int32}, val::Int64)", 184 | " @ Core .\\boot.jl:634", 185 | " ...", 186 | " [25] top-level scope", 187 | " @ In[5]:1" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "Parquet2.writefile(\"large_df.parquet\", large_df)" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "id": "84101f37", 198 | "metadata": {}, 199 | "source": [ 200 | "We need to split it into partitions of smaller size using `Iterators.partition`:" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 6, 206 | "id": "18ba68dd", 207 | "metadata": {}, 208 | "outputs": [ 209 | { 210 | "data": { 211 | "text/plain": [ 212 | "\u001b[34m✏ \u001b[39mParquet2.FileWriter{IOStream}(large_df.parquet)" 213 | ] 214 | }, 215 | "execution_count": 6, 216 | "metadata": {}, 217 | "output_type": "execute_result" 218 | } 219 | ], 220 | "source": [ 221 | "Parquet2.writefile(\"large_df.parquet\", Iterators.partition(large_df, 10^8))" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "id": "4f2ffa33", 227 | "metadata": {}, 228 | "source": [ 229 | "Drop original data to save memory:" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 7, 235 | "id": "6d1fc286", 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "large_df = nothing" 240 | ] 241 | }, 242 | { 243 | "cell_type": "markdown", 244 | "id": "85895503", 245 | "metadata": {}, 246 | "source": [ 247 | "## Impact of `copycols` keyword argument when fetching data to a `DataFrame`" 248 | ] 249 | }, 250 | { 251 | "cell_type": "markdown", 252 | "id": "6e716993", 253 | "metadata": {}, 254 | "source": [ 255 | "`copycols=true` option:" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 8, 261 | "id": "493809aa", 262 | "metadata": {}, 263 | "outputs": [ 264 | { 265 | "data": { 266 | "text/plain": [ 267 | "12000208304" 268 | ] 269 | }, 270 | "execution_count": 8, 271 | "metadata": {}, 272 | "output_type": "execute_result" 273 | } 274 | ], 275 | "source": [ 276 | "DataFrame(Parquet2.readfile(\"large_df.parquet\"))\n", 277 | "GC.gc(); GC.gc(); GC.gc(); GC.gc()\n", 278 | "@allocated DataFrame(Parquet2.readfile(\"large_df.parquet\"))" 279 | ] 280 | }, 281 | { 282 | "cell_type": "markdown", 283 | "id": "a900295c", 284 | "metadata": {}, 285 | "source": [ 286 | "`copycols=false` option:" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": 9, 292 | "id": "b2ba467a", 293 | "metadata": {}, 294 | "outputs": [ 295 | { 296 | "data": { 297 | "text/plain": [ 298 | "9600208336" 299 | ] 300 | }, 301 | "execution_count": 9, 302 | "metadata": {}, 303 | "output_type": "execute_result" 304 | } 305 | ], 306 | "source": [ 307 | "DataFrame(Parquet2.readfile(\"large_df.parquet\"), copycols=false)\n", 308 | "GC.gc(); GC.gc(); GC.gc(); GC.gc()\n", 309 | "@allocated DataFrame(Parquet2.readfile(\"large_df.parquet\"), copycols=false)" 310 | ] 311 | }, 312 | { 313 | "cell_type": "markdown", 314 | "id": "9d2f8738", 315 | "metadata": {}, 316 | "source": [ 317 | "**This issue has been fixed in Parquet2.jl version 0.2.18. Since this version you can omit passing `copycols=false`. Excessive copying is automatically avoided.**" 318 | ] 319 | }, 320 | { 321 | "cell_type": "markdown", 322 | "id": "97ea16bc", 323 | "metadata": {}, 324 | "source": [ 325 | "*Preparation of this worksop has been supported by the Polish National Agency for Academic Exchange under the Strategic Partnerships programme, grant number BPI/PST/2021/1/00069/U/00001.*\n", 326 | "\n", 327 | "![SGH & NAWA](logo.png)" 328 | ] 329 | } 330 | ], 331 | "metadata": { 332 | "kernelspec": { 333 | "display_name": "Julia 1.9.2", 334 | "language": "julia", 335 | "name": "julia-1.9" 336 | }, 337 | "language_info": { 338 | "file_extension": ".jl", 339 | "mimetype": "application/julia", 340 | "name": "julia", 341 | "version": "1.9.2" 342 | } 343 | }, 344 | "nbformat": 4, 345 | "nbformat_minor": 5 346 | } 347 | -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bkamins/JuliaCon2023-Tutorial/2da04fe2d76a2d468e96d7795b3d713de914a31f/logo.png --------------------------------------------------------------------------------