├── .github └── workflows │ └── build.yml ├── .gitignore ├── .mergify.yml ├── Cargo.lock ├── Cargo.toml ├── LICENSE ├── README.md ├── architecture.md ├── clippy.toml ├── data ├── department.csv ├── employee.csv ├── rank.csv └── test_data.csv ├── doc └── query_engine.jpg ├── makefile └── src ├── catalog.rs ├── datasource ├── csv.rs ├── empty.rs ├── memory.rs └── mod.rs ├── datatype.rs ├── db.rs ├── error.rs ├── lib.rs ├── logical_plan ├── dataframe.rs ├── expression.rs ├── literal.rs ├── mod.rs ├── plan.rs └── schema.rs ├── main.rs ├── optimizer ├── mod.rs └── projection_push_down.rs ├── physical_plan ├── aggregate │ ├── avg.rs │ ├── count.rs │ ├── max.rs │ ├── min.rs │ ├── mod.rs │ └── sum.rs ├── cross_join.rs ├── expression │ ├── binary.rs │ ├── cast.rs │ ├── column.rs │ ├── literal.rs │ ├── mod.rs │ └── unary.rs ├── hash_join.rs ├── limit.rs ├── mod.rs ├── nested_loop_join.rs ├── offset.rs ├── plan.rs ├── projection.rs ├── scan.rs ├── selection.rs └── visitor.rs ├── planner └── mod.rs ├── sql ├── mod.rs ├── parser.rs └── planner.rs └── utils.rs /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: auto build and test 2 | on: 3 | pull_request: 4 | branches: [main] 5 | env: 6 | RUST_TOOLCHAIN: nightly-2022-04-09 7 | CARGO_TERM_COLOR: always 8 | jobs: 9 | run-test: 10 | name: normal check 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout 14 | uses: actions/checkout@v3 15 | - name: Install rust toolchain@v1 16 | uses: actions-rs/toolchain@v1 17 | with: 18 | toolchain: ${{ env.RUST_TOOLCHAIN }} 19 | components: rustfmt, clippy 20 | - name: Run rust clippy check 21 | run: | 22 | # If new CI checks are added, the one with `--locked` must be run first. 23 | cargo clippy --all-targets --locked -- -D warnings 24 | - name: Run test 25 | run: cargo test 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /target 2 | lcov.info 3 | .vscode 4 | -------------------------------------------------------------------------------- /.mergify.yml: -------------------------------------------------------------------------------- 1 | pull_request_rules: 2 | - name: Automatic merge on approval 3 | conditions: 4 | - "#approved-reviews-by>=1" 5 | actions: 6 | merge: 7 | method: squash 8 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "aho-corasick" 7 | version = "0.7.18" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f" 10 | dependencies = [ 11 | "memchr", 12 | ] 13 | 14 | [[package]] 15 | name = "arrow" 16 | version = "13.0.0" 17 | source = "registry+https://github.com/rust-lang/crates.io-index" 18 | checksum = "5c6bee230122beb516ead31935a61f683715f987c6f003eff44ad6986624105a" 19 | dependencies = [ 20 | "bitflags", 21 | "chrono", 22 | "comfy-table", 23 | "csv", 24 | "flatbuffers", 25 | "half", 26 | "hex", 27 | "indexmap", 28 | "lazy_static", 29 | "lexical-core", 30 | "multiversion", 31 | "num", 32 | "rand", 33 | "regex", 34 | "serde", 35 | "serde_derive", 36 | "serde_json", 37 | ] 38 | 39 | [[package]] 40 | name = "autocfg" 41 | version = "1.1.0" 42 | source = "registry+https://github.com/rust-lang/crates.io-index" 43 | checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" 44 | 45 | [[package]] 46 | name = "bitflags" 47 | version = "1.3.2" 48 | source = "registry+https://github.com/rust-lang/crates.io-index" 49 | checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a" 50 | 51 | [[package]] 52 | name = "bstr" 53 | version = "0.2.17" 54 | source = "registry+https://github.com/rust-lang/crates.io-index" 55 | checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223" 56 | dependencies = [ 57 | "lazy_static", 58 | "memchr", 59 | "regex-automata", 60 | "serde", 61 | ] 62 | 63 | [[package]] 64 | name = "cfg-if" 65 | version = "1.0.0" 66 | source = "registry+https://github.com/rust-lang/crates.io-index" 67 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 68 | 69 | [[package]] 70 | name = "chrono" 71 | version = "0.4.19" 72 | source = "registry+https://github.com/rust-lang/crates.io-index" 73 | checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73" 74 | dependencies = [ 75 | "libc", 76 | "num-integer", 77 | "num-traits", 78 | "winapi", 79 | ] 80 | 81 | [[package]] 82 | name = "comfy-table" 83 | version = "5.0.1" 84 | source = "registry+https://github.com/rust-lang/crates.io-index" 85 | checksum = "b103d85ca6e209388771bfb7aa6b68a7aeec4afbf6f0a0264bfbf50360e5212e" 86 | dependencies = [ 87 | "strum", 88 | "strum_macros", 89 | "unicode-width", 90 | ] 91 | 92 | [[package]] 93 | name = "csv" 94 | version = "1.1.6" 95 | source = "registry+https://github.com/rust-lang/crates.io-index" 96 | checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1" 97 | dependencies = [ 98 | "bstr", 99 | "csv-core", 100 | "itoa 0.4.8", 101 | "ryu", 102 | "serde", 103 | ] 104 | 105 | [[package]] 106 | name = "csv-core" 107 | version = "0.1.10" 108 | source = "registry+https://github.com/rust-lang/crates.io-index" 109 | checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90" 110 | dependencies = [ 111 | "memchr", 112 | ] 113 | 114 | [[package]] 115 | name = "flatbuffers" 116 | version = "2.1.2" 117 | source = "registry+https://github.com/rust-lang/crates.io-index" 118 | checksum = "86b428b715fdbdd1c364b84573b5fdc0f84f8e423661b9f398735278bc7f2b6a" 119 | dependencies = [ 120 | "bitflags", 121 | "smallvec", 122 | "thiserror", 123 | ] 124 | 125 | [[package]] 126 | name = "getrandom" 127 | version = "0.2.6" 128 | source = "registry+https://github.com/rust-lang/crates.io-index" 129 | checksum = "9be70c98951c83b8d2f8f60d7065fa6d5146873094452a1008da8c2f1e4205ad" 130 | dependencies = [ 131 | "cfg-if", 132 | "libc", 133 | "wasi", 134 | ] 135 | 136 | [[package]] 137 | name = "half" 138 | version = "1.8.2" 139 | source = "registry+https://github.com/rust-lang/crates.io-index" 140 | checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7" 141 | 142 | [[package]] 143 | name = "hashbrown" 144 | version = "0.11.2" 145 | source = "registry+https://github.com/rust-lang/crates.io-index" 146 | checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e" 147 | 148 | [[package]] 149 | name = "heck" 150 | version = "0.3.3" 151 | source = "registry+https://github.com/rust-lang/crates.io-index" 152 | checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c" 153 | dependencies = [ 154 | "unicode-segmentation", 155 | ] 156 | 157 | [[package]] 158 | name = "hex" 159 | version = "0.4.3" 160 | source = "registry+https://github.com/rust-lang/crates.io-index" 161 | checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" 162 | 163 | [[package]] 164 | name = "indexmap" 165 | version = "1.8.1" 166 | source = "registry+https://github.com/rust-lang/crates.io-index" 167 | checksum = "0f647032dfaa1f8b6dc29bd3edb7bbef4861b8b8007ebb118d6db284fd59f6ee" 168 | dependencies = [ 169 | "autocfg", 170 | "hashbrown", 171 | ] 172 | 173 | [[package]] 174 | name = "itoa" 175 | version = "0.4.8" 176 | source = "registry+https://github.com/rust-lang/crates.io-index" 177 | checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4" 178 | 179 | [[package]] 180 | name = "itoa" 181 | version = "1.0.1" 182 | source = "registry+https://github.com/rust-lang/crates.io-index" 183 | checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35" 184 | 185 | [[package]] 186 | name = "lazy_static" 187 | version = "1.4.0" 188 | source = "registry+https://github.com/rust-lang/crates.io-index" 189 | checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" 190 | 191 | [[package]] 192 | name = "lexical-core" 193 | version = "0.8.3" 194 | source = "registry+https://github.com/rust-lang/crates.io-index" 195 | checksum = "92912c4af2e7d9075be3e5e3122c4d7263855fa6cce34fbece4dd08e5884624d" 196 | dependencies = [ 197 | "lexical-parse-float", 198 | "lexical-parse-integer", 199 | "lexical-util", 200 | "lexical-write-float", 201 | "lexical-write-integer", 202 | ] 203 | 204 | [[package]] 205 | name = "lexical-parse-float" 206 | version = "0.8.3" 207 | source = "registry+https://github.com/rust-lang/crates.io-index" 208 | checksum = "f518eed87c3be6debe6d26b855c97358d8a11bf05acec137e5f53080f5ad2dd8" 209 | dependencies = [ 210 | "lexical-parse-integer", 211 | "lexical-util", 212 | "static_assertions", 213 | ] 214 | 215 | [[package]] 216 | name = "lexical-parse-integer" 217 | version = "0.8.3" 218 | source = "registry+https://github.com/rust-lang/crates.io-index" 219 | checksum = "afc852ec67c6538bbb2b9911116a385b24510e879a69ab516e6a151b15a79168" 220 | dependencies = [ 221 | "lexical-util", 222 | "static_assertions", 223 | ] 224 | 225 | [[package]] 226 | name = "lexical-util" 227 | version = "0.8.3" 228 | source = "registry+https://github.com/rust-lang/crates.io-index" 229 | checksum = "c72a9d52c5c4e62fa2cdc2cb6c694a39ae1382d9c2a17a466f18e272a0930eb1" 230 | dependencies = [ 231 | "static_assertions", 232 | ] 233 | 234 | [[package]] 235 | name = "lexical-write-float" 236 | version = "0.8.4" 237 | source = "registry+https://github.com/rust-lang/crates.io-index" 238 | checksum = "8a89ec1d062e481210c309b672f73a0567b7855f21e7d2fae636df44d12e97f9" 239 | dependencies = [ 240 | "lexical-util", 241 | "lexical-write-integer", 242 | "static_assertions", 243 | ] 244 | 245 | [[package]] 246 | name = "lexical-write-integer" 247 | version = "0.8.3" 248 | source = "registry+https://github.com/rust-lang/crates.io-index" 249 | checksum = "094060bd2a7c2ff3a16d5304a6ae82727cb3cc9d1c70f813cc73f744c319337e" 250 | dependencies = [ 251 | "lexical-util", 252 | "static_assertions", 253 | ] 254 | 255 | [[package]] 256 | name = "libc" 257 | version = "0.2.125" 258 | source = "registry+https://github.com/rust-lang/crates.io-index" 259 | checksum = "5916d2ae698f6de9bfb891ad7a8d65c09d232dc58cc4ac433c7da3b2fd84bc2b" 260 | 261 | [[package]] 262 | name = "log" 263 | version = "0.4.17" 264 | source = "registry+https://github.com/rust-lang/crates.io-index" 265 | checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" 266 | dependencies = [ 267 | "cfg-if", 268 | ] 269 | 270 | [[package]] 271 | name = "memchr" 272 | version = "2.5.0" 273 | source = "registry+https://github.com/rust-lang/crates.io-index" 274 | checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" 275 | 276 | [[package]] 277 | name = "multiversion" 278 | version = "0.6.1" 279 | source = "registry+https://github.com/rust-lang/crates.io-index" 280 | checksum = "025c962a3dd3cc5e0e520aa9c612201d127dcdf28616974961a649dca64f5373" 281 | dependencies = [ 282 | "multiversion-macros", 283 | ] 284 | 285 | [[package]] 286 | name = "multiversion-macros" 287 | version = "0.6.1" 288 | source = "registry+https://github.com/rust-lang/crates.io-index" 289 | checksum = "a8a3e2bde382ebf960c1f3e79689fa5941625fe9bf694a1cb64af3e85faff3af" 290 | dependencies = [ 291 | "proc-macro2", 292 | "quote", 293 | "syn", 294 | ] 295 | 296 | [[package]] 297 | name = "naive-db" 298 | version = "0.1.0" 299 | dependencies = [ 300 | "arrow", 301 | "log", 302 | "ordered-float", 303 | "sqlparser", 304 | "twox-hash", 305 | ] 306 | 307 | [[package]] 308 | name = "num" 309 | version = "0.4.0" 310 | source = "registry+https://github.com/rust-lang/crates.io-index" 311 | checksum = "43db66d1170d347f9a065114077f7dccb00c1b9478c89384490a3425279a4606" 312 | dependencies = [ 313 | "num-bigint", 314 | "num-complex", 315 | "num-integer", 316 | "num-iter", 317 | "num-rational", 318 | "num-traits", 319 | ] 320 | 321 | [[package]] 322 | name = "num-bigint" 323 | version = "0.4.3" 324 | source = "registry+https://github.com/rust-lang/crates.io-index" 325 | checksum = "f93ab6289c7b344a8a9f60f88d80aa20032336fe78da341afc91c8a2341fc75f" 326 | dependencies = [ 327 | "autocfg", 328 | "num-integer", 329 | "num-traits", 330 | ] 331 | 332 | [[package]] 333 | name = "num-complex" 334 | version = "0.4.1" 335 | source = "registry+https://github.com/rust-lang/crates.io-index" 336 | checksum = "97fbc387afefefd5e9e39493299f3069e14a140dd34dc19b4c1c1a8fddb6a790" 337 | dependencies = [ 338 | "num-traits", 339 | ] 340 | 341 | [[package]] 342 | name = "num-integer" 343 | version = "0.1.45" 344 | source = "registry+https://github.com/rust-lang/crates.io-index" 345 | checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9" 346 | dependencies = [ 347 | "autocfg", 348 | "num-traits", 349 | ] 350 | 351 | [[package]] 352 | name = "num-iter" 353 | version = "0.1.43" 354 | source = "registry+https://github.com/rust-lang/crates.io-index" 355 | checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252" 356 | dependencies = [ 357 | "autocfg", 358 | "num-integer", 359 | "num-traits", 360 | ] 361 | 362 | [[package]] 363 | name = "num-rational" 364 | version = "0.4.0" 365 | source = "registry+https://github.com/rust-lang/crates.io-index" 366 | checksum = "d41702bd167c2df5520b384281bc111a4b5efcf7fbc4c9c222c815b07e0a6a6a" 367 | dependencies = [ 368 | "autocfg", 369 | "num-bigint", 370 | "num-integer", 371 | "num-traits", 372 | ] 373 | 374 | [[package]] 375 | name = "num-traits" 376 | version = "0.2.15" 377 | source = "registry+https://github.com/rust-lang/crates.io-index" 378 | checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" 379 | dependencies = [ 380 | "autocfg", 381 | ] 382 | 383 | [[package]] 384 | name = "ordered-float" 385 | version = "3.0.0" 386 | source = "registry+https://github.com/rust-lang/crates.io-index" 387 | checksum = "96bcbab4bfea7a59c2c0fe47211a1ac4e3e96bea6eb446d704f310bc5c732ae2" 388 | dependencies = [ 389 | "num-traits", 390 | ] 391 | 392 | [[package]] 393 | name = "ppv-lite86" 394 | version = "0.2.16" 395 | source = "registry+https://github.com/rust-lang/crates.io-index" 396 | checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872" 397 | 398 | [[package]] 399 | name = "proc-macro2" 400 | version = "1.0.38" 401 | source = "registry+https://github.com/rust-lang/crates.io-index" 402 | checksum = "9027b48e9d4c9175fa2218adf3557f91c1137021739951d4932f5f8268ac48aa" 403 | dependencies = [ 404 | "unicode-xid", 405 | ] 406 | 407 | [[package]] 408 | name = "quote" 409 | version = "1.0.18" 410 | source = "registry+https://github.com/rust-lang/crates.io-index" 411 | checksum = "a1feb54ed693b93a84e14094943b84b7c4eae204c512b7ccb95ab0c66d278ad1" 412 | dependencies = [ 413 | "proc-macro2", 414 | ] 415 | 416 | [[package]] 417 | name = "rand" 418 | version = "0.8.5" 419 | source = "registry+https://github.com/rust-lang/crates.io-index" 420 | checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" 421 | dependencies = [ 422 | "libc", 423 | "rand_chacha", 424 | "rand_core", 425 | ] 426 | 427 | [[package]] 428 | name = "rand_chacha" 429 | version = "0.3.1" 430 | source = "registry+https://github.com/rust-lang/crates.io-index" 431 | checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" 432 | dependencies = [ 433 | "ppv-lite86", 434 | "rand_core", 435 | ] 436 | 437 | [[package]] 438 | name = "rand_core" 439 | version = "0.6.3" 440 | source = "registry+https://github.com/rust-lang/crates.io-index" 441 | checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7" 442 | dependencies = [ 443 | "getrandom", 444 | ] 445 | 446 | [[package]] 447 | name = "regex" 448 | version = "1.5.5" 449 | source = "registry+https://github.com/rust-lang/crates.io-index" 450 | checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286" 451 | dependencies = [ 452 | "aho-corasick", 453 | "memchr", 454 | "regex-syntax", 455 | ] 456 | 457 | [[package]] 458 | name = "regex-automata" 459 | version = "0.1.10" 460 | source = "registry+https://github.com/rust-lang/crates.io-index" 461 | checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" 462 | 463 | [[package]] 464 | name = "regex-syntax" 465 | version = "0.6.25" 466 | source = "registry+https://github.com/rust-lang/crates.io-index" 467 | checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b" 468 | 469 | [[package]] 470 | name = "rustversion" 471 | version = "1.0.6" 472 | source = "registry+https://github.com/rust-lang/crates.io-index" 473 | checksum = "f2cc38e8fa666e2de3c4aba7edeb5ffc5246c1c2ed0e3d17e560aeeba736b23f" 474 | 475 | [[package]] 476 | name = "ryu" 477 | version = "1.0.9" 478 | source = "registry+https://github.com/rust-lang/crates.io-index" 479 | checksum = "73b4b750c782965c211b42f022f59af1fbceabdd026623714f104152f1ec149f" 480 | 481 | [[package]] 482 | name = "serde" 483 | version = "1.0.137" 484 | source = "registry+https://github.com/rust-lang/crates.io-index" 485 | checksum = "61ea8d54c77f8315140a05f4c7237403bf38b72704d031543aa1d16abbf517d1" 486 | 487 | [[package]] 488 | name = "serde_derive" 489 | version = "1.0.137" 490 | source = "registry+https://github.com/rust-lang/crates.io-index" 491 | checksum = "1f26faba0c3959972377d3b2d306ee9f71faee9714294e41bb777f83f88578be" 492 | dependencies = [ 493 | "proc-macro2", 494 | "quote", 495 | "syn", 496 | ] 497 | 498 | [[package]] 499 | name = "serde_json" 500 | version = "1.0.81" 501 | source = "registry+https://github.com/rust-lang/crates.io-index" 502 | checksum = "9b7ce2b32a1aed03c558dc61a5cd328f15aff2dbc17daad8fb8af04d2100e15c" 503 | dependencies = [ 504 | "indexmap", 505 | "itoa 1.0.1", 506 | "ryu", 507 | "serde", 508 | ] 509 | 510 | [[package]] 511 | name = "smallvec" 512 | version = "1.8.0" 513 | source = "registry+https://github.com/rust-lang/crates.io-index" 514 | checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83" 515 | 516 | [[package]] 517 | name = "sqlparser" 518 | version = "0.9.0" 519 | source = "registry+https://github.com/rust-lang/crates.io-index" 520 | checksum = "4fa863a2dfc4879a35647c51dadf495a2ad53745eaf3723fda27006e745fb0ba" 521 | dependencies = [ 522 | "log", 523 | ] 524 | 525 | [[package]] 526 | name = "static_assertions" 527 | version = "1.1.0" 528 | source = "registry+https://github.com/rust-lang/crates.io-index" 529 | checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" 530 | 531 | [[package]] 532 | name = "strum" 533 | version = "0.23.0" 534 | source = "registry+https://github.com/rust-lang/crates.io-index" 535 | checksum = "cae14b91c7d11c9a851d3fbc80a963198998c2a64eec840477fa92d8ce9b70bb" 536 | 537 | [[package]] 538 | name = "strum_macros" 539 | version = "0.23.1" 540 | source = "registry+https://github.com/rust-lang/crates.io-index" 541 | checksum = "5bb0dc7ee9c15cea6199cde9a127fa16a4c5819af85395457ad72d68edc85a38" 542 | dependencies = [ 543 | "heck", 544 | "proc-macro2", 545 | "quote", 546 | "rustversion", 547 | "syn", 548 | ] 549 | 550 | [[package]] 551 | name = "syn" 552 | version = "1.0.93" 553 | source = "registry+https://github.com/rust-lang/crates.io-index" 554 | checksum = "04066589568b72ec65f42d65a1a52436e954b168773148893c020269563decf2" 555 | dependencies = [ 556 | "proc-macro2", 557 | "quote", 558 | "unicode-xid", 559 | ] 560 | 561 | [[package]] 562 | name = "thiserror" 563 | version = "1.0.31" 564 | source = "registry+https://github.com/rust-lang/crates.io-index" 565 | checksum = "bd829fe32373d27f76265620b5309d0340cb8550f523c1dda251d6298069069a" 566 | dependencies = [ 567 | "thiserror-impl", 568 | ] 569 | 570 | [[package]] 571 | name = "thiserror-impl" 572 | version = "1.0.31" 573 | source = "registry+https://github.com/rust-lang/crates.io-index" 574 | checksum = "0396bc89e626244658bef819e22d0cc459e795a5ebe878e6ec336d1674a8d79a" 575 | dependencies = [ 576 | "proc-macro2", 577 | "quote", 578 | "syn", 579 | ] 580 | 581 | [[package]] 582 | name = "twox-hash" 583 | version = "1.6.3" 584 | source = "registry+https://github.com/rust-lang/crates.io-index" 585 | checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" 586 | dependencies = [ 587 | "cfg-if", 588 | "rand", 589 | "static_assertions", 590 | ] 591 | 592 | [[package]] 593 | name = "unicode-segmentation" 594 | version = "1.9.0" 595 | source = "registry+https://github.com/rust-lang/crates.io-index" 596 | checksum = "7e8820f5d777f6224dc4be3632222971ac30164d4a258d595640799554ebfd99" 597 | 598 | [[package]] 599 | name = "unicode-width" 600 | version = "0.1.9" 601 | source = "registry+https://github.com/rust-lang/crates.io-index" 602 | checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973" 603 | 604 | [[package]] 605 | name = "unicode-xid" 606 | version = "0.2.3" 607 | source = "registry+https://github.com/rust-lang/crates.io-index" 608 | checksum = "957e51f3646910546462e67d5f7599b9e4fb8acdd304b087a6494730f9eebf04" 609 | 610 | [[package]] 611 | name = "wasi" 612 | version = "0.10.2+wasi-snapshot-preview1" 613 | source = "registry+https://github.com/rust-lang/crates.io-index" 614 | checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6" 615 | 616 | [[package]] 617 | name = "winapi" 618 | version = "0.3.9" 619 | source = "registry+https://github.com/rust-lang/crates.io-index" 620 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" 621 | dependencies = [ 622 | "winapi-i686-pc-windows-gnu", 623 | "winapi-x86_64-pc-windows-gnu", 624 | ] 625 | 626 | [[package]] 627 | name = "winapi-i686-pc-windows-gnu" 628 | version = "0.4.0" 629 | source = "registry+https://github.com/rust-lang/crates.io-index" 630 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" 631 | 632 | [[package]] 633 | name = "winapi-x86_64-pc-windows-gnu" 634 | version = "0.4.0" 635 | source = "registry+https://github.com/rust-lang/crates.io-index" 636 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" 637 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "naive-db" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | [dependencies] 7 | arrow = { version = "13", features = ["prettyprint"] } 8 | sqlparser = "0.9.0" 9 | log = "0.4" 10 | twox-hash = "1.6.3" 11 | ordered-float = "3.0.0" 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Naive Query Engine (Toy for Learning) 😄 2 | 3 | This is a Query Engine which support `SQL` interface. And it is only a Toy for learn query engine only. You can check [TODO](https://github.com/Veeupup/naive-query-engine#todo) to check the progress now. 4 | 5 | Simple enough to learn (Although it is simple...but with so much work to finish.. TAT 😭) and Now it only has a basic architecture and most operators and planners have not implemented (will be done in the future). 6 | 7 | This is inspired(and most ideas come) by [how-query-engines-work](https://github.com/andygrove/how-query-engines-work) and it is just for learning purpose. And many ideas inspired by [arrow-datafusion](https://github.com/apache/arrow-datafusion). 8 | 9 | Use [arrow](https://github.com/apache/arrow-rs) to express in-memory columnar format and use [sqlparser](https://github.com/sqlparser-rs/sqlparser-rs) as SQL parser. 10 | 11 | ## architecture 12 | 13 | ![query_engine](./doc/query_engine.jpg) 14 | 15 | ## how to use 16 | 17 | for now, we can use `NaiveDB` like below, we can use csv as table storage. 18 | 19 | ```rust 20 | use naive_db::print_result; 21 | use naive_db::CsvConfig; 22 | use naive_db::NaiveDB; 23 | use naive_db::Result; 24 | 25 | fn main() -> Result<()> { 26 | let mut db = NaiveDB::default(); 27 | 28 | db.create_csv_table("t1", "data/test_data.csv", CsvConfig::default())?; 29 | 30 | // select 31 | let ret = db.run_sql("select id, name, age + 100 from t1 where id < 9 limit 3 offset 2")?; 32 | print_result(&ret)?; 33 | 34 | // Inner Join 35 | db.create_csv_table("employee", "data/employee.csv", CsvConfig::default())?; 36 | db.create_csv_table("rank", "data/rank.csv", CsvConfig::default())?; 37 | db.create_csv_table("department", "data/department.csv", CsvConfig::default())?; 38 | 39 | let ret = db.run_sql( 40 | " 41 | select id, name, rank_name, department_name 42 | from employee 43 | join rank on 44 | employee.rank = rank.id 45 | join department on 46 | employee.department_id = department.id 47 | ", 48 | )?; 49 | print_result(&ret)?; 50 | 51 | // cross join 52 | let ret = db.run_sql("select * from employee join rank")?; 53 | print_result(&ret)?; 54 | 55 | // aggregate 56 | let ret = db.run_sql( 57 | " 58 | select count(id), sum(age), sum(score), avg(score), max(score), min(score) 59 | from t1 group by id % 3", 60 | )?; 61 | print_result(&ret)?; 62 | 63 | Ok(()) 64 | } 65 | ``` 66 | 67 | output will be: 68 | 69 | ``` 70 | +----+-------+-----------+ 71 | | id | name | age + 100 | 72 | +----+-------+-----------+ 73 | | 4 | lynne | 118 | 74 | | 5 | alice | 119 | 75 | | 6 | bob | 120 | 76 | +----+-------+-----------+ 77 | +----+-------+-------------+-----------------+ 78 | | id | name | rank_name | department_name | 79 | +----+-------+-------------+-----------------+ 80 | | 2 | lynne | master | IT | 81 | | 1 | vee | diamond | IT | 82 | | 3 | Alex | master | Marketing | 83 | | 4 | jack | diamond | Marketing | 84 | | 5 | mike | grandmaster | Human Resource | 85 | +----+-------+-------------+-----------------+ 86 | +----+-------+---------------+------+----+-------------+ 87 | | id | name | department_id | rank | id | rank_name | 88 | +----+-------+---------------+------+----+-------------+ 89 | | 1 | vee | 1 | 1 | 1 | master | 90 | | 2 | lynne | 1 | 0 | 2 | diamond | 91 | | 3 | Alex | 2 | 0 | 3 | grandmaster | 92 | | 4 | jack | 2 | 1 | 4 | master | 93 | | 5 | mike | 3 | 2 | 5 | diamond | 94 | | 1 | vee | 1 | 1 | 1 | grandmaster | 95 | | 2 | lynne | 1 | 0 | 2 | master | 96 | | 3 | Alex | 2 | 0 | 3 | diamond | 97 | | 4 | jack | 2 | 1 | 4 | grandmaster | 98 | | 5 | mike | 3 | 2 | 5 | master | 99 | | 1 | vee | 1 | 1 | 1 | diamond | 100 | | 2 | lynne | 1 | 0 | 2 | grandmaster | 101 | | 3 | Alex | 2 | 0 | 3 | master | 102 | | 4 | jack | 2 | 1 | 4 | diamond | 103 | | 5 | mike | 3 | 2 | 5 | grandmaster | 104 | +----+-------+---------------+------+----+-------------+ 105 | +-----------+----------+--------------------+-------------------+------------+------------+ 106 | | count(id) | sum(age) | sum(score) | avg(score) | max(score) | min(score) | 107 | +-----------+----------+--------------------+-------------------+------------+------------+ 108 | | 3 | 61 | 255.6 | 85.2 | 90.1 | 81.1 | 109 | | 3 | 62 | 243.29000000000002 | 81.09666666666668 | 99.99 | 60 | 110 | | 2 | 43 | 167.7 | 83.85 | 85.5 | 82.2 | 111 | +-----------+----------+--------------------+-------------------+------------+------------+ 112 | ``` 113 | 114 | ## architecture 115 | 116 | The NaiveDB is just simple and has clear progress just like: 117 | 118 | ```rust 119 | impl NaiveDB { 120 | pub fn run_sql(&self, sql: &str) -> Result> { 121 | // 1. sql -> statement 122 | let statement = SQLParser::parse(sql)?; 123 | // 2. statement -> logical plan 124 | let sql_planner = SQLPlanner::new(&self.catalog); 125 | let logical_plan = sql_planner.statement_to_plan(statement)?; 126 | // 3. optimize 127 | let optimizer = Optimizer::default(); 128 | let logical_plan = optimizer.optimize(logical_plan); 129 | // 4. logical plan -> physical plan 130 | let physical_plan = QueryPlanner::create_physical_plan(&logical_plan)?; 131 | // 5. execute 132 | physical_plan.execute() 133 | } 134 | } 135 | ``` 136 | 137 | 138 | ## TODO 139 | 140 | - [x] type system 141 | - [x] datasource 142 | - [x] mem source 143 | - [x] csv as datasource 144 | - [x] empty datasource 145 | - [x] logical plan & expressions 146 | - [ ] build logical plans 147 | - [x] projection 148 | - [x] filter 149 | - [x] aggregate 150 | - [x] limit 151 | - [x] join 152 | - [x] physical plan & expressions 153 | - [x] physical scan 154 | - [x] physical projection 155 | - [x] physical filter 156 | - [x] physical limit 157 | - [x] join 158 | - algorithms 159 | - [x] (dumb😊) nested loop join 160 | - [x] hash join 161 | - [ ] sort-merge join 162 | - [x] inner join 163 | - [x] cross join 164 | - [ ] physical expression 165 | - [x] column expr 166 | - [x] binary operation expr(add/sub/mul/div/and/or...) 167 | - [x] literal expr 168 | - [x] unary expr 169 | - [x] aggr expr 170 | - [ ] so many work to do... TAT 171 | - [ ] query planner 172 | - [x] scan 173 | - [x] limit 174 | - [x] join 175 | - [x] aggregate 176 | - [ ] ... 177 | - [ ] query optimization 178 | - [ ] more rules needed 179 | - [ ] sql support 180 | - [x] parser 181 | - [ ] SQL planner: statement -> logical plan 182 | - [x] scan 183 | - [x] projection 184 | - [x] selection 185 | - [x] limit 186 | - [x] join 187 | - [x] aggregate 188 | - [x] group by 189 | - [ ] scalar function 190 | -------------------------------------------------------------------------------- /architecture.md: -------------------------------------------------------------------------------- 1 | # Architecture 2 | 3 | ## datatype 4 | 5 | use `arrow` as datatype system. 6 | 7 | ## datasource 8 | 9 | we can return scan result by using async streaming trait after 10 | 11 | ## 12 | 13 | ```rust 14 | pub struct NaiveDB { 15 | catalog: HashMap 16 | }; 17 | 18 | impl NaiveDB { 19 | pub run_sql(sql: &str) -> Result { 20 | let statement = Parser::parse_sql(); 21 | let logical_plan = SqlPlanner::sql_to_plan(statement, catalog)?; 22 | let optimizer = Optimizer::new(); 23 | let plan = optimizer.optimize(plan); 24 | let physical_plan = QueryPlanner::create_physical_plan(plan); 25 | plan.execute() 26 | } 27 | } 28 | 29 | fn main() { 30 | 31 | DB::run_sql("select a, b from t where a > 1") 32 | } 33 | 34 | 35 | ``` 36 | -------------------------------------------------------------------------------- /clippy.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Veeupup/naive-query-engine/b942af6a8505d4083b384ef8a5969988f97d330f/clippy.toml -------------------------------------------------------------------------------- /data/department.csv: -------------------------------------------------------------------------------- 1 | id,department_name 2 | 1,IT 3 | 2,Marketing 4 | 3,Human Resource -------------------------------------------------------------------------------- /data/employee.csv: -------------------------------------------------------------------------------- 1 | id,name,department_id,rank 2 | 1,vee,1,1 3 | 2,lynne,1,0 4 | 3,Alex,2,0 5 | 4,jack,2,1 6 | 5,mike,3,2 -------------------------------------------------------------------------------- /data/rank.csv: -------------------------------------------------------------------------------- 1 | id,rank_name 2 | 0,master 3 | 1,diamond 4 | 2,grandmaster -------------------------------------------------------------------------------- /data/test_data.csv: -------------------------------------------------------------------------------- 1 | id,name,age,score 2 | 1,veeupup,23,60.0 3 | 2,alex,20,90.1 4 | 4,lynne,18,99.99 5 | 5,alice,19,81.1 6 | 6,bob,20,82.2 7 | 7,jack,21,83.3 8 | 8,cock,22,84.4 9 | 9,primer,23,85.5 -------------------------------------------------------------------------------- /doc/query_engine.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Veeupup/naive-query-engine/b942af6a8505d4083b384ef8a5969988f97d330f/doc/query_engine.jpg -------------------------------------------------------------------------------- /makefile: -------------------------------------------------------------------------------- 1 | lint: 2 | cargo fmt 3 | cargo clippy --all-targets --all-features -- -D warnings 4 | 5 | fix: 6 | cargo fix --allow-dirty 7 | -------------------------------------------------------------------------------- /src/catalog.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-14 16:00:32 4 | * @Email: code@tanweime.com 5 | */ 6 | 7 | use std::collections::HashMap; 8 | 9 | use crate::datasource::{EmptyTable, MemTable}; 10 | use crate::error::ErrorCode; 11 | use crate::logical_plan::plan::{LogicalPlan, TableScan}; 12 | use crate::logical_plan::schema::NaiveSchema; 13 | use crate::logical_plan::DataFrame; 14 | use crate::{ 15 | datasource::{CsvConfig, CsvTable, TableRef}, 16 | error::Result, 17 | }; 18 | use arrow::record_batch::RecordBatch; 19 | 20 | #[derive(Default)] 21 | pub struct Catalog { 22 | tables: HashMap, 23 | } 24 | 25 | impl Catalog { 26 | /// add csv table 27 | pub fn add_csv_table( 28 | &mut self, 29 | table: &str, 30 | csv_file: &str, 31 | csv_conf: CsvConfig, 32 | ) -> Result<()> { 33 | let source = CsvTable::try_create(csv_file, csv_conf)?; 34 | self.tables.insert(table.to_string(), source); 35 | Ok(()) 36 | } 37 | 38 | #[allow(unused)] 39 | /// add memory table 40 | pub fn add_memory_table( 41 | &mut self, 42 | table: &str, 43 | schema: NaiveSchema, 44 | batches: Vec, 45 | ) -> Result<()> { 46 | let source = MemTable::try_create(schema, batches)?; 47 | self.tables.insert(table.to_string(), source); 48 | Ok(()) 49 | } 50 | 51 | #[allow(unused)] 52 | /// add empty table 53 | pub fn add_empty_table(&mut self, table: &str, schema: NaiveSchema) -> Result<()> { 54 | let source = EmptyTable::try_create(schema)?; 55 | self.tables.insert(table.to_string(), source); 56 | Ok(()) 57 | } 58 | 59 | /// get table 60 | pub fn get_table(&self, table: &str) -> Result { 61 | self.tables 62 | .get(table) 63 | .cloned() 64 | .ok_or_else(|| ErrorCode::NoSuchTable(format!("No table name: {}", table))) 65 | } 66 | 67 | #[allow(unused)] 68 | /// get dataframe by table name 69 | pub fn get_table_df(&self, table: &str) -> Result { 70 | let source = self 71 | .tables 72 | .get(table) 73 | .cloned() 74 | .ok_or_else(|| ErrorCode::NoSuchTable(format!("No table name: {}", table)))?; 75 | let plan = LogicalPlan::TableScan(TableScan { 76 | source, 77 | projection: None, 78 | }); 79 | Ok(DataFrame { plan }) 80 | } 81 | } 82 | -------------------------------------------------------------------------------- /src/datasource/csv.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-12 16:45:18 4 | * @Email: code@tanweime.com 5 | */ 6 | 7 | use std::env; 8 | use std::fs::File; 9 | use std::iter::Iterator; 10 | use std::path::Path; 11 | use std::sync::Arc; 12 | 13 | use crate::error::Result; 14 | use crate::logical_plan::schema::NaiveSchema; 15 | 16 | use arrow::csv; 17 | use arrow::datatypes::Schema; 18 | use arrow::record_batch::RecordBatch; 19 | 20 | use super::TableSource; 21 | use crate::datasource::TableRef; 22 | 23 | pub struct CsvConfig { 24 | pub has_header: bool, 25 | pub delimiter: u8, 26 | pub max_read_records: Option, 27 | pub batch_size: usize, 28 | pub file_projection: Option>, 29 | pub datetime_format: Option, 30 | } 31 | 32 | impl Default for CsvConfig { 33 | fn default() -> Self { 34 | Self { 35 | has_header: true, 36 | delimiter: b',', 37 | max_read_records: Some(3), 38 | batch_size: 1_000_000, 39 | file_projection: None, 40 | datetime_format: None, 41 | } 42 | } 43 | } 44 | 45 | #[derive(Debug, Clone)] 46 | pub struct CsvTable { 47 | schema: NaiveSchema, 48 | batches: Vec, 49 | } 50 | 51 | impl CsvTable { 52 | #[allow(unused, clippy::iter_next_loop)] 53 | pub fn try_create(filename: &str, csv_config: CsvConfig) -> Result { 54 | let orig_schema = Self::infer_schema_from_csv(filename, &csv_config)?; 55 | let schema = NaiveSchema::from_unqualified(&orig_schema); 56 | 57 | let mut file = File::open(env::current_dir()?.join(Path::new(filename)))?; 58 | let mut reader = csv::Reader::new( 59 | file, 60 | Arc::new(orig_schema), 61 | csv_config.has_header, 62 | Some(csv_config.delimiter), 63 | csv_config.batch_size, 64 | None, 65 | csv_config.file_projection.clone(), 66 | csv_config.datetime_format, 67 | ); 68 | let mut batches = vec![]; 69 | 70 | for record in reader.next() { 71 | batches.push(record?); 72 | } 73 | 74 | Ok(Arc::new(Self { schema, batches })) 75 | } 76 | 77 | fn infer_schema_from_csv(filename: &str, csv_config: &CsvConfig) -> Result { 78 | let mut file = File::open(env::current_dir()?.join(Path::new(filename)))?; 79 | let (schema, _) = arrow::csv::reader::infer_reader_schema( 80 | &mut file, 81 | csv_config.delimiter, 82 | csv_config.max_read_records, 83 | csv_config.has_header, 84 | )?; 85 | Ok(schema) 86 | } 87 | } 88 | 89 | impl TableSource for CsvTable { 90 | fn schema(&self) -> &NaiveSchema { 91 | &self.schema 92 | } 93 | 94 | fn scan(&self, _projection: Option>) -> Result> { 95 | Ok(self.batches.clone()) 96 | } 97 | 98 | fn source_name(&self) -> String { 99 | "CsvTable".into() 100 | } 101 | } 102 | 103 | #[cfg(test)] 104 | mod tests { 105 | use super::*; 106 | use arrow::{ 107 | array::{ArrayRef, Float64Array, Int64Array, StringArray}, 108 | datatypes::{DataType, Field, Schema}, 109 | }; 110 | 111 | #[test] 112 | fn test_infer_schema() -> Result<()> { 113 | let table = CsvTable::try_create("data/test_data.csv", CsvConfig::default())?; 114 | let schema = table.schema(); 115 | 116 | let excepted = Arc::new(Schema::new(vec![ 117 | Field::new("id", DataType::Int64, false), 118 | Field::new("name", DataType::Utf8, false), 119 | Field::new("age", DataType::Int64, false), 120 | Field::new("score", DataType::Float64, false), 121 | ])); 122 | 123 | assert_eq!(schema.fields().len(), excepted.fields().len()); 124 | 125 | let iter = schema.fields().iter().zip(excepted.fields().iter()); 126 | for (field, excepted) in iter { 127 | assert_eq!(field.name(), excepted.name()); 128 | assert_eq!(field.data_type(), excepted.data_type()); 129 | assert_eq!(field.is_nullable(), excepted.is_nullable()); 130 | } 131 | 132 | Ok(()) 133 | } 134 | 135 | #[test] 136 | fn test_read_from_csv() -> Result<()> { 137 | let table = CsvTable::try_create("data/test_data.csv", CsvConfig::default())?; 138 | 139 | let batches = table.scan(None)?; 140 | 141 | assert_eq!(batches.len(), 1); 142 | let record_batch = &batches[0]; 143 | assert_eq!(record_batch.columns().len(), 4); 144 | 145 | let id_excepted: ArrayRef = Arc::new(Int64Array::from(vec![1, 2, 4, 5, 6, 7, 8, 9])); 146 | let name_excepted: ArrayRef = Arc::new(StringArray::from(vec![ 147 | "veeupup", "alex", "lynne", "alice", "bob", "jack", "cock", "primer", 148 | ])); 149 | let age_excepted: ArrayRef = 150 | Arc::new(Int64Array::from(vec![23, 20, 18, 19, 20, 21, 22, 23])); 151 | let score_excepted: ArrayRef = Arc::new(Float64Array::from(vec![ 152 | 60.0, 90.1, 99.99, 81.1, 82.2, 83.3, 84.4, 85.5, 153 | ])); 154 | 155 | assert_eq!(record_batch.column(0), &id_excepted); 156 | assert_eq!(record_batch.column(1), &name_excepted); 157 | assert_eq!(record_batch.column(2), &age_excepted); 158 | assert_eq!(record_batch.column(3), &score_excepted); 159 | 160 | Ok(()) 161 | } 162 | } 163 | -------------------------------------------------------------------------------- /src/datasource/empty.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-12 16:16:58 4 | * @Email: code@tanweime.com 5 | */ 6 | 7 | use super::TableSource; 8 | use crate::datasource::TableRef; 9 | use crate::error::Result; 10 | use crate::logical_plan::schema::NaiveSchema; 11 | 12 | use arrow::record_batch::RecordBatch; 13 | use std::sync::Arc; 14 | 15 | /// Empty Table with schema but no data 16 | #[derive(Debug, Clone)] 17 | pub struct EmptyTable { 18 | schema: NaiveSchema, 19 | } 20 | 21 | impl EmptyTable { 22 | #[allow(unused)] 23 | pub fn try_create(schema: NaiveSchema) -> Result { 24 | Ok(Arc::new(Self { schema })) 25 | } 26 | } 27 | 28 | impl TableSource for EmptyTable { 29 | fn schema(&self) -> &NaiveSchema { 30 | &self.schema 31 | } 32 | 33 | fn scan(&self, _projection: Option>) -> Result> { 34 | Ok(vec![]) 35 | } 36 | 37 | fn source_name(&self) -> String { 38 | "EmptyTable".into() 39 | } 40 | } 41 | 42 | #[cfg(test)] 43 | mod tests { 44 | use super::*; 45 | use arrow::datatypes::{DataType, Field, Schema}; 46 | 47 | #[test] 48 | fn test_empty_table() -> Result<()> { 49 | let schema = Schema::new(vec![ 50 | Field::new("a", DataType::Int32, false), 51 | Field::new("b", DataType::Int32, false), 52 | ]); 53 | let schema = NaiveSchema::from_qualified("t1", &schema); 54 | 55 | let table = EmptyTable::try_create(schema)?; 56 | let batches = table.scan(None)?; 57 | 58 | assert!(batches.is_empty()); 59 | 60 | Ok(()) 61 | } 62 | } 63 | -------------------------------------------------------------------------------- /src/datasource/memory.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-12 16:14:35 4 | * @Email: code@tanweime.com 5 | */ 6 | 7 | use arrow::record_batch::RecordBatch; 8 | use std::sync::Arc; 9 | 10 | use super::{TableRef, TableSource}; 11 | use crate::{error::Result, logical_plan::schema::NaiveSchema}; 12 | 13 | #[derive(Debug, Clone)] 14 | pub struct MemTable { 15 | schema: NaiveSchema, 16 | batches: Vec, 17 | } 18 | 19 | impl MemTable { 20 | #[allow(unused)] 21 | pub fn try_create(schema: NaiveSchema, batches: Vec) -> Result { 22 | Ok(Arc::new(Self { schema, batches })) 23 | } 24 | } 25 | 26 | impl TableSource for MemTable { 27 | fn schema(&self) -> &NaiveSchema { 28 | &self.schema 29 | } 30 | 31 | fn scan(&self, projection: Option>) -> Result> { 32 | if let Some(projection) = projection { 33 | let batches = self 34 | .batches 35 | .iter() 36 | .map(|record_batch| record_batch.project(projection.as_ref()).unwrap()) 37 | .collect::>(); 38 | return Ok(batches); 39 | } 40 | Ok(self.batches.clone()) 41 | } 42 | 43 | fn source_name(&self) -> String { 44 | "MemTable".into() 45 | } 46 | } 47 | 48 | #[cfg(test)] 49 | mod tests { 50 | use super::MemTable; 51 | use crate::error::Result; 52 | use crate::logical_plan::schema::NaiveSchema; 53 | use arrow::array::Int32Array; 54 | use arrow::datatypes::{DataType, Field, Schema}; 55 | use arrow::record_batch::RecordBatch; 56 | use std::sync::Arc; 57 | 58 | #[test] 59 | fn mem_table_test() -> Result<()> { 60 | let schema = Arc::new(Schema::new(vec![ 61 | Field::new("a", DataType::Int32, false), 62 | Field::new("b", DataType::Int32, false), 63 | Field::new("c", DataType::Int32, false), 64 | Field::new("d", DataType::Int32, true), 65 | ])); 66 | let schema = NaiveSchema::from_qualified("t1", &schema); 67 | 68 | let batch = RecordBatch::try_new( 69 | schema.clone().into(), 70 | vec![ 71 | Arc::new(Int32Array::from(vec![1, 2, 3])), 72 | Arc::new(Int32Array::from(vec![4, 5, 6])), 73 | Arc::new(Int32Array::from(vec![7, 8, 9])), 74 | Arc::new(Int32Array::from(vec![None, None, Some(9)])), 75 | ], 76 | )?; 77 | 78 | let mem_table = MemTable::try_create(schema, vec![batch])?; 79 | 80 | // scan 81 | let batches = mem_table.scan(Some(vec![2, 1]))?; 82 | let batch2 = &batches[0]; 83 | 84 | assert_eq!(2, batch2.schema().fields().len()); 85 | assert_eq!("t1.c", batch2.schema().field(0).name()); 86 | assert_eq!("t1.b", batch2.schema().field(1).name()); 87 | assert_eq!(2, batch2.num_columns()); 88 | 89 | Ok(()) 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/datasource/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-12 16:08:23 4 | * @Email: code@tanweime.com 5 | */ 6 | 7 | mod csv; 8 | mod empty; 9 | mod memory; 10 | 11 | use std::fmt::Debug; 12 | use std::sync::Arc; 13 | 14 | use crate::error::Result; 15 | use crate::logical_plan::schema::NaiveSchema; 16 | use arrow::record_batch::RecordBatch; 17 | 18 | pub type TableRef = Arc; 19 | 20 | pub trait TableSource: Debug { 21 | fn schema(&self) -> &NaiveSchema; 22 | 23 | // TODO(veeupup): return async stream record batch 24 | /// for scan 25 | fn scan(&self, projection: Option>) -> Result>; 26 | 27 | fn source_name(&self) -> String; 28 | } 29 | 30 | pub use csv::CsvConfig; 31 | pub use csv::CsvTable; 32 | pub use empty::EmptyTable; 33 | pub use memory::MemTable; 34 | -------------------------------------------------------------------------------- /src/datatype.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-13 15:10:51 4 | * @Email: code@tanweime.com 5 | */ 6 | 7 | use arrow::{ 8 | array::{Array, ArrayRef}, 9 | datatypes::DataType, 10 | }; 11 | 12 | use crate::logical_plan::expression::ScalarValue; 13 | 14 | #[derive(Debug, Clone)] 15 | pub enum ColumnValue { 16 | /// Array of values 17 | Array(ArrayRef), 18 | /// A single value, 19 | Const(ScalarValue, usize), 20 | } 21 | 22 | impl ColumnValue { 23 | pub fn data_type(&self) -> DataType { 24 | match self { 25 | ColumnValue::Array(array) => array.data_type().clone(), 26 | ColumnValue::Const(scalar, _) => scalar.data_field().data_type().clone(), 27 | } 28 | } 29 | 30 | pub fn into_array(self) -> ArrayRef { 31 | match self { 32 | ColumnValue::Array(array) => array, 33 | ColumnValue::Const(scalar, num_rows) => scalar.into_array(num_rows), 34 | } 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/db.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-14 15:26:40 4 | * @Email: code@tanweime.com 5 | */ 6 | 7 | use arrow::record_batch::RecordBatch; 8 | 9 | use crate::catalog::Catalog; 10 | use crate::datasource::CsvConfig; 11 | use crate::error::Result; 12 | 13 | use crate::optimizer::Optimizer; 14 | use crate::planner::QueryPlanner; 15 | use crate::sql::parser::SQLParser; 16 | use crate::sql::planner::SQLPlanner; 17 | 18 | #[derive(Default)] 19 | pub struct NaiveDB { 20 | catalog: Catalog, 21 | } 22 | 23 | impl NaiveDB { 24 | pub fn run_sql(&self, sql: &str) -> Result> { 25 | // 1. sql -> statement 26 | let statement = SQLParser::parse(sql)?; 27 | // 2. statement -> logical plan 28 | let sql_planner = SQLPlanner::new(&self.catalog); 29 | let logical_plan = sql_planner.statement_to_plan(statement)?; 30 | // 3. optimize 31 | let optimizer = Optimizer::default(); 32 | let logical_plan = optimizer.optimize(logical_plan); 33 | // 4. logical plan -> physical plan 34 | let physical_plan = QueryPlanner::create_physical_plan(&logical_plan)?; 35 | // 5. execute 36 | physical_plan.execute() 37 | } 38 | 39 | pub fn create_csv_table( 40 | &mut self, 41 | table: &str, 42 | csv_file: &str, 43 | csv_conf: CsvConfig, 44 | ) -> Result<()> { 45 | self.catalog.add_csv_table(table, csv_file, csv_conf) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /src/error.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-12 16:45:51 4 | * @Email: code@tanweime.com 5 | */ 6 | use arrow::error::ArrowError; 7 | use sqlparser::parser::ParserError; 8 | use std::io; 9 | 10 | pub type Result = std::result::Result; 11 | 12 | #[derive(Debug)] 13 | pub enum ErrorCode { 14 | /// Error return by arrow 15 | ArrowError(ArrowError), 16 | 17 | IoError(io::Error), 18 | 19 | NoSuchField, 20 | 21 | ColumnNotExists(String), 22 | 23 | LogicalError(String), 24 | 25 | NoSuchTable(String), 26 | 27 | ParserError(ParserError), 28 | 29 | IntervalError(String), 30 | 31 | PlanError(String), 32 | 33 | NoMatchFunction(String), 34 | 35 | NotSupported(String), 36 | 37 | NotImplemented, 38 | #[allow(unused)] 39 | Others, 40 | } 41 | 42 | impl From for ErrorCode { 43 | fn from(e: ArrowError) -> Self { 44 | ErrorCode::ArrowError(e) 45 | } 46 | } 47 | 48 | impl From for ErrorCode { 49 | fn from(e: io::Error) -> Self { 50 | ErrorCode::IoError(e) 51 | } 52 | } 53 | impl From for ErrorCode { 54 | fn from(e: ParserError) -> Self { 55 | ErrorCode::ParserError(e) 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-12 16:08:43 4 | * @Email: code@tanweime.com 5 | */ 6 | 7 | mod catalog; 8 | mod datasource; 9 | mod datatype; 10 | mod db; 11 | mod error; 12 | mod logical_plan; 13 | mod optimizer; 14 | mod physical_plan; 15 | mod planner; 16 | mod sql; 17 | mod utils; 18 | 19 | pub use datasource::CsvConfig; 20 | pub use db::NaiveDB; 21 | pub use error::Result; 22 | pub use utils::*; 23 | -------------------------------------------------------------------------------- /src/logical_plan/dataframe.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-12 22:52:47 4 | * @Email: code@tanweime.com 5 | */ 6 | 7 | use std::sync::Arc; 8 | 9 | use crate::logical_plan::expression::LogicalExpr; 10 | use crate::logical_plan::plan::{Aggregate, Filter, LogicalPlan, Projection}; 11 | 12 | use super::expression::{AggregateFunction, Column}; 13 | use super::plan::{Join, JoinType, Limit, Offset}; 14 | use super::schema::NaiveSchema; 15 | use crate::error::{ErrorCode, Result}; 16 | 17 | #[derive(Clone)] 18 | pub struct DataFrame { 19 | pub plan: LogicalPlan, 20 | } 21 | 22 | impl DataFrame { 23 | pub fn new(plan: LogicalPlan) -> Self { 24 | Self { plan } 25 | } 26 | 27 | pub fn project(self, exprs: Vec) -> Result { 28 | // TODO(veeupup): Ambiguous reference of field 29 | let mut fields = vec![]; 30 | for expr in &exprs { 31 | fields.push(expr.data_field(&self.plan)?); 32 | } 33 | let schema = NaiveSchema::new(fields); 34 | Ok(Self { 35 | plan: LogicalPlan::Projection(Projection { 36 | input: Arc::new(self.plan), 37 | exprs, 38 | schema, 39 | }), 40 | }) 41 | } 42 | 43 | pub fn filter(self, expr: LogicalExpr) -> Self { 44 | Self { 45 | plan: LogicalPlan::Filter(Filter { 46 | input: Arc::new(self.plan), 47 | predicate: expr, 48 | }), 49 | } 50 | } 51 | 52 | #[allow(unused)] 53 | pub fn aggregate( 54 | self, 55 | group_expr: Vec, 56 | aggr_expr: Vec, 57 | ) -> Self { 58 | let mut group_fields = group_expr 59 | .iter() 60 | .map(|expr| expr.data_field(&self.plan).unwrap()) 61 | .collect::>(); 62 | let mut aggr_fields = aggr_expr 63 | .iter() 64 | .map(|expr| expr.data_field(&self.plan).unwrap()) 65 | .collect::>(); 66 | group_fields.append(&mut aggr_fields); 67 | let schema = NaiveSchema::new(group_fields); 68 | Self { 69 | plan: LogicalPlan::Aggregate(Aggregate { 70 | input: Arc::new(self.plan), 71 | group_expr, 72 | aggr_expr, 73 | schema, 74 | }), 75 | } 76 | } 77 | 78 | pub fn limit(self, n: usize) -> DataFrame { 79 | Self { 80 | plan: LogicalPlan::Limit(Limit { 81 | input: Arc::new(self.plan), 82 | n, 83 | }), 84 | } 85 | } 86 | 87 | pub fn offset(self, n: usize) -> DataFrame { 88 | Self { 89 | plan: LogicalPlan::Offset(Offset { 90 | input: Arc::new(self.plan), 91 | n, 92 | }), 93 | } 94 | } 95 | 96 | pub fn join( 97 | &self, 98 | right: &LogicalPlan, 99 | join_type: JoinType, 100 | join_keys: (Vec, Vec), 101 | ) -> Result { 102 | if join_keys.0.len() != join_keys.1.len() { 103 | return Err(ErrorCode::PlanError( 104 | "left_keys length must be equal to right_keys length".to_string(), 105 | )); 106 | } 107 | 108 | let (left_keys, right_keys) = join_keys; 109 | let on: Vec<(_, _)> = left_keys.into_iter().zip(right_keys.into_iter()).collect(); 110 | 111 | let left_schema = self.plan.schema(); 112 | let join_schema = left_schema.join(right.schema()); 113 | // TODO(ywq) test on it. 114 | if on.is_empty() { 115 | return Ok(Self::new(LogicalPlan::CrossJoin(Join { 116 | left: Arc::new(self.plan.clone()), 117 | right: Arc::new(right.clone()), 118 | on, 119 | join_type, 120 | schema: join_schema, 121 | }))); 122 | } 123 | Ok(Self::new(LogicalPlan::Join(Join { 124 | left: Arc::new(self.plan.clone()), 125 | right: Arc::new(right.clone()), 126 | on, 127 | join_type, 128 | schema: join_schema, 129 | }))) 130 | } 131 | 132 | #[allow(unused)] 133 | pub fn schema(&self) -> &NaiveSchema { 134 | self.plan.schema() 135 | } 136 | 137 | pub fn logical_plan(self) -> LogicalPlan { 138 | self.plan 139 | } 140 | } 141 | 142 | #[cfg(test)] 143 | mod tests { 144 | 145 | use super::*; 146 | use crate::{catalog::Catalog, logical_plan::schema::NaiveField}; 147 | 148 | use crate::error::Result; 149 | use crate::logical_plan::expression::*; 150 | use arrow::datatypes::DataType; 151 | 152 | #[test] 153 | fn create_logical_plan() -> Result<()> { 154 | let schema = NaiveSchema::new(vec![ 155 | NaiveField::new(None, "state", DataType::Int64, true), 156 | NaiveField::new(None, "id", DataType::Int64, true), 157 | NaiveField::new(None, "first_name", DataType::Utf8, true), 158 | NaiveField::new(None, "last_name", DataType::Utf8, true), 159 | NaiveField::new(None, "salary", DataType::Int64, true), 160 | ]); 161 | let mut catalog = Catalog::default(); 162 | catalog.add_empty_table("empty", schema)?; 163 | 164 | let _plan = catalog 165 | .get_table_df("empty")? 166 | .filter(LogicalExpr::BinaryExpr(BinaryExpr { 167 | left: Box::new(LogicalExpr::column(None, "state".to_string())), 168 | op: Operator::Eq, 169 | right: Box::new(LogicalExpr::Literal(ScalarValue::Utf8(Some( 170 | "CO".to_string(), 171 | )))), 172 | })) 173 | .project(vec![ 174 | LogicalExpr::column(None, "id".to_string()), 175 | LogicalExpr::column(None, "first_name".to_string()), 176 | LogicalExpr::column(None, "last_name".to_string()), 177 | LogicalExpr::column(None, "state".to_string()), 178 | LogicalExpr::column(None, "salary".to_string()), 179 | ]); 180 | 181 | Ok(()) 182 | } 183 | } 184 | -------------------------------------------------------------------------------- /src/logical_plan/expression.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-12 20:28:35 4 | * @Email: code@tanweime.com 5 | */ 6 | 7 | use std::iter::repeat; 8 | 9 | use arrow::array::StringArray; 10 | use arrow::array::{new_null_array, ArrayRef, BooleanArray, Float64Array, Int64Array, UInt64Array}; 11 | 12 | use arrow::datatypes::DataType; 13 | use std::sync::Arc; 14 | 15 | use crate::error::{ErrorCode, Result}; 16 | 17 | use crate::logical_plan::plan::LogicalPlan; 18 | 19 | use super::schema::NaiveField; 20 | 21 | #[derive(Clone, Debug)] 22 | pub enum LogicalExpr { 23 | #[allow(unused)] 24 | /// An expression with a specific name. 25 | Alias(Box, String), 26 | /// A named reference to a qualified filed in a schema. 27 | Column(Column), 28 | /// A constant value. 29 | Literal(ScalarValue), 30 | /// A binary expression such as "age > 21" 31 | BinaryExpr(BinaryExpr), 32 | /// A unary expression such as "-id" 33 | UnaryExpr(UnaryExpr), 34 | #[allow(unused)] 35 | /// Negation of an expression. The expression's type must be a boolean to make sense. 36 | Not(Box), 37 | #[allow(unused)] 38 | /// Casts the expression to a given type and will return a runtime error if the expression cannot be cast. 39 | /// This expression is guaranteed to have a fixed type. 40 | Cast(CastExpr), 41 | #[allow(unused)] 42 | /// Represents the call of an aggregate built-in function with arguments. 43 | AggregateFunction(AggregateFunction), 44 | // Represents a reference to all fields in a schema. 45 | Wildcard, 46 | // TODO(veeupup): add more expresssions 47 | } 48 | 49 | impl LogicalExpr { 50 | pub fn column(table: Option, name: String) -> LogicalExpr { 51 | LogicalExpr::Column(Column { table, name }) 52 | } 53 | 54 | /// TODO(veeupup): consider return Vec 55 | pub fn data_field(&self, input: &LogicalPlan) -> Result { 56 | match self { 57 | LogicalExpr::Alias(expr, alias) => { 58 | let field = expr.data_field(input)?; 59 | Ok(NaiveField::new( 60 | None, 61 | alias, 62 | field.data_type().clone(), 63 | field.is_nullable(), 64 | )) 65 | } 66 | LogicalExpr::Column(Column { name, table }) => match table { 67 | Some(table) => input.schema().field_with_qualified_name(table, name), 68 | None => input.schema().field_with_unqualified_name(name), 69 | }, 70 | LogicalExpr::Literal(scalar_val) => Ok(scalar_val.data_field()), 71 | LogicalExpr::BinaryExpr(expr) => expr.data_field(input), 72 | LogicalExpr::Not(expr) => Ok(NaiveField::new( 73 | None, 74 | format!("Not {}", expr.data_field(input)?.name()).as_str(), 75 | DataType::Boolean, 76 | true, 77 | )), 78 | LogicalExpr::Cast(expr) => Ok(NaiveField::new( 79 | None, 80 | expr.data_field(input)?.name(), 81 | expr.data_type.clone(), 82 | true, 83 | )), 84 | LogicalExpr::UnaryExpr(scalar_func) => scalar_func.data_field(input), 85 | LogicalExpr::AggregateFunction(aggr_func) => aggr_func.data_field(input), 86 | LogicalExpr::Wildcard => Err(ErrorCode::IntervalError( 87 | "Wildcard not supported in logical plan".to_string(), 88 | )), 89 | } 90 | } 91 | 92 | pub fn and(self, other: LogicalExpr) -> LogicalExpr { 93 | binary_expr(self, Operator::And, other) 94 | } 95 | 96 | pub fn try_create_scalar_func(func_name: &str, exprs: &[LogicalExpr]) -> Result { 97 | if exprs.len() != 1 { 98 | return Err(ErrorCode::PlanError( 99 | "Scalar Func only has one parameter".to_string(), 100 | )); 101 | } 102 | match func_name { 103 | "abs" => Ok(LogicalExpr::UnaryExpr(UnaryExpr { 104 | func: UnaryOperator::Abs, 105 | arg: Box::new(exprs[0].clone()), 106 | })), 107 | _ => { 108 | return Err(ErrorCode::NoMatchFunction(format!( 109 | "Not match scalar func: {}", 110 | func_name 111 | ))); 112 | } 113 | } 114 | } 115 | 116 | pub fn try_create_aggregate_func( 117 | func_name: &str, 118 | exprs: &[LogicalExpr], 119 | ) -> Result { 120 | if exprs.len() != 1 { 121 | return Err(ErrorCode::PlanError( 122 | "Aggregate Func Now only Support One parameter".to_string(), 123 | )); 124 | } 125 | match func_name { 126 | "count" => Ok(LogicalExpr::AggregateFunction(AggregateFunction { 127 | fun: AggregateFunc::Count, 128 | args: Box::new(exprs[0].clone()), 129 | })), 130 | "sum" => Ok(LogicalExpr::AggregateFunction(AggregateFunction { 131 | fun: AggregateFunc::Sum, 132 | args: Box::new(exprs[0].clone()), 133 | })), 134 | "avg" => Ok(LogicalExpr::AggregateFunction(AggregateFunction { 135 | fun: AggregateFunc::Avg, 136 | args: Box::new(exprs[0].clone()), 137 | })), 138 | "min" => Ok(LogicalExpr::AggregateFunction(AggregateFunction { 139 | fun: AggregateFunc::Min, 140 | args: Box::new(exprs[0].clone()), 141 | })), 142 | "max" => Ok(LogicalExpr::AggregateFunction(AggregateFunction { 143 | fun: AggregateFunc::Max, 144 | args: Box::new(exprs[0].clone()), 145 | })), 146 | _ => { 147 | return Err(ErrorCode::NoMatchFunction(format!( 148 | "Not match aggregate func: {}", 149 | func_name 150 | ))); 151 | } 152 | } 153 | } 154 | } 155 | 156 | /// return a new expression l r 157 | pub fn binary_expr(l: LogicalExpr, op: Operator, r: LogicalExpr) -> LogicalExpr { 158 | LogicalExpr::BinaryExpr(BinaryExpr { 159 | left: Box::new(l), 160 | op, 161 | right: Box::new(r), 162 | }) 163 | } 164 | 165 | /// A named reference to a qualified field in a schema. 166 | #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] 167 | pub struct Column { 168 | pub table: Option, 169 | pub name: String, 170 | } 171 | 172 | #[derive(Debug, Clone)] 173 | 174 | pub enum ScalarValue { 175 | /// represents `DataType::Null` (castable to/from any other type) 176 | Null, 177 | /// true or false value 178 | Boolean(Option), 179 | /// 64bit float 180 | Float64(Option), 181 | /// signed 64bit int 182 | Int64(Option), 183 | /// unsigned 64bit int 184 | UInt64(Option), 185 | /// utf-8 encoded string. 186 | Utf8(Option), 187 | } 188 | 189 | macro_rules! build_array_from_option { 190 | ($DATA_TYPE:ident, $ARRAY_TYPE:ident, $EXPR:expr, $SIZE:expr) => {{ 191 | match $EXPR { 192 | Some(value) => Arc::new($ARRAY_TYPE::from_value(value, $SIZE)), 193 | None => new_null_array(&DataType::$DATA_TYPE, $SIZE), 194 | } 195 | }}; 196 | } 197 | 198 | impl ScalarValue { 199 | pub fn data_field(&self) -> NaiveField { 200 | match self { 201 | ScalarValue::Null => NaiveField::new(None, "Null", DataType::Null, true), 202 | ScalarValue::Boolean(_) => NaiveField::new(None, "bool", DataType::Boolean, true), 203 | ScalarValue::Float64(_) => NaiveField::new(None, "f64", DataType::Float64, true), 204 | ScalarValue::Int64(_) => NaiveField::new(None, "i64", DataType::Int64, true), 205 | ScalarValue::UInt64(_) => NaiveField::new(None, "u64", DataType::UInt64, true), 206 | ScalarValue::Utf8(_) => NaiveField::new(None, "string", DataType::Utf8, true), 207 | } 208 | } 209 | 210 | pub fn into_array(self, size: usize) -> ArrayRef { 211 | match self { 212 | ScalarValue::Null => new_null_array(&DataType::Null, size), 213 | ScalarValue::Boolean(e) => Arc::new(BooleanArray::from(vec![e; size])) as ArrayRef, 214 | ScalarValue::Float64(e) => build_array_from_option!(Float64, Float64Array, e, size), 215 | ScalarValue::Int64(e) => build_array_from_option!(Int64, Int64Array, e, size), 216 | ScalarValue::UInt64(e) => build_array_from_option!(UInt64, UInt64Array, e, size), 217 | ScalarValue::Utf8(e) => match e { 218 | Some(value) => Arc::new(StringArray::from_iter_values(repeat(value).take(size))), 219 | None => new_null_array(&DataType::Utf8, size), 220 | }, 221 | } 222 | } 223 | } 224 | 225 | #[derive(Debug, Clone)] 226 | pub struct BinaryExpr { 227 | /// Left-hand side of the expression 228 | pub left: Box, 229 | /// The comparison operator 230 | pub op: Operator, 231 | /// Right-hand side of the expression 232 | pub right: Box, 233 | } 234 | 235 | impl BinaryExpr { 236 | pub fn data_field(&self, input: &LogicalPlan) -> Result { 237 | let left = self.left.data_field(input)?; 238 | let left = left.name(); 239 | let right = match &*self.right { 240 | LogicalExpr::Literal(scalar_val) => match scalar_val { 241 | ScalarValue::Boolean(Some(val)) => val.to_string(), 242 | ScalarValue::Int64(Some(val)) => val.to_string(), 243 | ScalarValue::UInt64(Some(val)) => val.to_string(), 244 | ScalarValue::Float64(Some(val)) => val.to_string(), 245 | ScalarValue::Utf8(Some(val)) => val.to_string(), 246 | _ => "null".to_string(), 247 | }, 248 | _ => self.right.data_field(input)?.name().clone(), 249 | }; 250 | let field = match self.op { 251 | Operator::Eq => NaiveField::new( 252 | None, 253 | format!("{} = {}", left, right).as_str(), 254 | DataType::Boolean, 255 | true, 256 | ), 257 | Operator::NotEq => NaiveField::new( 258 | None, 259 | format!("{} != {}", left, right).as_str(), 260 | DataType::Boolean, 261 | true, 262 | ), 263 | Operator::Lt => NaiveField::new( 264 | None, 265 | format!("{} < {}", left, right).as_str(), 266 | DataType::Boolean, 267 | true, 268 | ), 269 | Operator::LtEq => NaiveField::new( 270 | None, 271 | format!("{} <= {}", left, right).as_str(), 272 | DataType::Boolean, 273 | true, 274 | ), 275 | Operator::Gt => NaiveField::new( 276 | None, 277 | format!("{} > {}", left, right).as_str(), 278 | DataType::Boolean, 279 | true, 280 | ), 281 | Operator::GtEq => NaiveField::new( 282 | None, 283 | format!("{} >= {}", left, right).as_str(), 284 | DataType::Boolean, 285 | true, 286 | ), 287 | Operator::Plus => NaiveField::new( 288 | None, 289 | format!("{} + {}", left, right).as_str(), 290 | self.left.data_field(input)?.data_type().clone(), 291 | true, 292 | ), 293 | Operator::Minus => NaiveField::new( 294 | None, 295 | format!("{} - {}", left, right).as_str(), 296 | self.left.data_field(input)?.data_type().clone(), 297 | true, 298 | ), 299 | Operator::Multiply => NaiveField::new( 300 | None, 301 | format!("{} * {}", left, right).as_str(), 302 | self.left.data_field(input)?.data_type().clone(), 303 | true, 304 | ), 305 | Operator::Divide => NaiveField::new( 306 | None, 307 | format!("{} / {}", left, right).as_str(), 308 | self.left.data_field(input)?.data_type().clone(), 309 | true, 310 | ), 311 | Operator::Modulos => NaiveField::new( 312 | None, 313 | format!("{} % {}", left, right).as_str(), 314 | self.left.data_field(input)?.data_type().clone(), 315 | true, 316 | ), 317 | Operator::And => NaiveField::new( 318 | None, 319 | format!("{} and {}", left, right).as_str(), 320 | DataType::Boolean, 321 | true, 322 | ), 323 | Operator::Or => NaiveField::new( 324 | None, 325 | format!("{} or {}", left, right).as_str(), 326 | DataType::Boolean, 327 | true, 328 | ), 329 | }; 330 | Ok(field) 331 | } 332 | } 333 | 334 | #[derive(Debug, Clone)] 335 | pub enum Operator { 336 | /// Expressions are equal 337 | Eq, 338 | /// Expressions are not equal 339 | NotEq, 340 | /// Left side is smaller than right side 341 | Lt, 342 | /// Left side is smaller or equal to right side 343 | LtEq, 344 | /// Left side is greater than right side 345 | Gt, 346 | /// Left side is greater or equal to right side 347 | GtEq, 348 | /// Addition 349 | Plus, 350 | /// Subtraction 351 | Minus, 352 | /// Multiplication operator, like `*` 353 | Multiply, 354 | /// Division operator, like `/` 355 | Divide, 356 | /// Remainder operator, like `%` 357 | Modulos, 358 | /// Logical AND, like `&&` 359 | And, 360 | /// Logical OR, like `||` 361 | Or, 362 | } 363 | 364 | #[derive(Debug, Clone)] 365 | pub struct UnaryExpr { 366 | /// The function 367 | pub func: UnaryOperator, 368 | /// List of expressions to feed to the functions as arguments 369 | /// TODO(veeupup): we should check the args' type and nums 370 | pub arg: Box, 371 | } 372 | 373 | impl UnaryExpr { 374 | pub fn data_field(&self, input: &LogicalPlan) -> Result { 375 | // TODO(veeupup): we should make unary func more specific and should check if valid before creating them 376 | let field = self.arg.data_field(input)?; 377 | // TODO(ywq): add more exprs 378 | let field = match self.func { 379 | UnaryOperator::Abs => NaiveField::new( 380 | None, 381 | format!("abs({})", field.name()).as_str(), 382 | DataType::Int64, 383 | true, 384 | ), 385 | _ => unimplemented!(), 386 | }; 387 | Ok(field) 388 | } 389 | } 390 | 391 | #[derive(Debug, Clone)] 392 | pub enum UnaryOperator { 393 | // Math functions 394 | Abs, 395 | #[allow(unused)] 396 | Sin, 397 | #[allow(unused)] 398 | Cos, 399 | #[allow(unused)] 400 | Tan, 401 | // String functions 402 | #[allow(unused)] 403 | Trim, 404 | #[allow(unused)] 405 | LTrim, 406 | #[allow(unused)] 407 | RTrim, 408 | #[allow(unused)] 409 | CharacterLength, 410 | #[allow(unused)] 411 | Lower, 412 | #[allow(unused)] 413 | Upper, 414 | #[allow(unused)] 415 | Repeat, 416 | #[allow(unused)] 417 | Replace, 418 | #[allow(unused)] 419 | Reverse, 420 | #[allow(unused)] 421 | Substr, 422 | } 423 | 424 | #[derive(Debug, Clone)] 425 | pub struct CastExpr { 426 | /// The expression being cast 427 | pub expr: Box, 428 | /// The `DataType` the expression will yield 429 | pub data_type: DataType, 430 | } 431 | 432 | impl CastExpr { 433 | pub fn data_field(&self, input: &LogicalPlan) -> Result { 434 | Ok(NaiveField::new( 435 | None, 436 | self.expr.data_field(input)?.name(), 437 | self.data_type.clone(), 438 | true, 439 | )) 440 | } 441 | } 442 | 443 | #[derive(Debug, Clone)] 444 | pub struct AggregateFunction { 445 | /// Name of the function 446 | pub fun: AggregateFunc, 447 | /// List of expressions to feed to the functions as arguments 448 | pub args: Box, 449 | } 450 | 451 | impl AggregateFunction { 452 | pub fn data_field(&self, input: &LogicalPlan) -> Result { 453 | let dt = self.args.data_field(input)?; 454 | let field = match self.fun { 455 | AggregateFunc::Count => NaiveField::new( 456 | None, 457 | format!("count({})", dt.name()).as_str(), 458 | dt.data_type().clone(), 459 | true, 460 | ), 461 | AggregateFunc::Sum => NaiveField::new( 462 | None, 463 | format!("sum({})", dt.name()).as_str(), 464 | dt.data_type().clone(), 465 | true, 466 | ), 467 | AggregateFunc::Min => NaiveField::new( 468 | None, 469 | format!("min({})", dt.name()).as_str(), 470 | dt.data_type().clone(), 471 | true, 472 | ), 473 | AggregateFunc::Max => NaiveField::new( 474 | None, 475 | format!("max({})", dt.name()).as_str(), 476 | dt.data_type().clone(), 477 | true, 478 | ), 479 | AggregateFunc::Avg => NaiveField::new( 480 | None, 481 | format!("avg({})", dt.name()).as_str(), 482 | dt.data_type().clone(), 483 | true, 484 | ), 485 | }; 486 | Ok(field) 487 | } 488 | } 489 | 490 | #[derive(Debug, Clone)] 491 | pub enum AggregateFunc { 492 | #[allow(unused)] 493 | Count, 494 | #[allow(unused)] 495 | Sum, 496 | #[allow(unused)] 497 | Min, 498 | #[allow(unused)] 499 | Max, 500 | #[allow(unused)] 501 | Avg, 502 | } 503 | -------------------------------------------------------------------------------- /src/logical_plan/literal.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-16 23:36:56 4 | * @Last Modified by: Veeupup 5 | * @Last Modified time: 2022-05-16 23:45:02 6 | */ 7 | 8 | use super::expression::{LogicalExpr, ScalarValue}; 9 | 10 | pub fn lit(n: T) -> LogicalExpr { 11 | n.lit() 12 | } 13 | 14 | pub trait Literal { 15 | fn lit(&self) -> LogicalExpr; 16 | } 17 | 18 | impl Literal for String { 19 | fn lit(&self) -> LogicalExpr { 20 | LogicalExpr::Literal(ScalarValue::Utf8(Some(self.clone()))) 21 | } 22 | } 23 | 24 | impl Literal for &str { 25 | fn lit(&self) -> LogicalExpr { 26 | LogicalExpr::Literal(ScalarValue::Utf8(Some((*self).to_owned()))) 27 | } 28 | } 29 | 30 | macro_rules! impl_literal { 31 | ($TYPE: ty, $SCALAR: ident) => { 32 | impl Literal for $TYPE { 33 | fn lit(&self) -> LogicalExpr { 34 | LogicalExpr::Literal(ScalarValue::$SCALAR(Some(self.clone()))) 35 | } 36 | } 37 | }; 38 | } 39 | 40 | impl_literal!(bool, Boolean); 41 | impl_literal!(i64, Int64); 42 | impl_literal!(u64, UInt64); 43 | impl_literal!(f64, Float64); 44 | -------------------------------------------------------------------------------- /src/logical_plan/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-12 20:15:59 4 | * @Email: code@tanweime.com 5 | * 6 | * A logical plan represents a relation (a set of tuples) with a known schema. Each logical plan can 7 | * have zero or more logical plans as inputs. It is convenient for a logical plan to expose its child plans 8 | * so that a visitor pattern can be used to walk through the plan. 9 | * 10 | */ 11 | 12 | mod dataframe; 13 | pub mod expression; 14 | pub mod literal; 15 | pub mod plan; 16 | pub mod schema; 17 | 18 | pub use dataframe::DataFrame; 19 | -------------------------------------------------------------------------------- /src/logical_plan/plan.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-13 14:09:04 4 | * @Email: code@tanweime.com 5 | */ 6 | 7 | use crate::datasource::TableRef; 8 | use crate::logical_plan::expression::{Column, LogicalExpr}; 9 | 10 | use std::fmt::{Debug, Display, Formatter, Result}; 11 | use std::sync::Arc; 12 | 13 | use super::expression::AggregateFunction; 14 | use super::schema::NaiveSchema; 15 | 16 | #[derive(Clone)] 17 | pub enum LogicalPlan { 18 | /// Evaluates an arbitrary list of expressions (essentially a 19 | /// SELECT with an expression list) on its input. 20 | Projection(Projection), 21 | 22 | /// Filters rows from its input that do not match an 23 | /// expression (essentially a WHERE clause with a predicate 24 | /// expression). 25 | /// 26 | /// Semantically, `` is evaluated for each row of the input; 27 | /// If the value of `` is true, the input row is passed to 28 | /// the output. If the value of `` is false, the row is 29 | /// discarded. 30 | Filter(Filter), 31 | 32 | #[allow(unused)] 33 | /// Aggregates its input based on a set of grouping and aggregate 34 | /// expressions (e.g. SUM). 35 | Aggregate(Aggregate), 36 | 37 | /// Join two logical plans on one or more join columns 38 | Join(Join), 39 | 40 | CrossJoin(Join), 41 | 42 | /// Produces the first `n` tuples from its input and discards the rest. 43 | Limit(Limit), 44 | 45 | /// Adjusts the starting point at which the rest of the expressions begin to effect. 46 | Offset(Offset), 47 | 48 | /// Produces rows from a table provider by reference or from the context 49 | TableScan(TableScan), 50 | } 51 | 52 | impl LogicalPlan { 53 | pub fn schema(&self) -> &NaiveSchema { 54 | match self { 55 | LogicalPlan::Projection(Projection { schema, .. }) => schema, 56 | LogicalPlan::Filter(Filter { input, .. }) => input.schema(), 57 | LogicalPlan::Aggregate(Aggregate { schema, .. }) => schema, 58 | LogicalPlan::Join(Join { schema, .. }) => schema, 59 | LogicalPlan::Limit(Limit { input, .. }) => input.schema(), 60 | LogicalPlan::Offset(Offset { input, .. }) => input.schema(), 61 | LogicalPlan::TableScan(TableScan { source, .. }) => source.schema(), 62 | LogicalPlan::CrossJoin(Join { schema, .. }) => schema, 63 | } 64 | } 65 | 66 | #[allow(unused)] 67 | pub fn children(&self) -> Vec> { 68 | match self { 69 | LogicalPlan::Projection(Projection { input, .. }) => vec![input.clone()], 70 | LogicalPlan::Filter(Filter { input, .. }) => vec![input.clone()], 71 | LogicalPlan::Aggregate(Aggregate { input, .. }) => vec![input.clone()], 72 | LogicalPlan::Join(Join { left, right, .. }) => vec![left.clone(), right.clone()], 73 | LogicalPlan::Limit(Limit { input, .. }) => vec![input.clone()], 74 | LogicalPlan::Offset(Offset { input, .. }) => vec![input.clone()], 75 | LogicalPlan::TableScan(_) => vec![], 76 | LogicalPlan::CrossJoin(Join { left, right, .. }) => vec![left.clone(), right.clone()], 77 | } 78 | } 79 | } 80 | 81 | impl Display for LogicalPlan { 82 | fn fmt(&self, f: &mut Formatter<'_>) -> Result { 83 | Debug::fmt(&self, f) 84 | } 85 | } 86 | 87 | impl Debug for LogicalPlan { 88 | fn fmt(&self, f: &mut Formatter<'_>) -> Result { 89 | do_pretty_print(self, f, 0) 90 | } 91 | } 92 | 93 | #[derive(Debug, Clone)] 94 | pub struct Projection { 95 | /// The list of expressions 96 | pub exprs: Vec, 97 | /// The incoming logical plan 98 | pub input: Arc, 99 | /// The schema description of the output 100 | pub schema: NaiveSchema, 101 | } 102 | 103 | #[derive(Debug, Clone)] 104 | pub struct Filter { 105 | /// The predicate expression, which must have Boolean type. 106 | pub predicate: LogicalExpr, 107 | /// The incoming logical plan 108 | pub input: Arc, 109 | } 110 | 111 | #[derive(Debug, Clone)] 112 | pub struct TableScan { 113 | /// The source of the table 114 | pub source: TableRef, 115 | /// Optional column indices to use as a projection 116 | pub projection: Option>, 117 | } 118 | 119 | /// Aggregates its input based on a set of grouping and aggregate 120 | /// expressions (e.g. SUM). 121 | #[derive(Debug, Clone)] 122 | pub struct Aggregate { 123 | /// The incoming logical plan 124 | pub input: Arc, 125 | /// Grouping expressions 126 | pub group_expr: Vec, 127 | /// Aggregate expressions 128 | pub aggr_expr: Vec, 129 | /// The schema description of the aggregate output 130 | pub schema: NaiveSchema, 131 | } 132 | 133 | #[derive(Debug, Clone, Copy, Eq, PartialEq)] 134 | pub enum JoinType { 135 | Inner, 136 | Left, 137 | Right, 138 | Cross, 139 | } 140 | 141 | /// Join two logical plans on one or more join columns 142 | #[derive(Debug, Clone)] 143 | pub struct Join { 144 | /// Left input 145 | pub left: Arc, 146 | /// Right input 147 | pub right: Arc, 148 | /// Equijoin clause expressed as pairs of (left, right) join columns, cross join don't have on conditions 149 | pub on: Vec<(Column, Column)>, 150 | /// Join type 151 | pub join_type: JoinType, 152 | /// The output schema, containing fields from the left and right inputs 153 | pub schema: NaiveSchema, 154 | } 155 | 156 | /// Produces the first `n` tuples from its input and discards the rest. 157 | #[derive(Debug, Clone)] 158 | pub struct Limit { 159 | /// The limit 160 | pub n: usize, 161 | /// The logical plan 162 | pub input: Arc, 163 | } 164 | 165 | /// Adjusts the starting point at which the rest of the expressions begin to effect. 166 | #[derive(Debug, Clone)] 167 | pub struct Offset { 168 | /// The offset. 169 | pub n: usize, 170 | /// The logical plan. 171 | pub input: Arc, 172 | } 173 | 174 | fn do_pretty_print(plan: &LogicalPlan, f: &mut Formatter<'_>, depth: usize) -> Result { 175 | write!(f, "{}", " ".repeat(depth))?; 176 | 177 | match plan { 178 | LogicalPlan::Projection(Projection { 179 | exprs, 180 | input, 181 | schema, 182 | }) => { 183 | writeln!(f, "Projection:")?; 184 | 185 | write!(f, "{}", " ".repeat(depth + 1))?; 186 | writeln!(f, "exprs: {:?}", exprs)?; 187 | 188 | write!(f, "{}", " ".repeat(depth + 1))?; 189 | writeln!(f, "input:")?; 190 | do_pretty_print(input.as_ref(), f, depth + 2)?; 191 | 192 | write!(f, "{}", " ".repeat(depth + 1))?; 193 | writeln!(f, "schema: {:?}", schema) 194 | } 195 | LogicalPlan::Filter(Filter { predicate, input }) => { 196 | writeln!(f, "Filter:")?; 197 | 198 | write!(f, "{}", " ".repeat(depth + 1))?; 199 | writeln!(f, "predicate: {:?}", predicate)?; 200 | 201 | write!(f, "{}", " ".repeat(depth + 1))?; 202 | writeln!(f, "input:")?; 203 | do_pretty_print(input.as_ref(), f, depth + 2) 204 | } 205 | LogicalPlan::Aggregate(Aggregate { 206 | input, 207 | group_expr, 208 | aggr_expr, 209 | schema, 210 | }) => { 211 | writeln!(f, "Aggregate:")?; 212 | 213 | write!(f, "{}", " ".repeat(depth + 1))?; 214 | writeln!(f, "input:")?; 215 | do_pretty_print(input.as_ref(), f, depth + 2)?; 216 | 217 | write!(f, "{}", " ".repeat(depth + 1))?; 218 | writeln!(f, "group_expr: {:?}", group_expr)?; 219 | 220 | write!(f, "{}", " ".repeat(depth + 1))?; 221 | writeln!(f, "aggr_expr: {:?}", aggr_expr)?; 222 | 223 | write!(f, "{}", " ".repeat(depth + 1))?; 224 | writeln!(f, "schema: {:?}", schema) 225 | } 226 | LogicalPlan::Join(Join { 227 | left, 228 | right, 229 | on, 230 | join_type, 231 | schema, 232 | }) => { 233 | writeln!(f, "Join:")?; 234 | 235 | write!(f, "{}", " ".repeat(depth + 1))?; 236 | writeln!(f, "left:")?; 237 | do_pretty_print(left.as_ref(), f, depth + 2)?; 238 | 239 | write!(f, "{}", " ".repeat(depth + 1))?; 240 | writeln!(f, "right:")?; 241 | do_pretty_print(right.as_ref(), f, depth + 2)?; 242 | 243 | write!(f, "{}", " ".repeat(depth + 1))?; 244 | writeln!(f, "on: {:?}", on)?; 245 | 246 | write!(f, "{}", " ".repeat(depth + 1))?; 247 | writeln!(f, "join_type: {:?}", join_type)?; 248 | 249 | write!(f, "{}", " ".repeat(depth + 1))?; 250 | writeln!(f, "schema: {:?}", schema) 251 | } 252 | LogicalPlan::Limit(Limit { n, input }) => { 253 | writeln!(f, "Limit:")?; 254 | 255 | write!(f, "{}", " ".repeat(depth + 1))?; 256 | writeln!(f, "n: {}", n)?; 257 | 258 | write!(f, "{}", " ".repeat(depth + 1))?; 259 | writeln!(f, "input:")?; 260 | do_pretty_print(input.as_ref(), f, depth + 2) 261 | } 262 | LogicalPlan::Offset(Offset { n, input }) => { 263 | writeln!(f, "Offset:")?; 264 | 265 | write!(f, "{}", " ".repeat(depth + 1))?; 266 | writeln!(f, "n: {}", n)?; 267 | 268 | write!(f, "{}", " ".repeat(depth + 1))?; 269 | writeln!(f, "input:")?; 270 | do_pretty_print(input.as_ref(), f, depth + 2) 271 | } 272 | LogicalPlan::TableScan(TableScan { source, projection }) => { 273 | writeln!(f, "TableScan:")?; 274 | 275 | write!(f, "{}", " ".repeat(depth + 1))?; 276 | writeln!(f, "source: {:?}", source.source_name())?; 277 | 278 | write!(f, "{}", " ".repeat(depth + 1))?; 279 | writeln!(f, "projection: {:?}", projection) 280 | } 281 | LogicalPlan::CrossJoin(Join { 282 | left, 283 | right, 284 | on: _, 285 | join_type, 286 | schema, 287 | }) => { 288 | writeln!(f, "Join:")?; 289 | 290 | write!(f, "{}", " ".repeat(depth + 1))?; 291 | writeln!(f, "left:")?; 292 | do_pretty_print(left.as_ref(), f, depth + 2)?; 293 | 294 | write!(f, "{}", " ".repeat(depth + 1))?; 295 | writeln!(f, "right:")?; 296 | do_pretty_print(right.as_ref(), f, depth + 2)?; 297 | 298 | write!(f, "{}", " ".repeat(depth + 1))?; 299 | writeln!(f, "join_type: {:?}", join_type)?; 300 | 301 | write!(f, "{}", " ".repeat(depth + 1))?; 302 | writeln!(f, "schema: {:?}", schema) 303 | } 304 | } 305 | } 306 | 307 | #[cfg(test)] 308 | mod tests { 309 | use super::*; 310 | use crate::datasource::EmptyTable; 311 | 312 | use crate::error::Result; 313 | use crate::logical_plan::expression::*; 314 | 315 | /// Create LogicalPlan 316 | #[test] 317 | fn create_logical_plan() -> Result<()> { 318 | let schema = NaiveSchema::empty(); 319 | let source = EmptyTable::try_create(schema)?; 320 | 321 | let scan = LogicalPlan::TableScan(TableScan { 322 | source, 323 | projection: None, 324 | }); 325 | 326 | let filter_expr = LogicalExpr::BinaryExpr(BinaryExpr { 327 | left: Box::new(LogicalExpr::column(None, "state".to_string())), 328 | op: Operator::Eq, 329 | right: Box::new(LogicalExpr::Literal(ScalarValue::Utf8(Some( 330 | "CO".to_string(), 331 | )))), 332 | }); 333 | 334 | let _selection = LogicalPlan::Filter(Filter { 335 | predicate: filter_expr, 336 | input: Arc::new(scan), 337 | }); 338 | 339 | let _projection = vec![ 340 | LogicalExpr::column(None, "id".to_string()), 341 | LogicalExpr::column(None, "first_name".to_string()), 342 | LogicalExpr::column(None, "last_name".to_string()), 343 | LogicalExpr::column(None, "state".to_string()), 344 | LogicalExpr::column(None, "salary".to_string()), 345 | ]; 346 | 347 | Ok(()) 348 | } 349 | 350 | #[test] 351 | fn print_logical_plan() { 352 | let schema = NaiveSchema::empty(); 353 | let source = EmptyTable::try_create(schema.clone()).unwrap(); 354 | 355 | let scan = LogicalPlan::TableScan(TableScan { 356 | source, 357 | projection: None, 358 | }); 359 | 360 | assert_eq!( 361 | "TableScan:\ 362 | \n source: \"EmptyTable\"\ 363 | \n projection: None\n", 364 | format!("{}", scan) 365 | ); 366 | 367 | let scan = Arc::new(scan); 368 | 369 | let limit = LogicalPlan::Limit(Limit { 370 | n: 233, 371 | input: scan.clone(), 372 | }); 373 | 374 | assert_eq!( 375 | "Limit:\ 376 | \n n: 233\ 377 | \n input:\ 378 | \n TableScan:\ 379 | \n source: \"EmptyTable\"\ 380 | \n projection: None\n", 381 | format!("{}", limit) 382 | ); 383 | 384 | let join = LogicalPlan::Join(Join { 385 | left: scan.clone(), 386 | right: scan, 387 | on: vec![], 388 | join_type: JoinType::Inner, 389 | schema, 390 | }); 391 | 392 | assert_eq!( 393 | "Join:\ 394 | \n left:\ 395 | \n TableScan:\ 396 | \n source: \"EmptyTable\"\ 397 | \n projection: None\ 398 | \n right:\ 399 | \n TableScan:\ 400 | \n source: \"EmptyTable\"\ 401 | \n projection: None\ 402 | \n on: []\ 403 | \n join_type: Inner\ 404 | \n schema: NaiveSchema { fields: [] }\n", 405 | format!("{}", join) 406 | ); 407 | } 408 | } 409 | -------------------------------------------------------------------------------- /src/logical_plan/schema.rs: -------------------------------------------------------------------------------- 1 | // Licensed to the Apache Software Foundation (ASF) under one 2 | // or more contributor license agreements. See the NOTICE file 3 | // distributed with this work for additional information 4 | // regarding copyright ownership. The ASF licenses this file 5 | // to you under the Apache License, Version 2.0 (the 6 | // "License"); you may not use this file except in compliance 7 | // with the License. You may obtain a copy of the License at 8 | // 9 | // http://www.apache.org/licenses/LICENSE-2.0 10 | // 11 | // Unless required by applicable law or agreed to in writing, 12 | // software distributed under the License is distributed on an 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | // KIND, either express or implied. See the License for the 15 | // specific language governing permissions and limitations 16 | // under the License. 17 | 18 | /* 19 | * @Author: Veeupup 20 | * @Date: 2022-05-18 13:45:10 21 | * @Last Modified by: Veeupup 22 | * @Last Modified time: 2022-05-18 17:30:21 23 | * 24 | * Arrow Field does not have table/relation name as its proroties 25 | * So we need a Schema to define inner schema with table name 26 | * 27 | * Code Ideas come from https://github.com/apache/arrow-datafusion/ 28 | * 29 | */ 30 | 31 | use arrow::datatypes::{DataType, SchemaRef}; 32 | use arrow::datatypes::{Field, Schema}; 33 | 34 | use crate::error::ErrorCode; 35 | use crate::error::Result; 36 | 37 | #[derive(Debug, Clone)] 38 | pub struct NaiveSchema { 39 | pub fields: Vec, 40 | } 41 | 42 | impl NaiveSchema { 43 | #[allow(unused)] 44 | pub fn empty() -> Self { 45 | Self { fields: vec![] } 46 | } 47 | 48 | pub fn new(fields: Vec) -> Self { 49 | // TODO(veeupup): check if we have duplicated name field 50 | Self { fields } 51 | } 52 | 53 | #[allow(unused)] 54 | pub fn from_qualified(qualifier: &str, schema: &Schema) -> Self { 55 | Self::new( 56 | schema 57 | .fields() 58 | .iter() 59 | .map(|field| NaiveField { 60 | field: field.clone(), 61 | qualifier: Some(qualifier.to_owned()), 62 | }) 63 | .collect(), 64 | ) 65 | } 66 | 67 | pub fn from_unqualified(schema: &Schema) -> Self { 68 | Self::new( 69 | schema 70 | .fields() 71 | .iter() 72 | .map(|field| NaiveField { 73 | field: field.clone(), 74 | qualifier: None, 75 | }) 76 | .collect(), 77 | ) 78 | } 79 | 80 | /// join two schema 81 | pub fn join(&self, schema: &NaiveSchema) -> Self { 82 | let mut fields = self.fields.clone(); 83 | fields.extend_from_slice(schema.fields().as_slice()); 84 | Self::new(fields) 85 | } 86 | 87 | pub fn fields(&self) -> &Vec { 88 | &self.fields 89 | } 90 | 91 | #[allow(unused)] 92 | pub fn field(&self, i: usize) -> &NaiveField { 93 | &self.fields[i] 94 | } 95 | 96 | #[allow(unused)] 97 | pub fn index_of(&self, name: &str) -> Result { 98 | for i in 0..self.fields().len() { 99 | if self.fields[i].name() == name { 100 | return Ok(i); 101 | } 102 | } 103 | Err(ErrorCode::NoSuchField) 104 | } 105 | 106 | #[allow(unused)] 107 | /// Find the field with the given name 108 | pub fn field_with_name(&self, relation_name: Option<&str>, name: &str) -> Result { 109 | if let Some(relation_name) = relation_name { 110 | self.field_with_qualified_name(relation_name, name) 111 | } else { 112 | self.field_with_unqualified_name(name) 113 | } 114 | } 115 | 116 | pub fn field_with_unqualified_name(&self, name: &str) -> Result { 117 | let matches = self 118 | .fields 119 | .iter() 120 | .filter(|field| field.name() == name) 121 | .collect::>(); 122 | match matches.len() { 123 | 0 => Err(ErrorCode::PlanError(format!("No field named '{}'", name))), 124 | _ => Ok(matches[0].to_owned()), 125 | // TODO(veeupup): multi same name, and we need to return Error 126 | // _ => Err(ErrorCode::PlanError(format!( 127 | // "Ambiguous reference to field named '{}'", 128 | // name 129 | // ))), 130 | } 131 | } 132 | 133 | pub fn field_with_qualified_name(&self, relation_name: &str, name: &str) -> Result { 134 | let matches = self 135 | .fields 136 | .iter() 137 | .filter(|field| { 138 | field.qualifier == Some(relation_name.to_owned()) && field.name() == name 139 | }) 140 | .collect::>(); 141 | match matches.len() { 142 | 0 => Err(ErrorCode::PlanError(format!("No field named '{}'", name))), 143 | _ => Ok(matches[0].to_owned()), 144 | // TODO(veeupup): multi same name, and we need to return Error 145 | // _ => Err(ErrorCode::PlanError(format!( 146 | // "Ambiguous reference to field named '{}'", 147 | // name 148 | // ))), 149 | } 150 | } 151 | } 152 | 153 | impl From for Schema { 154 | fn from(schema: NaiveSchema) -> Self { 155 | Schema::new( 156 | schema 157 | .fields 158 | .into_iter() 159 | .map(|f| { 160 | if f.qualifier().is_some() { 161 | Field::new( 162 | f.qualified_name().as_str(), 163 | f.data_type().to_owned(), 164 | f.is_nullable(), 165 | ) 166 | } else { 167 | f.field 168 | } 169 | }) 170 | .collect(), 171 | ) 172 | } 173 | } 174 | 175 | // impl Into for NaiveSchema { 176 | // /// Convert a schema into a DFSchema 177 | // fn into(self) -> Schema { 178 | // Schema::new( 179 | // self.fields 180 | // .into_iter() 181 | // .map(|f| { 182 | // if f.qualifier().is_some() { 183 | // Field::new( 184 | // f.qualified_name().as_str(), 185 | // f.data_type().to_owned(), 186 | // f.is_nullable(), 187 | // ) 188 | // } else { 189 | // f.field 190 | // } 191 | // }) 192 | // .collect(), 193 | // ) 194 | // } 195 | // } 196 | 197 | impl From for SchemaRef { 198 | fn from(schema: NaiveSchema) -> Self { 199 | SchemaRef::new(schema.into()) 200 | } 201 | } 202 | 203 | // impl Into for NaiveSchema { 204 | // fn into(self) -> SchemaRef { 205 | // SchemaRef::new(self.into()) 206 | // } 207 | // } 208 | 209 | /// NaiveField wraps an Arrow field and adds an optional qualifier 210 | #[derive(Debug, Clone, PartialEq, Eq)] 211 | pub struct NaiveField { 212 | /// Optional qualifier (usually a table or relation name) 213 | qualifier: Option, 214 | /// Arrow field definition 215 | field: Field, 216 | } 217 | 218 | impl NaiveField { 219 | pub fn new(qualifier: Option<&str>, name: &str, data_type: DataType, nullable: bool) -> Self { 220 | Self { 221 | qualifier: qualifier.map(|s| s.to_owned()), 222 | field: Field::new(name, data_type, nullable), 223 | } 224 | } 225 | 226 | #[allow(unused)] 227 | pub fn from(field: Field) -> Self { 228 | Self { 229 | qualifier: None, 230 | field, 231 | } 232 | } 233 | 234 | #[allow(unused)] 235 | pub fn from_qualified(qualifier: &str, field: Field) -> Self { 236 | Self { 237 | qualifier: Some(qualifier.to_owned()), 238 | field, 239 | } 240 | } 241 | 242 | pub fn name(&self) -> &String { 243 | self.field.name() 244 | } 245 | 246 | /// Returns an immutable reference to the `NaiveField`'s data-type 247 | pub fn data_type(&self) -> &DataType { 248 | self.field.data_type() 249 | } 250 | 251 | /// Indicates whether this `NaiveField` supports null values 252 | pub fn is_nullable(&self) -> bool { 253 | self.field.is_nullable() 254 | } 255 | 256 | /// Returns a reference to the `NaiveField`'s qualified name 257 | pub fn qualified_name(&self) -> String { 258 | if let Some(relation_name) = &self.qualifier { 259 | format!("{}.{}", relation_name, self.field.name()) 260 | } else { 261 | self.field.name().to_owned() 262 | } 263 | } 264 | 265 | /// Get the optional qualifier 266 | pub fn qualifier(&self) -> Option<&String> { 267 | self.qualifier.as_ref() 268 | } 269 | } 270 | 271 | impl From for Field { 272 | fn from(field: NaiveField) -> Self { 273 | Field::new(field.name(), field.data_type().clone(), field.is_nullable()) 274 | } 275 | } 276 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use naive_db::print_result; 2 | use naive_db::CsvConfig; 3 | use naive_db::NaiveDB; 4 | use naive_db::Result; 5 | 6 | fn main() -> Result<()> { 7 | let mut db = NaiveDB::default(); 8 | 9 | db.create_csv_table("t1", "data/test_data.csv", CsvConfig::default())?; 10 | 11 | // select 12 | let ret = db.run_sql("select id, name, age + 100 from t1 where id < 9 limit 3 offset 2")?; 13 | print_result(&ret)?; 14 | 15 | // Join 16 | db.create_csv_table("employee", "data/employee.csv", CsvConfig::default())?; 17 | db.create_csv_table("rank", "data/rank.csv", CsvConfig::default())?; 18 | db.create_csv_table("department", "data/department.csv", CsvConfig::default())?; 19 | 20 | let ret = db.run_sql( 21 | " 22 | select id, name, rank_name, department_name 23 | from employee 24 | join rank on 25 | employee.rank = rank.id 26 | join department on 27 | employee.department_id = department.id 28 | ", 29 | )?; 30 | print_result(&ret)?; 31 | 32 | let ret = db.run_sql("select * from employee join rank")?; 33 | print_result(&ret)?; 34 | 35 | // aggregate 36 | let ret = db.run_sql( 37 | " 38 | select count(id), sum(age), sum(score), avg(score), max(score), min(score) 39 | from t1 group by id % 3", 40 | )?; 41 | print_result(&ret)?; 42 | 43 | Ok(()) 44 | } 45 | -------------------------------------------------------------------------------- /src/optimizer/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-13 17:59:40 4 | * @Email: code@tanweime.com 5 | */ 6 | 7 | mod projection_push_down; 8 | 9 | use crate::logical_plan::plan::LogicalPlan; 10 | use std::sync::Arc; 11 | 12 | #[derive(Default)] 13 | pub struct Optimizer { 14 | rules: Vec>, 15 | } 16 | 17 | pub trait OptimizerRule { 18 | fn optimize(&self, plan: &LogicalPlan) -> LogicalPlan; 19 | } 20 | 21 | impl Optimizer { 22 | pub fn optimize(&self, plan: LogicalPlan) -> LogicalPlan { 23 | let mut plan = plan; 24 | for rule in &self.rules { 25 | plan = rule.optimize(&plan); 26 | } 27 | plan 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/optimizer/projection_push_down.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-13 18:54:33 4 | * @Email: code@tanweime.com 5 | */ 6 | 7 | use super::OptimizerRule; 8 | use crate::logical_plan::plan::LogicalPlan; 9 | 10 | pub struct ProjectionPushDown; 11 | 12 | impl OptimizerRule for ProjectionPushDown { 13 | fn optimize(&self, plan: &LogicalPlan) -> LogicalPlan { 14 | // TODO(veeupup): do projection push down 15 | plan.clone() 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /src/physical_plan/aggregate/avg.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-20 19:09:44 4 | * @Last Modified by: Veeupup 5 | * @Last Modified time: 2022-05-20 21:19:45 6 | */ 7 | 8 | use arrow::array::Array; 9 | use arrow::array::PrimitiveArray; 10 | use arrow::datatypes::DataType; 11 | 12 | use arrow::datatypes::Float64Type; 13 | use arrow::datatypes::Int64Type; 14 | use arrow::datatypes::UInt64Type; 15 | use arrow::record_batch::RecordBatch; 16 | 17 | use super::AggregateOperator; 18 | use crate::error::ErrorCode; 19 | use crate::logical_plan::expression::ScalarValue; 20 | use crate::logical_plan::schema::NaiveField; 21 | use crate::logical_plan::schema::NaiveSchema; 22 | use crate::physical_plan::ColumnExpr; 23 | use crate::physical_plan::PhysicalExpr; 24 | use crate::Result; 25 | 26 | #[derive(Debug, Clone)] 27 | pub struct Avg { 28 | sum: f64, 29 | cnt: u32, 30 | // physical column 31 | col_expr: ColumnExpr, 32 | } 33 | 34 | impl Avg { 35 | pub fn create(col_expr: ColumnExpr) -> Box { 36 | Box::new(Self { 37 | sum: 0.0, 38 | cnt: 0, 39 | col_expr, 40 | }) 41 | } 42 | } 43 | 44 | macro_rules! update_match { 45 | ($COL: expr, $DT: ty, $SELF: expr) => {{ 46 | let col = $COL.as_any().downcast_ref::>().unwrap(); 47 | for val in col.into_iter().flatten() { 48 | $SELF.sum += val as f64; 49 | $SELF.cnt += 1; 50 | } 51 | }}; 52 | } 53 | 54 | macro_rules! update_value { 55 | ($COL: expr, $DT: ty, $IDX: expr, $SELF: expr) => {{ 56 | let col = $COL.as_any().downcast_ref::>().unwrap(); 57 | if !col.is_null($IDX) { 58 | $SELF.sum += col.value($IDX) as f64; 59 | $SELF.cnt += 1; 60 | } 61 | }}; 62 | } 63 | 64 | impl AggregateOperator for Avg { 65 | fn data_field(&self, schema: &NaiveSchema) -> Result { 66 | // find by name 67 | if let Some(name) = &self.col_expr.name { 68 | let field = schema.field_with_unqualified_name(name)?; 69 | return Ok(NaiveField::new( 70 | None, 71 | format!("avg({})", field.name()).as_str(), 72 | DataType::Float64, 73 | false, 74 | )); 75 | } 76 | 77 | if let Some(idx) = &self.col_expr.idx { 78 | let field = schema.field(*idx); 79 | return Ok(NaiveField::new( 80 | None, 81 | format!("avg({})", field.name()).as_str(), 82 | DataType::Float64, 83 | false, 84 | )); 85 | } 86 | 87 | Err(ErrorCode::LogicalError( 88 | "ColumnExpr must has name or idx".to_string(), 89 | )) 90 | } 91 | 92 | fn update_batch(&mut self, data: &RecordBatch) -> Result<()> { 93 | let col = self.col_expr.evaluate(data)?.into_array(); 94 | match col.data_type() { 95 | DataType::Int64 => update_match!(col, Int64Type, self), 96 | DataType::UInt64 => update_match!(col, UInt64Type, self), 97 | DataType::Float64 => update_match!(col, Float64Type, self), 98 | _ => { 99 | return Err(ErrorCode::NotSupported(format!( 100 | "Avg func for {:?} is not supported", 101 | col.data_type() 102 | ))) 103 | } 104 | } 105 | 106 | Ok(()) 107 | } 108 | 109 | fn update(&mut self, data: &RecordBatch, idx: usize) -> Result<()> { 110 | let col = self.col_expr.evaluate(data)?.into_array(); 111 | match col.data_type() { 112 | DataType::Int64 => update_value!(col, Int64Type, idx, self), 113 | DataType::UInt64 => update_value!(col, UInt64Type, idx, self), 114 | DataType::Float64 => update_value!(col, Float64Type, idx, self), 115 | _ => unimplemented!(), 116 | } 117 | Ok(()) 118 | } 119 | 120 | fn evaluate(&self) -> Result { 121 | Ok(ScalarValue::Float64(Some(self.sum / self.cnt as f64))) 122 | } 123 | 124 | fn clear_state(&mut self) { 125 | self.sum = 0.0; 126 | self.cnt = 0; 127 | } 128 | } 129 | -------------------------------------------------------------------------------- /src/physical_plan/aggregate/count.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-20 19:06:45 4 | * @Last Modified by: Veeupup 5 | * @Last Modified time: 2022-05-20 21:13:03 6 | */ 7 | 8 | use arrow::datatypes::DataType; 9 | 10 | use arrow::record_batch::RecordBatch; 11 | 12 | use super::AggregateOperator; 13 | use crate::error::ErrorCode; 14 | use crate::logical_plan::expression::ScalarValue; 15 | use crate::logical_plan::schema::NaiveField; 16 | use crate::physical_plan::aggregate::NaiveSchema; 17 | use crate::physical_plan::ColumnExpr; 18 | use crate::physical_plan::PhysicalExpr; 19 | use crate::Result; 20 | 21 | #[derive(Debug, Clone)] 22 | pub struct Count { 23 | cnt: u64, 24 | col_expr: ColumnExpr, 25 | } 26 | 27 | impl Count { 28 | pub fn create(col_expr: ColumnExpr) -> Box { 29 | Box::new(Self { cnt: 0, col_expr }) 30 | } 31 | } 32 | 33 | impl AggregateOperator for Count { 34 | fn data_field(&self, schema: &NaiveSchema) -> Result { 35 | // find by name 36 | if let Some(name) = &self.col_expr.name { 37 | let field = schema.field_with_unqualified_name(name)?; 38 | return Ok(NaiveField::new( 39 | None, 40 | format!("count({})", field.name()).as_str(), 41 | DataType::UInt64, 42 | false, 43 | )); 44 | } 45 | 46 | if let Some(idx) = &self.col_expr.idx { 47 | let field = schema.field(*idx); 48 | return Ok(NaiveField::new( 49 | None, 50 | format!("count({})", field.name()).as_str(), 51 | DataType::UInt64, 52 | false, 53 | )); 54 | } 55 | 56 | Err(ErrorCode::LogicalError( 57 | "ColumnExpr must has name or idx".to_string(), 58 | )) 59 | } 60 | 61 | fn update_batch(&mut self, data: &RecordBatch) -> Result<()> { 62 | let col = self.col_expr.evaluate(data)?.into_array(); 63 | self.cnt += (col.len() - col.null_count()) as u64; 64 | Ok(()) 65 | } 66 | 67 | fn update(&mut self, data: &RecordBatch, idx: usize) -> Result<()> { 68 | let col = self.col_expr.evaluate(data)?.into_array(); 69 | if !col.is_null(idx) { 70 | self.cnt += 1; 71 | } 72 | Ok(()) 73 | } 74 | 75 | fn evaluate(&self) -> Result { 76 | Ok(ScalarValue::UInt64(Some(self.cnt))) 77 | } 78 | 79 | fn clear_state(&mut self) { 80 | self.cnt = 0; 81 | } 82 | } 83 | -------------------------------------------------------------------------------- /src/physical_plan/aggregate/max.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-20 19:09:44 4 | * @Last Modified by: Veeupup 5 | * @Last Modified time: 2022-05-20 21:19:45 6 | */ 7 | 8 | use arrow::array::Array; 9 | use arrow::array::PrimitiveArray; 10 | use arrow::datatypes::DataType; 11 | 12 | use arrow::datatypes::Float64Type; 13 | use arrow::datatypes::Int64Type; 14 | use arrow::datatypes::UInt64Type; 15 | use arrow::record_batch::RecordBatch; 16 | use ordered_float::OrderedFloat; 17 | 18 | use super::AggregateOperator; 19 | use crate::error::ErrorCode; 20 | use crate::logical_plan::expression::ScalarValue; 21 | use crate::logical_plan::schema::NaiveField; 22 | use crate::logical_plan::schema::NaiveSchema; 23 | use crate::physical_plan::ColumnExpr; 24 | use crate::physical_plan::PhysicalExpr; 25 | use crate::Result; 26 | 27 | #[derive(Debug, Clone)] 28 | pub struct Max { 29 | // TODO(veeupup): should use generic type for Int64, UInt Float64 30 | val: OrderedFloat, 31 | // physical column 32 | col_expr: ColumnExpr, 33 | } 34 | 35 | impl Max { 36 | pub fn create(col_expr: ColumnExpr) -> Box { 37 | Box::new(Self { 38 | val: OrderedFloat::from(f64::MIN), 39 | col_expr, 40 | }) 41 | } 42 | } 43 | 44 | macro_rules! update_match { 45 | ($COL: expr, $DT: ty, $SELF: expr) => {{ 46 | let col = $COL.as_any().downcast_ref::>().unwrap(); 47 | for val in col.into_iter().flatten() { 48 | let val = OrderedFloat::from(val as f64); 49 | if val > $SELF.val { 50 | $SELF.val = val; 51 | } 52 | } 53 | }}; 54 | } 55 | 56 | macro_rules! update_value { 57 | ($COL: expr, $DT: ty, $IDX: expr, $SELF: expr) => {{ 58 | let col = $COL.as_any().downcast_ref::>().unwrap(); 59 | if !col.is_null($IDX) { 60 | let val = OrderedFloat::from(col.value($IDX) as f64); 61 | if val > $SELF.val { 62 | $SELF.val = val; 63 | } 64 | } 65 | }}; 66 | } 67 | 68 | impl AggregateOperator for Max { 69 | fn data_field(&self, schema: &NaiveSchema) -> Result { 70 | // find by name 71 | if let Some(name) = &self.col_expr.name { 72 | let field = schema.field_with_unqualified_name(name)?; 73 | return Ok(NaiveField::new( 74 | None, 75 | format!("max({})", field.name()).as_str(), 76 | DataType::Float64, 77 | false, 78 | )); 79 | } 80 | 81 | if let Some(idx) = &self.col_expr.idx { 82 | let field = schema.field(*idx); 83 | return Ok(NaiveField::new( 84 | None, 85 | format!("max({})", field.name()).as_str(), 86 | DataType::Float64, 87 | false, 88 | )); 89 | } 90 | 91 | Err(ErrorCode::LogicalError( 92 | "ColumnExpr must has name or idx".to_string(), 93 | )) 94 | } 95 | 96 | fn update_batch(&mut self, data: &RecordBatch) -> Result<()> { 97 | let col = self.col_expr.evaluate(data)?.into_array(); 98 | match col.data_type() { 99 | DataType::Int64 => update_match!(col, Int64Type, self), 100 | DataType::UInt64 => update_match!(col, UInt64Type, self), 101 | DataType::Float64 => update_match!(col, Float64Type, self), 102 | _ => { 103 | return Err(ErrorCode::NotSupported(format!( 104 | "Max func for {:?} is not supported", 105 | col.data_type() 106 | ))) 107 | } 108 | } 109 | 110 | Ok(()) 111 | } 112 | 113 | fn update(&mut self, data: &RecordBatch, idx: usize) -> Result<()> { 114 | let col = self.col_expr.evaluate(data)?.into_array(); 115 | match col.data_type() { 116 | DataType::Int64 => update_value!(col, Int64Type, idx, self), 117 | DataType::UInt64 => update_value!(col, UInt64Type, idx, self), 118 | DataType::Float64 => update_value!(col, Float64Type, idx, self), 119 | _ => unimplemented!(), 120 | } 121 | Ok(()) 122 | } 123 | 124 | fn evaluate(&self) -> Result { 125 | Ok(ScalarValue::Float64(Some(self.val.into()))) 126 | } 127 | 128 | fn clear_state(&mut self) { 129 | self.val = OrderedFloat::from(f64::MIN); 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /src/physical_plan/aggregate/min.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-20 19:09:44 4 | * @Last Modified by: Veeupup 5 | * @Last Modified time: 2022-05-20 21:19:45 6 | */ 7 | 8 | use arrow::array::Array; 9 | use arrow::array::PrimitiveArray; 10 | use arrow::datatypes::DataType; 11 | 12 | use arrow::datatypes::Float64Type; 13 | use arrow::datatypes::Int64Type; 14 | use arrow::datatypes::UInt64Type; 15 | use arrow::record_batch::RecordBatch; 16 | use ordered_float::OrderedFloat; 17 | 18 | use super::AggregateOperator; 19 | use crate::error::ErrorCode; 20 | use crate::logical_plan::expression::ScalarValue; 21 | use crate::logical_plan::schema::NaiveField; 22 | use crate::logical_plan::schema::NaiveSchema; 23 | use crate::physical_plan::ColumnExpr; 24 | use crate::physical_plan::PhysicalExpr; 25 | use crate::Result; 26 | 27 | #[derive(Debug, Clone)] 28 | pub struct Min { 29 | // TODO(veeupup): should use generic type for Int64, UInt Float64 30 | val: OrderedFloat, 31 | // physical column 32 | col_expr: ColumnExpr, 33 | } 34 | 35 | impl Min { 36 | pub fn create(col_expr: ColumnExpr) -> Box { 37 | Box::new(Self { 38 | val: OrderedFloat::from(f64::MAX), 39 | col_expr, 40 | }) 41 | } 42 | } 43 | 44 | macro_rules! update_match { 45 | ($COL: expr, $DT: ty, $SELF: expr) => {{ 46 | let col = $COL.as_any().downcast_ref::>().unwrap(); 47 | for val in col.into_iter().flatten() { 48 | let val = OrderedFloat::from(val as f64); 49 | if val < $SELF.val { 50 | $SELF.val = val; 51 | } 52 | } 53 | }}; 54 | } 55 | 56 | macro_rules! update_value { 57 | ($COL: expr, $DT: ty, $IDX: expr, $SELF: expr) => {{ 58 | let col = $COL.as_any().downcast_ref::>().unwrap(); 59 | if !col.is_null($IDX) { 60 | let val = OrderedFloat::from(col.value($IDX) as f64); 61 | if val < $SELF.val { 62 | $SELF.val = val; 63 | } 64 | } 65 | }}; 66 | } 67 | 68 | impl AggregateOperator for Min { 69 | fn data_field(&self, schema: &NaiveSchema) -> Result { 70 | // find by name 71 | if let Some(name) = &self.col_expr.name { 72 | let field = schema.field_with_unqualified_name(name)?; 73 | return Ok(NaiveField::new( 74 | None, 75 | format!("min({})", field.name()).as_str(), 76 | DataType::Float64, 77 | false, 78 | )); 79 | } 80 | 81 | if let Some(idx) = &self.col_expr.idx { 82 | let field = schema.field(*idx); 83 | return Ok(NaiveField::new( 84 | None, 85 | format!("min({})", field.name()).as_str(), 86 | DataType::Float64, 87 | false, 88 | )); 89 | } 90 | 91 | Err(ErrorCode::LogicalError( 92 | "ColumnExpr must has name or idx".to_string(), 93 | )) 94 | } 95 | 96 | fn update_batch(&mut self, data: &RecordBatch) -> Result<()> { 97 | let col = self.col_expr.evaluate(data)?.into_array(); 98 | match col.data_type() { 99 | DataType::Int64 => update_match!(col, Int64Type, self), 100 | DataType::UInt64 => update_match!(col, UInt64Type, self), 101 | DataType::Float64 => update_match!(col, Float64Type, self), 102 | _ => { 103 | return Err(ErrorCode::NotSupported(format!( 104 | "min func for {:?} is not supported", 105 | col.data_type() 106 | ))) 107 | } 108 | } 109 | 110 | Ok(()) 111 | } 112 | 113 | fn update(&mut self, data: &RecordBatch, idx: usize) -> Result<()> { 114 | let col = self.col_expr.evaluate(data)?.into_array(); 115 | match col.data_type() { 116 | DataType::Int64 => update_value!(col, Int64Type, idx, self), 117 | DataType::UInt64 => update_value!(col, UInt64Type, idx, self), 118 | DataType::Float64 => update_value!(col, Float64Type, idx, self), 119 | _ => unimplemented!(), 120 | } 121 | Ok(()) 122 | } 123 | 124 | fn evaluate(&self) -> Result { 125 | Ok(ScalarValue::Float64(Some(self.val.into()))) 126 | } 127 | 128 | fn clear_state(&mut self) { 129 | self.val = OrderedFloat::from(f64::MAX); 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /src/physical_plan/aggregate/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-20 14:12:40 4 | * @Last Modified by: Veeupup 5 | * @Last Modified time: 2022-05-20 21:24:37 6 | */ 7 | 8 | pub mod avg; 9 | pub mod count; 10 | pub mod max; 11 | pub mod min; 12 | pub mod sum; 13 | 14 | use std::collections::HashMap; 15 | use std::fmt::Debug; 16 | use std::sync::{Arc, Mutex}; 17 | 18 | use crate::error::ErrorCode; 19 | use crate::logical_plan::schema::NaiveField; 20 | use crate::logical_plan::{expression::ScalarValue, schema::NaiveSchema}; 21 | 22 | use super::{concat_batches, PhysicalPlan, PhysicalPlanRef}; 23 | 24 | use crate::physical_plan::PhysicalExprRef; 25 | use crate::Result; 26 | use arrow::array::{PrimitiveArray, StringArray}; 27 | use arrow::datatypes::{DataType, Field, Int64Type, Schema, UInt64Type}; 28 | use arrow::record_batch::RecordBatch; 29 | 30 | #[derive(Debug)] 31 | pub struct PhysicalAggregatePlan { 32 | pub group_expr: Vec, 33 | pub aggr_ops: Mutex>>, 34 | pub input: PhysicalPlanRef, 35 | pub schema: NaiveSchema, 36 | } 37 | 38 | impl PhysicalAggregatePlan { 39 | pub fn create( 40 | group_expr: Vec, 41 | aggr_ops: Vec>, 42 | input: PhysicalPlanRef, 43 | ) -> PhysicalPlanRef { 44 | let schema = input.schema().clone(); 45 | Arc::new(Self { 46 | group_expr, 47 | aggr_ops: Mutex::new(aggr_ops), 48 | input, 49 | schema, 50 | }) 51 | } 52 | } 53 | 54 | macro_rules! group_by_datatype { 55 | ($VAL: expr, $DT: ty, $GROUP_DT: ty, $GROUP_IDXS: expr, $AGGR_OPS: expr, $SINGLE_BATCH: expr, $SCHEMA: expr, $LEN: expr) => {{ 56 | let group_val = $VAL.as_any().downcast_ref::>().unwrap(); 57 | // group val -> Vec 58 | // such as group by number % 3, then we will have group_idxs like 59 | // 0 -> [0,3,6], 1 -> [1,2,5] ... 60 | let mut group_idxs = HashMap::<$GROUP_DT, Vec>::new(); 61 | 62 | // split into different groups 63 | for (idx, val) in group_val.iter().enumerate() { 64 | if let Some(val) = val { 65 | if let Some(idxs) = group_idxs.get_mut(&val) { 66 | idxs.push(idx); 67 | } else { 68 | group_idxs.insert(val, vec![idx]); 69 | } 70 | } 71 | } 72 | 73 | // for each group, calculate aggregating value 74 | let mut batches = vec![]; 75 | 76 | for group_idx in group_idxs.values() { 77 | for idx in group_idx { 78 | for i in 0..$LEN { 79 | $AGGR_OPS.get_mut(i).unwrap().update(&$SINGLE_BATCH, *idx)?; 80 | } 81 | } 82 | 83 | let mut arrays = vec![]; 84 | // let aggr_ops = self.aggr_ops.lock().unwrap(); 85 | for aggr_op in $AGGR_OPS.iter() { 86 | let x = aggr_op.evaluate()?; 87 | arrays.push(x.into_array(1)); 88 | } 89 | 90 | let record_batch = RecordBatch::try_new($SCHEMA.clone(), arrays)?; 91 | batches.push(record_batch); 92 | 93 | // for next group aggregate usage 94 | for i in 0..$LEN { 95 | $AGGR_OPS.get_mut(i).unwrap().clear_state(); 96 | } 97 | } 98 | 99 | let single_batch = concat_batches(&$SCHEMA, &batches)?; 100 | Ok(vec![single_batch]) 101 | }}; 102 | } 103 | 104 | impl PhysicalPlan for PhysicalAggregatePlan { 105 | fn schema(&self) -> &NaiveSchema { 106 | &self.schema 107 | } 108 | 109 | fn children(&self) -> Result> { 110 | Ok(vec![self.input.clone()]) 111 | } 112 | 113 | fn execute(&self) -> Result> { 114 | // output schema 115 | let mut aggr_ops = self.aggr_ops.lock().unwrap(); 116 | let len = aggr_ops.len(); 117 | let mut fields: Vec = vec![]; 118 | for aggr_op in aggr_ops.iter() { 119 | fields.push(aggr_op.data_field(self.schema())?.into()); 120 | } 121 | let schema = Arc::new(Schema::new(fields)); 122 | 123 | if self.group_expr.is_empty() { 124 | let batches = self.input.execute()?; 125 | 126 | for batch in &batches { 127 | for i in 0..len { 128 | aggr_ops.get_mut(i).unwrap().update_batch(batch)?; 129 | } 130 | } 131 | 132 | let mut arrays = vec![]; 133 | for aggr_op in aggr_ops.iter() { 134 | let x = aggr_op.evaluate()?; 135 | arrays.push(x.into_array(1)); 136 | } 137 | 138 | let record_batch = RecordBatch::try_new(schema, arrays)?; 139 | Ok(vec![record_batch]) 140 | } else { 141 | // TODO(veeupup): support multi group by expr 142 | // such as `select sum(id) from t1 group by id % 3, age % 2` 143 | let batches = self.input.execute()?; 144 | let single_batch = concat_batches(&self.input.schema().clone().into(), &batches)?; 145 | 146 | let group_by_expr = &self.group_expr[0]; 147 | 148 | let val = group_by_expr.evaluate(&single_batch)?.into_array(); 149 | match val.data_type() { 150 | DataType::Int64 => group_by_datatype!( 151 | val, 152 | Int64Type, 153 | i64, 154 | group_idxs, 155 | aggr_ops, 156 | single_batch, 157 | schema, 158 | len 159 | ), 160 | DataType::UInt64 => group_by_datatype!( 161 | val, 162 | UInt64Type, 163 | u64, 164 | group_idxs, 165 | aggr_ops, 166 | single_batch, 167 | schema, 168 | len 169 | ), 170 | DataType::Utf8 => { 171 | let group_val = val.as_any().downcast_ref::().unwrap(); 172 | // group val -> Vec 173 | // such as group by number % 3, then we will have group_idxs like 174 | // 0 -> [0,3,6], 1 -> [1,2,5] ... 175 | let mut group_idxs = HashMap::>::new(); 176 | 177 | // split into different groups 178 | for (idx, val) in group_val.iter().enumerate() { 179 | if let Some(val) = val { 180 | if let Some(idxs) = group_idxs.get_mut(val) { 181 | idxs.push(idx); 182 | } else { 183 | group_idxs.insert(val.to_string(), vec![idx]); 184 | } 185 | } 186 | } 187 | 188 | // for each group, calculate aggregating value 189 | let mut batches = vec![]; 190 | 191 | for group_idx in group_idxs.values() { 192 | for idx in group_idx { 193 | for i in 0..len { 194 | aggr_ops.get_mut(i).unwrap().update(&single_batch, *idx)?; 195 | } 196 | } 197 | 198 | let mut arrays = vec![]; 199 | // let aggr_ops = self.aggr_ops.lock().unwrap(); 200 | for aggr_op in aggr_ops.iter() { 201 | let x = aggr_op.evaluate()?; 202 | arrays.push(x.into_array(1)); 203 | } 204 | 205 | let record_batch = RecordBatch::try_new(schema.clone(), arrays)?; 206 | batches.push(record_batch); 207 | 208 | // for next group aggregate usage 209 | for i in 0..len { 210 | aggr_ops.get_mut(i).unwrap().clear_state(); 211 | } 212 | } 213 | 214 | let single_batch = concat_batches(&schema, &batches)?; 215 | Ok(vec![single_batch]) 216 | } 217 | _ => Err(ErrorCode::NotSupported( 218 | "group by only support by `Int64`, `UInt64`, `String`".to_string(), 219 | )), 220 | } 221 | } 222 | } 223 | } 224 | 225 | pub trait AggregateOperator: Debug { 226 | fn data_field(&self, schema: &NaiveSchema) -> Result; 227 | 228 | fn update_batch(&mut self, data: &RecordBatch) -> Result<()>; 229 | 230 | fn update(&mut self, data: &RecordBatch, idx: usize) -> Result<()>; 231 | 232 | fn evaluate(&self) -> Result; 233 | 234 | fn clear_state(&mut self); 235 | } 236 | -------------------------------------------------------------------------------- /src/physical_plan/aggregate/sum.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-20 19:09:44 4 | * @Last Modified by: Veeupup 5 | * @Last Modified time: 2022-05-20 21:19:45 6 | */ 7 | 8 | use arrow::array::Array; 9 | use arrow::array::PrimitiveArray; 10 | use arrow::datatypes::DataType; 11 | 12 | use arrow::datatypes::Float64Type; 13 | use arrow::datatypes::Int64Type; 14 | use arrow::datatypes::UInt64Type; 15 | use arrow::record_batch::RecordBatch; 16 | 17 | use super::AggregateOperator; 18 | use crate::error::ErrorCode; 19 | use crate::logical_plan::expression::ScalarValue; 20 | use crate::logical_plan::schema::NaiveField; 21 | use crate::logical_plan::schema::NaiveSchema; 22 | use crate::physical_plan::ColumnExpr; 23 | use crate::physical_plan::PhysicalExpr; 24 | use crate::Result; 25 | 26 | #[derive(Debug, Clone)] 27 | pub struct Sum { 28 | // TODO(veeupup): should use generic type for Int64, UInt Float64 29 | sum: f64, 30 | // physical column 31 | col_expr: ColumnExpr, 32 | } 33 | 34 | impl Sum { 35 | pub fn create(col_expr: ColumnExpr) -> Box { 36 | Box::new(Self { sum: 0.0, col_expr }) 37 | } 38 | } 39 | 40 | macro_rules! update_match { 41 | ($COL: expr, $DT: ty, $SELF: expr) => {{ 42 | let col = $COL.as_any().downcast_ref::>().unwrap(); 43 | for val in col.into_iter().flatten() { 44 | $SELF.sum += val as f64; 45 | } 46 | }}; 47 | } 48 | 49 | macro_rules! update_value { 50 | ($COL: expr, $DT: ty, $IDX: expr, $SELF: expr) => {{ 51 | let col = $COL.as_any().downcast_ref::>().unwrap(); 52 | if !col.is_null($IDX) { 53 | $SELF.sum += col.value($IDX) as f64; 54 | } 55 | }}; 56 | } 57 | 58 | impl AggregateOperator for Sum { 59 | fn data_field(&self, schema: &NaiveSchema) -> Result { 60 | // find by name 61 | if let Some(name) = &self.col_expr.name { 62 | let field = schema.field_with_unqualified_name(name)?; 63 | return Ok(NaiveField::new( 64 | None, 65 | format!("sum({})", field.name()).as_str(), 66 | DataType::Float64, 67 | false, 68 | )); 69 | } 70 | 71 | if let Some(idx) = &self.col_expr.idx { 72 | let field = schema.field(*idx); 73 | return Ok(NaiveField::new( 74 | None, 75 | format!("sum({})", field.name()).as_str(), 76 | DataType::Float64, 77 | false, 78 | )); 79 | } 80 | 81 | Err(ErrorCode::LogicalError( 82 | "ColumnExpr must has name or idx".to_string(), 83 | )) 84 | } 85 | 86 | fn update_batch(&mut self, data: &RecordBatch) -> Result<()> { 87 | let col = self.col_expr.evaluate(data)?.into_array(); 88 | match col.data_type() { 89 | DataType::Int64 => update_match!(col, Int64Type, self), 90 | DataType::UInt64 => update_match!(col, UInt64Type, self), 91 | DataType::Float64 => update_match!(col, Float64Type, self), 92 | _ => { 93 | return Err(ErrorCode::NotSupported(format!( 94 | "Sum func for {:?} is not supported", 95 | col.data_type() 96 | ))) 97 | } 98 | } 99 | 100 | Ok(()) 101 | } 102 | 103 | fn update(&mut self, data: &RecordBatch, idx: usize) -> Result<()> { 104 | let col = self.col_expr.evaluate(data)?.into_array(); 105 | match col.data_type() { 106 | DataType::Int64 => update_value!(col, Int64Type, idx, self), 107 | DataType::UInt64 => update_value!(col, UInt64Type, idx, self), 108 | DataType::Float64 => update_value!(col, Float64Type, idx, self), 109 | _ => unimplemented!(), 110 | } 111 | Ok(()) 112 | } 113 | 114 | fn evaluate(&self) -> Result { 115 | Ok(ScalarValue::Float64(Some(self.sum))) 116 | } 117 | 118 | fn clear_state(&mut self) { 119 | self.sum = 0.0; 120 | } 121 | } 122 | -------------------------------------------------------------------------------- /src/physical_plan/cross_join.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: ywq 3 | * @Date: 2022-05-24 4 | */ 5 | use super::PhysicalPlan; 6 | use super::PhysicalPlanRef; 7 | use crate::logical_plan::plan::JoinType; 8 | use crate::logical_plan::schema::NaiveSchema; 9 | 10 | use crate::Result; 11 | use arrow::array::Array; 12 | use arrow::array::Float64Array; 13 | use arrow::array::Int64Array; 14 | use arrow::array::PrimitiveArray; 15 | use arrow::array::StringArray; 16 | use arrow::array::UInt64Array; 17 | use arrow::datatypes::DataType; 18 | use arrow::datatypes::Float64Type; 19 | use arrow::datatypes::Int64Type; 20 | use arrow::datatypes::SchemaRef; 21 | use arrow::datatypes::UInt64Type; 22 | use arrow::record_batch::RecordBatch; 23 | use std::sync::Arc; 24 | 25 | #[derive(Debug)] 26 | pub struct CrossJoin { 27 | left: PhysicalPlanRef, 28 | right: PhysicalPlanRef, 29 | #[allow(unused)] 30 | join_type: JoinType, 31 | schema: NaiveSchema, 32 | } 33 | 34 | impl CrossJoin { 35 | #[allow(unused)] 36 | pub fn create( 37 | left: PhysicalPlanRef, 38 | right: PhysicalPlanRef, 39 | join_type: JoinType, 40 | schema: NaiveSchema, 41 | ) -> PhysicalPlanRef { 42 | Arc::new(Self { 43 | left, 44 | right, 45 | join_type, 46 | schema, 47 | }) 48 | } 49 | } 50 | 51 | impl PhysicalPlan for CrossJoin { 52 | fn schema(&self) -> &NaiveSchema { 53 | &self.schema 54 | } 55 | 56 | fn execute(&self) -> Result> { 57 | // TODO(ywq) 58 | let outer_table = self.left.execute()?; 59 | let inner_table = self.right.execute()?; 60 | 61 | let mut batches: Vec = vec![]; 62 | 63 | for outer in &outer_table { 64 | for inner in &inner_table { 65 | let mut columns: Vec> = vec![]; 66 | let left_rows = outer.num_rows(); 67 | let right_rows = inner.num_rows(); 68 | for i in 0..self.left.schema().fields().len() { 69 | let array = outer.column(i); 70 | let dt = self.left.schema().field(i).data_type(); 71 | match dt { 72 | // TODO(ywq reafctor with macro) 73 | DataType::Int64 => { 74 | let mut t_vec = vec![]; 75 | let left_col = array 76 | .as_any() 77 | .downcast_ref::>() 78 | .unwrap(); 79 | for _ in 0..right_rows { 80 | for k in 0..left_col.len() { 81 | t_vec.push(left_col.value(k)) 82 | } 83 | } 84 | columns.push(Arc::new(Int64Array::from(t_vec))); 85 | } 86 | DataType::UInt64 => { 87 | let mut t_vec = vec![]; 88 | let left_col = array 89 | .as_any() 90 | .downcast_ref::>() 91 | .unwrap(); 92 | for _ in 0..right_rows { 93 | for k in 0..left_col.len() { 94 | t_vec.push(left_col.value(k)) 95 | } 96 | } 97 | columns.push(Arc::new(UInt64Array::from(t_vec))); 98 | } 99 | DataType::Float64 => { 100 | let mut t_vec = vec![]; 101 | let left_col = array 102 | .as_any() 103 | .downcast_ref::>() 104 | .unwrap(); 105 | for _ in 0..right_rows { 106 | for k in 0..left_col.len() { 107 | t_vec.push(left_col.value(k)) 108 | } 109 | } 110 | columns.push(Arc::new(Float64Array::from(t_vec))); 111 | } 112 | DataType::Utf8 => { 113 | let mut t_vec = vec![]; 114 | let left_col = array.as_any().downcast_ref::().unwrap(); 115 | for _ in 0..right_rows { 116 | for k in 0..left_col.len() { 117 | t_vec.push(left_col.value(k)) 118 | } 119 | } 120 | columns.push(Arc::new(StringArray::from(t_vec))); 121 | } 122 | _ => unimplemented!(), 123 | } 124 | } 125 | for i in 0..self.right.schema().fields().len() { 126 | let array = inner.column(i); 127 | let dt = self.right.schema().field(i).data_type(); 128 | match dt { 129 | DataType::Int64 => { 130 | let mut t_vec = vec![]; 131 | let left_col = array 132 | .as_any() 133 | .downcast_ref::>() 134 | .unwrap(); 135 | for _ in 0..left_rows { 136 | for k in 0..left_col.len() { 137 | t_vec.push(left_col.value(k)) 138 | } 139 | } 140 | columns.push(Arc::new(Int64Array::from(t_vec))); 141 | } 142 | DataType::UInt64 => { 143 | let mut t_vec = vec![]; 144 | let left_col = array 145 | .as_any() 146 | .downcast_ref::>() 147 | .unwrap(); 148 | for _ in 0..left_rows { 149 | for k in 0..left_col.len() { 150 | t_vec.push(left_col.value(k)) 151 | } 152 | } 153 | columns.push(Arc::new(UInt64Array::from(t_vec))); 154 | } 155 | DataType::Float64 => { 156 | let mut t_vec = vec![]; 157 | let left_col = array 158 | .as_any() 159 | .downcast_ref::>() 160 | .unwrap(); 161 | for _ in 0..left_rows { 162 | for k in 0..left_col.len() { 163 | t_vec.push(left_col.value(k)) 164 | } 165 | } 166 | columns.push(Arc::new(Float64Array::from(t_vec))); 167 | } 168 | DataType::Utf8 => { 169 | let mut t_vec = vec![]; 170 | let left_col = array.as_any().downcast_ref::().unwrap(); 171 | for _ in 0..left_rows { 172 | for k in 0..left_col.len() { 173 | t_vec.push(left_col.value(k)) 174 | } 175 | } 176 | columns.push(Arc::new(StringArray::from(t_vec))); 177 | } 178 | _ => unimplemented!(), 179 | } 180 | } 181 | // new batch 182 | let batch = RecordBatch::try_new(SchemaRef::from(self.schema.clone()), columns)?; 183 | batches.push(batch); 184 | } 185 | } 186 | Ok(batches) 187 | } 188 | 189 | fn children(&self) -> Result> { 190 | Ok(vec![self.left.clone(), self.right.clone()]) 191 | } 192 | } 193 | -------------------------------------------------------------------------------- /src/physical_plan/expression/binary.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-14 20:26:01 4 | * @Email: code@tanweime.com 5 | */ 6 | use arrow::{ 7 | array::{BooleanArray, PrimitiveArray}, 8 | compute::{ 9 | and_kleene, eq_dyn, gt_dyn, gt_eq_dyn, 10 | kernels::arithmetic::{add, divide, modulus, multiply, subtract}, 11 | lt_dyn, lt_eq_dyn, neq_dyn, or_kleene, 12 | }, 13 | datatypes::{DataType, Float64Type, Int64Type, UInt64Type}, 14 | record_batch::RecordBatch, 15 | }; 16 | use std::any::Any; 17 | use std::sync::Arc; 18 | 19 | use super::{PhysicalExpr, PhysicalExprRef}; 20 | use crate::{datatype::ColumnValue, error::ErrorCode, logical_plan::expression::Operator}; 21 | 22 | macro_rules! compare_bin { 23 | ($OP:expr, $LEFT: expr, $RIGHT: expr) => { 24 | $OP($LEFT, $RIGHT) 25 | .map_err(|e| e.into()) 26 | .map(|a| ColumnValue::Array(Arc::new(a))) 27 | }; 28 | } 29 | 30 | macro_rules! binary_op { 31 | ($OP:expr, $LEFT_DT: expr, $RIGHT_DT: expr, $LEFT: expr, $RIGHT: expr, $SELF_OP: expr) => {{ 32 | if $LEFT_DT == DataType::Boolean && $RIGHT_DT == DataType::Boolean { 33 | let left = $LEFT.as_any().downcast_ref::().unwrap(); 34 | let right = $RIGHT.as_any().downcast_ref::().unwrap(); 35 | let ret = $OP(left, right)?; 36 | Ok(ColumnValue::Array(Arc::new(ret))) 37 | } else { 38 | Err(ErrorCode::IntervalError(format!( 39 | "Cannot evaluate binary expression {:?} with types {:?} and {:?}", 40 | $SELF_OP, $LEFT_DT, $RIGHT_DT 41 | ))) 42 | } 43 | }}; 44 | } 45 | 46 | macro_rules! arithemic_op { 47 | ($OP:expr, $LEFT_DT: expr, $LEFT: expr, $RIGHT: expr) => {{ 48 | match $LEFT_DT { 49 | DataType::Int64 => { 50 | let left = $LEFT 51 | .as_any() 52 | .downcast_ref::>() 53 | .unwrap(); 54 | let right = $RIGHT 55 | .as_any() 56 | .downcast_ref::>() 57 | .unwrap(); 58 | let x = $OP(left, right)?; 59 | Ok(ColumnValue::Array(Arc::new(x))) 60 | } 61 | DataType::UInt64 => { 62 | let left = $LEFT 63 | .as_any() 64 | .downcast_ref::>() 65 | .unwrap(); 66 | let right = $RIGHT 67 | .as_any() 68 | .downcast_ref::>() 69 | .unwrap(); 70 | let x = $OP(left, right)?; 71 | Ok(ColumnValue::Array(Arc::new(x))) 72 | } 73 | DataType::Float64 => { 74 | let left = $LEFT 75 | .as_any() 76 | .downcast_ref::>() 77 | .unwrap(); 78 | let right = $RIGHT 79 | .as_any() 80 | .downcast_ref::>() 81 | .unwrap(); 82 | let x = $OP(left, right)?; 83 | Ok(ColumnValue::Array(Arc::new(x))) 84 | } 85 | _ => unimplemented!(), 86 | } 87 | }}; 88 | } 89 | 90 | #[derive(Debug)] 91 | pub struct PhysicalBinaryExpr { 92 | left: PhysicalExprRef, 93 | op: Operator, 94 | right: PhysicalExprRef, 95 | } 96 | 97 | impl PhysicalBinaryExpr { 98 | pub fn create(left: PhysicalExprRef, op: Operator, right: PhysicalExprRef) -> PhysicalExprRef { 99 | Arc::new(Self { left, op, right }) 100 | } 101 | } 102 | 103 | impl PhysicalExpr for PhysicalBinaryExpr { 104 | fn as_any(&self) -> &dyn Any { 105 | self 106 | } 107 | 108 | fn evaluate(&self, input: &RecordBatch) -> crate::Result { 109 | let left_value = self.left.evaluate(input)?; 110 | let right_value = self.right.evaluate(input)?; 111 | 112 | let left_data_type = left_value.data_type(); 113 | let right_data_type = right_value.data_type(); 114 | if left_value.data_type() != right_value.data_type() { 115 | return Err(ErrorCode::IntervalError(format!( 116 | "Cannot evaluate binary expression {:?} with types {:?} and {:?}", 117 | self.op, left_data_type, right_data_type 118 | ))); 119 | } 120 | 121 | // TODO(veeupup): speed up if left_value or right_value is scalar 122 | 123 | let left_array = left_value.into_array(); 124 | let right_array = right_value.into_array(); 125 | 126 | match self.op { 127 | Operator::Eq => compare_bin!(eq_dyn, &left_array, &right_array), 128 | Operator::NotEq => compare_bin!(neq_dyn, &left_array, &right_array), 129 | Operator::Lt => compare_bin!(lt_dyn, &left_array, &right_array), 130 | Operator::LtEq => compare_bin!(lt_eq_dyn, &left_array, &right_array), 131 | Operator::Gt => compare_bin!(gt_dyn, &left_array, &right_array), 132 | Operator::GtEq => compare_bin!(gt_eq_dyn, &left_array, &right_array), 133 | Operator::And => binary_op!( 134 | and_kleene, 135 | left_data_type, 136 | right_data_type, 137 | left_array, 138 | right_array, 139 | Operator::And 140 | ), 141 | Operator::Or => binary_op!( 142 | or_kleene, 143 | left_data_type, 144 | right_data_type, 145 | left_array, 146 | right_array, 147 | Operator::Or 148 | ), 149 | Operator::Plus => arithemic_op!(add, left_data_type, left_array, right_array), 150 | Operator::Minus => arithemic_op!(subtract, left_data_type, left_array, right_array), 151 | Operator::Multiply => arithemic_op!(multiply, left_data_type, left_array, right_array), 152 | Operator::Divide => arithemic_op!(divide, left_data_type, left_array, right_array), 153 | Operator::Modulos => arithemic_op!(modulus, left_data_type, left_array, right_array), 154 | } 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /src/physical_plan/expression/cast.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: ywqzzy 3 | * @Date: 2022-05-20 4 | */ 5 | use arrow::{datatypes::DataType, record_batch::RecordBatch}; 6 | use core::fmt; 7 | use std::any::Any; 8 | use std::{ 9 | fmt::{Debug, Formatter}, 10 | sync::Arc, 11 | }; 12 | 13 | use super::{PhysicalExpr, PhysicalExprRef}; 14 | use crate::datatype::ColumnValue; 15 | 16 | pub struct PhysicalCastExpr { 17 | #[allow(unused)] 18 | expr: PhysicalExprRef, 19 | data_type: DataType, 20 | } 21 | 22 | impl Debug for PhysicalCastExpr { 23 | fn fmt(&self, f: &mut Formatter) -> fmt::Result { 24 | f.debug_struct("CastExpr") 25 | .field("name", &"CAST") 26 | .field("return_type", &self.data_type) 27 | .finish() 28 | } 29 | } 30 | 31 | impl PhysicalCastExpr { 32 | pub fn create(expr: PhysicalExprRef, data_type: &DataType) -> PhysicalExprRef { 33 | Arc::new(Self { 34 | expr, 35 | data_type: data_type.clone(), 36 | }) 37 | } 38 | } 39 | 40 | impl PhysicalExpr for PhysicalCastExpr { 41 | fn as_any(&self) -> &dyn Any { 42 | self 43 | } 44 | 45 | fn evaluate(&self, _input: &RecordBatch) -> crate::Result { 46 | // let value = self.expr.evaluate(input)?; 47 | 48 | // let from_data_type = value.data_type(); 49 | // let value_array = value.into_array(); 50 | // let field_array_builder = arrow::array::make_builder(&self.data_type, input.num_rows()); 51 | 52 | match self.data_type { 53 | DataType::Null => todo!(), 54 | DataType::Boolean => todo!(), 55 | DataType::Int8 => todo!(), 56 | DataType::Int16 => todo!(), 57 | DataType::Int32 => todo!(), 58 | DataType::Int64 => todo!(), 59 | DataType::UInt8 => todo!(), 60 | DataType::UInt16 => todo!(), 61 | DataType::UInt32 => todo!(), 62 | DataType::UInt64 => todo!(), 63 | DataType::Float16 => todo!(), 64 | DataType::Float32 => todo!(), 65 | DataType::Float64 => todo!(), 66 | DataType::Timestamp(_, _) => todo!(), 67 | DataType::Date32 => todo!(), 68 | DataType::Date64 => todo!(), 69 | DataType::Time32(_) => todo!(), 70 | DataType::Time64(_) => todo!(), 71 | DataType::Duration(_) => todo!(), 72 | DataType::Interval(_) => todo!(), 73 | DataType::Binary => todo!(), 74 | DataType::FixedSizeBinary(_) => todo!(), 75 | DataType::LargeBinary => todo!(), 76 | DataType::Utf8 => todo!(), 77 | DataType::LargeUtf8 => todo!(), 78 | DataType::List(_) => todo!(), 79 | DataType::FixedSizeList(_, _) => todo!(), 80 | DataType::LargeList(_) => todo!(), 81 | DataType::Struct(_) => todo!(), 82 | DataType::Union(_, _) => todo!(), 83 | DataType::Dictionary(_, _) => todo!(), 84 | DataType::Decimal(_, _) => todo!(), 85 | DataType::Map(_, _) => todo!(), 86 | } 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/physical_plan/expression/column.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-13 14:56:36 4 | * @Email: code@tanweime.com 5 | */ 6 | 7 | use std::any::Any; 8 | use std::sync::Arc; 9 | 10 | use arrow::record_batch::RecordBatch; 11 | 12 | use super::PhysicalExpr; 13 | use crate::datatype::ColumnValue; 14 | use crate::error::{ErrorCode, Result}; 15 | use crate::physical_plan::PhysicalExprRef; 16 | 17 | #[derive(Debug, Clone)] 18 | pub struct ColumnExpr { 19 | pub name: Option, 20 | pub idx: Option, 21 | } 22 | 23 | impl ColumnExpr { 24 | pub fn try_create(name: Option, idx: Option) -> Result { 25 | if name.is_none() && idx.is_none() { 26 | return Err(ErrorCode::LogicalError( 27 | "ColumnExpr must has name or idx".to_string(), 28 | )); 29 | } 30 | Ok(Arc::new(Self { name, idx })) 31 | } 32 | } 33 | 34 | impl PhysicalExpr for ColumnExpr { 35 | fn as_any(&self) -> &dyn Any { 36 | self 37 | } 38 | 39 | fn evaluate(&self, input: &RecordBatch) -> Result { 40 | // prefer idx first 41 | if let Some(idx) = self.idx { 42 | let column = input.column(idx).clone(); 43 | return Ok(ColumnValue::Array(column)); 44 | } 45 | // then name 46 | if let Some(name) = &self.name { 47 | for (idx, field) in input.schema().fields().iter().enumerate() { 48 | if field.name() == name { 49 | let column = input.column(idx).clone(); 50 | return Ok(ColumnValue::Array(column)); 51 | } 52 | } 53 | } 54 | Err(ErrorCode::LogicalError( 55 | "ColumnExpr must has name or idx".to_string(), 56 | )) 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/physical_plan/expression/literal.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-14 21:30:10 4 | * @Email: code@tanweime.com 5 | */ 6 | 7 | use crate::logical_plan::expression::ScalarValue; 8 | use std::any::Any; 9 | use std::sync::Arc; 10 | 11 | use super::{PhysicalExpr, PhysicalExprRef}; 12 | use crate::datatype::ColumnValue; 13 | use crate::Result; 14 | use arrow::record_batch::RecordBatch; 15 | 16 | #[derive(Debug)] 17 | pub struct PhysicalLiteralExpr { 18 | pub literal: ScalarValue, 19 | } 20 | 21 | impl PhysicalLiteralExpr { 22 | pub fn create(literal: ScalarValue) -> PhysicalExprRef { 23 | Arc::new(Self { literal }) 24 | } 25 | } 26 | 27 | impl PhysicalExpr for PhysicalLiteralExpr { 28 | fn as_any(&self) -> &dyn Any { 29 | self 30 | } 31 | 32 | fn evaluate(&self, input: &RecordBatch) -> Result { 33 | Ok(ColumnValue::Const(self.literal.clone(), input.num_rows())) 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/physical_plan/expression/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-13 14:26:45 4 | * @Email: code@tanweime.com 5 | */ 6 | 7 | mod binary; 8 | mod cast; 9 | mod column; 10 | mod literal; 11 | mod unary; 12 | 13 | pub use binary::PhysicalBinaryExpr; 14 | pub use cast::PhysicalCastExpr; 15 | pub use column::ColumnExpr; 16 | pub use literal::PhysicalLiteralExpr; 17 | pub use unary::PhysicalUnaryExpr; 18 | 19 | use crate::{datatype::ColumnValue, error::Result}; 20 | use arrow::record_batch::RecordBatch; 21 | use std::any::Any; 22 | use std::fmt::Debug; 23 | use std::sync::Arc; 24 | 25 | pub trait PhysicalExpr: Debug { 26 | fn as_any(&self) -> &dyn Any; 27 | 28 | fn evaluate(&self, input: &RecordBatch) -> Result; 29 | } 30 | 31 | pub type PhysicalExprRef = Arc; 32 | -------------------------------------------------------------------------------- /src/physical_plan/expression/unary.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: ywqzzy 3 | * @Date: 2022-05-19 4 | */ 5 | use arrow::{ 6 | array::PrimitiveArray, 7 | datatypes::{DataType, Float32Type, Float64Type}, 8 | record_batch::RecordBatch, 9 | }; 10 | use core::fmt; 11 | use std::any::Any; 12 | use std::{ 13 | fmt::{Debug, Formatter}, 14 | sync::Arc, 15 | }; 16 | 17 | use super::{PhysicalExpr, PhysicalExprRef}; 18 | use crate::{datatype::ColumnValue, logical_plan::expression::UnaryOperator}; 19 | 20 | macro_rules! unary_arith_op { 21 | ($OP:ident, $DT: expr, $COL: expr) => {{ 22 | match $DT { 23 | DataType::Float64 => { 24 | let value = $COL 25 | .as_any() 26 | .downcast_ref::>() 27 | .unwrap(); 28 | let res: PrimitiveArray = 29 | arrow::compute::kernels::arity::unary(value, |x| x.$OP()); 30 | Ok(ColumnValue::Array(Arc::new(res))) 31 | } 32 | DataType::Float32 => { 33 | let value = $COL 34 | .as_any() 35 | .downcast_ref::>() 36 | .unwrap(); 37 | let res: PrimitiveArray = 38 | arrow::compute::kernels::arity::unary(value, |x| x.$OP()); 39 | Ok(ColumnValue::Array(Arc::new(res))) 40 | } 41 | _ => unimplemented!(), 42 | } 43 | }}; 44 | } 45 | 46 | pub struct PhysicalUnaryExpr { 47 | expr: PhysicalExprRef, 48 | func: UnaryOperator, 49 | name: String, 50 | return_type: DataType, 51 | } 52 | 53 | impl Debug for PhysicalUnaryExpr { 54 | fn fmt(&self, f: &mut Formatter) -> fmt::Result { 55 | f.debug_struct("UnaryExpr") 56 | .field("func", &"") 57 | .field("name", &self.name) 58 | .field("expr", &self.expr) 59 | .field("return_type", &self.return_type) 60 | .finish() 61 | } 62 | } 63 | 64 | impl PhysicalUnaryExpr { 65 | pub fn create( 66 | expr: PhysicalExprRef, 67 | func: UnaryOperator, 68 | name: String, 69 | return_type: &DataType, 70 | ) -> PhysicalExprRef { 71 | Arc::new(Self { 72 | expr, 73 | func, 74 | name, 75 | return_type: return_type.clone(), 76 | }) 77 | } 78 | } 79 | 80 | impl PhysicalExpr for PhysicalUnaryExpr { 81 | fn as_any(&self) -> &dyn Any { 82 | self 83 | } 84 | 85 | fn evaluate(&self, input: &RecordBatch) -> crate::Result { 86 | let value = self.expr.evaluate(input)?; 87 | 88 | let data_type = value.data_type(); 89 | 90 | let value_array = value.into_array(); 91 | 92 | match self.func { 93 | UnaryOperator::Abs => unary_arith_op!(abs, data_type, value_array), 94 | UnaryOperator::Sin => unary_arith_op!(sin, data_type, value_array), 95 | UnaryOperator::Cos => unary_arith_op!(cos, data_type, value_array), 96 | UnaryOperator::Tan => unary_arith_op!(cos, data_type, value_array), 97 | UnaryOperator::Trim => todo!(), 98 | UnaryOperator::LTrim => todo!(), 99 | UnaryOperator::RTrim => todo!(), 100 | UnaryOperator::CharacterLength => todo!(), 101 | UnaryOperator::Lower => todo!(), 102 | UnaryOperator::Upper => todo!(), 103 | UnaryOperator::Repeat => todo!(), 104 | UnaryOperator::Replace => todo!(), 105 | UnaryOperator::Reverse => todo!(), 106 | UnaryOperator::Substr => todo!(), 107 | } 108 | } 109 | } 110 | 111 | #[cfg(test)] 112 | mod tests { 113 | use super::*; 114 | use crate::datasource::{CsvConfig, CsvTable}; 115 | use crate::error::Result; 116 | use crate::logical_plan::expression::UnaryOperator; 117 | use crate::physical_plan::expression::ColumnExpr; 118 | use crate::physical_plan::PhysicalUnaryExpr; 119 | use arrow::array::{ArrayRef, Float64Array}; 120 | use arrow::datatypes::DataType; 121 | 122 | #[test] 123 | fn test_abs_expression() -> Result<()> { 124 | let table = CsvTable::try_create("data/test_data.csv", CsvConfig::default())?; 125 | let abs_expr = PhysicalUnaryExpr::create( 126 | ColumnExpr::try_create(Some("score".to_string()), None)?, 127 | UnaryOperator::Abs, 128 | "abs".to_string(), 129 | &DataType::Float64, 130 | ); 131 | let batches = table.scan(Some(vec![3]))?; 132 | let res = abs_expr.evaluate(&batches[0])?.into_array(); 133 | let score_expected: ArrayRef = Arc::new(Float64Array::from(vec![ 134 | Some(60.0), 135 | Some(90.1), 136 | Some(99.99), 137 | Some(81.1), 138 | Some(82.2), 139 | Some(83.3), 140 | Some(84.4), 141 | Some(85.5), 142 | ])); 143 | assert_eq!(&res, &score_expected); 144 | Ok(()) 145 | } 146 | 147 | #[test] 148 | fn test_sin_expression() -> Result<()> { 149 | let table = CsvTable::try_create("data/test_data.csv", CsvConfig::default())?; 150 | let abs_expr = PhysicalUnaryExpr::create( 151 | ColumnExpr::try_create(Some("score".to_string()), None)?, 152 | UnaryOperator::Sin, 153 | "sin".to_string(), 154 | &DataType::Float64, 155 | ); 156 | let batches = table.scan(Some(vec![3]))?; 157 | let res = abs_expr.evaluate(&batches[0])?.into_array(); 158 | let score_expected: ArrayRef = Arc::new(Float64Array::from(vec![ 159 | Some(-0.3048106211022167), 160 | Some(0.8447976840197418), 161 | Some(-0.5149633680424761), 162 | Some(-0.5492019627147913), 163 | Some(0.49565689358989423), 164 | Some(0.9988580516952367), 165 | Some(0.4104993826174394), 166 | Some(-0.6264561960895026), 167 | ])); 168 | assert_eq!(&res, &score_expected); 169 | Ok(()) 170 | } 171 | } 172 | -------------------------------------------------------------------------------- /src/physical_plan/hash_join.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-19 14:17:29 4 | * @Last Modified by: Veeupup 5 | * @Last Modified time: 2022-05-19 17:28:50 6 | */ 7 | 8 | use arrow::array::ArrayRef; 9 | use arrow::array::Int64Builder; 10 | use arrow::array::PrimitiveArray; 11 | use arrow::array::StringArray; 12 | use arrow::compute; 13 | use arrow::compute::concat; 14 | use arrow::datatypes::DataType; 15 | 16 | use arrow::datatypes::Int64Type; 17 | use arrow::datatypes::SchemaRef; 18 | use arrow::datatypes::UInt64Type; 19 | use arrow::record_batch::RecordBatch; 20 | 21 | use twox_hash::XxHash64; 22 | 23 | use super::PhysicalPlan; 24 | use super::PhysicalPlanRef; 25 | use crate::error::ErrorCode; 26 | use crate::logical_plan::expression::Column; 27 | use crate::logical_plan::plan::JoinType; 28 | use crate::logical_plan::schema::NaiveSchema; 29 | use crate::physical_plan::ColumnExpr; 30 | 31 | use crate::Result; 32 | use std::collections::HashMap; 33 | 34 | use std::hash::Hasher; 35 | 36 | use std::sync::Arc; 37 | use std::sync::Mutex; 38 | 39 | /// HashJoin has two phase for join 40 | /// 1. build phase will build HashMap about outer table using on column as hashval 41 | /// hashmap: col hash val -> vec 42 | /// 2. probe phase will probe all inner table by using on col to check 43 | #[derive(Debug)] 44 | pub struct HashJoin { 45 | left: PhysicalPlanRef, 46 | right: PhysicalPlanRef, 47 | on: Vec<(Column, Column)>, 48 | #[allow(unused)] 49 | join_type: JoinType, 50 | schema: NaiveSchema, 51 | /// on col hash val and row id 52 | /// chain hash table 53 | hashtable: Mutex>>, 54 | /// data, combine all data in one record batch 55 | data: Mutex>, 56 | } 57 | 58 | macro_rules! build_match { 59 | ($LEFT_COL: expr, $TYPE: ty, $SINGLE_BATCH: expr, $HASHTABLE: expr, $WRITE_DT: ident) => {{ 60 | let left_col = $LEFT_COL 61 | .as_any() 62 | .downcast_ref::>() 63 | .unwrap(); 64 | 65 | // build hashmap 66 | for i in 0..$SINGLE_BATCH.num_rows() { 67 | let left_val = left_col.value(i); 68 | let mut hasher = XxHash64::default(); 69 | hasher.$WRITE_DT(left_val); 70 | let hash_val = hasher.finish(); 71 | if let Some(vec) = $HASHTABLE.get_mut(&hash_val) { 72 | vec.push(i); 73 | } else { 74 | $HASHTABLE.insert(hash_val, vec![i]); 75 | } 76 | } 77 | }}; 78 | } 79 | 80 | macro_rules! probe_match { 81 | ($RIGHT_COL: expr, $LEFT_COL: expr, $TYPE: ty, $RIGHT_BATCH: expr, $HASHTABLE: expr, $OUTER_POS: expr, $INNER_POS: expr, $WRITE_DT: ident) => {{ 82 | let right_col = $RIGHT_COL.as_any().downcast_ref::<$TYPE>().unwrap(); 83 | let left_col = $LEFT_COL.as_any().downcast_ref::<$TYPE>().unwrap(); 84 | 85 | // probe 86 | for i in 0..$RIGHT_BATCH.num_rows() { 87 | let right_val = right_col.value(i); 88 | let mut hasher = XxHash64::default(); 89 | hasher.$WRITE_DT(right_val); 90 | let hash_val = hasher.finish(); 91 | 92 | if let Some(left_pos) = $HASHTABLE.get(&hash_val) { 93 | for idx in left_pos { 94 | // hash val same, but we need to check whether real value equal or not 95 | if left_col.value(*idx) == right_col.value(i) { 96 | $OUTER_POS.append_value(*idx as i64)?; 97 | $INNER_POS.append_value(i as i64)?; 98 | } 99 | } 100 | } 101 | } 102 | }}; 103 | } 104 | 105 | impl HashJoin { 106 | pub fn create( 107 | left: PhysicalPlanRef, 108 | right: PhysicalPlanRef, 109 | on: Vec<(Column, Column)>, 110 | join_type: JoinType, 111 | schema: NaiveSchema, 112 | ) -> PhysicalPlanRef { 113 | Arc::new(Self { 114 | left, 115 | right, 116 | on, 117 | join_type, 118 | schema, 119 | hashtable: Mutex::new(HashMap::new()), 120 | data: Mutex::new(None), 121 | }) 122 | } 123 | 124 | pub fn build(&self) -> Result> { 125 | if self.on.is_empty() { 126 | return Err(ErrorCode::PlanError( 127 | "Inner Join on Conditions can't not be empty".to_string(), 128 | )); 129 | } 130 | 131 | let left = self.left.execute()?; 132 | let single_batch = concat_batches(&self.left.schema().clone().into(), &left)?; 133 | 134 | let (left_col, _) = &self.on[0]; 135 | let left_col = ColumnExpr::try_create(Some(left_col.name.clone()), None)?; 136 | let left_col = left_col.evaluate(&single_batch)?.into_array(); 137 | 138 | let mut hashtable = self.hashtable.lock().unwrap(); 139 | match left_col.data_type() { 140 | DataType::Int64 => { 141 | build_match!(left_col, Int64Type, single_batch, hashtable, write_i64) 142 | } 143 | DataType::UInt64 => { 144 | build_match!(left_col, UInt64Type, single_batch, hashtable, write_u64) 145 | } 146 | DataType::Utf8 => { 147 | let left_col = left_col.as_any().downcast_ref::().unwrap(); 148 | 149 | // build hashmap 150 | for i in 0..single_batch.num_rows() { 151 | let mut hasher = XxHash64::default(); 152 | hasher.write(left_col.value(i).as_bytes()); 153 | let hash_val = hasher.finish(); 154 | if let Some(vec) = hashtable.get_mut(&hash_val) { 155 | vec.push(i); 156 | } else { 157 | hashtable.insert(hash_val, vec![i]); 158 | } 159 | } 160 | } 161 | _ => return Err(ErrorCode::NotImplemented), 162 | } 163 | 164 | *self.data.lock().unwrap() = Some(single_batch); 165 | Ok(vec![left_col]) 166 | } 167 | 168 | pub fn probe(&self, left_cols: Vec) -> Result> { 169 | let right_batches = self.right.execute()?; 170 | 171 | let (_, right_col) = &self.on[0]; 172 | let right_col = ColumnExpr::try_create(Some(right_col.name.clone()), None)?; 173 | let left_col = &left_cols[0]; 174 | 175 | let mut batches = vec![]; 176 | 177 | for right_batch in &right_batches { 178 | let right_col = right_col.evaluate(right_batch)?.into_array(); 179 | 180 | let hashtable = self.hashtable.lock().unwrap(); 181 | 182 | let mut outer_pos = Int64Builder::new(left_col.len()); 183 | let mut inner_pos = Int64Builder::new(right_col.len()); 184 | match right_col.data_type() { 185 | DataType::Int64 => probe_match!( 186 | right_col, 187 | left_col, 188 | PrimitiveArray, 189 | right_batch, 190 | hashtable, 191 | outer_pos, 192 | inner_pos, 193 | write_i64 194 | ), 195 | DataType::UInt64 => probe_match!( 196 | right_col, 197 | left_col, 198 | PrimitiveArray, 199 | right_batch, 200 | hashtable, 201 | outer_pos, 202 | inner_pos, 203 | write_u64 204 | ), 205 | DataType::Utf8 => { 206 | let right_col = right_col.as_any().downcast_ref::().unwrap(); 207 | let left_col = left_col.as_any().downcast_ref::().unwrap(); 208 | 209 | // probe 210 | for i in 0..right_batch.num_rows() { 211 | let mut hasher = XxHash64::default(); 212 | hasher.write(right_col.value(i).as_bytes()); 213 | let hash_val = hasher.finish(); 214 | 215 | if let Some(left_pos) = hashtable.get(&hash_val) { 216 | for idx in left_pos { 217 | // hash val same, but we need to check whether real value equal or not 218 | if left_col.value(*idx) == right_col.value(i) { 219 | outer_pos.append_value(*idx as i64)?; 220 | inner_pos.append_value(i as i64)?; 221 | } 222 | } 223 | } 224 | } 225 | } 226 | _ => return Err(ErrorCode::NotImplemented), 227 | } 228 | 229 | let mut columns = vec![]; 230 | 231 | let outer_pos = outer_pos.finish(); 232 | let inner_pos = inner_pos.finish(); 233 | 234 | // add left columns 235 | let data = self.data.lock().unwrap(); 236 | if let Some(outer_table) = &*data { 237 | for i in 0..self.left.schema().fields().len() { 238 | let array = outer_table.column(i); 239 | columns.push(compute::take(array.as_ref(), &outer_pos, None)?); 240 | } 241 | 242 | // add right columns 243 | for i in 0..self.right.schema().fields().len() { 244 | let array = right_batch.column(i); 245 | columns.push(compute::take(array.as_ref(), &inner_pos, None)?); 246 | } 247 | 248 | let batch = RecordBatch::try_new(SchemaRef::from(self.schema.clone()), columns)?; 249 | batches.push(batch); 250 | } 251 | } 252 | 253 | Ok(batches) 254 | } 255 | } 256 | 257 | /// Concatenates an array of `RecordBatch` into one batch 258 | pub fn concat_batches(schema: &SchemaRef, batches: &[RecordBatch]) -> Result { 259 | if batches.is_empty() { 260 | return Ok(RecordBatch::new_empty(schema.clone())); 261 | } 262 | let mut arrays = Vec::with_capacity(schema.fields().len()); 263 | for i in 0..schema.fields().len() { 264 | let array = concat( 265 | &batches 266 | .iter() 267 | .map(|batch| batch.column(i).as_ref()) 268 | .collect::>(), 269 | )?; 270 | arrays.push(array); 271 | } 272 | Ok(RecordBatch::try_new(schema.clone(), arrays)?) 273 | } 274 | 275 | impl PhysicalPlan for HashJoin { 276 | fn schema(&self) -> &NaiveSchema { 277 | &self.schema 278 | } 279 | 280 | fn execute(&self) -> Result> { 281 | let left_cols = self.build()?; 282 | 283 | self.probe(left_cols) 284 | } 285 | 286 | fn children(&self) -> Result> { 287 | Ok(vec![self.left.clone(), self.right.clone()]) 288 | } 289 | } 290 | -------------------------------------------------------------------------------- /src/physical_plan/limit.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-17 11:27:29 4 | * @Last Modified by: Veeupup 5 | * @Last Modified time: 2022-05-18 14:45:03 6 | */ 7 | 8 | use super::{PhysicalPlan, PhysicalPlanRef}; 9 | use crate::error::Result; 10 | use crate::logical_plan::schema::NaiveSchema; 11 | 12 | use arrow::record_batch::RecordBatch; 13 | use std::sync::Arc; 14 | 15 | #[derive(Debug, Clone)] 16 | pub struct PhysicalLimitPlan { 17 | input: PhysicalPlanRef, 18 | n: usize, 19 | } 20 | 21 | impl PhysicalLimitPlan { 22 | pub fn create(input: PhysicalPlanRef, n: usize) -> PhysicalPlanRef { 23 | Arc::new(Self { input, n }) 24 | } 25 | } 26 | 27 | impl PhysicalPlan for PhysicalLimitPlan { 28 | fn schema(&self) -> &NaiveSchema { 29 | self.input.schema() 30 | } 31 | 32 | fn execute(&self) -> Result> { 33 | let batches = self.input.execute()?; 34 | let mut n = self.n; 35 | let mut ret = vec![]; 36 | for batch in &batches { 37 | if n == 0 { 38 | break; 39 | } 40 | if batch.num_rows() <= n { 41 | ret.push(batch.clone()); 42 | n -= batch.num_rows(); 43 | } else { 44 | ret.push(batch.slice(0, n)); 45 | n = 0; 46 | }; 47 | } 48 | Ok(ret) 49 | } 50 | 51 | fn children(&self) -> Result> { 52 | Ok(vec![self.input.clone()]) 53 | } 54 | } 55 | 56 | #[cfg(test)] 57 | mod tests { 58 | use crate::{ 59 | datasource::{CsvConfig, CsvTable}, 60 | physical_plan::ScanPlan, 61 | }; 62 | use arrow::array::{ArrayRef, Float64Array, Int64Array, StringArray}; 63 | 64 | use super::*; 65 | 66 | #[test] 67 | fn test_physical_limit() -> Result<()> { 68 | let source = CsvTable::try_create("data/test_data.csv", CsvConfig::default())?; 69 | 70 | let scan_plan = ScanPlan::create(source, None); 71 | let limit_plan = PhysicalLimitPlan::create(scan_plan, 2); 72 | 73 | let result = limit_plan.execute()?; 74 | 75 | assert_eq!(result.len(), 1); 76 | let record_batch = &result[0]; 77 | assert_eq!(record_batch.columns().len(), 4); 78 | 79 | let id_excepted: ArrayRef = Arc::new(Int64Array::from(vec![1, 2])); 80 | let name_excepted: ArrayRef = Arc::new(StringArray::from(vec!["veeupup", "alex"])); 81 | let age_excepted: ArrayRef = Arc::new(Int64Array::from(vec![23, 20])); 82 | let score_excepted: ArrayRef = Arc::new(Float64Array::from(vec![60.0, 90.1])); 83 | 84 | assert_eq!(record_batch.column(0), &id_excepted); 85 | assert_eq!(record_batch.column(1), &name_excepted); 86 | assert_eq!(record_batch.column(2), &age_excepted); 87 | assert_eq!(record_batch.column(3), &score_excepted); 88 | 89 | Ok(()) 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /src/physical_plan/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-13 14:07:36 4 | * @Email: code@tanweime.com 5 | */ 6 | 7 | mod expression; 8 | mod plan; 9 | 10 | mod aggregate; 11 | mod cross_join; 12 | mod hash_join; 13 | mod limit; 14 | mod nested_loop_join; 15 | mod offset; 16 | mod projection; 17 | mod scan; 18 | mod selection; 19 | mod visitor; 20 | 21 | pub use aggregate::*; 22 | pub use cross_join::*; 23 | pub use expression::*; 24 | pub use hash_join::*; 25 | pub use limit::*; 26 | pub use nested_loop_join::*; 27 | pub use offset::*; 28 | pub use plan::*; 29 | pub use projection::*; 30 | pub use scan::*; 31 | pub use selection::*; 32 | -------------------------------------------------------------------------------- /src/physical_plan/nested_loop_join.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-18 16:00:13 4 | * @Last Modified by: Veeupup 5 | * @Last Modified time: 2022-05-19 15:53:24 6 | */ 7 | use super::PhysicalPlan; 8 | use super::PhysicalPlanRef; 9 | use crate::error::ErrorCode; 10 | use crate::logical_plan::expression::Column; 11 | use crate::logical_plan::plan::JoinType; 12 | use crate::logical_plan::schema::NaiveSchema; 13 | use crate::physical_plan::ColumnExpr; 14 | 15 | use crate::Result; 16 | use std::sync::Arc; 17 | 18 | use arrow::array::Array; 19 | use arrow::array::Int64Builder; 20 | use arrow::array::PrimitiveArray; 21 | use arrow::array::StringArray; 22 | use arrow::compute; 23 | use arrow::datatypes::DataType; 24 | use arrow::datatypes::Float64Type; 25 | use arrow::datatypes::Int64Type; 26 | use arrow::datatypes::SchemaRef; 27 | use arrow::datatypes::UInt64Type; 28 | use arrow::record_batch::RecordBatch; 29 | 30 | #[derive(Debug)] 31 | pub struct NestedLoopJoin { 32 | left: PhysicalPlanRef, 33 | right: PhysicalPlanRef, 34 | on: Vec<(Column, Column)>, 35 | #[allow(unused)] 36 | join_type: JoinType, 37 | schema: NaiveSchema, 38 | } 39 | 40 | impl NestedLoopJoin { 41 | #[allow(unused)] 42 | pub fn create( 43 | left: PhysicalPlanRef, 44 | right: PhysicalPlanRef, 45 | on: Vec<(Column, Column)>, 46 | join_type: JoinType, 47 | schema: NaiveSchema, 48 | ) -> PhysicalPlanRef { 49 | Arc::new(Self { 50 | left, 51 | right, 52 | on, 53 | join_type, 54 | schema, 55 | }) 56 | } 57 | } 58 | 59 | macro_rules! join_match { 60 | ($DATATYPE: ty, $LEFT_COL: expr, $RIGHT_COL: expr, $OUTER_POS: expr, $INNER_POS: expr) => {{ 61 | let left_col = $LEFT_COL 62 | .as_any() 63 | .downcast_ref::>() 64 | .unwrap(); 65 | let right_col = $RIGHT_COL 66 | .as_any() 67 | .downcast_ref::>() 68 | .unwrap(); 69 | 70 | for (x_pos, x) in left_col.iter().enumerate() { 71 | for (y_pos, y) in right_col.iter().enumerate() { 72 | match (x, y) { 73 | (Some(x), Some(y)) => { 74 | if x == y { 75 | // equal and we should 76 | $OUTER_POS.append_value(x_pos as i64)?; 77 | $INNER_POS.append_value(y_pos as i64)?; 78 | } 79 | } 80 | _ => {} 81 | } 82 | } 83 | } 84 | }}; 85 | } 86 | 87 | impl PhysicalPlan for NestedLoopJoin { 88 | fn schema(&self) -> &NaiveSchema { 89 | &self.schema 90 | } 91 | 92 | fn execute(&self) -> Result> { 93 | let outer_table = self.left.execute()?; 94 | let inner_table = self.right.execute()?; 95 | 96 | let mut batches: Vec = vec![]; 97 | // TODO(veeupup): support multi on conditions 98 | // Using for loop to combine different conditions 99 | if self.on.is_empty() { 100 | return Err(ErrorCode::PlanError( 101 | "Inner Join on Conditions can't not be empty".to_string(), 102 | )); 103 | } 104 | 105 | let (left_col, right_col) = &self.on[0]; 106 | // TODO(veeupup): consider make left_col in physical plan and not create when executing 107 | let left_col = ColumnExpr::try_create(Some(left_col.name.clone()), None)?; 108 | let right_col = ColumnExpr::try_create(Some(right_col.name.clone()), None)?; 109 | 110 | for outer in &outer_table { 111 | let left_col = left_col.evaluate(outer)?.into_array(); 112 | 113 | let dt = left_col.data_type(); 114 | for inner in &inner_table { 115 | let right_col = right_col.evaluate(inner)?.into_array(); 116 | 117 | // check if ok 118 | if left_col.data_type() != right_col.data_type() { 119 | return Err(ErrorCode::PlanError(format!( 120 | "Join on left and right data type should be same: left: {:?}, right: {:?}", 121 | left_col.data_type(), 122 | right_col.data_type() 123 | ))); 124 | } 125 | 126 | let mut outer_pos = Int64Builder::new(left_col.len()); 127 | let mut inner_pos = Int64Builder::new(right_col.len()); 128 | match dt { 129 | DataType::Int64 => { 130 | join_match!(Int64Type, left_col, right_col, outer_pos, inner_pos) 131 | } 132 | DataType::UInt64 => { 133 | join_match!(UInt64Type, left_col, right_col, outer_pos, inner_pos) 134 | } 135 | DataType::Float64 => { 136 | join_match!(Float64Type, left_col, right_col, outer_pos, inner_pos) 137 | } 138 | DataType::Utf8 => { 139 | let left_col = left_col.as_any().downcast_ref::().unwrap(); 140 | let right_col = right_col.as_any().downcast_ref::().unwrap(); 141 | 142 | for (x_pos, x) in left_col.iter().enumerate() { 143 | for (y_pos, y) in right_col.iter().enumerate() { 144 | if let (Some(x), Some(y)) = (x, y) { 145 | if x == y { 146 | // equal and we should 147 | outer_pos.append_value(x_pos as i64)?; 148 | inner_pos.append_value(y_pos as i64)?; 149 | } 150 | } 151 | } 152 | } 153 | } 154 | _ => unimplemented!(), 155 | } 156 | let mut columns = vec![]; 157 | 158 | let outer_pos = outer_pos.finish(); 159 | let inner_pos = inner_pos.finish(); 160 | 161 | // add left columns 162 | for i in 0..self.left.schema().fields().len() { 163 | let array = outer.column(i); 164 | columns.push(compute::take(array.as_ref(), &outer_pos, None)?); 165 | } 166 | 167 | // add right columns 168 | for i in 0..self.right.schema().fields().len() { 169 | let array = inner.column(i); 170 | columns.push(compute::take(array.as_ref(), &inner_pos, None)?); 171 | } 172 | 173 | let batch = RecordBatch::try_new(SchemaRef::from(self.schema.clone()), columns)?; 174 | batches.push(batch); 175 | } 176 | } 177 | 178 | Ok(batches) 179 | } 180 | 181 | fn children(&self) -> Result> { 182 | Ok(vec![self.left.clone(), self.right.clone()]) 183 | } 184 | } 185 | -------------------------------------------------------------------------------- /src/physical_plan/offset.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: GanZiheng 3 | * @Date: 2022-05-25 4 | */ 5 | 6 | use super::{PhysicalPlan, PhysicalPlanRef}; 7 | use crate::error::Result; 8 | use crate::logical_plan::schema::NaiveSchema; 9 | 10 | use arrow::record_batch::RecordBatch; 11 | use std::sync::Arc; 12 | 13 | #[derive(Debug, Clone)] 14 | pub struct PhysicalOffsetPlan { 15 | input: PhysicalPlanRef, 16 | n: usize, 17 | } 18 | 19 | impl PhysicalOffsetPlan { 20 | pub fn create(input: PhysicalPlanRef, n: usize) -> PhysicalPlanRef { 21 | Arc::new(Self { input, n }) 22 | } 23 | } 24 | 25 | impl PhysicalPlan for PhysicalOffsetPlan { 26 | fn schema(&self) -> &NaiveSchema { 27 | self.input.schema() 28 | } 29 | 30 | fn execute(&self) -> Result> { 31 | let batches = self.input.execute()?; 32 | let mut n = self.n; 33 | let mut ret = vec![]; 34 | 35 | for batch in &batches { 36 | if n == 0 { 37 | ret.push(batch.clone()); 38 | continue; 39 | } 40 | 41 | if n >= batch.num_rows() { 42 | n -= batch.num_rows(); 43 | continue; 44 | } 45 | 46 | let remain = batch.num_rows() - n; 47 | ret.push(batch.slice(n, remain)); 48 | n = 0; 49 | } 50 | Ok(ret) 51 | } 52 | 53 | fn children(&self) -> Result> { 54 | Ok(vec![self.input.clone()]) 55 | } 56 | } 57 | 58 | #[cfg(test)] 59 | mod tests { 60 | use crate::{ 61 | datasource::{CsvConfig, CsvTable}, 62 | physical_plan::ScanPlan, 63 | }; 64 | use arrow::array::{ArrayRef, Float64Array, Int64Array, StringArray}; 65 | 66 | use super::*; 67 | 68 | #[test] 69 | fn test_physical_offset() -> Result<()> { 70 | let source = CsvTable::try_create("data/test_data.csv", CsvConfig::default())?; 71 | 72 | let scan_plan = ScanPlan::create(source, None); 73 | let offset_plan = PhysicalOffsetPlan::create(scan_plan, 5); 74 | 75 | let result = offset_plan.execute()?; 76 | 77 | assert_eq!(result.len(), 1); 78 | let record_batch = &result[0]; 79 | assert_eq!(record_batch.columns().len(), 4); 80 | 81 | let id_excepted: ArrayRef = Arc::new(Int64Array::from(vec![7, 8, 9])); 82 | let name_excepted: ArrayRef = Arc::new(StringArray::from(vec!["jack", "cock", "primer"])); 83 | let age_excepted: ArrayRef = Arc::new(Int64Array::from(vec![21, 22, 23])); 84 | let score_excepted: ArrayRef = Arc::new(Float64Array::from(vec![83.3, 84.4, 85.5])); 85 | 86 | assert_eq!(record_batch.column(0), &id_excepted); 87 | assert_eq!(record_batch.column(1), &name_excepted); 88 | assert_eq!(record_batch.column(2), &age_excepted); 89 | assert_eq!(record_batch.column(3), &score_excepted); 90 | 91 | Ok(()) 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/physical_plan/plan.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-13 14:23:58 4 | * @Email: code@tanweime.com 5 | */ 6 | 7 | use std::fmt::Debug; 8 | use std::sync::Arc; 9 | 10 | use arrow::record_batch::RecordBatch; 11 | 12 | use crate::{error::Result, logical_plan::schema::NaiveSchema}; 13 | 14 | pub trait PhysicalPlan: Debug { 15 | fn schema(&self) -> &NaiveSchema; 16 | 17 | // TODO(veeupup): return by using streaming mode 18 | fn execute(&self) -> Result>; 19 | 20 | fn children(&self) -> Result>; 21 | } 22 | 23 | pub type PhysicalPlanRef = Arc; 24 | -------------------------------------------------------------------------------- /src/physical_plan/projection.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-13 14:54:33 4 | * @Email: code@tanweime.com 5 | */ 6 | 7 | use std::iter::Iterator; 8 | use std::sync::Arc; 9 | 10 | use super::plan::PhysicalPlan; 11 | use crate::error::Result; 12 | use crate::logical_plan::schema::NaiveSchema; 13 | use crate::physical_plan::PhysicalExprRef; 14 | use crate::physical_plan::PhysicalPlanRef; 15 | use arrow::datatypes::SchemaRef; 16 | use arrow::record_batch::RecordBatch; 17 | #[derive(Debug, Clone)] 18 | pub struct ProjectionPlan { 19 | input: PhysicalPlanRef, 20 | schema: NaiveSchema, 21 | expr: Vec, 22 | } 23 | 24 | impl ProjectionPlan { 25 | pub fn create( 26 | input: PhysicalPlanRef, 27 | schema: NaiveSchema, 28 | expr: Vec, 29 | ) -> PhysicalPlanRef { 30 | Arc::new(Self { 31 | input, 32 | schema, 33 | expr, 34 | }) 35 | } 36 | } 37 | 38 | impl PhysicalPlan for ProjectionPlan { 39 | fn schema(&self) -> &NaiveSchema { 40 | &self.schema 41 | } 42 | 43 | fn execute(&self) -> Result> { 44 | let input = self.input.execute()?; 45 | 46 | // when aggragating, we just output what input does 47 | if self.schema.fields().is_empty() { 48 | Ok(input) 49 | } else { 50 | let batches = input 51 | .iter() 52 | .map(|batch| { 53 | let columns = self 54 | .expr 55 | .iter() 56 | // TODO(veeupup): remove unwrap 57 | .map(|expr| expr.evaluate(batch).unwrap()) 58 | .collect::>(); 59 | let columns = columns 60 | .iter() 61 | .map(|column| column.clone().into_array()) 62 | .collect::>(); 63 | // TODO(veeupup): remove unwrap 64 | // let projection_schema = self.schema.into(); 65 | RecordBatch::try_new(SchemaRef::from(self.schema.clone()), columns).unwrap() 66 | }) 67 | .collect::>(); 68 | Ok(batches) 69 | } 70 | } 71 | 72 | fn children(&self) -> Result> { 73 | Ok(vec![self.input.clone()]) 74 | } 75 | } 76 | 77 | #[cfg(test)] 78 | mod tests { 79 | use super::*; 80 | use crate::datasource::{CsvConfig, CsvTable}; 81 | use crate::logical_plan::expression::{Operator, ScalarValue}; 82 | use crate::physical_plan::expression::ColumnExpr; 83 | use crate::physical_plan::scan::ScanPlan; 84 | use crate::physical_plan::{PhysicalBinaryExpr, PhysicalLiteralExpr}; 85 | use arrow::array::{ArrayRef, Int64Array, StringArray}; 86 | 87 | #[test] 88 | fn test_projection() -> Result<()> { 89 | let source = CsvTable::try_create("data/test_data.csv", CsvConfig::default())?; 90 | let schema = NaiveSchema::new(vec![ 91 | source.schema().field(0).clone(), 92 | source.schema().field(1).clone(), 93 | ]); 94 | let scan_plan = ScanPlan::create(source, None); 95 | let add_expr = PhysicalBinaryExpr::create( 96 | ColumnExpr::try_create(Some("id".to_string()), None)?, 97 | Operator::Plus, 98 | PhysicalLiteralExpr::create(ScalarValue::Int64(Some(1))), 99 | ); 100 | let expr = vec![ 101 | // ColumnExpr::try_create(None, Some(0))?, 102 | add_expr, 103 | ColumnExpr::try_create(Some("name".to_string()), None)?, 104 | ]; 105 | let proj_plan = ProjectionPlan::create(scan_plan, schema, expr); 106 | 107 | let res = proj_plan.execute()?; 108 | 109 | assert_eq!(res.len(), 1); 110 | let batch = &res[0]; 111 | 112 | // let id_excepted: ArrayRef = Arc::new(Int64Array::from(vec![1, 2, 4, 5, 6, 7, 8, 9])); 113 | let name_excepted: ArrayRef = Arc::new(StringArray::from(vec![ 114 | "veeupup", "alex", "lynne", "alice", "bob", "jack", "cock", "primer", 115 | ])); 116 | let id_excepted2: ArrayRef = Arc::new(Int64Array::from(vec![2, 3, 5, 6, 7, 8, 9, 10])); 117 | assert_eq!(batch.column(0), &id_excepted2); 118 | assert_eq!(batch.column(1), &name_excepted); 119 | 120 | Ok(()) 121 | } 122 | } 123 | -------------------------------------------------------------------------------- /src/physical_plan/scan.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-13 14:26:59 4 | * @Email: code@tanweime.com 5 | */ 6 | 7 | use std::sync::Arc; 8 | 9 | use crate::datasource::TableRef; 10 | use crate::error::Result; 11 | use crate::logical_plan::schema::NaiveSchema; 12 | use arrow::record_batch::RecordBatch; 13 | 14 | use crate::physical_plan::PhysicalPlan; 15 | use crate::physical_plan::PhysicalPlanRef; 16 | 17 | #[derive(Debug, Clone)] 18 | pub struct ScanPlan { 19 | source: TableRef, 20 | projection: Option>, 21 | } 22 | 23 | impl ScanPlan { 24 | pub fn create(source: TableRef, projection: Option>) -> PhysicalPlanRef { 25 | Arc::new(Self { source, projection }) 26 | } 27 | } 28 | 29 | impl PhysicalPlan for ScanPlan { 30 | fn schema(&self) -> &NaiveSchema { 31 | self.source.schema() 32 | } 33 | 34 | fn execute(&self) -> Result> { 35 | self.source.scan(self.projection.clone()) 36 | } 37 | 38 | fn children(&self) -> Result> { 39 | Ok(vec![]) 40 | } 41 | } 42 | 43 | #[cfg(test)] 44 | mod tests { 45 | use crate::datasource::{CsvConfig, CsvTable}; 46 | use arrow::array::{ArrayRef, Float64Array, Int64Array, StringArray}; 47 | 48 | use super::*; 49 | 50 | #[test] 51 | fn test_physical_scan() -> Result<()> { 52 | let source = CsvTable::try_create("data/test_data.csv", CsvConfig::default())?; 53 | 54 | let scan_plan = ScanPlan::create(source, None); 55 | 56 | let result = scan_plan.execute()?; 57 | 58 | assert_eq!(result.len(), 1); 59 | let record_batch = &result[0]; 60 | assert_eq!(record_batch.columns().len(), 4); 61 | 62 | let id_excepted: ArrayRef = Arc::new(Int64Array::from(vec![1, 2, 4, 5, 6, 7, 8, 9])); 63 | let name_excepted: ArrayRef = Arc::new(StringArray::from(vec![ 64 | "veeupup", "alex", "lynne", "alice", "bob", "jack", "cock", "primer", 65 | ])); 66 | let age_excepted: ArrayRef = 67 | Arc::new(Int64Array::from(vec![23, 20, 18, 19, 20, 21, 22, 23])); 68 | let score_excepted: ArrayRef = Arc::new(Float64Array::from(vec![ 69 | 60.0, 90.1, 99.99, 81.1, 82.2, 83.3, 84.4, 85.5, 70 | ])); 71 | 72 | assert_eq!(record_batch.column(0), &id_excepted); 73 | assert_eq!(record_batch.column(1), &name_excepted); 74 | assert_eq!(record_batch.column(2), &age_excepted); 75 | assert_eq!(record_batch.column(3), &score_excepted); 76 | 77 | Ok(()) 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/physical_plan/selection.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-14 18:20:27 4 | * @Email: code@tanweime.com 5 | */ 6 | 7 | use std::sync::Arc; 8 | 9 | use super::{PhysicalExprRef, PhysicalPlan, PhysicalPlanRef}; 10 | use crate::logical_plan::schema::NaiveSchema; 11 | use crate::Result; 12 | use arrow::array::{ 13 | Float64Array, Float64Builder, Int64Array, Int64Builder, StringArray, StringBuilder, 14 | UInt64Array, UInt64Builder, 15 | }; 16 | use arrow::record_batch::RecordBatch; 17 | use arrow::{ 18 | array::{Array, BooleanArray, BooleanBuilder}, 19 | datatypes::DataType, 20 | }; 21 | 22 | #[derive(Debug)] 23 | pub struct SelectionPlan { 24 | input: PhysicalPlanRef, 25 | expr: PhysicalExprRef, 26 | } 27 | 28 | impl SelectionPlan { 29 | pub fn create(input: PhysicalPlanRef, expr: PhysicalExprRef) -> PhysicalPlanRef { 30 | Arc::new(Self { input, expr }) 31 | } 32 | } 33 | 34 | macro_rules! build_array_by_predicate { 35 | ($COLUMN: ident, $PREDICATE: expr, $ARRAY_TYPE: ty, $ARRAY_BUILDER: ty) => {{ 36 | let array = $COLUMN.as_any().downcast_ref::<$ARRAY_TYPE>().unwrap(); 37 | let mut builder = <$ARRAY_BUILDER>::new(array.len()); 38 | let iter = $PREDICATE.iter().zip(array.iter()); 39 | for (valid, val) in iter { 40 | match valid { 41 | Some(valid) => { 42 | if valid { 43 | builder.append_option(val)?; 44 | } 45 | } 46 | None => builder.append_option(None)?, 47 | } 48 | } 49 | Arc::new(builder.finish()) 50 | }}; 51 | } 52 | 53 | impl PhysicalPlan for SelectionPlan { 54 | fn schema(&self) -> &NaiveSchema { 55 | self.input.schema() 56 | } 57 | 58 | fn execute(&self) -> Result> { 59 | let input = self.input.execute()?; 60 | let predicate = self.expr.evaluate(&input[0])?.into_array(); 61 | let predicate = predicate.as_any().downcast_ref::().unwrap(); 62 | 63 | let mut batches = vec![]; 64 | 65 | for batch in &input { 66 | let mut columns = vec![]; 67 | for col in batch.columns() { 68 | let dt = col.data_type(); 69 | let column: Arc = match dt { 70 | DataType::Boolean => { 71 | build_array_by_predicate!(col, predicate, BooleanArray, BooleanBuilder) 72 | } 73 | DataType::UInt64 => { 74 | build_array_by_predicate!(col, predicate, UInt64Array, UInt64Builder) 75 | } 76 | DataType::Int64 => { 77 | build_array_by_predicate!(col, predicate, Int64Array, Int64Builder) 78 | } 79 | DataType::Float64 => { 80 | build_array_by_predicate!(col, predicate, Float64Array, Float64Builder) 81 | } 82 | DataType::Utf8 => { 83 | let array = col.as_any().downcast_ref::().unwrap(); 84 | let mut builder = StringBuilder::new(array.len()); 85 | let iter = predicate.iter().zip(array.iter()); 86 | for (valid, val) in iter { 87 | match valid { 88 | Some(valid) => { 89 | if valid { 90 | builder.append_option(val)?; 91 | } 92 | } 93 | None => builder.append_option(None::<&str>)?, 94 | } 95 | } 96 | Arc::new(builder.finish()) 97 | } 98 | _ => unimplemented!(), 99 | }; 100 | columns.push(column); 101 | } 102 | let record_batch = 103 | RecordBatch::try_new(Arc::new(self.schema().clone().into()), columns)?; 104 | batches.push(record_batch); 105 | } 106 | Ok(batches) 107 | } 108 | 109 | fn children(&self) -> Result> { 110 | Ok(vec![self.input.clone()]) 111 | } 112 | } 113 | 114 | #[cfg(test)] 115 | mod tests { 116 | use super::*; 117 | use crate::datasource::{CsvConfig, CsvTable}; 118 | use crate::logical_plan::expression::{Operator, ScalarValue}; 119 | use crate::physical_plan::expression::ColumnExpr; 120 | use crate::physical_plan::scan::ScanPlan; 121 | use crate::physical_plan::{PhysicalBinaryExpr, PhysicalLiteralExpr, ProjectionPlan}; 122 | use crate::print_result; 123 | use arrow::array::{ArrayRef, Int64Array, StringArray}; 124 | 125 | #[test] 126 | fn test_selection() -> Result<()> { 127 | let source = CsvTable::try_create("data/test_data.csv", CsvConfig::default())?; 128 | let schema = NaiveSchema::new(vec![ 129 | source.schema().field(0).clone(), 130 | source.schema().field(1).clone(), 131 | source.schema().field(2).clone(), 132 | ]); 133 | let scan_plan = ScanPlan::create(source, None); 134 | 135 | let expr = vec![ 136 | ColumnExpr::try_create(None, Some(0))?, 137 | ColumnExpr::try_create(Some("name".to_string()), None)?, 138 | ColumnExpr::try_create(None, Some(2))?, 139 | ]; 140 | let proj_plan = ProjectionPlan::create(scan_plan, schema, expr); 141 | 142 | // TODO(veeupup): selection expression 143 | 144 | { 145 | let add_expr = PhysicalBinaryExpr::create( 146 | ColumnExpr::try_create(Some("id".to_string()), None)?, 147 | Operator::Plus, 148 | PhysicalLiteralExpr::create(ScalarValue::Int64(Some(1))), 149 | ); 150 | 151 | let expr = PhysicalBinaryExpr::create( 152 | add_expr, 153 | Operator::Gt, 154 | PhysicalLiteralExpr::create(ScalarValue::Int64(Some(5))), 155 | ); 156 | 157 | let selection_plan = SelectionPlan::create(proj_plan, expr); 158 | 159 | let res = selection_plan.execute()?; 160 | 161 | assert_eq!(res.len(), 1); 162 | let batch = &res[0]; 163 | 164 | print_result(&res)?; 165 | 166 | let id_excepted: ArrayRef = Arc::new(Int64Array::from(vec![5, 6, 7, 8, 9])); 167 | let name_excepted: ArrayRef = Arc::new(StringArray::from(vec![ 168 | "alice", "bob", "jack", "cock", "primer", 169 | ])); 170 | 171 | assert_eq!(batch.column(0), &id_excepted); 172 | assert_eq!(batch.column(1), &name_excepted); 173 | } 174 | 175 | // TODO(veeupup): add more test about binary expression 176 | 177 | Ok(()) 178 | } 179 | } 180 | -------------------------------------------------------------------------------- /src/physical_plan/visitor.rs: -------------------------------------------------------------------------------- 1 | use super::PhysicalPlan; 2 | use crate::error::Result; 3 | 4 | pub trait PhysicalPlanVistor { 5 | // Invoke before visit PhysicalPlan 6 | fn pre_visit(&mut self, plan: &dyn PhysicalPlan) -> Result<()>; 7 | 8 | // Invoke before after PhysicalPlan 9 | fn post_visit(&mut self, plan: &dyn PhysicalPlan) -> Result<()>; 10 | } 11 | 12 | pub fn _visit_physical_plan( 13 | plan: &dyn PhysicalPlan, 14 | visitor: &mut V, 15 | ) -> Result<()> { 16 | let children = plan.children()?; 17 | visitor.pre_visit(plan)?; 18 | 19 | for child in children { 20 | _visit_physical_plan(child.as_ref(), visitor)?; 21 | } 22 | visitor.post_visit(plan)?; 23 | Ok(()) 24 | } 25 | 26 | #[cfg(test)] 27 | mod tests { 28 | use crate::{ 29 | datasource::CsvTable, 30 | physical_plan::{PhysicalPlan, ScanPlan}, 31 | CsvConfig, 32 | }; 33 | 34 | use super::{PhysicalPlanVistor, _visit_physical_plan}; 35 | use crate::error::Result; 36 | 37 | struct TestVisitor { 38 | v: usize, 39 | } 40 | 41 | impl PhysicalPlanVistor for TestVisitor { 42 | fn pre_visit(&mut self, _: &dyn PhysicalPlan) -> Result<()> { 43 | println!("pre_v: {}", self.v); 44 | Ok(()) 45 | } 46 | 47 | fn post_visit(&mut self, _: &dyn PhysicalPlan) -> Result<()> { 48 | println!("post_v: {}", self.v); 49 | Ok(()) 50 | } 51 | } 52 | 53 | #[test] 54 | fn test_visitor() -> Result<()> { 55 | let source = CsvTable::try_create("data/test_data.csv", CsvConfig::default())?; 56 | 57 | let scan_plan = ScanPlan::create(source, None); 58 | _visit_physical_plan(scan_plan.as_ref(), &mut TestVisitor { v: 1 })?; 59 | Ok(()) 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/planner/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-13 16:56:35 4 | * @Email: code@tanweime.com 5 | * 6 | * Planner: translate the logical plan into the physical plan. 7 | * 8 | */ 9 | 10 | use crate::logical_plan::expression::AggregateFunc; 11 | use crate::logical_plan::schema::NaiveSchema; 12 | use crate::physical_plan::CrossJoin; 13 | use crate::physical_plan::HashJoin; 14 | 15 | use crate::physical_plan::avg::Avg; 16 | use crate::physical_plan::count::Count; 17 | use crate::physical_plan::max::Max; 18 | use crate::physical_plan::min::Min; 19 | use crate::physical_plan::sum::Sum; 20 | use crate::physical_plan::PhysicalAggregatePlan; 21 | use crate::physical_plan::PhysicalBinaryExpr; 22 | use crate::physical_plan::PhysicalCastExpr; 23 | use crate::physical_plan::PhysicalExprRef; 24 | use crate::physical_plan::PhysicalLimitPlan; 25 | use crate::physical_plan::PhysicalLiteralExpr; 26 | use crate::physical_plan::PhysicalOffsetPlan; 27 | use crate::physical_plan::PhysicalPlanRef; 28 | use crate::physical_plan::PhysicalUnaryExpr; 29 | use crate::physical_plan::SelectionPlan; 30 | use crate::{ 31 | error::{ErrorCode, Result}, 32 | logical_plan::{ 33 | expression::{Column, LogicalExpr}, 34 | plan::LogicalPlan, 35 | }, 36 | physical_plan::{ColumnExpr, ProjectionPlan, ScanPlan}, 37 | }; 38 | 39 | pub struct QueryPlanner; 40 | 41 | impl QueryPlanner { 42 | pub fn create_physical_plan(plan: &LogicalPlan) -> Result { 43 | match plan { 44 | LogicalPlan::TableScan(table_scan) => Ok(ScanPlan::create( 45 | table_scan.source.clone(), 46 | table_scan.projection.clone(), 47 | )), 48 | LogicalPlan::Projection(proj) => { 49 | let input = Self::create_physical_plan(&proj.input)?; 50 | let proj_expr = proj 51 | .exprs 52 | .iter() 53 | .map(|expr| Self::create_physical_expression(expr, &proj.input).unwrap()) 54 | .collect::>(); 55 | let fields = proj 56 | .exprs 57 | .iter() 58 | .map(|expr| expr.data_field(proj.input.as_ref()).unwrap()) 59 | .collect::>(); 60 | let proj_schema = NaiveSchema::new(fields); 61 | Ok(ProjectionPlan::create(input, proj_schema, proj_expr)) 62 | } 63 | LogicalPlan::Limit(limit) => { 64 | let plan = Self::create_physical_plan(&limit.input)?; 65 | Ok(PhysicalLimitPlan::create(plan, limit.n)) 66 | } 67 | LogicalPlan::Offset(offset) => { 68 | let plan = Self::create_physical_plan(&offset.input)?; 69 | Ok(PhysicalOffsetPlan::create(plan, offset.n)) 70 | } 71 | LogicalPlan::Join(join) => { 72 | let left = Self::create_physical_plan(&join.left)?; 73 | let right = Self::create_physical_plan(&join.right)?; 74 | // We now have two join physical implementation 75 | // Ok(NestedLoopJoin::new( 76 | // left, 77 | // right, 78 | // join.on.clone(), 79 | // join.join_type, 80 | // join.schema.clone(), 81 | // )) 82 | Ok(HashJoin::create( 83 | left, 84 | right, 85 | join.on.clone(), 86 | join.join_type, 87 | join.schema.clone(), 88 | )) 89 | } 90 | LogicalPlan::Filter(filter) => { 91 | let predicate = Self::create_physical_expression(&filter.predicate, plan)?; 92 | let input = Self::create_physical_plan(&filter.input)?; 93 | Ok(SelectionPlan::create(input, predicate)) 94 | } 95 | LogicalPlan::Aggregate(aggr) => { 96 | let mut group_exprs = vec![]; 97 | for group_expr in &aggr.group_expr { 98 | group_exprs.push(Self::create_physical_expression(group_expr, &aggr.input)?); 99 | } 100 | 101 | let mut aggr_ops = vec![]; 102 | for aggr_expr in &aggr.aggr_expr { 103 | let aggr_op = match aggr_expr.fun { 104 | AggregateFunc::Count => { 105 | let expr = 106 | Self::create_physical_expression(&aggr_expr.args, &aggr.input)?; 107 | let col_expr = expr.as_any().downcast_ref::(); 108 | if let Some(col_expr) = col_expr { 109 | Count::create(col_expr.clone()) 110 | } else { 111 | return Err(ErrorCode::PlanError( 112 | "Aggregate Func should have a column in it".to_string(), 113 | )); 114 | } 115 | } 116 | AggregateFunc::Sum => { 117 | let expr = 118 | Self::create_physical_expression(&aggr_expr.args, &aggr.input)?; 119 | let col_expr = expr.as_any().downcast_ref::(); 120 | if let Some(col_expr) = col_expr { 121 | Sum::create(col_expr.clone()) 122 | } else { 123 | return Err(ErrorCode::PlanError( 124 | "Aggregate Func should have a column in it".to_string(), 125 | )); 126 | } 127 | } 128 | AggregateFunc::Avg => { 129 | let expr = 130 | Self::create_physical_expression(&aggr_expr.args, &aggr.input)?; 131 | let col_expr = expr.as_any().downcast_ref::(); 132 | if let Some(col_expr) = col_expr { 133 | Avg::create(col_expr.clone()) 134 | } else { 135 | return Err(ErrorCode::PlanError( 136 | "Aggregate Func should have a column in it".to_string(), 137 | )); 138 | } 139 | } 140 | AggregateFunc::Min => { 141 | let expr = 142 | Self::create_physical_expression(&aggr_expr.args, &aggr.input)?; 143 | let col_expr = expr.as_any().downcast_ref::(); 144 | if let Some(col_expr) = col_expr { 145 | Min::create(col_expr.clone()) 146 | } else { 147 | return Err(ErrorCode::PlanError( 148 | "Aggregate Func should have a column in it".to_string(), 149 | )); 150 | } 151 | } 152 | AggregateFunc::Max => { 153 | let expr = 154 | Self::create_physical_expression(&aggr_expr.args, &aggr.input)?; 155 | let col_expr = expr.as_any().downcast_ref::(); 156 | if let Some(col_expr) = col_expr { 157 | Max::create(col_expr.clone()) 158 | } else { 159 | return Err(ErrorCode::PlanError( 160 | "Aggregate Func should have a column in it".to_string(), 161 | )); 162 | } 163 | } 164 | }; 165 | aggr_ops.push(aggr_op); 166 | } 167 | 168 | let input = Self::create_physical_plan(&aggr.input)?; 169 | Ok(PhysicalAggregatePlan::create(group_exprs, aggr_ops, input)) 170 | } 171 | LogicalPlan::CrossJoin(join) => { 172 | let left = Self::create_physical_plan(&join.left)?; 173 | let right = Self::create_physical_plan(&join.right)?; 174 | Ok(CrossJoin::create( 175 | left, 176 | right, 177 | join.join_type, 178 | join.schema.clone(), 179 | )) 180 | } 181 | } 182 | } 183 | 184 | pub fn create_physical_expression( 185 | expr: &LogicalExpr, 186 | input: &LogicalPlan, 187 | ) -> Result { 188 | match expr { 189 | LogicalExpr::Alias(_, _) => todo!(), 190 | LogicalExpr::Column(Column { name, .. }) => { 191 | for (idx, field) in input.schema().fields().iter().enumerate() { 192 | if field.name() == name { 193 | return ColumnExpr::try_create(None, Some(idx)); 194 | } 195 | } 196 | Err(ErrorCode::ColumnNotExists(format!( 197 | "column `{}` not exists", 198 | name 199 | ))) 200 | } 201 | LogicalExpr::Literal(scalar_val) => Ok(PhysicalLiteralExpr::create(scalar_val.clone())), 202 | LogicalExpr::BinaryExpr(bin_expr) => { 203 | let left = Self::create_physical_expression(bin_expr.left.as_ref(), input)?; 204 | let right = Self::create_physical_expression(bin_expr.right.as_ref(), input)?; 205 | let phy_bin_expr = PhysicalBinaryExpr::create(left, bin_expr.op.clone(), right); 206 | Ok(phy_bin_expr) 207 | } 208 | LogicalExpr::UnaryExpr(scalar_expr) => { 209 | let expr = Self::create_physical_expression(scalar_expr.arg.as_ref(), input)?; 210 | let phy_scalar_expr = PhysicalUnaryExpr::create( 211 | expr, 212 | scalar_expr.func.clone(), 213 | "todo".to_string(), 214 | &arrow::datatypes::DataType::Int32, 215 | ); 216 | Ok(phy_scalar_expr) 217 | } 218 | LogicalExpr::Not(_) => todo!(), 219 | LogicalExpr::Cast(cast_expr) => { 220 | let expr = Self::create_physical_expression(cast_expr.expr.as_ref(), input)?; 221 | let phy_cast_expr = PhysicalCastExpr::create(expr, &cast_expr.data_type); 222 | Ok(phy_cast_expr) 223 | } 224 | LogicalExpr::AggregateFunction(_) => todo!(), 225 | LogicalExpr::Wildcard => todo!(), 226 | } 227 | } 228 | } 229 | 230 | #[cfg(test)] 231 | mod tests { 232 | use arrow::array::ArrayRef; 233 | use arrow::array::Int64Array; 234 | use arrow::array::StringArray; 235 | use std::sync::Arc; 236 | 237 | use crate::catalog::Catalog; 238 | use crate::CsvConfig; 239 | 240 | use super::*; 241 | 242 | #[test] 243 | fn test_scan_projection() -> Result<()> { 244 | // construct 245 | let mut catalog = Catalog::default(); 246 | catalog.add_csv_table("t1", "data/test_data.csv", CsvConfig::default())?; 247 | let source = catalog.get_table_df("t1")?; 248 | let exprs = vec![ 249 | LogicalExpr::column(None, "id".to_string()), 250 | LogicalExpr::column(None, "name".to_string()), 251 | LogicalExpr::column(None, "age".to_string()), 252 | ]; 253 | let logical_plan = source.project(exprs)?.logical_plan(); 254 | let physical_plan = QueryPlanner::create_physical_plan(&logical_plan)?; 255 | let batches = physical_plan.execute()?; 256 | 257 | // test 258 | assert_eq!(batches.len(), 1); 259 | let batch = &batches[0]; 260 | 261 | let id_excepted: ArrayRef = Arc::new(Int64Array::from(vec![1, 2, 4, 5, 6, 7, 8, 9])); 262 | let name_excepted: ArrayRef = Arc::new(StringArray::from(vec![ 263 | "veeupup", "alex", "lynne", "alice", "bob", "jack", "cock", "primer", 264 | ])); 265 | let age_excepted: ArrayRef = 266 | Arc::new(Int64Array::from(vec![23, 20, 18, 19, 20, 21, 22, 23])); 267 | 268 | assert_eq!(batch.column(0), &id_excepted); 269 | assert_eq!(batch.column(1), &name_excepted); 270 | assert_eq!(batch.column(2), &age_excepted); 271 | 272 | Ok(()) 273 | } 274 | } 275 | -------------------------------------------------------------------------------- /src/sql/mod.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-13 19:35:34 4 | * @Email: code@tanweime.com 5 | */ 6 | 7 | pub mod parser; 8 | pub mod planner; 9 | -------------------------------------------------------------------------------- /src/sql/parser.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-13 19:35:41 4 | * @Email: code@tanweime.com 5 | */ 6 | 7 | use sqlparser::{ 8 | ast::Statement, 9 | dialect::GenericDialect, 10 | parser::{Parser, ParserError}, 11 | tokenizer::Tokenizer, 12 | }; 13 | 14 | /// SQL Parser 15 | pub struct SQLParser; 16 | 17 | impl SQLParser { 18 | /// Parse the specified tokens and return statement 19 | pub fn parse(sql: &str) -> Result { 20 | let dialect = GenericDialect {}; 21 | let mut tokenizer = Tokenizer::new(&dialect, sql); 22 | let tokens = tokenizer.tokenize()?; 23 | let mut parser = Parser::new(tokens, &dialect); 24 | parser.parse_statement() 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /src/utils.rs: -------------------------------------------------------------------------------- 1 | /* 2 | * @Author: Veeupup 3 | * @Date: 2022-05-14 17:33:26 4 | * @Email: code@tanweime.com 5 | */ 6 | 7 | use crate::error::ErrorCode; 8 | use crate::error::Result; 9 | use arrow::{record_batch::RecordBatch, util::pretty}; 10 | 11 | pub fn print_result(result: &[RecordBatch]) -> Result<()> { 12 | pretty::print_batches(result).map_err(ErrorCode::ArrowError) 13 | } 14 | --------------------------------------------------------------------------------