├── .github
    └── workflows
    │   └── build.yml
├── .gitignore
├── .mergify.yml
├── Cargo.lock
├── Cargo.toml
├── LICENSE
├── README.md
├── architecture.md
├── clippy.toml
├── data
    ├── department.csv
    ├── employee.csv
    ├── rank.csv
    └── test_data.csv
├── doc
    └── query_engine.jpg
├── makefile
└── src
    ├── catalog.rs
    ├── datasource
        ├── csv.rs
        ├── empty.rs
        ├── memory.rs
        └── mod.rs
    ├── datatype.rs
    ├── db.rs
    ├── error.rs
    ├── lib.rs
    ├── logical_plan
        ├── dataframe.rs
        ├── expression.rs
        ├── literal.rs
        ├── mod.rs
        ├── plan.rs
        └── schema.rs
    ├── main.rs
    ├── optimizer
        ├── mod.rs
        └── projection_push_down.rs
    ├── physical_plan
        ├── aggregate
        │   ├── avg.rs
        │   ├── count.rs
        │   ├── max.rs
        │   ├── min.rs
        │   ├── mod.rs
        │   └── sum.rs
        ├── cross_join.rs
        ├── expression
        │   ├── binary.rs
        │   ├── cast.rs
        │   ├── column.rs
        │   ├── literal.rs
        │   ├── mod.rs
        │   └── unary.rs
        ├── hash_join.rs
        ├── limit.rs
        ├── mod.rs
        ├── nested_loop_join.rs
        ├── offset.rs
        ├── plan.rs
        ├── projection.rs
        ├── scan.rs
        ├── selection.rs
        └── visitor.rs
    ├── planner
        └── mod.rs
    ├── sql
        ├── mod.rs
        ├── parser.rs
        └── planner.rs
    └── utils.rs


/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: auto build and test
 2 | on:
 3 |   pull_request:
 4 |     branches: [main]
 5 | env:
 6 |   RUST_TOOLCHAIN: nightly-2022-04-09
 7 |   CARGO_TERM_COLOR: always
 8 | jobs:
 9 |   run-test:
10 |     name: normal check
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - name: Checkout
14 |         uses: actions/checkout@v3
15 |       - name: Install rust toolchain@v1
16 |         uses: actions-rs/toolchain@v1
17 |         with:
18 |           toolchain: ${{ env.RUST_TOOLCHAIN }}
19 |           components: rustfmt, clippy
20 |       - name: Run rust clippy check
21 |         run: |
22 |           # If new CI checks are added, the one with `--locked` must be run first.
23 |           cargo clippy --all-targets --locked -- -D warnings
24 |       - name: Run test
25 |         run: cargo test
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /target
2 | lcov.info
3 | .vscode
4 | 


--------------------------------------------------------------------------------
/.mergify.yml:
--------------------------------------------------------------------------------
1 | pull_request_rules:
2 |   - name: Automatic merge on approval
3 |     conditions:
4 |       - "#approved-reviews-by>=1"
5 |     actions:
6 |       merge:
7 |         method: squash
8 | 


--------------------------------------------------------------------------------
/Cargo.lock:
--------------------------------------------------------------------------------
  1 | # This file is automatically @generated by Cargo.
  2 | # It is not intended for manual editing.
  3 | version = 3
  4 | 
  5 | [[package]]
  6 | name = "aho-corasick"
  7 | version = "0.7.18"
  8 | source = "registry+https://github.com/rust-lang/crates.io-index"
  9 | checksum = "1e37cfd5e7657ada45f742d6e99ca5788580b5c529dc78faf11ece6dc702656f"
 10 | dependencies = [
 11 |  "memchr",
 12 | ]
 13 | 
 14 | [[package]]
 15 | name = "arrow"
 16 | version = "13.0.0"
 17 | source = "registry+https://github.com/rust-lang/crates.io-index"
 18 | checksum = "5c6bee230122beb516ead31935a61f683715f987c6f003eff44ad6986624105a"
 19 | dependencies = [
 20 |  "bitflags",
 21 |  "chrono",
 22 |  "comfy-table",
 23 |  "csv",
 24 |  "flatbuffers",
 25 |  "half",
 26 |  "hex",
 27 |  "indexmap",
 28 |  "lazy_static",
 29 |  "lexical-core",
 30 |  "multiversion",
 31 |  "num",
 32 |  "rand",
 33 |  "regex",
 34 |  "serde",
 35 |  "serde_derive",
 36 |  "serde_json",
 37 | ]
 38 | 
 39 | [[package]]
 40 | name = "autocfg"
 41 | version = "1.1.0"
 42 | source = "registry+https://github.com/rust-lang/crates.io-index"
 43 | checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa"
 44 | 
 45 | [[package]]
 46 | name = "bitflags"
 47 | version = "1.3.2"
 48 | source = "registry+https://github.com/rust-lang/crates.io-index"
 49 | checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
 50 | 
 51 | [[package]]
 52 | name = "bstr"
 53 | version = "0.2.17"
 54 | source = "registry+https://github.com/rust-lang/crates.io-index"
 55 | checksum = "ba3569f383e8f1598449f1a423e72e99569137b47740b1da11ef19af3d5c3223"
 56 | dependencies = [
 57 |  "lazy_static",
 58 |  "memchr",
 59 |  "regex-automata",
 60 |  "serde",
 61 | ]
 62 | 
 63 | [[package]]
 64 | name = "cfg-if"
 65 | version = "1.0.0"
 66 | source = "registry+https://github.com/rust-lang/crates.io-index"
 67 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 68 | 
 69 | [[package]]
 70 | name = "chrono"
 71 | version = "0.4.19"
 72 | source = "registry+https://github.com/rust-lang/crates.io-index"
 73 | checksum = "670ad68c9088c2a963aaa298cb369688cf3f9465ce5e2d4ca10e6e0098a1ce73"
 74 | dependencies = [
 75 |  "libc",
 76 |  "num-integer",
 77 |  "num-traits",
 78 |  "winapi",
 79 | ]
 80 | 
 81 | [[package]]
 82 | name = "comfy-table"
 83 | version = "5.0.1"
 84 | source = "registry+https://github.com/rust-lang/crates.io-index"
 85 | checksum = "b103d85ca6e209388771bfb7aa6b68a7aeec4afbf6f0a0264bfbf50360e5212e"
 86 | dependencies = [
 87 |  "strum",
 88 |  "strum_macros",
 89 |  "unicode-width",
 90 | ]
 91 | 
 92 | [[package]]
 93 | name = "csv"
 94 | version = "1.1.6"
 95 | source = "registry+https://github.com/rust-lang/crates.io-index"
 96 | checksum = "22813a6dc45b335f9bade10bf7271dc477e81113e89eb251a0bc2a8a81c536e1"
 97 | dependencies = [
 98 |  "bstr",
 99 |  "csv-core",
100 |  "itoa 0.4.8",
101 |  "ryu",
102 |  "serde",
103 | ]
104 | 
105 | [[package]]
106 | name = "csv-core"
107 | version = "0.1.10"
108 | source = "registry+https://github.com/rust-lang/crates.io-index"
109 | checksum = "2b2466559f260f48ad25fe6317b3c8dac77b5bdb5763ac7d9d6103530663bc90"
110 | dependencies = [
111 |  "memchr",
112 | ]
113 | 
114 | [[package]]
115 | name = "flatbuffers"
116 | version = "2.1.2"
117 | source = "registry+https://github.com/rust-lang/crates.io-index"
118 | checksum = "86b428b715fdbdd1c364b84573b5fdc0f84f8e423661b9f398735278bc7f2b6a"
119 | dependencies = [
120 |  "bitflags",
121 |  "smallvec",
122 |  "thiserror",
123 | ]
124 | 
125 | [[package]]
126 | name = "getrandom"
127 | version = "0.2.6"
128 | source = "registry+https://github.com/rust-lang/crates.io-index"
129 | checksum = "9be70c98951c83b8d2f8f60d7065fa6d5146873094452a1008da8c2f1e4205ad"
130 | dependencies = [
131 |  "cfg-if",
132 |  "libc",
133 |  "wasi",
134 | ]
135 | 
136 | [[package]]
137 | name = "half"
138 | version = "1.8.2"
139 | source = "registry+https://github.com/rust-lang/crates.io-index"
140 | checksum = "eabb4a44450da02c90444cf74558da904edde8fb4e9035a9a6a4e15445af0bd7"
141 | 
142 | [[package]]
143 | name = "hashbrown"
144 | version = "0.11.2"
145 | source = "registry+https://github.com/rust-lang/crates.io-index"
146 | checksum = "ab5ef0d4909ef3724cc8cce6ccc8572c5c817592e9285f5464f8e86f8bd3726e"
147 | 
148 | [[package]]
149 | name = "heck"
150 | version = "0.3.3"
151 | source = "registry+https://github.com/rust-lang/crates.io-index"
152 | checksum = "6d621efb26863f0e9924c6ac577e8275e5e6b77455db64ffa6c65c904e9e132c"
153 | dependencies = [
154 |  "unicode-segmentation",
155 | ]
156 | 
157 | [[package]]
158 | name = "hex"
159 | version = "0.4.3"
160 | source = "registry+https://github.com/rust-lang/crates.io-index"
161 | checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70"
162 | 
163 | [[package]]
164 | name = "indexmap"
165 | version = "1.8.1"
166 | source = "registry+https://github.com/rust-lang/crates.io-index"
167 | checksum = "0f647032dfaa1f8b6dc29bd3edb7bbef4861b8b8007ebb118d6db284fd59f6ee"
168 | dependencies = [
169 |  "autocfg",
170 |  "hashbrown",
171 | ]
172 | 
173 | [[package]]
174 | name = "itoa"
175 | version = "0.4.8"
176 | source = "registry+https://github.com/rust-lang/crates.io-index"
177 | checksum = "b71991ff56294aa922b450139ee08b3bfc70982c6b2c7562771375cf73542dd4"
178 | 
179 | [[package]]
180 | name = "itoa"
181 | version = "1.0.1"
182 | source = "registry+https://github.com/rust-lang/crates.io-index"
183 | checksum = "1aab8fc367588b89dcee83ab0fd66b72b50b72fa1904d7095045ace2b0c81c35"
184 | 
185 | [[package]]
186 | name = "lazy_static"
187 | version = "1.4.0"
188 | source = "registry+https://github.com/rust-lang/crates.io-index"
189 | checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646"
190 | 
191 | [[package]]
192 | name = "lexical-core"
193 | version = "0.8.3"
194 | source = "registry+https://github.com/rust-lang/crates.io-index"
195 | checksum = "92912c4af2e7d9075be3e5e3122c4d7263855fa6cce34fbece4dd08e5884624d"
196 | dependencies = [
197 |  "lexical-parse-float",
198 |  "lexical-parse-integer",
199 |  "lexical-util",
200 |  "lexical-write-float",
201 |  "lexical-write-integer",
202 | ]
203 | 
204 | [[package]]
205 | name = "lexical-parse-float"
206 | version = "0.8.3"
207 | source = "registry+https://github.com/rust-lang/crates.io-index"
208 | checksum = "f518eed87c3be6debe6d26b855c97358d8a11bf05acec137e5f53080f5ad2dd8"
209 | dependencies = [
210 |  "lexical-parse-integer",
211 |  "lexical-util",
212 |  "static_assertions",
213 | ]
214 | 
215 | [[package]]
216 | name = "lexical-parse-integer"
217 | version = "0.8.3"
218 | source = "registry+https://github.com/rust-lang/crates.io-index"
219 | checksum = "afc852ec67c6538bbb2b9911116a385b24510e879a69ab516e6a151b15a79168"
220 | dependencies = [
221 |  "lexical-util",
222 |  "static_assertions",
223 | ]
224 | 
225 | [[package]]
226 | name = "lexical-util"
227 | version = "0.8.3"
228 | source = "registry+https://github.com/rust-lang/crates.io-index"
229 | checksum = "c72a9d52c5c4e62fa2cdc2cb6c694a39ae1382d9c2a17a466f18e272a0930eb1"
230 | dependencies = [
231 |  "static_assertions",
232 | ]
233 | 
234 | [[package]]
235 | name = "lexical-write-float"
236 | version = "0.8.4"
237 | source = "registry+https://github.com/rust-lang/crates.io-index"
238 | checksum = "8a89ec1d062e481210c309b672f73a0567b7855f21e7d2fae636df44d12e97f9"
239 | dependencies = [
240 |  "lexical-util",
241 |  "lexical-write-integer",
242 |  "static_assertions",
243 | ]
244 | 
245 | [[package]]
246 | name = "lexical-write-integer"
247 | version = "0.8.3"
248 | source = "registry+https://github.com/rust-lang/crates.io-index"
249 | checksum = "094060bd2a7c2ff3a16d5304a6ae82727cb3cc9d1c70f813cc73f744c319337e"
250 | dependencies = [
251 |  "lexical-util",
252 |  "static_assertions",
253 | ]
254 | 
255 | [[package]]
256 | name = "libc"
257 | version = "0.2.125"
258 | source = "registry+https://github.com/rust-lang/crates.io-index"
259 | checksum = "5916d2ae698f6de9bfb891ad7a8d65c09d232dc58cc4ac433c7da3b2fd84bc2b"
260 | 
261 | [[package]]
262 | name = "log"
263 | version = "0.4.17"
264 | source = "registry+https://github.com/rust-lang/crates.io-index"
265 | checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e"
266 | dependencies = [
267 |  "cfg-if",
268 | ]
269 | 
270 | [[package]]
271 | name = "memchr"
272 | version = "2.5.0"
273 | source = "registry+https://github.com/rust-lang/crates.io-index"
274 | checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
275 | 
276 | [[package]]
277 | name = "multiversion"
278 | version = "0.6.1"
279 | source = "registry+https://github.com/rust-lang/crates.io-index"
280 | checksum = "025c962a3dd3cc5e0e520aa9c612201d127dcdf28616974961a649dca64f5373"
281 | dependencies = [
282 |  "multiversion-macros",
283 | ]
284 | 
285 | [[package]]
286 | name = "multiversion-macros"
287 | version = "0.6.1"
288 | source = "registry+https://github.com/rust-lang/crates.io-index"
289 | checksum = "a8a3e2bde382ebf960c1f3e79689fa5941625fe9bf694a1cb64af3e85faff3af"
290 | dependencies = [
291 |  "proc-macro2",
292 |  "quote",
293 |  "syn",
294 | ]
295 | 
296 | [[package]]
297 | name = "naive-db"
298 | version = "0.1.0"
299 | dependencies = [
300 |  "arrow",
301 |  "log",
302 |  "ordered-float",
303 |  "sqlparser",
304 |  "twox-hash",
305 | ]
306 | 
307 | [[package]]
308 | name = "num"
309 | version = "0.4.0"
310 | source = "registry+https://github.com/rust-lang/crates.io-index"
311 | checksum = "43db66d1170d347f9a065114077f7dccb00c1b9478c89384490a3425279a4606"
312 | dependencies = [
313 |  "num-bigint",
314 |  "num-complex",
315 |  "num-integer",
316 |  "num-iter",
317 |  "num-rational",
318 |  "num-traits",
319 | ]
320 | 
321 | [[package]]
322 | name = "num-bigint"
323 | version = "0.4.3"
324 | source = "registry+https://github.com/rust-lang/crates.io-index"
325 | checksum = "f93ab6289c7b344a8a9f60f88d80aa20032336fe78da341afc91c8a2341fc75f"
326 | dependencies = [
327 |  "autocfg",
328 |  "num-integer",
329 |  "num-traits",
330 | ]
331 | 
332 | [[package]]
333 | name = "num-complex"
334 | version = "0.4.1"
335 | source = "registry+https://github.com/rust-lang/crates.io-index"
336 | checksum = "97fbc387afefefd5e9e39493299f3069e14a140dd34dc19b4c1c1a8fddb6a790"
337 | dependencies = [
338 |  "num-traits",
339 | ]
340 | 
341 | [[package]]
342 | name = "num-integer"
343 | version = "0.1.45"
344 | source = "registry+https://github.com/rust-lang/crates.io-index"
345 | checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9"
346 | dependencies = [
347 |  "autocfg",
348 |  "num-traits",
349 | ]
350 | 
351 | [[package]]
352 | name = "num-iter"
353 | version = "0.1.43"
354 | source = "registry+https://github.com/rust-lang/crates.io-index"
355 | checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252"
356 | dependencies = [
357 |  "autocfg",
358 |  "num-integer",
359 |  "num-traits",
360 | ]
361 | 
362 | [[package]]
363 | name = "num-rational"
364 | version = "0.4.0"
365 | source = "registry+https://github.com/rust-lang/crates.io-index"
366 | checksum = "d41702bd167c2df5520b384281bc111a4b5efcf7fbc4c9c222c815b07e0a6a6a"
367 | dependencies = [
368 |  "autocfg",
369 |  "num-bigint",
370 |  "num-integer",
371 |  "num-traits",
372 | ]
373 | 
374 | [[package]]
375 | name = "num-traits"
376 | version = "0.2.15"
377 | source = "registry+https://github.com/rust-lang/crates.io-index"
378 | checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
379 | dependencies = [
380 |  "autocfg",
381 | ]
382 | 
383 | [[package]]
384 | name = "ordered-float"
385 | version = "3.0.0"
386 | source = "registry+https://github.com/rust-lang/crates.io-index"
387 | checksum = "96bcbab4bfea7a59c2c0fe47211a1ac4e3e96bea6eb446d704f310bc5c732ae2"
388 | dependencies = [
389 |  "num-traits",
390 | ]
391 | 
392 | [[package]]
393 | name = "ppv-lite86"
394 | version = "0.2.16"
395 | source = "registry+https://github.com/rust-lang/crates.io-index"
396 | checksum = "eb9f9e6e233e5c4a35559a617bf40a4ec447db2e84c20b55a6f83167b7e57872"
397 | 
398 | [[package]]
399 | name = "proc-macro2"
400 | version = "1.0.38"
401 | source = "registry+https://github.com/rust-lang/crates.io-index"
402 | checksum = "9027b48e9d4c9175fa2218adf3557f91c1137021739951d4932f5f8268ac48aa"
403 | dependencies = [
404 |  "unicode-xid",
405 | ]
406 | 
407 | [[package]]
408 | name = "quote"
409 | version = "1.0.18"
410 | source = "registry+https://github.com/rust-lang/crates.io-index"
411 | checksum = "a1feb54ed693b93a84e14094943b84b7c4eae204c512b7ccb95ab0c66d278ad1"
412 | dependencies = [
413 |  "proc-macro2",
414 | ]
415 | 
416 | [[package]]
417 | name = "rand"
418 | version = "0.8.5"
419 | source = "registry+https://github.com/rust-lang/crates.io-index"
420 | checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404"
421 | dependencies = [
422 |  "libc",
423 |  "rand_chacha",
424 |  "rand_core",
425 | ]
426 | 
427 | [[package]]
428 | name = "rand_chacha"
429 | version = "0.3.1"
430 | source = "registry+https://github.com/rust-lang/crates.io-index"
431 | checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88"
432 | dependencies = [
433 |  "ppv-lite86",
434 |  "rand_core",
435 | ]
436 | 
437 | [[package]]
438 | name = "rand_core"
439 | version = "0.6.3"
440 | source = "registry+https://github.com/rust-lang/crates.io-index"
441 | checksum = "d34f1408f55294453790c48b2f1ebbb1c5b4b7563eb1f418bcfcfdbb06ebb4e7"
442 | dependencies = [
443 |  "getrandom",
444 | ]
445 | 
446 | [[package]]
447 | name = "regex"
448 | version = "1.5.5"
449 | source = "registry+https://github.com/rust-lang/crates.io-index"
450 | checksum = "1a11647b6b25ff05a515cb92c365cec08801e83423a235b51e231e1808747286"
451 | dependencies = [
452 |  "aho-corasick",
453 |  "memchr",
454 |  "regex-syntax",
455 | ]
456 | 
457 | [[package]]
458 | name = "regex-automata"
459 | version = "0.1.10"
460 | source = "registry+https://github.com/rust-lang/crates.io-index"
461 | checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132"
462 | 
463 | [[package]]
464 | name = "regex-syntax"
465 | version = "0.6.25"
466 | source = "registry+https://github.com/rust-lang/crates.io-index"
467 | checksum = "f497285884f3fcff424ffc933e56d7cbca511def0c9831a7f9b5f6153e3cc89b"
468 | 
469 | [[package]]
470 | name = "rustversion"
471 | version = "1.0.6"
472 | source = "registry+https://github.com/rust-lang/crates.io-index"
473 | checksum = "f2cc38e8fa666e2de3c4aba7edeb5ffc5246c1c2ed0e3d17e560aeeba736b23f"
474 | 
475 | [[package]]
476 | name = "ryu"
477 | version = "1.0.9"
478 | source = "registry+https://github.com/rust-lang/crates.io-index"
479 | checksum = "73b4b750c782965c211b42f022f59af1fbceabdd026623714f104152f1ec149f"
480 | 
481 | [[package]]
482 | name = "serde"
483 | version = "1.0.137"
484 | source = "registry+https://github.com/rust-lang/crates.io-index"
485 | checksum = "61ea8d54c77f8315140a05f4c7237403bf38b72704d031543aa1d16abbf517d1"
486 | 
487 | [[package]]
488 | name = "serde_derive"
489 | version = "1.0.137"
490 | source = "registry+https://github.com/rust-lang/crates.io-index"
491 | checksum = "1f26faba0c3959972377d3b2d306ee9f71faee9714294e41bb777f83f88578be"
492 | dependencies = [
493 |  "proc-macro2",
494 |  "quote",
495 |  "syn",
496 | ]
497 | 
498 | [[package]]
499 | name = "serde_json"
500 | version = "1.0.81"
501 | source = "registry+https://github.com/rust-lang/crates.io-index"
502 | checksum = "9b7ce2b32a1aed03c558dc61a5cd328f15aff2dbc17daad8fb8af04d2100e15c"
503 | dependencies = [
504 |  "indexmap",
505 |  "itoa 1.0.1",
506 |  "ryu",
507 |  "serde",
508 | ]
509 | 
510 | [[package]]
511 | name = "smallvec"
512 | version = "1.8.0"
513 | source = "registry+https://github.com/rust-lang/crates.io-index"
514 | checksum = "f2dd574626839106c320a323308629dcb1acfc96e32a8cba364ddc61ac23ee83"
515 | 
516 | [[package]]
517 | name = "sqlparser"
518 | version = "0.9.0"
519 | source = "registry+https://github.com/rust-lang/crates.io-index"
520 | checksum = "4fa863a2dfc4879a35647c51dadf495a2ad53745eaf3723fda27006e745fb0ba"
521 | dependencies = [
522 |  "log",
523 | ]
524 | 
525 | [[package]]
526 | name = "static_assertions"
527 | version = "1.1.0"
528 | source = "registry+https://github.com/rust-lang/crates.io-index"
529 | checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
530 | 
531 | [[package]]
532 | name = "strum"
533 | version = "0.23.0"
534 | source = "registry+https://github.com/rust-lang/crates.io-index"
535 | checksum = "cae14b91c7d11c9a851d3fbc80a963198998c2a64eec840477fa92d8ce9b70bb"
536 | 
537 | [[package]]
538 | name = "strum_macros"
539 | version = "0.23.1"
540 | source = "registry+https://github.com/rust-lang/crates.io-index"
541 | checksum = "5bb0dc7ee9c15cea6199cde9a127fa16a4c5819af85395457ad72d68edc85a38"
542 | dependencies = [
543 |  "heck",
544 |  "proc-macro2",
545 |  "quote",
546 |  "rustversion",
547 |  "syn",
548 | ]
549 | 
550 | [[package]]
551 | name = "syn"
552 | version = "1.0.93"
553 | source = "registry+https://github.com/rust-lang/crates.io-index"
554 | checksum = "04066589568b72ec65f42d65a1a52436e954b168773148893c020269563decf2"
555 | dependencies = [
556 |  "proc-macro2",
557 |  "quote",
558 |  "unicode-xid",
559 | ]
560 | 
561 | [[package]]
562 | name = "thiserror"
563 | version = "1.0.31"
564 | source = "registry+https://github.com/rust-lang/crates.io-index"
565 | checksum = "bd829fe32373d27f76265620b5309d0340cb8550f523c1dda251d6298069069a"
566 | dependencies = [
567 |  "thiserror-impl",
568 | ]
569 | 
570 | [[package]]
571 | name = "thiserror-impl"
572 | version = "1.0.31"
573 | source = "registry+https://github.com/rust-lang/crates.io-index"
574 | checksum = "0396bc89e626244658bef819e22d0cc459e795a5ebe878e6ec336d1674a8d79a"
575 | dependencies = [
576 |  "proc-macro2",
577 |  "quote",
578 |  "syn",
579 | ]
580 | 
581 | [[package]]
582 | name = "twox-hash"
583 | version = "1.6.3"
584 | source = "registry+https://github.com/rust-lang/crates.io-index"
585 | checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675"
586 | dependencies = [
587 |  "cfg-if",
588 |  "rand",
589 |  "static_assertions",
590 | ]
591 | 
592 | [[package]]
593 | name = "unicode-segmentation"
594 | version = "1.9.0"
595 | source = "registry+https://github.com/rust-lang/crates.io-index"
596 | checksum = "7e8820f5d777f6224dc4be3632222971ac30164d4a258d595640799554ebfd99"
597 | 
598 | [[package]]
599 | name = "unicode-width"
600 | version = "0.1.9"
601 | source = "registry+https://github.com/rust-lang/crates.io-index"
602 | checksum = "3ed742d4ea2bd1176e236172c8429aaf54486e7ac098db29ffe6529e0ce50973"
603 | 
604 | [[package]]
605 | name = "unicode-xid"
606 | version = "0.2.3"
607 | source = "registry+https://github.com/rust-lang/crates.io-index"
608 | checksum = "957e51f3646910546462e67d5f7599b9e4fb8acdd304b087a6494730f9eebf04"
609 | 
610 | [[package]]
611 | name = "wasi"
612 | version = "0.10.2+wasi-snapshot-preview1"
613 | source = "registry+https://github.com/rust-lang/crates.io-index"
614 | checksum = "fd6fbd9a79829dd1ad0cc20627bf1ed606756a7f77edff7b66b7064f9cb327c6"
615 | 
616 | [[package]]
617 | name = "winapi"
618 | version = "0.3.9"
619 | source = "registry+https://github.com/rust-lang/crates.io-index"
620 | checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419"
621 | dependencies = [
622 |  "winapi-i686-pc-windows-gnu",
623 |  "winapi-x86_64-pc-windows-gnu",
624 | ]
625 | 
626 | [[package]]
627 | name = "winapi-i686-pc-windows-gnu"
628 | version = "0.4.0"
629 | source = "registry+https://github.com/rust-lang/crates.io-index"
630 | checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6"
631 | 
632 | [[package]]
633 | name = "winapi-x86_64-pc-windows-gnu"
634 | version = "0.4.0"
635 | source = "registry+https://github.com/rust-lang/crates.io-index"
636 | checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f"
637 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "naive-db"
 3 | version = "0.1.0"
 4 | edition = "2021"
 5 | 
 6 | [dependencies]
 7 | arrow = { version = "13", features = ["prettyprint"] }
 8 | sqlparser = "0.9.0"
 9 | log = "0.4"
10 | twox-hash = "1.6.3"
11 | ordered-float = "3.0.0"
12 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Naive Query Engine (Toy for Learning) 😄
  2 | 
  3 | This is a Query Engine which support `SQL` interface. And it is only a Toy for learn query engine only. You can check [TODO](https://github.com/Veeupup/naive-query-engine#todo) to check the progress now.
  4 | 
  5 | Simple enough to learn (Although it is simple...but with so much work to finish.. TAT 😭) and Now it only has a basic architecture and most operators and planners have not implemented (will be done in the future).
  6 | 
  7 | This is inspired(and most ideas come) by [how-query-engines-work](https://github.com/andygrove/how-query-engines-work) and it is just for learning purpose. And many ideas inspired by [arrow-datafusion](https://github.com/apache/arrow-datafusion).
  8 | 
  9 | Use [arrow](https://github.com/apache/arrow-rs) to express in-memory columnar format and use [sqlparser](https://github.com/sqlparser-rs/sqlparser-rs) as SQL parser.
 10 | 
 11 | ## architecture
 12 | 
 13 | ![query_engine](./doc/query_engine.jpg)
 14 | 
 15 | ## how to use
 16 | 
 17 | for now, we can use `NaiveDB` like below, we can use csv as table storage.
 18 | 
 19 | ```rust
 20 | use naive_db::print_result;
 21 | use naive_db::CsvConfig;
 22 | use naive_db::NaiveDB;
 23 | use naive_db::Result;
 24 | 
 25 | fn main() -> Result<()> {
 26 |     let mut db = NaiveDB::default();
 27 | 
 28 |     db.create_csv_table("t1", "data/test_data.csv", CsvConfig::default())?;
 29 | 
 30 |     // select
 31 |     let ret = db.run_sql("select id, name, age + 100 from t1 where id < 9 limit 3 offset 2")?;
 32 |     print_result(&ret)?;
 33 | 
 34 |     // Inner Join
 35 |     db.create_csv_table("employee", "data/employee.csv", CsvConfig::default())?;
 36 |     db.create_csv_table("rank", "data/rank.csv", CsvConfig::default())?;
 37 |     db.create_csv_table("department", "data/department.csv", CsvConfig::default())?;
 38 | 
 39 |     let ret = db.run_sql(
 40 |         "
 41 |         select id, name, rank_name, department_name
 42 |         from employee
 43 |         join rank on 
 44 |             employee.rank = rank.id  
 45 |         join department on
 46 |             employee.department_id = department.id
 47 |     ",
 48 |     )?;
 49 |     print_result(&ret)?;
 50 | 
 51 |     // cross join
 52 |     let ret = db.run_sql("select * from employee join rank")?;
 53 |     print_result(&ret)?;
 54 | 
 55 |     // aggregate
 56 |     let ret = db.run_sql(
 57 |         "
 58 |         select count(id), sum(age), sum(score), avg(score), max(score), min(score) 
 59 |         from t1 group by id % 3",
 60 |     )?;
 61 |     print_result(&ret)?;
 62 | 
 63 |     Ok(())
 64 | }
 65 | ```
 66 | 
 67 | output will be:
 68 | 
 69 | ```
 70 | +----+-------+-----------+
 71 | | id | name  | age + 100 |
 72 | +----+-------+-----------+
 73 | | 4  | lynne | 118       |
 74 | | 5  | alice | 119       |
 75 | | 6  | bob   | 120       |
 76 | +----+-------+-----------+
 77 | +----+-------+-------------+-----------------+
 78 | | id | name  | rank_name   | department_name |
 79 | +----+-------+-------------+-----------------+
 80 | | 2  | lynne | master      | IT              |
 81 | | 1  | vee   | diamond     | IT              |
 82 | | 3  | Alex  | master      | Marketing       |
 83 | | 4  | jack  | diamond     | Marketing       |
 84 | | 5  | mike  | grandmaster | Human Resource  |
 85 | +----+-------+-------------+-----------------+
 86 | +----+-------+---------------+------+----+-------------+
 87 | | id | name  | department_id | rank | id | rank_name   |
 88 | +----+-------+---------------+------+----+-------------+
 89 | | 1  | vee   | 1             | 1    | 1  | master      |
 90 | | 2  | lynne | 1             | 0    | 2  | diamond     |
 91 | | 3  | Alex  | 2             | 0    | 3  | grandmaster |
 92 | | 4  | jack  | 2             | 1    | 4  | master      |
 93 | | 5  | mike  | 3             | 2    | 5  | diamond     |
 94 | | 1  | vee   | 1             | 1    | 1  | grandmaster |
 95 | | 2  | lynne | 1             | 0    | 2  | master      |
 96 | | 3  | Alex  | 2             | 0    | 3  | diamond     |
 97 | | 4  | jack  | 2             | 1    | 4  | grandmaster |
 98 | | 5  | mike  | 3             | 2    | 5  | master      |
 99 | | 1  | vee   | 1             | 1    | 1  | diamond     |
100 | | 2  | lynne | 1             | 0    | 2  | grandmaster |
101 | | 3  | Alex  | 2             | 0    | 3  | master      |
102 | | 4  | jack  | 2             | 1    | 4  | diamond     |
103 | | 5  | mike  | 3             | 2    | 5  | grandmaster |
104 | +----+-------+---------------+------+----+-------------+
105 | +-----------+----------+--------------------+-------------------+------------+------------+
106 | | count(id) | sum(age) | sum(score)         | avg(score)        | max(score) | min(score) |
107 | +-----------+----------+--------------------+-------------------+------------+------------+
108 | | 3         | 61       | 255.6              | 85.2              | 90.1       | 81.1       |
109 | | 3         | 62       | 243.29000000000002 | 81.09666666666668 | 99.99      | 60         |
110 | | 2         | 43       | 167.7              | 83.85             | 85.5       | 82.2       |
111 | +-----------+----------+--------------------+-------------------+------------+------------+
112 | ```
113 | 
114 | ## architecture
115 | 
116 | The NaiveDB is just simple and has clear progress just like:
117 | 
118 | ```rust
119 | impl NaiveDB {
120 |     pub fn run_sql(&self, sql: &str) -> Result<Vec<RecordBatch>> {
121 |         // 1. sql -> statement
122 |         let statement = SQLParser::parse(sql)?;
123 |         // 2. statement -> logical plan
124 |         let sql_planner = SQLPlanner::new(&self.catalog);
125 |         let logical_plan = sql_planner.statement_to_plan(statement)?;
126 |         // 3. optimize
127 |         let optimizer = Optimizer::default();
128 |         let logical_plan = optimizer.optimize(logical_plan);
129 |         // 4. logical plan -> physical plan
130 |         let physical_plan = QueryPlanner::create_physical_plan(&logical_plan)?;
131 |         // 5. execute
132 |         physical_plan.execute()
133 |     }
134 | }
135 | ```
136 | 
137 | 
138 | ## TODO
139 | 
140 | - [x] type system
141 | - [x] datasource
142 |     - [x] mem source
143 |     - [x] csv as datasource
144 |     - [x] empty datasource
145 | - [x] logical plan & expressions
146 | - [ ] build logical plans
147 |     - [x] projection
148 |     - [x] filter
149 |     - [x] aggregate
150 |     - [x] limit
151 |     - [x] join
152 | - [x] physical plan & expressions
153 |     - [x] physical scan
154 |     - [x] physical projection
155 |     - [x] physical filter
156 |     - [x] physical limit
157 |     - [x] join
158 |         - algorithms
159 |             - [x] (dumb😊) nested loop join
160 |             - [x] hash join
161 |             - [ ] sort-merge join
162 |         - [x] inner join
163 |         - [x] cross join
164 |     - [ ] physical expression
165 |         - [x] column expr
166 |         - [x] binary operation expr(add/sub/mul/div/and/or...)
167 |         - [x] literal expr
168 |         - [x] unary expr
169 |         - [x] aggr expr
170 |         - [ ] so many work to do... TAT
171 | - [ ] query planner
172 |     - [x] scan
173 |     - [x] limit
174 |     - [x] join
175 |     - [x] aggregate
176 |     - [ ] ...
177 | - [ ] query optimization
178 |     - [ ] more rules needed
179 | - [ ] sql support
180 |     - [x] parser
181 |     - [ ] SQL planner: statement -> logical plan
182 |         - [x] scan
183 |         - [x] projection
184 |         - [x] selection
185 |         - [x] limit
186 |         - [x] join
187 |         - [x] aggregate
188 |             - [x] group by
189 |         - [ ] scalar function
190 | 


--------------------------------------------------------------------------------
/architecture.md:
--------------------------------------------------------------------------------
 1 | # Architecture
 2 | 
 3 | ## datatype
 4 | 
 5 | use `arrow` as datatype system.
 6 | 
 7 | ## datasource
 8 | 
 9 | we can return scan result by using async streaming trait after
10 | 
11 | ## 
12 | 
13 | ```rust
14 | pub struct NaiveDB {
15 |     catalog: HashMap<String, TableRef>
16 | };
17 | 
18 | impl NaiveDB {
19 |     pub run_sql(sql: &str) -> Result<RecordBatch> {
20 |         let statement = Parser::parse_sql();
21 |         let logical_plan = SqlPlanner::sql_to_plan(statement, catalog)?;
22 |         let optimizer = Optimizer::new();
23 |         let plan = optimizer.optimize(plan);
24 |         let physical_plan = QueryPlanner::create_physical_plan(plan);
25 |         plan.execute()
26 |     }
27 | }
28 | 
29 | fn main() {
30 |     
31 |     DB::run_sql("select a, b from t where a > 1")
32 | }
33 | 
34 | 
35 | ```
36 | 


--------------------------------------------------------------------------------
/clippy.toml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Veeupup/naive-query-engine/b942af6a8505d4083b384ef8a5969988f97d330f/clippy.toml


--------------------------------------------------------------------------------
/data/department.csv:
--------------------------------------------------------------------------------
1 | id,department_name
2 | 1,IT
3 | 2,Marketing
4 | 3,Human Resource


--------------------------------------------------------------------------------
/data/employee.csv:
--------------------------------------------------------------------------------
1 | id,name,department_id,rank
2 | 1,vee,1,1
3 | 2,lynne,1,0
4 | 3,Alex,2,0
5 | 4,jack,2,1
6 | 5,mike,3,2


--------------------------------------------------------------------------------
/data/rank.csv:
--------------------------------------------------------------------------------
1 | id,rank_name
2 | 0,master
3 | 1,diamond
4 | 2,grandmaster


--------------------------------------------------------------------------------
/data/test_data.csv:
--------------------------------------------------------------------------------
1 | id,name,age,score
2 | 1,veeupup,23,60.0
3 | 2,alex,20,90.1
4 | 4,lynne,18,99.99
5 | 5,alice,19,81.1
6 | 6,bob,20,82.2
7 | 7,jack,21,83.3
8 | 8,cock,22,84.4
9 | 9,primer,23,85.5


--------------------------------------------------------------------------------
/doc/query_engine.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Veeupup/naive-query-engine/b942af6a8505d4083b384ef8a5969988f97d330f/doc/query_engine.jpg


--------------------------------------------------------------------------------
/makefile:
--------------------------------------------------------------------------------
1 | lint:
2 | 	cargo fmt
3 | 	cargo clippy --all-targets --all-features -- -D warnings
4 | 
5 | fix:
6 | 	cargo fix --allow-dirty
7 | 


--------------------------------------------------------------------------------
/src/catalog.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: Veeupup
 3 |  * @Date: 2022-05-14 16:00:32
 4 |  * @Email: code@tanweime.com
 5 | */
 6 | 
 7 | use std::collections::HashMap;
 8 | 
 9 | use crate::datasource::{EmptyTable, MemTable};
10 | use crate::error::ErrorCode;
11 | use crate::logical_plan::plan::{LogicalPlan, TableScan};
12 | use crate::logical_plan::schema::NaiveSchema;
13 | use crate::logical_plan::DataFrame;
14 | use crate::{
15 |     datasource::{CsvConfig, CsvTable, TableRef},
16 |     error::Result,
17 | };
18 | use arrow::record_batch::RecordBatch;
19 | 
20 | #[derive(Default)]
21 | pub struct Catalog {
22 |     tables: HashMap<String, TableRef>,
23 | }
24 | 
25 | impl Catalog {
26 |     /// add csv table
27 |     pub fn add_csv_table(
28 |         &mut self,
29 |         table: &str,
30 |         csv_file: &str,
31 |         csv_conf: CsvConfig,
32 |     ) -> Result<()> {
33 |         let source = CsvTable::try_create(csv_file, csv_conf)?;
34 |         self.tables.insert(table.to_string(), source);
35 |         Ok(())
36 |     }
37 | 
38 |     #[allow(unused)]
39 |     /// add memory table
40 |     pub fn add_memory_table(
41 |         &mut self,
42 |         table: &str,
43 |         schema: NaiveSchema,
44 |         batches: Vec<RecordBatch>,
45 |     ) -> Result<()> {
46 |         let source = MemTable::try_create(schema, batches)?;
47 |         self.tables.insert(table.to_string(), source);
48 |         Ok(())
49 |     }
50 | 
51 |     #[allow(unused)]
52 |     /// add empty table
53 |     pub fn add_empty_table(&mut self, table: &str, schema: NaiveSchema) -> Result<()> {
54 |         let source = EmptyTable::try_create(schema)?;
55 |         self.tables.insert(table.to_string(), source);
56 |         Ok(())
57 |     }
58 | 
59 |     /// get table
60 |     pub fn get_table(&self, table: &str) -> Result<TableRef> {
61 |         self.tables
62 |             .get(table)
63 |             .cloned()
64 |             .ok_or_else(|| ErrorCode::NoSuchTable(format!("No table name: {}", table)))
65 |     }
66 | 
67 |     #[allow(unused)]
68 |     /// get dataframe by table name
69 |     pub fn get_table_df(&self, table: &str) -> Result<DataFrame> {
70 |         let source = self
71 |             .tables
72 |             .get(table)
73 |             .cloned()
74 |             .ok_or_else(|| ErrorCode::NoSuchTable(format!("No table name: {}", table)))?;
75 |         let plan = LogicalPlan::TableScan(TableScan {
76 |             source,
77 |             projection: None,
78 |         });
79 |         Ok(DataFrame { plan })
80 |     }
81 | }
82 | 


--------------------------------------------------------------------------------
/src/datasource/csv.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * @Author: Veeupup
  3 |  * @Date: 2022-05-12 16:45:18
  4 |  * @Email: code@tanweime.com
  5 | */
  6 | 
  7 | use std::env;
  8 | use std::fs::File;
  9 | use std::iter::Iterator;
 10 | use std::path::Path;
 11 | use std::sync::Arc;
 12 | 
 13 | use crate::error::Result;
 14 | use crate::logical_plan::schema::NaiveSchema;
 15 | 
 16 | use arrow::csv;
 17 | use arrow::datatypes::Schema;
 18 | use arrow::record_batch::RecordBatch;
 19 | 
 20 | use super::TableSource;
 21 | use crate::datasource::TableRef;
 22 | 
 23 | pub struct CsvConfig {
 24 |     pub has_header: bool,
 25 |     pub delimiter: u8,
 26 |     pub max_read_records: Option<usize>,
 27 |     pub batch_size: usize,
 28 |     pub file_projection: Option<Vec<usize>>,
 29 |     pub datetime_format: Option<String>,
 30 | }
 31 | 
 32 | impl Default for CsvConfig {
 33 |     fn default() -> Self {
 34 |         Self {
 35 |             has_header: true,
 36 |             delimiter: b',',
 37 |             max_read_records: Some(3),
 38 |             batch_size: 1_000_000,
 39 |             file_projection: None,
 40 |             datetime_format: None,
 41 |         }
 42 |     }
 43 | }
 44 | 
 45 | #[derive(Debug, Clone)]
 46 | pub struct CsvTable {
 47 |     schema: NaiveSchema,
 48 |     batches: Vec<RecordBatch>,
 49 | }
 50 | 
 51 | impl CsvTable {
 52 |     #[allow(unused, clippy::iter_next_loop)]
 53 |     pub fn try_create(filename: &str, csv_config: CsvConfig) -> Result<TableRef> {
 54 |         let orig_schema = Self::infer_schema_from_csv(filename, &csv_config)?;
 55 |         let schema = NaiveSchema::from_unqualified(&orig_schema);
 56 | 
 57 |         let mut file = File::open(env::current_dir()?.join(Path::new(filename)))?;
 58 |         let mut reader = csv::Reader::new(
 59 |             file,
 60 |             Arc::new(orig_schema),
 61 |             csv_config.has_header,
 62 |             Some(csv_config.delimiter),
 63 |             csv_config.batch_size,
 64 |             None,
 65 |             csv_config.file_projection.clone(),
 66 |             csv_config.datetime_format,
 67 |         );
 68 |         let mut batches = vec![];
 69 | 
 70 |         for record in reader.next() {
 71 |             batches.push(record?);
 72 |         }
 73 | 
 74 |         Ok(Arc::new(Self { schema, batches }))
 75 |     }
 76 | 
 77 |     fn infer_schema_from_csv(filename: &str, csv_config: &CsvConfig) -> Result<Schema> {
 78 |         let mut file = File::open(env::current_dir()?.join(Path::new(filename)))?;
 79 |         let (schema, _) = arrow::csv::reader::infer_reader_schema(
 80 |             &mut file,
 81 |             csv_config.delimiter,
 82 |             csv_config.max_read_records,
 83 |             csv_config.has_header,
 84 |         )?;
 85 |         Ok(schema)
 86 |     }
 87 | }
 88 | 
 89 | impl TableSource for CsvTable {
 90 |     fn schema(&self) -> &NaiveSchema {
 91 |         &self.schema
 92 |     }
 93 | 
 94 |     fn scan(&self, _projection: Option<Vec<usize>>) -> Result<Vec<RecordBatch>> {
 95 |         Ok(self.batches.clone())
 96 |     }
 97 | 
 98 |     fn source_name(&self) -> String {
 99 |         "CsvTable".into()
100 |     }
101 | }
102 | 
103 | #[cfg(test)]
104 | mod tests {
105 |     use super::*;
106 |     use arrow::{
107 |         array::{ArrayRef, Float64Array, Int64Array, StringArray},
108 |         datatypes::{DataType, Field, Schema},
109 |     };
110 | 
111 |     #[test]
112 |     fn test_infer_schema() -> Result<()> {
113 |         let table = CsvTable::try_create("data/test_data.csv", CsvConfig::default())?;
114 |         let schema = table.schema();
115 | 
116 |         let excepted = Arc::new(Schema::new(vec![
117 |             Field::new("id", DataType::Int64, false),
118 |             Field::new("name", DataType::Utf8, false),
119 |             Field::new("age", DataType::Int64, false),
120 |             Field::new("score", DataType::Float64, false),
121 |         ]));
122 | 
123 |         assert_eq!(schema.fields().len(), excepted.fields().len());
124 | 
125 |         let iter = schema.fields().iter().zip(excepted.fields().iter());
126 |         for (field, excepted) in iter {
127 |             assert_eq!(field.name(), excepted.name());
128 |             assert_eq!(field.data_type(), excepted.data_type());
129 |             assert_eq!(field.is_nullable(), excepted.is_nullable());
130 |         }
131 | 
132 |         Ok(())
133 |     }
134 | 
135 |     #[test]
136 |     fn test_read_from_csv() -> Result<()> {
137 |         let table = CsvTable::try_create("data/test_data.csv", CsvConfig::default())?;
138 | 
139 |         let batches = table.scan(None)?;
140 | 
141 |         assert_eq!(batches.len(), 1);
142 |         let record_batch = &batches[0];
143 |         assert_eq!(record_batch.columns().len(), 4);
144 | 
145 |         let id_excepted: ArrayRef = Arc::new(Int64Array::from(vec![1, 2, 4, 5, 6, 7, 8, 9]));
146 |         let name_excepted: ArrayRef = Arc::new(StringArray::from(vec![
147 |             "veeupup", "alex", "lynne", "alice", "bob", "jack", "cock", "primer",
148 |         ]));
149 |         let age_excepted: ArrayRef =
150 |             Arc::new(Int64Array::from(vec![23, 20, 18, 19, 20, 21, 22, 23]));
151 |         let score_excepted: ArrayRef = Arc::new(Float64Array::from(vec![
152 |             60.0, 90.1, 99.99, 81.1, 82.2, 83.3, 84.4, 85.5,
153 |         ]));
154 | 
155 |         assert_eq!(record_batch.column(0), &id_excepted);
156 |         assert_eq!(record_batch.column(1), &name_excepted);
157 |         assert_eq!(record_batch.column(2), &age_excepted);
158 |         assert_eq!(record_batch.column(3), &score_excepted);
159 | 
160 |         Ok(())
161 |     }
162 | }
163 | 


--------------------------------------------------------------------------------
/src/datasource/empty.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: Veeupup
 3 |  * @Date: 2022-05-12 16:16:58
 4 |  * @Email: code@tanweime.com
 5 | */
 6 | 
 7 | use super::TableSource;
 8 | use crate::datasource::TableRef;
 9 | use crate::error::Result;
10 | use crate::logical_plan::schema::NaiveSchema;
11 | 
12 | use arrow::record_batch::RecordBatch;
13 | use std::sync::Arc;
14 | 
15 | /// Empty Table with schema but no data
16 | #[derive(Debug, Clone)]
17 | pub struct EmptyTable {
18 |     schema: NaiveSchema,
19 | }
20 | 
21 | impl EmptyTable {
22 |     #[allow(unused)]
23 |     pub fn try_create(schema: NaiveSchema) -> Result<TableRef> {
24 |         Ok(Arc::new(Self { schema }))
25 |     }
26 | }
27 | 
28 | impl TableSource for EmptyTable {
29 |     fn schema(&self) -> &NaiveSchema {
30 |         &self.schema
31 |     }
32 | 
33 |     fn scan(&self, _projection: Option<Vec<usize>>) -> Result<Vec<RecordBatch>> {
34 |         Ok(vec![])
35 |     }
36 | 
37 |     fn source_name(&self) -> String {
38 |         "EmptyTable".into()
39 |     }
40 | }
41 | 
42 | #[cfg(test)]
43 | mod tests {
44 |     use super::*;
45 |     use arrow::datatypes::{DataType, Field, Schema};
46 | 
47 |     #[test]
48 |     fn test_empty_table() -> Result<()> {
49 |         let schema = Schema::new(vec![
50 |             Field::new("a", DataType::Int32, false),
51 |             Field::new("b", DataType::Int32, false),
52 |         ]);
53 |         let schema = NaiveSchema::from_qualified("t1", &schema);
54 | 
55 |         let table = EmptyTable::try_create(schema)?;
56 |         let batches = table.scan(None)?;
57 | 
58 |         assert!(batches.is_empty());
59 | 
60 |         Ok(())
61 |     }
62 | }
63 | 


--------------------------------------------------------------------------------
/src/datasource/memory.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: Veeupup
 3 |  * @Date: 2022-05-12 16:14:35
 4 |  * @Email: code@tanweime.com
 5 | */
 6 | 
 7 | use arrow::record_batch::RecordBatch;
 8 | use std::sync::Arc;
 9 | 
10 | use super::{TableRef, TableSource};
11 | use crate::{error::Result, logical_plan::schema::NaiveSchema};
12 | 
13 | #[derive(Debug, Clone)]
14 | pub struct MemTable {
15 |     schema: NaiveSchema,
16 |     batches: Vec<RecordBatch>,
17 | }
18 | 
19 | impl MemTable {
20 |     #[allow(unused)]
21 |     pub fn try_create(schema: NaiveSchema, batches: Vec<RecordBatch>) -> Result<TableRef> {
22 |         Ok(Arc::new(Self { schema, batches }))
23 |     }
24 | }
25 | 
26 | impl TableSource for MemTable {
27 |     fn schema(&self) -> &NaiveSchema {
28 |         &self.schema
29 |     }
30 | 
31 |     fn scan(&self, projection: Option<Vec<usize>>) -> Result<Vec<RecordBatch>> {
32 |         if let Some(projection) = projection {
33 |             let batches = self
34 |                 .batches
35 |                 .iter()
36 |                 .map(|record_batch| record_batch.project(projection.as_ref()).unwrap())
37 |                 .collect::<Vec<_>>();
38 |             return Ok(batches);
39 |         }
40 |         Ok(self.batches.clone())
41 |     }
42 | 
43 |     fn source_name(&self) -> String {
44 |         "MemTable".into()
45 |     }
46 | }
47 | 
48 | #[cfg(test)]
49 | mod tests {
50 |     use super::MemTable;
51 |     use crate::error::Result;
52 |     use crate::logical_plan::schema::NaiveSchema;
53 |     use arrow::array::Int32Array;
54 |     use arrow::datatypes::{DataType, Field, Schema};
55 |     use arrow::record_batch::RecordBatch;
56 |     use std::sync::Arc;
57 | 
58 |     #[test]
59 |     fn mem_table_test() -> Result<()> {
60 |         let schema = Arc::new(Schema::new(vec![
61 |             Field::new("a", DataType::Int32, false),
62 |             Field::new("b", DataType::Int32, false),
63 |             Field::new("c", DataType::Int32, false),
64 |             Field::new("d", DataType::Int32, true),
65 |         ]));
66 |         let schema = NaiveSchema::from_qualified("t1", &schema);
67 | 
68 |         let batch = RecordBatch::try_new(
69 |             schema.clone().into(),
70 |             vec![
71 |                 Arc::new(Int32Array::from(vec![1, 2, 3])),
72 |                 Arc::new(Int32Array::from(vec![4, 5, 6])),
73 |                 Arc::new(Int32Array::from(vec![7, 8, 9])),
74 |                 Arc::new(Int32Array::from(vec![None, None, Some(9)])),
75 |             ],
76 |         )?;
77 | 
78 |         let mem_table = MemTable::try_create(schema, vec![batch])?;
79 | 
80 |         // scan
81 |         let batches = mem_table.scan(Some(vec![2, 1]))?;
82 |         let batch2 = &batches[0];
83 | 
84 |         assert_eq!(2, batch2.schema().fields().len());
85 |         assert_eq!("t1.c", batch2.schema().field(0).name());
86 |         assert_eq!("t1.b", batch2.schema().field(1).name());
87 |         assert_eq!(2, batch2.num_columns());
88 | 
89 |         Ok(())
90 |     }
91 | }
92 | 


--------------------------------------------------------------------------------
/src/datasource/mod.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: Veeupup
 3 |  * @Date: 2022-05-12 16:08:23
 4 |  * @Email: code@tanweime.com
 5 | */
 6 | 
 7 | mod csv;
 8 | mod empty;
 9 | mod memory;
10 | 
11 | use std::fmt::Debug;
12 | use std::sync::Arc;
13 | 
14 | use crate::error::Result;
15 | use crate::logical_plan::schema::NaiveSchema;
16 | use arrow::record_batch::RecordBatch;
17 | 
18 | pub type TableRef = Arc<dyn TableSource>;
19 | 
20 | pub trait TableSource: Debug {
21 |     fn schema(&self) -> &NaiveSchema;
22 | 
23 |     // TODO(veeupup): return async stream record batch
24 |     /// for scan
25 |     fn scan(&self, projection: Option<Vec<usize>>) -> Result<Vec<RecordBatch>>;
26 | 
27 |     fn source_name(&self) -> String;
28 | }
29 | 
30 | pub use csv::CsvConfig;
31 | pub use csv::CsvTable;
32 | pub use empty::EmptyTable;
33 | pub use memory::MemTable;
34 | 


--------------------------------------------------------------------------------
/src/datatype.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: Veeupup
 3 |  * @Date: 2022-05-13 15:10:51
 4 |  * @Email: code@tanweime.com
 5 | */
 6 | 
 7 | use arrow::{
 8 |     array::{Array, ArrayRef},
 9 |     datatypes::DataType,
10 | };
11 | 
12 | use crate::logical_plan::expression::ScalarValue;
13 | 
14 | #[derive(Debug, Clone)]
15 | pub enum ColumnValue {
16 |     /// Array of values
17 |     Array(ArrayRef),
18 |     /// A single value,
19 |     Const(ScalarValue, usize),
20 | }
21 | 
22 | impl ColumnValue {
23 |     pub fn data_type(&self) -> DataType {
24 |         match self {
25 |             ColumnValue::Array(array) => array.data_type().clone(),
26 |             ColumnValue::Const(scalar, _) => scalar.data_field().data_type().clone(),
27 |         }
28 |     }
29 | 
30 |     pub fn into_array(self) -> ArrayRef {
31 |         match self {
32 |             ColumnValue::Array(array) => array,
33 |             ColumnValue::Const(scalar, num_rows) => scalar.into_array(num_rows),
34 |         }
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/src/db.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: Veeupup
 3 |  * @Date: 2022-05-14 15:26:40
 4 |  * @Email: code@tanweime.com
 5 | */
 6 | 
 7 | use arrow::record_batch::RecordBatch;
 8 | 
 9 | use crate::catalog::Catalog;
10 | use crate::datasource::CsvConfig;
11 | use crate::error::Result;
12 | 
13 | use crate::optimizer::Optimizer;
14 | use crate::planner::QueryPlanner;
15 | use crate::sql::parser::SQLParser;
16 | use crate::sql::planner::SQLPlanner;
17 | 
18 | #[derive(Default)]
19 | pub struct NaiveDB {
20 |     catalog: Catalog,
21 | }
22 | 
23 | impl NaiveDB {
24 |     pub fn run_sql(&self, sql: &str) -> Result<Vec<RecordBatch>> {
25 |         // 1. sql -> statement
26 |         let statement = SQLParser::parse(sql)?;
27 |         // 2. statement -> logical plan
28 |         let sql_planner = SQLPlanner::new(&self.catalog);
29 |         let logical_plan = sql_planner.statement_to_plan(statement)?;
30 |         // 3. optimize
31 |         let optimizer = Optimizer::default();
32 |         let logical_plan = optimizer.optimize(logical_plan);
33 |         // 4. logical plan -> physical plan
34 |         let physical_plan = QueryPlanner::create_physical_plan(&logical_plan)?;
35 |         // 5. execute
36 |         physical_plan.execute()
37 |     }
38 | 
39 |     pub fn create_csv_table(
40 |         &mut self,
41 |         table: &str,
42 |         csv_file: &str,
43 |         csv_conf: CsvConfig,
44 |     ) -> Result<()> {
45 |         self.catalog.add_csv_table(table, csv_file, csv_conf)
46 |     }
47 | }
48 | 


--------------------------------------------------------------------------------
/src/error.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: Veeupup
 3 |  * @Date: 2022-05-12 16:45:51
 4 |  * @Email: code@tanweime.com
 5 | */
 6 | use arrow::error::ArrowError;
 7 | use sqlparser::parser::ParserError;
 8 | use std::io;
 9 | 
10 | pub type Result<T> = std::result::Result<T, ErrorCode>;
11 | 
12 | #[derive(Debug)]
13 | pub enum ErrorCode {
14 |     /// Error return by arrow
15 |     ArrowError(ArrowError),
16 | 
17 |     IoError(io::Error),
18 | 
19 |     NoSuchField,
20 | 
21 |     ColumnNotExists(String),
22 | 
23 |     LogicalError(String),
24 | 
25 |     NoSuchTable(String),
26 | 
27 |     ParserError(ParserError),
28 | 
29 |     IntervalError(String),
30 | 
31 |     PlanError(String),
32 | 
33 |     NoMatchFunction(String),
34 | 
35 |     NotSupported(String),
36 | 
37 |     NotImplemented,
38 |     #[allow(unused)]
39 |     Others,
40 | }
41 | 
42 | impl From<ArrowError> for ErrorCode {
43 |     fn from(e: ArrowError) -> Self {
44 |         ErrorCode::ArrowError(e)
45 |     }
46 | }
47 | 
48 | impl From<io::Error> for ErrorCode {
49 |     fn from(e: io::Error) -> Self {
50 |         ErrorCode::IoError(e)
51 |     }
52 | }
53 | impl From<ParserError> for ErrorCode {
54 |     fn from(e: ParserError) -> Self {
55 |         ErrorCode::ParserError(e)
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: Veeupup
 3 |  * @Date: 2022-05-12 16:08:43
 4 |  * @Email: code@tanweime.com
 5 | */
 6 | 
 7 | mod catalog;
 8 | mod datasource;
 9 | mod datatype;
10 | mod db;
11 | mod error;
12 | mod logical_plan;
13 | mod optimizer;
14 | mod physical_plan;
15 | mod planner;
16 | mod sql;
17 | mod utils;
18 | 
19 | pub use datasource::CsvConfig;
20 | pub use db::NaiveDB;
21 | pub use error::Result;
22 | pub use utils::*;
23 | 


--------------------------------------------------------------------------------
/src/logical_plan/dataframe.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * @Author: Veeupup
  3 |  * @Date: 2022-05-12 22:52:47
  4 |  * @Email: code@tanweime.com
  5 | */
  6 | 
  7 | use std::sync::Arc;
  8 | 
  9 | use crate::logical_plan::expression::LogicalExpr;
 10 | use crate::logical_plan::plan::{Aggregate, Filter, LogicalPlan, Projection};
 11 | 
 12 | use super::expression::{AggregateFunction, Column};
 13 | use super::plan::{Join, JoinType, Limit, Offset};
 14 | use super::schema::NaiveSchema;
 15 | use crate::error::{ErrorCode, Result};
 16 | 
 17 | #[derive(Clone)]
 18 | pub struct DataFrame {
 19 |     pub plan: LogicalPlan,
 20 | }
 21 | 
 22 | impl DataFrame {
 23 |     pub fn new(plan: LogicalPlan) -> Self {
 24 |         Self { plan }
 25 |     }
 26 | 
 27 |     pub fn project(self, exprs: Vec<LogicalExpr>) -> Result<Self> {
 28 |         // TODO(veeupup): Ambiguous reference of field
 29 |         let mut fields = vec![];
 30 |         for expr in &exprs {
 31 |             fields.push(expr.data_field(&self.plan)?);
 32 |         }
 33 |         let schema = NaiveSchema::new(fields);
 34 |         Ok(Self {
 35 |             plan: LogicalPlan::Projection(Projection {
 36 |                 input: Arc::new(self.plan),
 37 |                 exprs,
 38 |                 schema,
 39 |             }),
 40 |         })
 41 |     }
 42 | 
 43 |     pub fn filter(self, expr: LogicalExpr) -> Self {
 44 |         Self {
 45 |             plan: LogicalPlan::Filter(Filter {
 46 |                 input: Arc::new(self.plan),
 47 |                 predicate: expr,
 48 |             }),
 49 |         }
 50 |     }
 51 | 
 52 |     #[allow(unused)]
 53 |     pub fn aggregate(
 54 |         self,
 55 |         group_expr: Vec<LogicalExpr>,
 56 |         aggr_expr: Vec<AggregateFunction>,
 57 |     ) -> Self {
 58 |         let mut group_fields = group_expr
 59 |             .iter()
 60 |             .map(|expr| expr.data_field(&self.plan).unwrap())
 61 |             .collect::<Vec<_>>();
 62 |         let mut aggr_fields = aggr_expr
 63 |             .iter()
 64 |             .map(|expr| expr.data_field(&self.plan).unwrap())
 65 |             .collect::<Vec<_>>();
 66 |         group_fields.append(&mut aggr_fields);
 67 |         let schema = NaiveSchema::new(group_fields);
 68 |         Self {
 69 |             plan: LogicalPlan::Aggregate(Aggregate {
 70 |                 input: Arc::new(self.plan),
 71 |                 group_expr,
 72 |                 aggr_expr,
 73 |                 schema,
 74 |             }),
 75 |         }
 76 |     }
 77 | 
 78 |     pub fn limit(self, n: usize) -> DataFrame {
 79 |         Self {
 80 |             plan: LogicalPlan::Limit(Limit {
 81 |                 input: Arc::new(self.plan),
 82 |                 n,
 83 |             }),
 84 |         }
 85 |     }
 86 | 
 87 |     pub fn offset(self, n: usize) -> DataFrame {
 88 |         Self {
 89 |             plan: LogicalPlan::Offset(Offset {
 90 |                 input: Arc::new(self.plan),
 91 |                 n,
 92 |             }),
 93 |         }
 94 |     }
 95 | 
 96 |     pub fn join(
 97 |         &self,
 98 |         right: &LogicalPlan,
 99 |         join_type: JoinType,
100 |         join_keys: (Vec<Column>, Vec<Column>),
101 |     ) -> Result<DataFrame> {
102 |         if join_keys.0.len() != join_keys.1.len() {
103 |             return Err(ErrorCode::PlanError(
104 |                 "left_keys length must be equal to right_keys length".to_string(),
105 |             ));
106 |         }
107 | 
108 |         let (left_keys, right_keys) = join_keys;
109 |         let on: Vec<(_, _)> = left_keys.into_iter().zip(right_keys.into_iter()).collect();
110 | 
111 |         let left_schema = self.plan.schema();
112 |         let join_schema = left_schema.join(right.schema());
113 |         // TODO(ywq) test on it.
114 |         if on.is_empty() {
115 |             return Ok(Self::new(LogicalPlan::CrossJoin(Join {
116 |                 left: Arc::new(self.plan.clone()),
117 |                 right: Arc::new(right.clone()),
118 |                 on,
119 |                 join_type,
120 |                 schema: join_schema,
121 |             })));
122 |         }
123 |         Ok(Self::new(LogicalPlan::Join(Join {
124 |             left: Arc::new(self.plan.clone()),
125 |             right: Arc::new(right.clone()),
126 |             on,
127 |             join_type,
128 |             schema: join_schema,
129 |         })))
130 |     }
131 | 
132 |     #[allow(unused)]
133 |     pub fn schema(&self) -> &NaiveSchema {
134 |         self.plan.schema()
135 |     }
136 | 
137 |     pub fn logical_plan(self) -> LogicalPlan {
138 |         self.plan
139 |     }
140 | }
141 | 
142 | #[cfg(test)]
143 | mod tests {
144 | 
145 |     use super::*;
146 |     use crate::{catalog::Catalog, logical_plan::schema::NaiveField};
147 | 
148 |     use crate::error::Result;
149 |     use crate::logical_plan::expression::*;
150 |     use arrow::datatypes::DataType;
151 | 
152 |     #[test]
153 |     fn create_logical_plan() -> Result<()> {
154 |         let schema = NaiveSchema::new(vec![
155 |             NaiveField::new(None, "state", DataType::Int64, true),
156 |             NaiveField::new(None, "id", DataType::Int64, true),
157 |             NaiveField::new(None, "first_name", DataType::Utf8, true),
158 |             NaiveField::new(None, "last_name", DataType::Utf8, true),
159 |             NaiveField::new(None, "salary", DataType::Int64, true),
160 |         ]);
161 |         let mut catalog = Catalog::default();
162 |         catalog.add_empty_table("empty", schema)?;
163 | 
164 |         let _plan = catalog
165 |             .get_table_df("empty")?
166 |             .filter(LogicalExpr::BinaryExpr(BinaryExpr {
167 |                 left: Box::new(LogicalExpr::column(None, "state".to_string())),
168 |                 op: Operator::Eq,
169 |                 right: Box::new(LogicalExpr::Literal(ScalarValue::Utf8(Some(
170 |                     "CO".to_string(),
171 |                 )))),
172 |             }))
173 |             .project(vec![
174 |                 LogicalExpr::column(None, "id".to_string()),
175 |                 LogicalExpr::column(None, "first_name".to_string()),
176 |                 LogicalExpr::column(None, "last_name".to_string()),
177 |                 LogicalExpr::column(None, "state".to_string()),
178 |                 LogicalExpr::column(None, "salary".to_string()),
179 |             ]);
180 | 
181 |         Ok(())
182 |     }
183 | }
184 | 


--------------------------------------------------------------------------------
/src/logical_plan/expression.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * @Author: Veeupup
  3 |  * @Date: 2022-05-12 20:28:35
  4 |  * @Email: code@tanweime.com
  5 | */
  6 | 
  7 | use std::iter::repeat;
  8 | 
  9 | use arrow::array::StringArray;
 10 | use arrow::array::{new_null_array, ArrayRef, BooleanArray, Float64Array, Int64Array, UInt64Array};
 11 | 
 12 | use arrow::datatypes::DataType;
 13 | use std::sync::Arc;
 14 | 
 15 | use crate::error::{ErrorCode, Result};
 16 | 
 17 | use crate::logical_plan::plan::LogicalPlan;
 18 | 
 19 | use super::schema::NaiveField;
 20 | 
 21 | #[derive(Clone, Debug)]
 22 | pub enum LogicalExpr {
 23 |     #[allow(unused)]
 24 |     /// An expression with a specific name.
 25 |     Alias(Box<LogicalExpr>, String),
 26 |     /// A named reference to a qualified filed in a schema.
 27 |     Column(Column),
 28 |     /// A constant value.
 29 |     Literal(ScalarValue),
 30 |     /// A binary expression such as "age > 21"
 31 |     BinaryExpr(BinaryExpr),
 32 |     /// A unary expression such as "-id"
 33 |     UnaryExpr(UnaryExpr),
 34 |     #[allow(unused)]
 35 |     /// Negation of an expression. The expression's type must be a boolean to make sense.
 36 |     Not(Box<LogicalExpr>),
 37 |     #[allow(unused)]
 38 |     /// Casts the expression to a given type and will return a runtime error if the expression cannot be cast.
 39 |     /// This expression is guaranteed to have a fixed type.
 40 |     Cast(CastExpr),
 41 |     #[allow(unused)]
 42 |     /// Represents the call of an aggregate built-in function with arguments.
 43 |     AggregateFunction(AggregateFunction),
 44 |     // Represents a reference to all fields in a schema.
 45 |     Wildcard,
 46 |     // TODO(veeupup): add more expresssions
 47 | }
 48 | 
 49 | impl LogicalExpr {
 50 |     pub fn column(table: Option<String>, name: String) -> LogicalExpr {
 51 |         LogicalExpr::Column(Column { table, name })
 52 |     }
 53 | 
 54 |     /// TODO(veeupup): consider return Vec<Field>
 55 |     pub fn data_field(&self, input: &LogicalPlan) -> Result<NaiveField> {
 56 |         match self {
 57 |             LogicalExpr::Alias(expr, alias) => {
 58 |                 let field = expr.data_field(input)?;
 59 |                 Ok(NaiveField::new(
 60 |                     None,
 61 |                     alias,
 62 |                     field.data_type().clone(),
 63 |                     field.is_nullable(),
 64 |                 ))
 65 |             }
 66 |             LogicalExpr::Column(Column { name, table }) => match table {
 67 |                 Some(table) => input.schema().field_with_qualified_name(table, name),
 68 |                 None => input.schema().field_with_unqualified_name(name),
 69 |             },
 70 |             LogicalExpr::Literal(scalar_val) => Ok(scalar_val.data_field()),
 71 |             LogicalExpr::BinaryExpr(expr) => expr.data_field(input),
 72 |             LogicalExpr::Not(expr) => Ok(NaiveField::new(
 73 |                 None,
 74 |                 format!("Not {}", expr.data_field(input)?.name()).as_str(),
 75 |                 DataType::Boolean,
 76 |                 true,
 77 |             )),
 78 |             LogicalExpr::Cast(expr) => Ok(NaiveField::new(
 79 |                 None,
 80 |                 expr.data_field(input)?.name(),
 81 |                 expr.data_type.clone(),
 82 |                 true,
 83 |             )),
 84 |             LogicalExpr::UnaryExpr(scalar_func) => scalar_func.data_field(input),
 85 |             LogicalExpr::AggregateFunction(aggr_func) => aggr_func.data_field(input),
 86 |             LogicalExpr::Wildcard => Err(ErrorCode::IntervalError(
 87 |                 "Wildcard not supported in logical plan".to_string(),
 88 |             )),
 89 |         }
 90 |     }
 91 | 
 92 |     pub fn and(self, other: LogicalExpr) -> LogicalExpr {
 93 |         binary_expr(self, Operator::And, other)
 94 |     }
 95 | 
 96 |     pub fn try_create_scalar_func(func_name: &str, exprs: &[LogicalExpr]) -> Result<LogicalExpr> {
 97 |         if exprs.len() != 1 {
 98 |             return Err(ErrorCode::PlanError(
 99 |                 "Scalar Func only has one parameter".to_string(),
100 |             ));
101 |         }
102 |         match func_name {
103 |             "abs" => Ok(LogicalExpr::UnaryExpr(UnaryExpr {
104 |                 func: UnaryOperator::Abs,
105 |                 arg: Box::new(exprs[0].clone()),
106 |             })),
107 |             _ => {
108 |                 return Err(ErrorCode::NoMatchFunction(format!(
109 |                     "Not match scalar func: {}",
110 |                     func_name
111 |                 )));
112 |             }
113 |         }
114 |     }
115 | 
116 |     pub fn try_create_aggregate_func(
117 |         func_name: &str,
118 |         exprs: &[LogicalExpr],
119 |     ) -> Result<LogicalExpr> {
120 |         if exprs.len() != 1 {
121 |             return Err(ErrorCode::PlanError(
122 |                 "Aggregate Func Now only Support One parameter".to_string(),
123 |             ));
124 |         }
125 |         match func_name {
126 |             "count" => Ok(LogicalExpr::AggregateFunction(AggregateFunction {
127 |                 fun: AggregateFunc::Count,
128 |                 args: Box::new(exprs[0].clone()),
129 |             })),
130 |             "sum" => Ok(LogicalExpr::AggregateFunction(AggregateFunction {
131 |                 fun: AggregateFunc::Sum,
132 |                 args: Box::new(exprs[0].clone()),
133 |             })),
134 |             "avg" => Ok(LogicalExpr::AggregateFunction(AggregateFunction {
135 |                 fun: AggregateFunc::Avg,
136 |                 args: Box::new(exprs[0].clone()),
137 |             })),
138 |             "min" => Ok(LogicalExpr::AggregateFunction(AggregateFunction {
139 |                 fun: AggregateFunc::Min,
140 |                 args: Box::new(exprs[0].clone()),
141 |             })),
142 |             "max" => Ok(LogicalExpr::AggregateFunction(AggregateFunction {
143 |                 fun: AggregateFunc::Max,
144 |                 args: Box::new(exprs[0].clone()),
145 |             })),
146 |             _ => {
147 |                 return Err(ErrorCode::NoMatchFunction(format!(
148 |                     "Not match aggregate func: {}",
149 |                     func_name
150 |                 )));
151 |             }
152 |         }
153 |     }
154 | }
155 | 
156 | /// return a new expression l <op> r
157 | pub fn binary_expr(l: LogicalExpr, op: Operator, r: LogicalExpr) -> LogicalExpr {
158 |     LogicalExpr::BinaryExpr(BinaryExpr {
159 |         left: Box::new(l),
160 |         op,
161 |         right: Box::new(r),
162 |     })
163 | }
164 | 
165 | /// A named reference to a qualified field in a schema.
166 | #[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)]
167 | pub struct Column {
168 |     pub table: Option<String>,
169 |     pub name: String,
170 | }
171 | 
172 | #[derive(Debug, Clone)]
173 | 
174 | pub enum ScalarValue {
175 |     /// represents `DataType::Null` (castable to/from any other type)
176 |     Null,
177 |     /// true or false value
178 |     Boolean(Option<bool>),
179 |     /// 64bit float
180 |     Float64(Option<f64>),
181 |     /// signed 64bit int
182 |     Int64(Option<i64>),
183 |     /// unsigned 64bit int
184 |     UInt64(Option<u64>),
185 |     /// utf-8 encoded string.
186 |     Utf8(Option<String>),
187 | }
188 | 
189 | macro_rules! build_array_from_option {
190 |     ($DATA_TYPE:ident, $ARRAY_TYPE:ident, $EXPR:expr, $SIZE:expr) => {{
191 |         match $EXPR {
192 |             Some(value) => Arc::new($ARRAY_TYPE::from_value(value, $SIZE)),
193 |             None => new_null_array(&DataType::$DATA_TYPE, $SIZE),
194 |         }
195 |     }};
196 | }
197 | 
198 | impl ScalarValue {
199 |     pub fn data_field(&self) -> NaiveField {
200 |         match self {
201 |             ScalarValue::Null => NaiveField::new(None, "Null", DataType::Null, true),
202 |             ScalarValue::Boolean(_) => NaiveField::new(None, "bool", DataType::Boolean, true),
203 |             ScalarValue::Float64(_) => NaiveField::new(None, "f64", DataType::Float64, true),
204 |             ScalarValue::Int64(_) => NaiveField::new(None, "i64", DataType::Int64, true),
205 |             ScalarValue::UInt64(_) => NaiveField::new(None, "u64", DataType::UInt64, true),
206 |             ScalarValue::Utf8(_) => NaiveField::new(None, "string", DataType::Utf8, true),
207 |         }
208 |     }
209 | 
210 |     pub fn into_array(self, size: usize) -> ArrayRef {
211 |         match self {
212 |             ScalarValue::Null => new_null_array(&DataType::Null, size),
213 |             ScalarValue::Boolean(e) => Arc::new(BooleanArray::from(vec![e; size])) as ArrayRef,
214 |             ScalarValue::Float64(e) => build_array_from_option!(Float64, Float64Array, e, size),
215 |             ScalarValue::Int64(e) => build_array_from_option!(Int64, Int64Array, e, size),
216 |             ScalarValue::UInt64(e) => build_array_from_option!(UInt64, UInt64Array, e, size),
217 |             ScalarValue::Utf8(e) => match e {
218 |                 Some(value) => Arc::new(StringArray::from_iter_values(repeat(value).take(size))),
219 |                 None => new_null_array(&DataType::Utf8, size),
220 |             },
221 |         }
222 |     }
223 | }
224 | 
225 | #[derive(Debug, Clone)]
226 | pub struct BinaryExpr {
227 |     /// Left-hand side of the expression
228 |     pub left: Box<LogicalExpr>,
229 |     /// The comparison operator
230 |     pub op: Operator,
231 |     /// Right-hand side of the expression
232 |     pub right: Box<LogicalExpr>,
233 | }
234 | 
235 | impl BinaryExpr {
236 |     pub fn data_field(&self, input: &LogicalPlan) -> Result<NaiveField> {
237 |         let left = self.left.data_field(input)?;
238 |         let left = left.name();
239 |         let right = match &*self.right {
240 |             LogicalExpr::Literal(scalar_val) => match scalar_val {
241 |                 ScalarValue::Boolean(Some(val)) => val.to_string(),
242 |                 ScalarValue::Int64(Some(val)) => val.to_string(),
243 |                 ScalarValue::UInt64(Some(val)) => val.to_string(),
244 |                 ScalarValue::Float64(Some(val)) => val.to_string(),
245 |                 ScalarValue::Utf8(Some(val)) => val.to_string(),
246 |                 _ => "null".to_string(),
247 |             },
248 |             _ => self.right.data_field(input)?.name().clone(),
249 |         };
250 |         let field = match self.op {
251 |             Operator::Eq => NaiveField::new(
252 |                 None,
253 |                 format!("{} = {}", left, right).as_str(),
254 |                 DataType::Boolean,
255 |                 true,
256 |             ),
257 |             Operator::NotEq => NaiveField::new(
258 |                 None,
259 |                 format!("{} != {}", left, right).as_str(),
260 |                 DataType::Boolean,
261 |                 true,
262 |             ),
263 |             Operator::Lt => NaiveField::new(
264 |                 None,
265 |                 format!("{} < {}", left, right).as_str(),
266 |                 DataType::Boolean,
267 |                 true,
268 |             ),
269 |             Operator::LtEq => NaiveField::new(
270 |                 None,
271 |                 format!("{} <= {}", left, right).as_str(),
272 |                 DataType::Boolean,
273 |                 true,
274 |             ),
275 |             Operator::Gt => NaiveField::new(
276 |                 None,
277 |                 format!("{} > {}", left, right).as_str(),
278 |                 DataType::Boolean,
279 |                 true,
280 |             ),
281 |             Operator::GtEq => NaiveField::new(
282 |                 None,
283 |                 format!("{} >= {}", left, right).as_str(),
284 |                 DataType::Boolean,
285 |                 true,
286 |             ),
287 |             Operator::Plus => NaiveField::new(
288 |                 None,
289 |                 format!("{} + {}", left, right).as_str(),
290 |                 self.left.data_field(input)?.data_type().clone(),
291 |                 true,
292 |             ),
293 |             Operator::Minus => NaiveField::new(
294 |                 None,
295 |                 format!("{} - {}", left, right).as_str(),
296 |                 self.left.data_field(input)?.data_type().clone(),
297 |                 true,
298 |             ),
299 |             Operator::Multiply => NaiveField::new(
300 |                 None,
301 |                 format!("{} * {}", left, right).as_str(),
302 |                 self.left.data_field(input)?.data_type().clone(),
303 |                 true,
304 |             ),
305 |             Operator::Divide => NaiveField::new(
306 |                 None,
307 |                 format!("{} / {}", left, right).as_str(),
308 |                 self.left.data_field(input)?.data_type().clone(),
309 |                 true,
310 |             ),
311 |             Operator::Modulos => NaiveField::new(
312 |                 None,
313 |                 format!("{} % {}", left, right).as_str(),
314 |                 self.left.data_field(input)?.data_type().clone(),
315 |                 true,
316 |             ),
317 |             Operator::And => NaiveField::new(
318 |                 None,
319 |                 format!("{} and {}", left, right).as_str(),
320 |                 DataType::Boolean,
321 |                 true,
322 |             ),
323 |             Operator::Or => NaiveField::new(
324 |                 None,
325 |                 format!("{} or {}", left, right).as_str(),
326 |                 DataType::Boolean,
327 |                 true,
328 |             ),
329 |         };
330 |         Ok(field)
331 |     }
332 | }
333 | 
334 | #[derive(Debug, Clone)]
335 | pub enum Operator {
336 |     /// Expressions are equal
337 |     Eq,
338 |     /// Expressions are not equal
339 |     NotEq,
340 |     /// Left side is smaller than right side
341 |     Lt,
342 |     /// Left side is smaller or equal to right side
343 |     LtEq,
344 |     /// Left side is greater than right side
345 |     Gt,
346 |     /// Left side is greater or equal to right side
347 |     GtEq,
348 |     /// Addition
349 |     Plus,
350 |     /// Subtraction
351 |     Minus,
352 |     /// Multiplication operator, like `*`
353 |     Multiply,
354 |     /// Division operator, like `/`
355 |     Divide,
356 |     /// Remainder operator, like `%`
357 |     Modulos,
358 |     /// Logical AND, like `&&`
359 |     And,
360 |     /// Logical OR, like `||`
361 |     Or,
362 | }
363 | 
364 | #[derive(Debug, Clone)]
365 | pub struct UnaryExpr {
366 |     /// The function
367 |     pub func: UnaryOperator,
368 |     /// List of expressions to feed to the functions as arguments
369 |     /// TODO(veeupup): we should check the args' type and nums
370 |     pub arg: Box<LogicalExpr>,
371 | }
372 | 
373 | impl UnaryExpr {
374 |     pub fn data_field(&self, input: &LogicalPlan) -> Result<NaiveField> {
375 |         // TODO(veeupup): we should make unary func more specific and should check if valid before creating them
376 |         let field = self.arg.data_field(input)?;
377 |         // TODO(ywq): add more exprs
378 |         let field = match self.func {
379 |             UnaryOperator::Abs => NaiveField::new(
380 |                 None,
381 |                 format!("abs({})", field.name()).as_str(),
382 |                 DataType::Int64,
383 |                 true,
384 |             ),
385 |             _ => unimplemented!(),
386 |         };
387 |         Ok(field)
388 |     }
389 | }
390 | 
391 | #[derive(Debug, Clone)]
392 | pub enum UnaryOperator {
393 |     // Math functions
394 |     Abs,
395 |     #[allow(unused)]
396 |     Sin,
397 |     #[allow(unused)]
398 |     Cos,
399 |     #[allow(unused)]
400 |     Tan,
401 |     // String functions
402 |     #[allow(unused)]
403 |     Trim,
404 |     #[allow(unused)]
405 |     LTrim,
406 |     #[allow(unused)]
407 |     RTrim,
408 |     #[allow(unused)]
409 |     CharacterLength,
410 |     #[allow(unused)]
411 |     Lower,
412 |     #[allow(unused)]
413 |     Upper,
414 |     #[allow(unused)]
415 |     Repeat,
416 |     #[allow(unused)]
417 |     Replace,
418 |     #[allow(unused)]
419 |     Reverse,
420 |     #[allow(unused)]
421 |     Substr,
422 | }
423 | 
424 | #[derive(Debug, Clone)]
425 | pub struct CastExpr {
426 |     /// The expression being cast
427 |     pub expr: Box<LogicalExpr>,
428 |     /// The `DataType` the expression will yield
429 |     pub data_type: DataType,
430 | }
431 | 
432 | impl CastExpr {
433 |     pub fn data_field(&self, input: &LogicalPlan) -> Result<NaiveField> {
434 |         Ok(NaiveField::new(
435 |             None,
436 |             self.expr.data_field(input)?.name(),
437 |             self.data_type.clone(),
438 |             true,
439 |         ))
440 |     }
441 | }
442 | 
443 | #[derive(Debug, Clone)]
444 | pub struct AggregateFunction {
445 |     /// Name of the function
446 |     pub fun: AggregateFunc,
447 |     /// List of expressions to feed to the functions as arguments
448 |     pub args: Box<LogicalExpr>,
449 | }
450 | 
451 | impl AggregateFunction {
452 |     pub fn data_field(&self, input: &LogicalPlan) -> Result<NaiveField> {
453 |         let dt = self.args.data_field(input)?;
454 |         let field = match self.fun {
455 |             AggregateFunc::Count => NaiveField::new(
456 |                 None,
457 |                 format!("count({})", dt.name()).as_str(),
458 |                 dt.data_type().clone(),
459 |                 true,
460 |             ),
461 |             AggregateFunc::Sum => NaiveField::new(
462 |                 None,
463 |                 format!("sum({})", dt.name()).as_str(),
464 |                 dt.data_type().clone(),
465 |                 true,
466 |             ),
467 |             AggregateFunc::Min => NaiveField::new(
468 |                 None,
469 |                 format!("min({})", dt.name()).as_str(),
470 |                 dt.data_type().clone(),
471 |                 true,
472 |             ),
473 |             AggregateFunc::Max => NaiveField::new(
474 |                 None,
475 |                 format!("max({})", dt.name()).as_str(),
476 |                 dt.data_type().clone(),
477 |                 true,
478 |             ),
479 |             AggregateFunc::Avg => NaiveField::new(
480 |                 None,
481 |                 format!("avg({})", dt.name()).as_str(),
482 |                 dt.data_type().clone(),
483 |                 true,
484 |             ),
485 |         };
486 |         Ok(field)
487 |     }
488 | }
489 | 
490 | #[derive(Debug, Clone)]
491 | pub enum AggregateFunc {
492 |     #[allow(unused)]
493 |     Count,
494 |     #[allow(unused)]
495 |     Sum,
496 |     #[allow(unused)]
497 |     Min,
498 |     #[allow(unused)]
499 |     Max,
500 |     #[allow(unused)]
501 |     Avg,
502 | }
503 | 


--------------------------------------------------------------------------------
/src/logical_plan/literal.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: Veeupup
 3 |  * @Date: 2022-05-16 23:36:56
 4 |  * @Last Modified by: Veeupup
 5 |  * @Last Modified time: 2022-05-16 23:45:02
 6 |  */
 7 | 
 8 | use super::expression::{LogicalExpr, ScalarValue};
 9 | 
10 | pub fn lit<T: Literal>(n: T) -> LogicalExpr {
11 |     n.lit()
12 | }
13 | 
14 | pub trait Literal {
15 |     fn lit(&self) -> LogicalExpr;
16 | }
17 | 
18 | impl Literal for String {
19 |     fn lit(&self) -> LogicalExpr {
20 |         LogicalExpr::Literal(ScalarValue::Utf8(Some(self.clone())))
21 |     }
22 | }
23 | 
24 | impl Literal for &str {
25 |     fn lit(&self) -> LogicalExpr {
26 |         LogicalExpr::Literal(ScalarValue::Utf8(Some((*self).to_owned())))
27 |     }
28 | }
29 | 
30 | macro_rules! impl_literal {
31 |     ($TYPE: ty, $SCALAR: ident) => {
32 |         impl Literal for $TYPE {
33 |             fn lit(&self) -> LogicalExpr {
34 |                 LogicalExpr::Literal(ScalarValue::$SCALAR(Some(self.clone())))
35 |             }
36 |         }
37 |     };
38 | }
39 | 
40 | impl_literal!(bool, Boolean);
41 | impl_literal!(i64, Int64);
42 | impl_literal!(u64, UInt64);
43 | impl_literal!(f64, Float64);
44 | 


--------------------------------------------------------------------------------
/src/logical_plan/mod.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: Veeupup
 3 |  * @Date: 2022-05-12 20:15:59
 4 |  * @Email: code@tanweime.com
 5 |  *
 6 |  * A logical plan represents a relation (a set of tuples) with a known schema. Each logical plan can
 7 |  * have zero or more logical plans as inputs. It is convenient for a logical plan to expose its child plans
 8 |  * so that a visitor pattern can be used to walk through the plan.
 9 |  *
10 | */
11 | 
12 | mod dataframe;
13 | pub mod expression;
14 | pub mod literal;
15 | pub mod plan;
16 | pub mod schema;
17 | 
18 | pub use dataframe::DataFrame;
19 | 


--------------------------------------------------------------------------------
/src/logical_plan/plan.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * @Author: Veeupup
  3 |  * @Date: 2022-05-13 14:09:04
  4 |  * @Email: code@tanweime.com
  5 | */
  6 | 
  7 | use crate::datasource::TableRef;
  8 | use crate::logical_plan::expression::{Column, LogicalExpr};
  9 | 
 10 | use std::fmt::{Debug, Display, Formatter, Result};
 11 | use std::sync::Arc;
 12 | 
 13 | use super::expression::AggregateFunction;
 14 | use super::schema::NaiveSchema;
 15 | 
 16 | #[derive(Clone)]
 17 | pub enum LogicalPlan {
 18 |     /// Evaluates an arbitrary list of expressions (essentially a
 19 |     /// SELECT with an expression list) on its input.
 20 |     Projection(Projection),
 21 | 
 22 |     /// Filters rows from its input that do not match an
 23 |     /// expression (essentially a WHERE clause with a predicate
 24 |     /// expression).
 25 |     ///
 26 |     /// Semantically, `<predicate>` is evaluated for each row of the input;
 27 |     /// If the value of `<predicate>` is true, the input row is passed to
 28 |     /// the output. If the value of `<predicate>` is false, the row is
 29 |     /// discarded.
 30 |     Filter(Filter),
 31 | 
 32 |     #[allow(unused)]
 33 |     /// Aggregates its input based on a set of grouping and aggregate
 34 |     /// expressions (e.g. SUM).
 35 |     Aggregate(Aggregate),
 36 | 
 37 |     /// Join two logical plans on one or more join columns
 38 |     Join(Join),
 39 | 
 40 |     CrossJoin(Join),
 41 | 
 42 |     /// Produces the first `n` tuples from its input and discards the rest.
 43 |     Limit(Limit),
 44 | 
 45 |     /// Adjusts the starting point at which the rest of the expressions begin to effect.
 46 |     Offset(Offset),
 47 | 
 48 |     /// Produces rows from a table provider by reference or from the context
 49 |     TableScan(TableScan),
 50 | }
 51 | 
 52 | impl LogicalPlan {
 53 |     pub fn schema(&self) -> &NaiveSchema {
 54 |         match self {
 55 |             LogicalPlan::Projection(Projection { schema, .. }) => schema,
 56 |             LogicalPlan::Filter(Filter { input, .. }) => input.schema(),
 57 |             LogicalPlan::Aggregate(Aggregate { schema, .. }) => schema,
 58 |             LogicalPlan::Join(Join { schema, .. }) => schema,
 59 |             LogicalPlan::Limit(Limit { input, .. }) => input.schema(),
 60 |             LogicalPlan::Offset(Offset { input, .. }) => input.schema(),
 61 |             LogicalPlan::TableScan(TableScan { source, .. }) => source.schema(),
 62 |             LogicalPlan::CrossJoin(Join { schema, .. }) => schema,
 63 |         }
 64 |     }
 65 | 
 66 |     #[allow(unused)]
 67 |     pub fn children(&self) -> Vec<Arc<LogicalPlan>> {
 68 |         match self {
 69 |             LogicalPlan::Projection(Projection { input, .. }) => vec![input.clone()],
 70 |             LogicalPlan::Filter(Filter { input, .. }) => vec![input.clone()],
 71 |             LogicalPlan::Aggregate(Aggregate { input, .. }) => vec![input.clone()],
 72 |             LogicalPlan::Join(Join { left, right, .. }) => vec![left.clone(), right.clone()],
 73 |             LogicalPlan::Limit(Limit { input, .. }) => vec![input.clone()],
 74 |             LogicalPlan::Offset(Offset { input, .. }) => vec![input.clone()],
 75 |             LogicalPlan::TableScan(_) => vec![],
 76 |             LogicalPlan::CrossJoin(Join { left, right, .. }) => vec![left.clone(), right.clone()],
 77 |         }
 78 |     }
 79 | }
 80 | 
 81 | impl Display for LogicalPlan {
 82 |     fn fmt(&self, f: &mut Formatter<'_>) -> Result {
 83 |         Debug::fmt(&self, f)
 84 |     }
 85 | }
 86 | 
 87 | impl Debug for LogicalPlan {
 88 |     fn fmt(&self, f: &mut Formatter<'_>) -> Result {
 89 |         do_pretty_print(self, f, 0)
 90 |     }
 91 | }
 92 | 
 93 | #[derive(Debug, Clone)]
 94 | pub struct Projection {
 95 |     /// The list of expressions
 96 |     pub exprs: Vec<LogicalExpr>,
 97 |     /// The incoming logical plan
 98 |     pub input: Arc<LogicalPlan>,
 99 |     /// The schema description of the output
100 |     pub schema: NaiveSchema,
101 | }
102 | 
103 | #[derive(Debug, Clone)]
104 | pub struct Filter {
105 |     /// The predicate expression, which must have Boolean type.
106 |     pub predicate: LogicalExpr,
107 |     /// The incoming logical plan
108 |     pub input: Arc<LogicalPlan>,
109 | }
110 | 
111 | #[derive(Debug, Clone)]
112 | pub struct TableScan {
113 |     /// The source of the table
114 |     pub source: TableRef,
115 |     /// Optional column indices to use as a projection
116 |     pub projection: Option<Vec<usize>>,
117 | }
118 | 
119 | /// Aggregates its input based on a set of grouping and aggregate
120 | /// expressions (e.g. SUM).
121 | #[derive(Debug, Clone)]
122 | pub struct Aggregate {
123 |     /// The incoming logical plan
124 |     pub input: Arc<LogicalPlan>,
125 |     /// Grouping expressions
126 |     pub group_expr: Vec<LogicalExpr>,
127 |     /// Aggregate expressions
128 |     pub aggr_expr: Vec<AggregateFunction>,
129 |     /// The schema description of the aggregate output
130 |     pub schema: NaiveSchema,
131 | }
132 | 
133 | #[derive(Debug, Clone, Copy, Eq, PartialEq)]
134 | pub enum JoinType {
135 |     Inner,
136 |     Left,
137 |     Right,
138 |     Cross,
139 | }
140 | 
141 | /// Join two logical plans on one or more join columns
142 | #[derive(Debug, Clone)]
143 | pub struct Join {
144 |     /// Left input
145 |     pub left: Arc<LogicalPlan>,
146 |     /// Right input
147 |     pub right: Arc<LogicalPlan>,
148 |     /// Equijoin clause expressed as pairs of (left, right) join columns, cross join don't have on conditions
149 |     pub on: Vec<(Column, Column)>,
150 |     /// Join type
151 |     pub join_type: JoinType,
152 |     /// The output schema, containing fields from the left and right inputs
153 |     pub schema: NaiveSchema,
154 | }
155 | 
156 | /// Produces the first `n` tuples from its input and discards the rest.
157 | #[derive(Debug, Clone)]
158 | pub struct Limit {
159 |     /// The limit
160 |     pub n: usize,
161 |     /// The logical plan
162 |     pub input: Arc<LogicalPlan>,
163 | }
164 | 
165 | /// Adjusts the starting point at which the rest of the expressions begin to effect.
166 | #[derive(Debug, Clone)]
167 | pub struct Offset {
168 |     /// The offset.
169 |     pub n: usize,
170 |     /// The logical plan.
171 |     pub input: Arc<LogicalPlan>,
172 | }
173 | 
174 | fn do_pretty_print(plan: &LogicalPlan, f: &mut Formatter<'_>, depth: usize) -> Result {
175 |     write!(f, "{}", "  ".repeat(depth))?;
176 | 
177 |     match plan {
178 |         LogicalPlan::Projection(Projection {
179 |             exprs,
180 |             input,
181 |             schema,
182 |         }) => {
183 |             writeln!(f, "Projection:")?;
184 | 
185 |             write!(f, "{}", "  ".repeat(depth + 1))?;
186 |             writeln!(f, "exprs: {:?}", exprs)?;
187 | 
188 |             write!(f, "{}", "  ".repeat(depth + 1))?;
189 |             writeln!(f, "input:")?;
190 |             do_pretty_print(input.as_ref(), f, depth + 2)?;
191 | 
192 |             write!(f, "{}", "  ".repeat(depth + 1))?;
193 |             writeln!(f, "schema: {:?}", schema)
194 |         }
195 |         LogicalPlan::Filter(Filter { predicate, input }) => {
196 |             writeln!(f, "Filter:")?;
197 | 
198 |             write!(f, "{}", "  ".repeat(depth + 1))?;
199 |             writeln!(f, "predicate: {:?}", predicate)?;
200 | 
201 |             write!(f, "{}", "  ".repeat(depth + 1))?;
202 |             writeln!(f, "input:")?;
203 |             do_pretty_print(input.as_ref(), f, depth + 2)
204 |         }
205 |         LogicalPlan::Aggregate(Aggregate {
206 |             input,
207 |             group_expr,
208 |             aggr_expr,
209 |             schema,
210 |         }) => {
211 |             writeln!(f, "Aggregate:")?;
212 | 
213 |             write!(f, "{}", "  ".repeat(depth + 1))?;
214 |             writeln!(f, "input:")?;
215 |             do_pretty_print(input.as_ref(), f, depth + 2)?;
216 | 
217 |             write!(f, "{}", "  ".repeat(depth + 1))?;
218 |             writeln!(f, "group_expr: {:?}", group_expr)?;
219 | 
220 |             write!(f, "{}", "  ".repeat(depth + 1))?;
221 |             writeln!(f, "aggr_expr: {:?}", aggr_expr)?;
222 | 
223 |             write!(f, "{}", "  ".repeat(depth + 1))?;
224 |             writeln!(f, "schema: {:?}", schema)
225 |         }
226 |         LogicalPlan::Join(Join {
227 |             left,
228 |             right,
229 |             on,
230 |             join_type,
231 |             schema,
232 |         }) => {
233 |             writeln!(f, "Join:")?;
234 | 
235 |             write!(f, "{}", "  ".repeat(depth + 1))?;
236 |             writeln!(f, "left:")?;
237 |             do_pretty_print(left.as_ref(), f, depth + 2)?;
238 | 
239 |             write!(f, "{}", "  ".repeat(depth + 1))?;
240 |             writeln!(f, "right:")?;
241 |             do_pretty_print(right.as_ref(), f, depth + 2)?;
242 | 
243 |             write!(f, "{}", "  ".repeat(depth + 1))?;
244 |             writeln!(f, "on: {:?}", on)?;
245 | 
246 |             write!(f, "{}", "  ".repeat(depth + 1))?;
247 |             writeln!(f, "join_type: {:?}", join_type)?;
248 | 
249 |             write!(f, "{}", "  ".repeat(depth + 1))?;
250 |             writeln!(f, "schema: {:?}", schema)
251 |         }
252 |         LogicalPlan::Limit(Limit { n, input }) => {
253 |             writeln!(f, "Limit:")?;
254 | 
255 |             write!(f, "{}", "  ".repeat(depth + 1))?;
256 |             writeln!(f, "n: {}", n)?;
257 | 
258 |             write!(f, "{}", "  ".repeat(depth + 1))?;
259 |             writeln!(f, "input:")?;
260 |             do_pretty_print(input.as_ref(), f, depth + 2)
261 |         }
262 |         LogicalPlan::Offset(Offset { n, input }) => {
263 |             writeln!(f, "Offset:")?;
264 | 
265 |             write!(f, "{}", "  ".repeat(depth + 1))?;
266 |             writeln!(f, "n: {}", n)?;
267 | 
268 |             write!(f, "{}", "  ".repeat(depth + 1))?;
269 |             writeln!(f, "input:")?;
270 |             do_pretty_print(input.as_ref(), f, depth + 2)
271 |         }
272 |         LogicalPlan::TableScan(TableScan { source, projection }) => {
273 |             writeln!(f, "TableScan:")?;
274 | 
275 |             write!(f, "{}", "  ".repeat(depth + 1))?;
276 |             writeln!(f, "source: {:?}", source.source_name())?;
277 | 
278 |             write!(f, "{}", "  ".repeat(depth + 1))?;
279 |             writeln!(f, "projection: {:?}", projection)
280 |         }
281 |         LogicalPlan::CrossJoin(Join {
282 |             left,
283 |             right,
284 |             on: _,
285 |             join_type,
286 |             schema,
287 |         }) => {
288 |             writeln!(f, "Join:")?;
289 | 
290 |             write!(f, "{}", "  ".repeat(depth + 1))?;
291 |             writeln!(f, "left:")?;
292 |             do_pretty_print(left.as_ref(), f, depth + 2)?;
293 | 
294 |             write!(f, "{}", "  ".repeat(depth + 1))?;
295 |             writeln!(f, "right:")?;
296 |             do_pretty_print(right.as_ref(), f, depth + 2)?;
297 | 
298 |             write!(f, "{}", "  ".repeat(depth + 1))?;
299 |             writeln!(f, "join_type: {:?}", join_type)?;
300 | 
301 |             write!(f, "{}", "  ".repeat(depth + 1))?;
302 |             writeln!(f, "schema: {:?}", schema)
303 |         }
304 |     }
305 | }
306 | 
307 | #[cfg(test)]
308 | mod tests {
309 |     use super::*;
310 |     use crate::datasource::EmptyTable;
311 | 
312 |     use crate::error::Result;
313 |     use crate::logical_plan::expression::*;
314 | 
315 |     /// Create LogicalPlan
316 |     #[test]
317 |     fn create_logical_plan() -> Result<()> {
318 |         let schema = NaiveSchema::empty();
319 |         let source = EmptyTable::try_create(schema)?;
320 | 
321 |         let scan = LogicalPlan::TableScan(TableScan {
322 |             source,
323 |             projection: None,
324 |         });
325 | 
326 |         let filter_expr = LogicalExpr::BinaryExpr(BinaryExpr {
327 |             left: Box::new(LogicalExpr::column(None, "state".to_string())),
328 |             op: Operator::Eq,
329 |             right: Box::new(LogicalExpr::Literal(ScalarValue::Utf8(Some(
330 |                 "CO".to_string(),
331 |             )))),
332 |         });
333 | 
334 |         let _selection = LogicalPlan::Filter(Filter {
335 |             predicate: filter_expr,
336 |             input: Arc::new(scan),
337 |         });
338 | 
339 |         let _projection = vec![
340 |             LogicalExpr::column(None, "id".to_string()),
341 |             LogicalExpr::column(None, "first_name".to_string()),
342 |             LogicalExpr::column(None, "last_name".to_string()),
343 |             LogicalExpr::column(None, "state".to_string()),
344 |             LogicalExpr::column(None, "salary".to_string()),
345 |         ];
346 | 
347 |         Ok(())
348 |     }
349 | 
350 |     #[test]
351 |     fn print_logical_plan() {
352 |         let schema = NaiveSchema::empty();
353 |         let source = EmptyTable::try_create(schema.clone()).unwrap();
354 | 
355 |         let scan = LogicalPlan::TableScan(TableScan {
356 |             source,
357 |             projection: None,
358 |         });
359 | 
360 |         assert_eq!(
361 |             "TableScan:\
362 |             \n  source: \"EmptyTable\"\
363 |             \n  projection: None\n",
364 |             format!("{}", scan)
365 |         );
366 | 
367 |         let scan = Arc::new(scan);
368 | 
369 |         let limit = LogicalPlan::Limit(Limit {
370 |             n: 233,
371 |             input: scan.clone(),
372 |         });
373 | 
374 |         assert_eq!(
375 |             "Limit:\
376 |             \n  n: 233\
377 |             \n  input:\
378 |             \n    TableScan:\
379 |             \n      source: \"EmptyTable\"\
380 |             \n      projection: None\n",
381 |             format!("{}", limit)
382 |         );
383 | 
384 |         let join = LogicalPlan::Join(Join {
385 |             left: scan.clone(),
386 |             right: scan,
387 |             on: vec![],
388 |             join_type: JoinType::Inner,
389 |             schema,
390 |         });
391 | 
392 |         assert_eq!(
393 |             "Join:\
394 |             \n  left:\
395 |             \n    TableScan:\
396 |             \n      source: \"EmptyTable\"\
397 |             \n      projection: None\
398 |             \n  right:\
399 |             \n    TableScan:\
400 |             \n      source: \"EmptyTable\"\
401 |             \n      projection: None\
402 |             \n  on: []\
403 |             \n  join_type: Inner\
404 |             \n  schema: NaiveSchema { fields: [] }\n",
405 |             format!("{}", join)
406 |         );
407 |     }
408 | }
409 | 


--------------------------------------------------------------------------------
/src/logical_plan/schema.rs:
--------------------------------------------------------------------------------
  1 | // Licensed to the Apache Software Foundation (ASF) under one
  2 | // or more contributor license agreements.  See the NOTICE file
  3 | // distributed with this work for additional information
  4 | // regarding copyright ownership.  The ASF licenses this file
  5 | // to you under the Apache License, Version 2.0 (the
  6 | // "License"); you may not use this file except in compliance
  7 | // with the License.  You may obtain a copy of the License at
  8 | //
  9 | //   http://www.apache.org/licenses/LICENSE-2.0
 10 | //
 11 | // Unless required by applicable law or agreed to in writing,
 12 | // software distributed under the License is distributed on an
 13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 14 | // KIND, either express or implied.  See the License for the
 15 | // specific language governing permissions and limitations
 16 | // under the License.
 17 | 
 18 | /*
 19 |  * @Author: Veeupup
 20 |  * @Date: 2022-05-18 13:45:10
 21 |  * @Last Modified by: Veeupup
 22 |  * @Last Modified time: 2022-05-18 17:30:21
 23 |  *
 24 |  * Arrow Field does not have table/relation name as its proroties
 25 |  * So we need a Schema to define inner schema with table name
 26 |  *
 27 |  * Code Ideas come from https://github.com/apache/arrow-datafusion/
 28 |  *
 29 |  */
 30 | 
 31 | use arrow::datatypes::{DataType, SchemaRef};
 32 | use arrow::datatypes::{Field, Schema};
 33 | 
 34 | use crate::error::ErrorCode;
 35 | use crate::error::Result;
 36 | 
 37 | #[derive(Debug, Clone)]
 38 | pub struct NaiveSchema {
 39 |     pub fields: Vec<NaiveField>,
 40 | }
 41 | 
 42 | impl NaiveSchema {
 43 |     #[allow(unused)]
 44 |     pub fn empty() -> Self {
 45 |         Self { fields: vec![] }
 46 |     }
 47 | 
 48 |     pub fn new(fields: Vec<NaiveField>) -> Self {
 49 |         // TODO(veeupup): check if we have duplicated name field
 50 |         Self { fields }
 51 |     }
 52 | 
 53 |     #[allow(unused)]
 54 |     pub fn from_qualified(qualifier: &str, schema: &Schema) -> Self {
 55 |         Self::new(
 56 |             schema
 57 |                 .fields()
 58 |                 .iter()
 59 |                 .map(|field| NaiveField {
 60 |                     field: field.clone(),
 61 |                     qualifier: Some(qualifier.to_owned()),
 62 |                 })
 63 |                 .collect(),
 64 |         )
 65 |     }
 66 | 
 67 |     pub fn from_unqualified(schema: &Schema) -> Self {
 68 |         Self::new(
 69 |             schema
 70 |                 .fields()
 71 |                 .iter()
 72 |                 .map(|field| NaiveField {
 73 |                     field: field.clone(),
 74 |                     qualifier: None,
 75 |                 })
 76 |                 .collect(),
 77 |         )
 78 |     }
 79 | 
 80 |     /// join two schema
 81 |     pub fn join(&self, schema: &NaiveSchema) -> Self {
 82 |         let mut fields = self.fields.clone();
 83 |         fields.extend_from_slice(schema.fields().as_slice());
 84 |         Self::new(fields)
 85 |     }
 86 | 
 87 |     pub fn fields(&self) -> &Vec<NaiveField> {
 88 |         &self.fields
 89 |     }
 90 | 
 91 |     #[allow(unused)]
 92 |     pub fn field(&self, i: usize) -> &NaiveField {
 93 |         &self.fields[i]
 94 |     }
 95 | 
 96 |     #[allow(unused)]
 97 |     pub fn index_of(&self, name: &str) -> Result<usize> {
 98 |         for i in 0..self.fields().len() {
 99 |             if self.fields[i].name() == name {
100 |                 return Ok(i);
101 |             }
102 |         }
103 |         Err(ErrorCode::NoSuchField)
104 |     }
105 | 
106 |     #[allow(unused)]
107 |     /// Find the field with the given name
108 |     pub fn field_with_name(&self, relation_name: Option<&str>, name: &str) -> Result<NaiveField> {
109 |         if let Some(relation_name) = relation_name {
110 |             self.field_with_qualified_name(relation_name, name)
111 |         } else {
112 |             self.field_with_unqualified_name(name)
113 |         }
114 |     }
115 | 
116 |     pub fn field_with_unqualified_name(&self, name: &str) -> Result<NaiveField> {
117 |         let matches = self
118 |             .fields
119 |             .iter()
120 |             .filter(|field| field.name() == name)
121 |             .collect::<Vec<_>>();
122 |         match matches.len() {
123 |             0 => Err(ErrorCode::PlanError(format!("No field named '{}'", name))),
124 |             _ => Ok(matches[0].to_owned()),
125 |             // TODO(veeupup): multi same name, and we need to return Error
126 |             // _ => Err(ErrorCode::PlanError(format!(
127 |             //     "Ambiguous reference to field named '{}'",
128 |             //     name
129 |             // ))),
130 |         }
131 |     }
132 | 
133 |     pub fn field_with_qualified_name(&self, relation_name: &str, name: &str) -> Result<NaiveField> {
134 |         let matches = self
135 |             .fields
136 |             .iter()
137 |             .filter(|field| {
138 |                 field.qualifier == Some(relation_name.to_owned()) && field.name() == name
139 |             })
140 |             .collect::<Vec<_>>();
141 |         match matches.len() {
142 |             0 => Err(ErrorCode::PlanError(format!("No field named '{}'", name))),
143 |             _ => Ok(matches[0].to_owned()),
144 |             // TODO(veeupup): multi same name, and we need to return Error
145 |             // _ => Err(ErrorCode::PlanError(format!(
146 |             //     "Ambiguous reference to field named '{}'",
147 |             //     name
148 |             // ))),
149 |         }
150 |     }
151 | }
152 | 
153 | impl From<NaiveSchema> for Schema {
154 |     fn from(schema: NaiveSchema) -> Self {
155 |         Schema::new(
156 |             schema
157 |                 .fields
158 |                 .into_iter()
159 |                 .map(|f| {
160 |                     if f.qualifier().is_some() {
161 |                         Field::new(
162 |                             f.qualified_name().as_str(),
163 |                             f.data_type().to_owned(),
164 |                             f.is_nullable(),
165 |                         )
166 |                     } else {
167 |                         f.field
168 |                     }
169 |                 })
170 |                 .collect(),
171 |         )
172 |     }
173 | }
174 | 
175 | // impl Into<Schema> for NaiveSchema {
176 | //     /// Convert a schema into a DFSchema
177 | //     fn into(self) -> Schema {
178 | //         Schema::new(
179 | //             self.fields
180 | //                 .into_iter()
181 | //                 .map(|f| {
182 | //                     if f.qualifier().is_some() {
183 | //                         Field::new(
184 | //                             f.qualified_name().as_str(),
185 | //                             f.data_type().to_owned(),
186 | //                             f.is_nullable(),
187 | //                         )
188 | //                     } else {
189 | //                         f.field
190 | //                     }
191 | //                 })
192 | //                 .collect(),
193 | //         )
194 | //     }
195 | // }
196 | 
197 | impl From<NaiveSchema> for SchemaRef {
198 |     fn from(schema: NaiveSchema) -> Self {
199 |         SchemaRef::new(schema.into())
200 |     }
201 | }
202 | 
203 | // impl Into<SchemaRef> for NaiveSchema {
204 | //     fn into(self) -> SchemaRef {
205 | //         SchemaRef::new(self.into())
206 | //     }
207 | // }
208 | 
209 | /// NaiveField wraps an Arrow field and adds an optional qualifier
210 | #[derive(Debug, Clone, PartialEq, Eq)]
211 | pub struct NaiveField {
212 |     /// Optional qualifier (usually a table or relation name)
213 |     qualifier: Option<String>,
214 |     /// Arrow field definition
215 |     field: Field,
216 | }
217 | 
218 | impl NaiveField {
219 |     pub fn new(qualifier: Option<&str>, name: &str, data_type: DataType, nullable: bool) -> Self {
220 |         Self {
221 |             qualifier: qualifier.map(|s| s.to_owned()),
222 |             field: Field::new(name, data_type, nullable),
223 |         }
224 |     }
225 | 
226 |     #[allow(unused)]
227 |     pub fn from(field: Field) -> Self {
228 |         Self {
229 |             qualifier: None,
230 |             field,
231 |         }
232 |     }
233 | 
234 |     #[allow(unused)]
235 |     pub fn from_qualified(qualifier: &str, field: Field) -> Self {
236 |         Self {
237 |             qualifier: Some(qualifier.to_owned()),
238 |             field,
239 |         }
240 |     }
241 | 
242 |     pub fn name(&self) -> &String {
243 |         self.field.name()
244 |     }
245 | 
246 |     /// Returns an immutable reference to the `NaiveField`'s data-type
247 |     pub fn data_type(&self) -> &DataType {
248 |         self.field.data_type()
249 |     }
250 | 
251 |     /// Indicates whether this `NaiveField` supports null values
252 |     pub fn is_nullable(&self) -> bool {
253 |         self.field.is_nullable()
254 |     }
255 | 
256 |     /// Returns a reference to the `NaiveField`'s qualified name
257 |     pub fn qualified_name(&self) -> String {
258 |         if let Some(relation_name) = &self.qualifier {
259 |             format!("{}.{}", relation_name, self.field.name())
260 |         } else {
261 |             self.field.name().to_owned()
262 |         }
263 |     }
264 | 
265 |     /// Get the optional qualifier
266 |     pub fn qualifier(&self) -> Option<&String> {
267 |         self.qualifier.as_ref()
268 |     }
269 | }
270 | 
271 | impl From<NaiveField> for Field {
272 |     fn from(field: NaiveField) -> Self {
273 |         Field::new(field.name(), field.data_type().clone(), field.is_nullable())
274 |     }
275 | }
276 | 


--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
 1 | use naive_db::print_result;
 2 | use naive_db::CsvConfig;
 3 | use naive_db::NaiveDB;
 4 | use naive_db::Result;
 5 | 
 6 | fn main() -> Result<()> {
 7 |     let mut db = NaiveDB::default();
 8 | 
 9 |     db.create_csv_table("t1", "data/test_data.csv", CsvConfig::default())?;
10 | 
11 |     // select
12 |     let ret = db.run_sql("select id, name, age + 100 from t1 where id < 9 limit 3 offset 2")?;
13 |     print_result(&ret)?;
14 | 
15 |     // Join
16 |     db.create_csv_table("employee", "data/employee.csv", CsvConfig::default())?;
17 |     db.create_csv_table("rank", "data/rank.csv", CsvConfig::default())?;
18 |     db.create_csv_table("department", "data/department.csv", CsvConfig::default())?;
19 | 
20 |     let ret = db.run_sql(
21 |         "
22 |         select id, name, rank_name, department_name
23 |         from employee
24 |         join rank on 
25 |             employee.rank = rank.id  
26 |         join department on
27 |                 employee.department_id = department.id
28 |     ",
29 |     )?;
30 |     print_result(&ret)?;
31 | 
32 |     let ret = db.run_sql("select * from employee join rank")?;
33 |     print_result(&ret)?;
34 | 
35 |     // aggregate
36 |     let ret = db.run_sql(
37 |         "
38 |         select count(id), sum(age), sum(score), avg(score), max(score), min(score) 
39 |         from t1 group by id % 3",
40 |     )?;
41 |     print_result(&ret)?;
42 | 
43 |     Ok(())
44 | }
45 | 


--------------------------------------------------------------------------------
/src/optimizer/mod.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: Veeupup
 3 |  * @Date: 2022-05-13 17:59:40
 4 |  * @Email: code@tanweime.com
 5 | */
 6 | 
 7 | mod projection_push_down;
 8 | 
 9 | use crate::logical_plan::plan::LogicalPlan;
10 | use std::sync::Arc;
11 | 
12 | #[derive(Default)]
13 | pub struct Optimizer {
14 |     rules: Vec<Arc<dyn OptimizerRule>>,
15 | }
16 | 
17 | pub trait OptimizerRule {
18 |     fn optimize(&self, plan: &LogicalPlan) -> LogicalPlan;
19 | }
20 | 
21 | impl Optimizer {
22 |     pub fn optimize(&self, plan: LogicalPlan) -> LogicalPlan {
23 |         let mut plan = plan;
24 |         for rule in &self.rules {
25 |             plan = rule.optimize(&plan);
26 |         }
27 |         plan
28 |     }
29 | }
30 | 


--------------------------------------------------------------------------------
/src/optimizer/projection_push_down.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: Veeupup
 3 |  * @Date: 2022-05-13 18:54:33
 4 |  * @Email: code@tanweime.com
 5 | */
 6 | 
 7 | use super::OptimizerRule;
 8 | use crate::logical_plan::plan::LogicalPlan;
 9 | 
10 | pub struct ProjectionPushDown;
11 | 
12 | impl OptimizerRule for ProjectionPushDown {
13 |     fn optimize(&self, plan: &LogicalPlan) -> LogicalPlan {
14 |         // TODO(veeupup): do projection push down
15 |         plan.clone()
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/src/physical_plan/aggregate/avg.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * @Author: Veeupup
  3 |  * @Date: 2022-05-20 19:09:44
  4 |  * @Last Modified by: Veeupup
  5 |  * @Last Modified time: 2022-05-20 21:19:45
  6 |  */
  7 | 
  8 | use arrow::array::Array;
  9 | use arrow::array::PrimitiveArray;
 10 | use arrow::datatypes::DataType;
 11 | 
 12 | use arrow::datatypes::Float64Type;
 13 | use arrow::datatypes::Int64Type;
 14 | use arrow::datatypes::UInt64Type;
 15 | use arrow::record_batch::RecordBatch;
 16 | 
 17 | use super::AggregateOperator;
 18 | use crate::error::ErrorCode;
 19 | use crate::logical_plan::expression::ScalarValue;
 20 | use crate::logical_plan::schema::NaiveField;
 21 | use crate::logical_plan::schema::NaiveSchema;
 22 | use crate::physical_plan::ColumnExpr;
 23 | use crate::physical_plan::PhysicalExpr;
 24 | use crate::Result;
 25 | 
 26 | #[derive(Debug, Clone)]
 27 | pub struct Avg {
 28 |     sum: f64,
 29 |     cnt: u32,
 30 |     // physical column
 31 |     col_expr: ColumnExpr,
 32 | }
 33 | 
 34 | impl Avg {
 35 |     pub fn create(col_expr: ColumnExpr) -> Box<dyn AggregateOperator> {
 36 |         Box::new(Self {
 37 |             sum: 0.0,
 38 |             cnt: 0,
 39 |             col_expr,
 40 |         })
 41 |     }
 42 | }
 43 | 
 44 | macro_rules! update_match {
 45 |     ($COL: expr, $DT: ty, $SELF: expr) => {{
 46 |         let col = $COL.as_any().downcast_ref::<PrimitiveArray<$DT>>().unwrap();
 47 |         for val in col.into_iter().flatten() {
 48 |             $SELF.sum += val as f64;
 49 |             $SELF.cnt += 1;
 50 |         }
 51 |     }};
 52 | }
 53 | 
 54 | macro_rules! update_value {
 55 |     ($COL: expr, $DT: ty, $IDX: expr, $SELF: expr) => {{
 56 |         let col = $COL.as_any().downcast_ref::<PrimitiveArray<$DT>>().unwrap();
 57 |         if !col.is_null($IDX) {
 58 |             $SELF.sum += col.value($IDX) as f64;
 59 |             $SELF.cnt += 1;
 60 |         }
 61 |     }};
 62 | }
 63 | 
 64 | impl AggregateOperator for Avg {
 65 |     fn data_field(&self, schema: &NaiveSchema) -> Result<NaiveField> {
 66 |         // find by name
 67 |         if let Some(name) = &self.col_expr.name {
 68 |             let field = schema.field_with_unqualified_name(name)?;
 69 |             return Ok(NaiveField::new(
 70 |                 None,
 71 |                 format!("avg({})", field.name()).as_str(),
 72 |                 DataType::Float64,
 73 |                 false,
 74 |             ));
 75 |         }
 76 | 
 77 |         if let Some(idx) = &self.col_expr.idx {
 78 |             let field = schema.field(*idx);
 79 |             return Ok(NaiveField::new(
 80 |                 None,
 81 |                 format!("avg({})", field.name()).as_str(),
 82 |                 DataType::Float64,
 83 |                 false,
 84 |             ));
 85 |         }
 86 | 
 87 |         Err(ErrorCode::LogicalError(
 88 |             "ColumnExpr must has name or idx".to_string(),
 89 |         ))
 90 |     }
 91 | 
 92 |     fn update_batch(&mut self, data: &RecordBatch) -> Result<()> {
 93 |         let col = self.col_expr.evaluate(data)?.into_array();
 94 |         match col.data_type() {
 95 |             DataType::Int64 => update_match!(col, Int64Type, self),
 96 |             DataType::UInt64 => update_match!(col, UInt64Type, self),
 97 |             DataType::Float64 => update_match!(col, Float64Type, self),
 98 |             _ => {
 99 |                 return Err(ErrorCode::NotSupported(format!(
100 |                     "Avg func for {:?} is not supported",
101 |                     col.data_type()
102 |                 )))
103 |             }
104 |         }
105 | 
106 |         Ok(())
107 |     }
108 | 
109 |     fn update(&mut self, data: &RecordBatch, idx: usize) -> Result<()> {
110 |         let col = self.col_expr.evaluate(data)?.into_array();
111 |         match col.data_type() {
112 |             DataType::Int64 => update_value!(col, Int64Type, idx, self),
113 |             DataType::UInt64 => update_value!(col, UInt64Type, idx, self),
114 |             DataType::Float64 => update_value!(col, Float64Type, idx, self),
115 |             _ => unimplemented!(),
116 |         }
117 |         Ok(())
118 |     }
119 | 
120 |     fn evaluate(&self) -> Result<ScalarValue> {
121 |         Ok(ScalarValue::Float64(Some(self.sum / self.cnt as f64)))
122 |     }
123 | 
124 |     fn clear_state(&mut self) {
125 |         self.sum = 0.0;
126 |         self.cnt = 0;
127 |     }
128 | }
129 | 


--------------------------------------------------------------------------------
/src/physical_plan/aggregate/count.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: Veeupup
 3 |  * @Date: 2022-05-20 19:06:45
 4 |  * @Last Modified by: Veeupup
 5 |  * @Last Modified time: 2022-05-20 21:13:03
 6 |  */
 7 | 
 8 | use arrow::datatypes::DataType;
 9 | 
10 | use arrow::record_batch::RecordBatch;
11 | 
12 | use super::AggregateOperator;
13 | use crate::error::ErrorCode;
14 | use crate::logical_plan::expression::ScalarValue;
15 | use crate::logical_plan::schema::NaiveField;
16 | use crate::physical_plan::aggregate::NaiveSchema;
17 | use crate::physical_plan::ColumnExpr;
18 | use crate::physical_plan::PhysicalExpr;
19 | use crate::Result;
20 | 
21 | #[derive(Debug, Clone)]
22 | pub struct Count {
23 |     cnt: u64,
24 |     col_expr: ColumnExpr,
25 | }
26 | 
27 | impl Count {
28 |     pub fn create(col_expr: ColumnExpr) -> Box<dyn AggregateOperator> {
29 |         Box::new(Self { cnt: 0, col_expr })
30 |     }
31 | }
32 | 
33 | impl AggregateOperator for Count {
34 |     fn data_field(&self, schema: &NaiveSchema) -> Result<NaiveField> {
35 |         // find by name
36 |         if let Some(name) = &self.col_expr.name {
37 |             let field = schema.field_with_unqualified_name(name)?;
38 |             return Ok(NaiveField::new(
39 |                 None,
40 |                 format!("count({})", field.name()).as_str(),
41 |                 DataType::UInt64,
42 |                 false,
43 |             ));
44 |         }
45 | 
46 |         if let Some(idx) = &self.col_expr.idx {
47 |             let field = schema.field(*idx);
48 |             return Ok(NaiveField::new(
49 |                 None,
50 |                 format!("count({})", field.name()).as_str(),
51 |                 DataType::UInt64,
52 |                 false,
53 |             ));
54 |         }
55 | 
56 |         Err(ErrorCode::LogicalError(
57 |             "ColumnExpr must has name or idx".to_string(),
58 |         ))
59 |     }
60 | 
61 |     fn update_batch(&mut self, data: &RecordBatch) -> Result<()> {
62 |         let col = self.col_expr.evaluate(data)?.into_array();
63 |         self.cnt += (col.len() - col.null_count()) as u64;
64 |         Ok(())
65 |     }
66 | 
67 |     fn update(&mut self, data: &RecordBatch, idx: usize) -> Result<()> {
68 |         let col = self.col_expr.evaluate(data)?.into_array();
69 |         if !col.is_null(idx) {
70 |             self.cnt += 1;
71 |         }
72 |         Ok(())
73 |     }
74 | 
75 |     fn evaluate(&self) -> Result<ScalarValue> {
76 |         Ok(ScalarValue::UInt64(Some(self.cnt)))
77 |     }
78 | 
79 |     fn clear_state(&mut self) {
80 |         self.cnt = 0;
81 |     }
82 | }
83 | 


--------------------------------------------------------------------------------
/src/physical_plan/aggregate/max.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * @Author: Veeupup
  3 |  * @Date: 2022-05-20 19:09:44
  4 |  * @Last Modified by: Veeupup
  5 |  * @Last Modified time: 2022-05-20 21:19:45
  6 |  */
  7 | 
  8 | use arrow::array::Array;
  9 | use arrow::array::PrimitiveArray;
 10 | use arrow::datatypes::DataType;
 11 | 
 12 | use arrow::datatypes::Float64Type;
 13 | use arrow::datatypes::Int64Type;
 14 | use arrow::datatypes::UInt64Type;
 15 | use arrow::record_batch::RecordBatch;
 16 | use ordered_float::OrderedFloat;
 17 | 
 18 | use super::AggregateOperator;
 19 | use crate::error::ErrorCode;
 20 | use crate::logical_plan::expression::ScalarValue;
 21 | use crate::logical_plan::schema::NaiveField;
 22 | use crate::logical_plan::schema::NaiveSchema;
 23 | use crate::physical_plan::ColumnExpr;
 24 | use crate::physical_plan::PhysicalExpr;
 25 | use crate::Result;
 26 | 
 27 | #[derive(Debug, Clone)]
 28 | pub struct Max {
 29 |     // TODO(veeupup): should use generic type for Int64, UInt Float64
 30 |     val: OrderedFloat<f64>,
 31 |     // physical column
 32 |     col_expr: ColumnExpr,
 33 | }
 34 | 
 35 | impl Max {
 36 |     pub fn create(col_expr: ColumnExpr) -> Box<dyn AggregateOperator> {
 37 |         Box::new(Self {
 38 |             val: OrderedFloat::from(f64::MIN),
 39 |             col_expr,
 40 |         })
 41 |     }
 42 | }
 43 | 
 44 | macro_rules! update_match {
 45 |     ($COL: expr, $DT: ty, $SELF: expr) => {{
 46 |         let col = $COL.as_any().downcast_ref::<PrimitiveArray<$DT>>().unwrap();
 47 |         for val in col.into_iter().flatten() {
 48 |             let val = OrderedFloat::from(val as f64);
 49 |             if val > $SELF.val {
 50 |                 $SELF.val = val;
 51 |             }
 52 |         }
 53 |     }};
 54 | }
 55 | 
 56 | macro_rules! update_value {
 57 |     ($COL: expr, $DT: ty, $IDX: expr, $SELF: expr) => {{
 58 |         let col = $COL.as_any().downcast_ref::<PrimitiveArray<$DT>>().unwrap();
 59 |         if !col.is_null($IDX) {
 60 |             let val = OrderedFloat::from(col.value($IDX) as f64);
 61 |             if val > $SELF.val {
 62 |                 $SELF.val = val;
 63 |             }
 64 |         }
 65 |     }};
 66 | }
 67 | 
 68 | impl AggregateOperator for Max {
 69 |     fn data_field(&self, schema: &NaiveSchema) -> Result<NaiveField> {
 70 |         // find by name
 71 |         if let Some(name) = &self.col_expr.name {
 72 |             let field = schema.field_with_unqualified_name(name)?;
 73 |             return Ok(NaiveField::new(
 74 |                 None,
 75 |                 format!("max({})", field.name()).as_str(),
 76 |                 DataType::Float64,
 77 |                 false,
 78 |             ));
 79 |         }
 80 | 
 81 |         if let Some(idx) = &self.col_expr.idx {
 82 |             let field = schema.field(*idx);
 83 |             return Ok(NaiveField::new(
 84 |                 None,
 85 |                 format!("max({})", field.name()).as_str(),
 86 |                 DataType::Float64,
 87 |                 false,
 88 |             ));
 89 |         }
 90 | 
 91 |         Err(ErrorCode::LogicalError(
 92 |             "ColumnExpr must has name or idx".to_string(),
 93 |         ))
 94 |     }
 95 | 
 96 |     fn update_batch(&mut self, data: &RecordBatch) -> Result<()> {
 97 |         let col = self.col_expr.evaluate(data)?.into_array();
 98 |         match col.data_type() {
 99 |             DataType::Int64 => update_match!(col, Int64Type, self),
100 |             DataType::UInt64 => update_match!(col, UInt64Type, self),
101 |             DataType::Float64 => update_match!(col, Float64Type, self),
102 |             _ => {
103 |                 return Err(ErrorCode::NotSupported(format!(
104 |                     "Max func for {:?} is not supported",
105 |                     col.data_type()
106 |                 )))
107 |             }
108 |         }
109 | 
110 |         Ok(())
111 |     }
112 | 
113 |     fn update(&mut self, data: &RecordBatch, idx: usize) -> Result<()> {
114 |         let col = self.col_expr.evaluate(data)?.into_array();
115 |         match col.data_type() {
116 |             DataType::Int64 => update_value!(col, Int64Type, idx, self),
117 |             DataType::UInt64 => update_value!(col, UInt64Type, idx, self),
118 |             DataType::Float64 => update_value!(col, Float64Type, idx, self),
119 |             _ => unimplemented!(),
120 |         }
121 |         Ok(())
122 |     }
123 | 
124 |     fn evaluate(&self) -> Result<ScalarValue> {
125 |         Ok(ScalarValue::Float64(Some(self.val.into())))
126 |     }
127 | 
128 |     fn clear_state(&mut self) {
129 |         self.val = OrderedFloat::from(f64::MIN);
130 |     }
131 | }
132 | 


--------------------------------------------------------------------------------
/src/physical_plan/aggregate/min.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * @Author: Veeupup
  3 |  * @Date: 2022-05-20 19:09:44
  4 |  * @Last Modified by: Veeupup
  5 |  * @Last Modified time: 2022-05-20 21:19:45
  6 |  */
  7 | 
  8 | use arrow::array::Array;
  9 | use arrow::array::PrimitiveArray;
 10 | use arrow::datatypes::DataType;
 11 | 
 12 | use arrow::datatypes::Float64Type;
 13 | use arrow::datatypes::Int64Type;
 14 | use arrow::datatypes::UInt64Type;
 15 | use arrow::record_batch::RecordBatch;
 16 | use ordered_float::OrderedFloat;
 17 | 
 18 | use super::AggregateOperator;
 19 | use crate::error::ErrorCode;
 20 | use crate::logical_plan::expression::ScalarValue;
 21 | use crate::logical_plan::schema::NaiveField;
 22 | use crate::logical_plan::schema::NaiveSchema;
 23 | use crate::physical_plan::ColumnExpr;
 24 | use crate::physical_plan::PhysicalExpr;
 25 | use crate::Result;
 26 | 
 27 | #[derive(Debug, Clone)]
 28 | pub struct Min {
 29 |     // TODO(veeupup): should use generic type for Int64, UInt Float64
 30 |     val: OrderedFloat<f64>,
 31 |     // physical column
 32 |     col_expr: ColumnExpr,
 33 | }
 34 | 
 35 | impl Min {
 36 |     pub fn create(col_expr: ColumnExpr) -> Box<dyn AggregateOperator> {
 37 |         Box::new(Self {
 38 |             val: OrderedFloat::from(f64::MAX),
 39 |             col_expr,
 40 |         })
 41 |     }
 42 | }
 43 | 
 44 | macro_rules! update_match {
 45 |     ($COL: expr, $DT: ty, $SELF: expr) => {{
 46 |         let col = $COL.as_any().downcast_ref::<PrimitiveArray<$DT>>().unwrap();
 47 |         for val in col.into_iter().flatten() {
 48 |             let val = OrderedFloat::from(val as f64);
 49 |             if val < $SELF.val {
 50 |                 $SELF.val = val;
 51 |             }
 52 |         }
 53 |     }};
 54 | }
 55 | 
 56 | macro_rules! update_value {
 57 |     ($COL: expr, $DT: ty, $IDX: expr, $SELF: expr) => {{
 58 |         let col = $COL.as_any().downcast_ref::<PrimitiveArray<$DT>>().unwrap();
 59 |         if !col.is_null($IDX) {
 60 |             let val = OrderedFloat::from(col.value($IDX) as f64);
 61 |             if val < $SELF.val {
 62 |                 $SELF.val = val;
 63 |             }
 64 |         }
 65 |     }};
 66 | }
 67 | 
 68 | impl AggregateOperator for Min {
 69 |     fn data_field(&self, schema: &NaiveSchema) -> Result<NaiveField> {
 70 |         // find by name
 71 |         if let Some(name) = &self.col_expr.name {
 72 |             let field = schema.field_with_unqualified_name(name)?;
 73 |             return Ok(NaiveField::new(
 74 |                 None,
 75 |                 format!("min({})", field.name()).as_str(),
 76 |                 DataType::Float64,
 77 |                 false,
 78 |             ));
 79 |         }
 80 | 
 81 |         if let Some(idx) = &self.col_expr.idx {
 82 |             let field = schema.field(*idx);
 83 |             return Ok(NaiveField::new(
 84 |                 None,
 85 |                 format!("min({})", field.name()).as_str(),
 86 |                 DataType::Float64,
 87 |                 false,
 88 |             ));
 89 |         }
 90 | 
 91 |         Err(ErrorCode::LogicalError(
 92 |             "ColumnExpr must has name or idx".to_string(),
 93 |         ))
 94 |     }
 95 | 
 96 |     fn update_batch(&mut self, data: &RecordBatch) -> Result<()> {
 97 |         let col = self.col_expr.evaluate(data)?.into_array();
 98 |         match col.data_type() {
 99 |             DataType::Int64 => update_match!(col, Int64Type, self),
100 |             DataType::UInt64 => update_match!(col, UInt64Type, self),
101 |             DataType::Float64 => update_match!(col, Float64Type, self),
102 |             _ => {
103 |                 return Err(ErrorCode::NotSupported(format!(
104 |                     "min func for {:?} is not supported",
105 |                     col.data_type()
106 |                 )))
107 |             }
108 |         }
109 | 
110 |         Ok(())
111 |     }
112 | 
113 |     fn update(&mut self, data: &RecordBatch, idx: usize) -> Result<()> {
114 |         let col = self.col_expr.evaluate(data)?.into_array();
115 |         match col.data_type() {
116 |             DataType::Int64 => update_value!(col, Int64Type, idx, self),
117 |             DataType::UInt64 => update_value!(col, UInt64Type, idx, self),
118 |             DataType::Float64 => update_value!(col, Float64Type, idx, self),
119 |             _ => unimplemented!(),
120 |         }
121 |         Ok(())
122 |     }
123 | 
124 |     fn evaluate(&self) -> Result<ScalarValue> {
125 |         Ok(ScalarValue::Float64(Some(self.val.into())))
126 |     }
127 | 
128 |     fn clear_state(&mut self) {
129 |         self.val = OrderedFloat::from(f64::MAX);
130 |     }
131 | }
132 | 


--------------------------------------------------------------------------------
/src/physical_plan/aggregate/mod.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * @Author: Veeupup
  3 |  * @Date: 2022-05-20 14:12:40
  4 |  * @Last Modified by: Veeupup
  5 |  * @Last Modified time: 2022-05-20 21:24:37
  6 |  */
  7 | 
  8 | pub mod avg;
  9 | pub mod count;
 10 | pub mod max;
 11 | pub mod min;
 12 | pub mod sum;
 13 | 
 14 | use std::collections::HashMap;
 15 | use std::fmt::Debug;
 16 | use std::sync::{Arc, Mutex};
 17 | 
 18 | use crate::error::ErrorCode;
 19 | use crate::logical_plan::schema::NaiveField;
 20 | use crate::logical_plan::{expression::ScalarValue, schema::NaiveSchema};
 21 | 
 22 | use super::{concat_batches, PhysicalPlan, PhysicalPlanRef};
 23 | 
 24 | use crate::physical_plan::PhysicalExprRef;
 25 | use crate::Result;
 26 | use arrow::array::{PrimitiveArray, StringArray};
 27 | use arrow::datatypes::{DataType, Field, Int64Type, Schema, UInt64Type};
 28 | use arrow::record_batch::RecordBatch;
 29 | 
 30 | #[derive(Debug)]
 31 | pub struct PhysicalAggregatePlan {
 32 |     pub group_expr: Vec<PhysicalExprRef>,
 33 |     pub aggr_ops: Mutex<Vec<Box<dyn AggregateOperator>>>,
 34 |     pub input: PhysicalPlanRef,
 35 |     pub schema: NaiveSchema,
 36 | }
 37 | 
 38 | impl PhysicalAggregatePlan {
 39 |     pub fn create(
 40 |         group_expr: Vec<PhysicalExprRef>,
 41 |         aggr_ops: Vec<Box<dyn AggregateOperator>>,
 42 |         input: PhysicalPlanRef,
 43 |     ) -> PhysicalPlanRef {
 44 |         let schema = input.schema().clone();
 45 |         Arc::new(Self {
 46 |             group_expr,
 47 |             aggr_ops: Mutex::new(aggr_ops),
 48 |             input,
 49 |             schema,
 50 |         })
 51 |     }
 52 | }
 53 | 
 54 | macro_rules! group_by_datatype {
 55 |     ($VAL: expr, $DT: ty, $GROUP_DT: ty, $GROUP_IDXS: expr, $AGGR_OPS: expr, $SINGLE_BATCH: expr, $SCHEMA: expr, $LEN: expr) => {{
 56 |         let group_val = $VAL.as_any().downcast_ref::<PrimitiveArray<$DT>>().unwrap();
 57 |         // group val -> Vec<index>
 58 |         // such as group by number % 3, then we will have group_idxs like
 59 |         // 0 -> [0,3,6], 1 -> [1,2,5] ...
 60 |         let mut group_idxs = HashMap::<$GROUP_DT, Vec<usize>>::new();
 61 | 
 62 |         // split into different groups
 63 |         for (idx, val) in group_val.iter().enumerate() {
 64 |             if let Some(val) = val {
 65 |                 if let Some(idxs) = group_idxs.get_mut(&val) {
 66 |                     idxs.push(idx);
 67 |                 } else {
 68 |                     group_idxs.insert(val, vec![idx]);
 69 |                 }
 70 |             }
 71 |         }
 72 | 
 73 |         // for each group, calculate aggregating value
 74 |         let mut batches = vec![];
 75 | 
 76 |         for group_idx in group_idxs.values() {
 77 |             for idx in group_idx {
 78 |                 for i in 0..$LEN {
 79 |                     $AGGR_OPS.get_mut(i).unwrap().update(&$SINGLE_BATCH, *idx)?;
 80 |                 }
 81 |             }
 82 | 
 83 |             let mut arrays = vec![];
 84 |             // let aggr_ops = self.aggr_ops.lock().unwrap();
 85 |             for aggr_op in $AGGR_OPS.iter() {
 86 |                 let x = aggr_op.evaluate()?;
 87 |                 arrays.push(x.into_array(1));
 88 |             }
 89 | 
 90 |             let record_batch = RecordBatch::try_new($SCHEMA.clone(), arrays)?;
 91 |             batches.push(record_batch);
 92 | 
 93 |             // for next group aggregate usage
 94 |             for i in 0..$LEN {
 95 |                 $AGGR_OPS.get_mut(i).unwrap().clear_state();
 96 |             }
 97 |         }
 98 | 
 99 |         let single_batch = concat_batches(&$SCHEMA, &batches)?;
100 |         Ok(vec![single_batch])
101 |     }};
102 | }
103 | 
104 | impl PhysicalPlan for PhysicalAggregatePlan {
105 |     fn schema(&self) -> &NaiveSchema {
106 |         &self.schema
107 |     }
108 | 
109 |     fn children(&self) -> Result<Vec<PhysicalPlanRef>> {
110 |         Ok(vec![self.input.clone()])
111 |     }
112 | 
113 |     fn execute(&self) -> Result<Vec<RecordBatch>> {
114 |         // output schema
115 |         let mut aggr_ops = self.aggr_ops.lock().unwrap();
116 |         let len = aggr_ops.len();
117 |         let mut fields: Vec<Field> = vec![];
118 |         for aggr_op in aggr_ops.iter() {
119 |             fields.push(aggr_op.data_field(self.schema())?.into());
120 |         }
121 |         let schema = Arc::new(Schema::new(fields));
122 | 
123 |         if self.group_expr.is_empty() {
124 |             let batches = self.input.execute()?;
125 | 
126 |             for batch in &batches {
127 |                 for i in 0..len {
128 |                     aggr_ops.get_mut(i).unwrap().update_batch(batch)?;
129 |                 }
130 |             }
131 | 
132 |             let mut arrays = vec![];
133 |             for aggr_op in aggr_ops.iter() {
134 |                 let x = aggr_op.evaluate()?;
135 |                 arrays.push(x.into_array(1));
136 |             }
137 | 
138 |             let record_batch = RecordBatch::try_new(schema, arrays)?;
139 |             Ok(vec![record_batch])
140 |         } else {
141 |             // TODO(veeupup): support multi group by expr
142 |             // such as `select sum(id) from t1 group by id % 3, age % 2`
143 |             let batches = self.input.execute()?;
144 |             let single_batch = concat_batches(&self.input.schema().clone().into(), &batches)?;
145 | 
146 |             let group_by_expr = &self.group_expr[0];
147 | 
148 |             let val = group_by_expr.evaluate(&single_batch)?.into_array();
149 |             match val.data_type() {
150 |                 DataType::Int64 => group_by_datatype!(
151 |                     val,
152 |                     Int64Type,
153 |                     i64,
154 |                     group_idxs,
155 |                     aggr_ops,
156 |                     single_batch,
157 |                     schema,
158 |                     len
159 |                 ),
160 |                 DataType::UInt64 => group_by_datatype!(
161 |                     val,
162 |                     UInt64Type,
163 |                     u64,
164 |                     group_idxs,
165 |                     aggr_ops,
166 |                     single_batch,
167 |                     schema,
168 |                     len
169 |                 ),
170 |                 DataType::Utf8 => {
171 |                     let group_val = val.as_any().downcast_ref::<StringArray>().unwrap();
172 |                     // group val -> Vec<index>
173 |                     // such as group by number % 3, then we will have group_idxs like
174 |                     // 0 -> [0,3,6], 1 -> [1,2,5] ...
175 |                     let mut group_idxs = HashMap::<String, Vec<usize>>::new();
176 | 
177 |                     // split into different groups
178 |                     for (idx, val) in group_val.iter().enumerate() {
179 |                         if let Some(val) = val {
180 |                             if let Some(idxs) = group_idxs.get_mut(val) {
181 |                                 idxs.push(idx);
182 |                             } else {
183 |                                 group_idxs.insert(val.to_string(), vec![idx]);
184 |                             }
185 |                         }
186 |                     }
187 | 
188 |                     // for each group, calculate aggregating value
189 |                     let mut batches = vec![];
190 | 
191 |                     for group_idx in group_idxs.values() {
192 |                         for idx in group_idx {
193 |                             for i in 0..len {
194 |                                 aggr_ops.get_mut(i).unwrap().update(&single_batch, *idx)?;
195 |                             }
196 |                         }
197 | 
198 |                         let mut arrays = vec![];
199 |                         // let aggr_ops = self.aggr_ops.lock().unwrap();
200 |                         for aggr_op in aggr_ops.iter() {
201 |                             let x = aggr_op.evaluate()?;
202 |                             arrays.push(x.into_array(1));
203 |                         }
204 | 
205 |                         let record_batch = RecordBatch::try_new(schema.clone(), arrays)?;
206 |                         batches.push(record_batch);
207 | 
208 |                         // for next group aggregate usage
209 |                         for i in 0..len {
210 |                             aggr_ops.get_mut(i).unwrap().clear_state();
211 |                         }
212 |                     }
213 | 
214 |                     let single_batch = concat_batches(&schema, &batches)?;
215 |                     Ok(vec![single_batch])
216 |                 }
217 |                 _ => Err(ErrorCode::NotSupported(
218 |                     "group by only support by `Int64`, `UInt64`, `String`".to_string(),
219 |                 )),
220 |             }
221 |         }
222 |     }
223 | }
224 | 
225 | pub trait AggregateOperator: Debug {
226 |     fn data_field(&self, schema: &NaiveSchema) -> Result<NaiveField>;
227 | 
228 |     fn update_batch(&mut self, data: &RecordBatch) -> Result<()>;
229 | 
230 |     fn update(&mut self, data: &RecordBatch, idx: usize) -> Result<()>;
231 | 
232 |     fn evaluate(&self) -> Result<ScalarValue>;
233 | 
234 |     fn clear_state(&mut self);
235 | }
236 | 


--------------------------------------------------------------------------------
/src/physical_plan/aggregate/sum.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * @Author: Veeupup
  3 |  * @Date: 2022-05-20 19:09:44
  4 |  * @Last Modified by: Veeupup
  5 |  * @Last Modified time: 2022-05-20 21:19:45
  6 |  */
  7 | 
  8 | use arrow::array::Array;
  9 | use arrow::array::PrimitiveArray;
 10 | use arrow::datatypes::DataType;
 11 | 
 12 | use arrow::datatypes::Float64Type;
 13 | use arrow::datatypes::Int64Type;
 14 | use arrow::datatypes::UInt64Type;
 15 | use arrow::record_batch::RecordBatch;
 16 | 
 17 | use super::AggregateOperator;
 18 | use crate::error::ErrorCode;
 19 | use crate::logical_plan::expression::ScalarValue;
 20 | use crate::logical_plan::schema::NaiveField;
 21 | use crate::logical_plan::schema::NaiveSchema;
 22 | use crate::physical_plan::ColumnExpr;
 23 | use crate::physical_plan::PhysicalExpr;
 24 | use crate::Result;
 25 | 
 26 | #[derive(Debug, Clone)]
 27 | pub struct Sum {
 28 |     // TODO(veeupup): should use generic type for Int64, UInt Float64
 29 |     sum: f64,
 30 |     // physical column
 31 |     col_expr: ColumnExpr,
 32 | }
 33 | 
 34 | impl Sum {
 35 |     pub fn create(col_expr: ColumnExpr) -> Box<dyn AggregateOperator> {
 36 |         Box::new(Self { sum: 0.0, col_expr })
 37 |     }
 38 | }
 39 | 
 40 | macro_rules! update_match {
 41 |     ($COL: expr, $DT: ty, $SELF: expr) => {{
 42 |         let col = $COL.as_any().downcast_ref::<PrimitiveArray<$DT>>().unwrap();
 43 |         for val in col.into_iter().flatten() {
 44 |             $SELF.sum += val as f64;
 45 |         }
 46 |     }};
 47 | }
 48 | 
 49 | macro_rules! update_value {
 50 |     ($COL: expr, $DT: ty, $IDX: expr, $SELF: expr) => {{
 51 |         let col = $COL.as_any().downcast_ref::<PrimitiveArray<$DT>>().unwrap();
 52 |         if !col.is_null($IDX) {
 53 |             $SELF.sum += col.value($IDX) as f64;
 54 |         }
 55 |     }};
 56 | }
 57 | 
 58 | impl AggregateOperator for Sum {
 59 |     fn data_field(&self, schema: &NaiveSchema) -> Result<NaiveField> {
 60 |         // find by name
 61 |         if let Some(name) = &self.col_expr.name {
 62 |             let field = schema.field_with_unqualified_name(name)?;
 63 |             return Ok(NaiveField::new(
 64 |                 None,
 65 |                 format!("sum({})", field.name()).as_str(),
 66 |                 DataType::Float64,
 67 |                 false,
 68 |             ));
 69 |         }
 70 | 
 71 |         if let Some(idx) = &self.col_expr.idx {
 72 |             let field = schema.field(*idx);
 73 |             return Ok(NaiveField::new(
 74 |                 None,
 75 |                 format!("sum({})", field.name()).as_str(),
 76 |                 DataType::Float64,
 77 |                 false,
 78 |             ));
 79 |         }
 80 | 
 81 |         Err(ErrorCode::LogicalError(
 82 |             "ColumnExpr must has name or idx".to_string(),
 83 |         ))
 84 |     }
 85 | 
 86 |     fn update_batch(&mut self, data: &RecordBatch) -> Result<()> {
 87 |         let col = self.col_expr.evaluate(data)?.into_array();
 88 |         match col.data_type() {
 89 |             DataType::Int64 => update_match!(col, Int64Type, self),
 90 |             DataType::UInt64 => update_match!(col, UInt64Type, self),
 91 |             DataType::Float64 => update_match!(col, Float64Type, self),
 92 |             _ => {
 93 |                 return Err(ErrorCode::NotSupported(format!(
 94 |                     "Sum func for {:?} is not supported",
 95 |                     col.data_type()
 96 |                 )))
 97 |             }
 98 |         }
 99 | 
100 |         Ok(())
101 |     }
102 | 
103 |     fn update(&mut self, data: &RecordBatch, idx: usize) -> Result<()> {
104 |         let col = self.col_expr.evaluate(data)?.into_array();
105 |         match col.data_type() {
106 |             DataType::Int64 => update_value!(col, Int64Type, idx, self),
107 |             DataType::UInt64 => update_value!(col, UInt64Type, idx, self),
108 |             DataType::Float64 => update_value!(col, Float64Type, idx, self),
109 |             _ => unimplemented!(),
110 |         }
111 |         Ok(())
112 |     }
113 | 
114 |     fn evaluate(&self) -> Result<ScalarValue> {
115 |         Ok(ScalarValue::Float64(Some(self.sum)))
116 |     }
117 | 
118 |     fn clear_state(&mut self) {
119 |         self.sum = 0.0;
120 |     }
121 | }
122 | 


--------------------------------------------------------------------------------
/src/physical_plan/cross_join.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * @Author: ywq
  3 |  * @Date: 2022-05-24
  4 |  */
  5 | use super::PhysicalPlan;
  6 | use super::PhysicalPlanRef;
  7 | use crate::logical_plan::plan::JoinType;
  8 | use crate::logical_plan::schema::NaiveSchema;
  9 | 
 10 | use crate::Result;
 11 | use arrow::array::Array;
 12 | use arrow::array::Float64Array;
 13 | use arrow::array::Int64Array;
 14 | use arrow::array::PrimitiveArray;
 15 | use arrow::array::StringArray;
 16 | use arrow::array::UInt64Array;
 17 | use arrow::datatypes::DataType;
 18 | use arrow::datatypes::Float64Type;
 19 | use arrow::datatypes::Int64Type;
 20 | use arrow::datatypes::SchemaRef;
 21 | use arrow::datatypes::UInt64Type;
 22 | use arrow::record_batch::RecordBatch;
 23 | use std::sync::Arc;
 24 | 
 25 | #[derive(Debug)]
 26 | pub struct CrossJoin {
 27 |     left: PhysicalPlanRef,
 28 |     right: PhysicalPlanRef,
 29 |     #[allow(unused)]
 30 |     join_type: JoinType,
 31 |     schema: NaiveSchema,
 32 | }
 33 | 
 34 | impl CrossJoin {
 35 |     #[allow(unused)]
 36 |     pub fn create(
 37 |         left: PhysicalPlanRef,
 38 |         right: PhysicalPlanRef,
 39 |         join_type: JoinType,
 40 |         schema: NaiveSchema,
 41 |     ) -> PhysicalPlanRef {
 42 |         Arc::new(Self {
 43 |             left,
 44 |             right,
 45 |             join_type,
 46 |             schema,
 47 |         })
 48 |     }
 49 | }
 50 | 
 51 | impl PhysicalPlan for CrossJoin {
 52 |     fn schema(&self) -> &NaiveSchema {
 53 |         &self.schema
 54 |     }
 55 | 
 56 |     fn execute(&self) -> Result<Vec<RecordBatch>> {
 57 |         // TODO(ywq)
 58 |         let outer_table = self.left.execute()?;
 59 |         let inner_table = self.right.execute()?;
 60 | 
 61 |         let mut batches: Vec<RecordBatch> = vec![];
 62 | 
 63 |         for outer in &outer_table {
 64 |             for inner in &inner_table {
 65 |                 let mut columns: Vec<Arc<dyn Array>> = vec![];
 66 |                 let left_rows = outer.num_rows();
 67 |                 let right_rows = inner.num_rows();
 68 |                 for i in 0..self.left.schema().fields().len() {
 69 |                     let array = outer.column(i);
 70 |                     let dt = self.left.schema().field(i).data_type();
 71 |                     match dt {
 72 |                         // TODO(ywq reafctor with macro)
 73 |                         DataType::Int64 => {
 74 |                             let mut t_vec = vec![];
 75 |                             let left_col = array
 76 |                                 .as_any()
 77 |                                 .downcast_ref::<PrimitiveArray<Int64Type>>()
 78 |                                 .unwrap();
 79 |                             for _ in 0..right_rows {
 80 |                                 for k in 0..left_col.len() {
 81 |                                     t_vec.push(left_col.value(k))
 82 |                                 }
 83 |                             }
 84 |                             columns.push(Arc::new(Int64Array::from(t_vec)));
 85 |                         }
 86 |                         DataType::UInt64 => {
 87 |                             let mut t_vec = vec![];
 88 |                             let left_col = array
 89 |                                 .as_any()
 90 |                                 .downcast_ref::<PrimitiveArray<UInt64Type>>()
 91 |                                 .unwrap();
 92 |                             for _ in 0..right_rows {
 93 |                                 for k in 0..left_col.len() {
 94 |                                     t_vec.push(left_col.value(k))
 95 |                                 }
 96 |                             }
 97 |                             columns.push(Arc::new(UInt64Array::from(t_vec)));
 98 |                         }
 99 |                         DataType::Float64 => {
100 |                             let mut t_vec = vec![];
101 |                             let left_col = array
102 |                                 .as_any()
103 |                                 .downcast_ref::<PrimitiveArray<Float64Type>>()
104 |                                 .unwrap();
105 |                             for _ in 0..right_rows {
106 |                                 for k in 0..left_col.len() {
107 |                                     t_vec.push(left_col.value(k))
108 |                                 }
109 |                             }
110 |                             columns.push(Arc::new(Float64Array::from(t_vec)));
111 |                         }
112 |                         DataType::Utf8 => {
113 |                             let mut t_vec = vec![];
114 |                             let left_col = array.as_any().downcast_ref::<StringArray>().unwrap();
115 |                             for _ in 0..right_rows {
116 |                                 for k in 0..left_col.len() {
117 |                                     t_vec.push(left_col.value(k))
118 |                                 }
119 |                             }
120 |                             columns.push(Arc::new(StringArray::from(t_vec)));
121 |                         }
122 |                         _ => unimplemented!(),
123 |                     }
124 |                 }
125 |                 for i in 0..self.right.schema().fields().len() {
126 |                     let array = inner.column(i);
127 |                     let dt = self.right.schema().field(i).data_type();
128 |                     match dt {
129 |                         DataType::Int64 => {
130 |                             let mut t_vec = vec![];
131 |                             let left_col = array
132 |                                 .as_any()
133 |                                 .downcast_ref::<PrimitiveArray<Int64Type>>()
134 |                                 .unwrap();
135 |                             for _ in 0..left_rows {
136 |                                 for k in 0..left_col.len() {
137 |                                     t_vec.push(left_col.value(k))
138 |                                 }
139 |                             }
140 |                             columns.push(Arc::new(Int64Array::from(t_vec)));
141 |                         }
142 |                         DataType::UInt64 => {
143 |                             let mut t_vec = vec![];
144 |                             let left_col = array
145 |                                 .as_any()
146 |                                 .downcast_ref::<PrimitiveArray<UInt64Type>>()
147 |                                 .unwrap();
148 |                             for _ in 0..left_rows {
149 |                                 for k in 0..left_col.len() {
150 |                                     t_vec.push(left_col.value(k))
151 |                                 }
152 |                             }
153 |                             columns.push(Arc::new(UInt64Array::from(t_vec)));
154 |                         }
155 |                         DataType::Float64 => {
156 |                             let mut t_vec = vec![];
157 |                             let left_col = array
158 |                                 .as_any()
159 |                                 .downcast_ref::<PrimitiveArray<Float64Type>>()
160 |                                 .unwrap();
161 |                             for _ in 0..left_rows {
162 |                                 for k in 0..left_col.len() {
163 |                                     t_vec.push(left_col.value(k))
164 |                                 }
165 |                             }
166 |                             columns.push(Arc::new(Float64Array::from(t_vec)));
167 |                         }
168 |                         DataType::Utf8 => {
169 |                             let mut t_vec = vec![];
170 |                             let left_col = array.as_any().downcast_ref::<StringArray>().unwrap();
171 |                             for _ in 0..left_rows {
172 |                                 for k in 0..left_col.len() {
173 |                                     t_vec.push(left_col.value(k))
174 |                                 }
175 |                             }
176 |                             columns.push(Arc::new(StringArray::from(t_vec)));
177 |                         }
178 |                         _ => unimplemented!(),
179 |                     }
180 |                 }
181 |                 // new batch
182 |                 let batch = RecordBatch::try_new(SchemaRef::from(self.schema.clone()), columns)?;
183 |                 batches.push(batch);
184 |             }
185 |         }
186 |         Ok(batches)
187 |     }
188 | 
189 |     fn children(&self) -> Result<Vec<PhysicalPlanRef>> {
190 |         Ok(vec![self.left.clone(), self.right.clone()])
191 |     }
192 | }
193 | 


--------------------------------------------------------------------------------
/src/physical_plan/expression/binary.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * @Author: Veeupup
  3 |  * @Date: 2022-05-14 20:26:01
  4 |  * @Email: code@tanweime.com
  5 | */
  6 | use arrow::{
  7 |     array::{BooleanArray, PrimitiveArray},
  8 |     compute::{
  9 |         and_kleene, eq_dyn, gt_dyn, gt_eq_dyn,
 10 |         kernels::arithmetic::{add, divide, modulus, multiply, subtract},
 11 |         lt_dyn, lt_eq_dyn, neq_dyn, or_kleene,
 12 |     },
 13 |     datatypes::{DataType, Float64Type, Int64Type, UInt64Type},
 14 |     record_batch::RecordBatch,
 15 | };
 16 | use std::any::Any;
 17 | use std::sync::Arc;
 18 | 
 19 | use super::{PhysicalExpr, PhysicalExprRef};
 20 | use crate::{datatype::ColumnValue, error::ErrorCode, logical_plan::expression::Operator};
 21 | 
 22 | macro_rules! compare_bin {
 23 |     ($OP:expr, $LEFT: expr, $RIGHT: expr) => {
 24 |         $OP($LEFT, $RIGHT)
 25 |             .map_err(|e| e.into())
 26 |             .map(|a| ColumnValue::Array(Arc::new(a)))
 27 |     };
 28 | }
 29 | 
 30 | macro_rules! binary_op {
 31 |     ($OP:expr, $LEFT_DT: expr, $RIGHT_DT: expr, $LEFT: expr, $RIGHT: expr, $SELF_OP: expr) => {{
 32 |         if $LEFT_DT == DataType::Boolean && $RIGHT_DT == DataType::Boolean {
 33 |             let left = $LEFT.as_any().downcast_ref::<BooleanArray>().unwrap();
 34 |             let right = $RIGHT.as_any().downcast_ref::<BooleanArray>().unwrap();
 35 |             let ret = $OP(left, right)?;
 36 |             Ok(ColumnValue::Array(Arc::new(ret)))
 37 |         } else {
 38 |             Err(ErrorCode::IntervalError(format!(
 39 |                 "Cannot evaluate binary expression {:?} with types {:?} and {:?}",
 40 |                 $SELF_OP, $LEFT_DT, $RIGHT_DT
 41 |             )))
 42 |         }
 43 |     }};
 44 | }
 45 | 
 46 | macro_rules! arithemic_op {
 47 |     ($OP:expr, $LEFT_DT: expr, $LEFT: expr, $RIGHT: expr) => {{
 48 |         match $LEFT_DT {
 49 |             DataType::Int64 => {
 50 |                 let left = $LEFT
 51 |                     .as_any()
 52 |                     .downcast_ref::<PrimitiveArray<Int64Type>>()
 53 |                     .unwrap();
 54 |                 let right = $RIGHT
 55 |                     .as_any()
 56 |                     .downcast_ref::<PrimitiveArray<Int64Type>>()
 57 |                     .unwrap();
 58 |                 let x = $OP(left, right)?;
 59 |                 Ok(ColumnValue::Array(Arc::new(x)))
 60 |             }
 61 |             DataType::UInt64 => {
 62 |                 let left = $LEFT
 63 |                     .as_any()
 64 |                     .downcast_ref::<PrimitiveArray<UInt64Type>>()
 65 |                     .unwrap();
 66 |                 let right = $RIGHT
 67 |                     .as_any()
 68 |                     .downcast_ref::<PrimitiveArray<UInt64Type>>()
 69 |                     .unwrap();
 70 |                 let x = $OP(left, right)?;
 71 |                 Ok(ColumnValue::Array(Arc::new(x)))
 72 |             }
 73 |             DataType::Float64 => {
 74 |                 let left = $LEFT
 75 |                     .as_any()
 76 |                     .downcast_ref::<PrimitiveArray<Float64Type>>()
 77 |                     .unwrap();
 78 |                 let right = $RIGHT
 79 |                     .as_any()
 80 |                     .downcast_ref::<PrimitiveArray<Float64Type>>()
 81 |                     .unwrap();
 82 |                 let x = $OP(left, right)?;
 83 |                 Ok(ColumnValue::Array(Arc::new(x)))
 84 |             }
 85 |             _ => unimplemented!(),
 86 |         }
 87 |     }};
 88 | }
 89 | 
 90 | #[derive(Debug)]
 91 | pub struct PhysicalBinaryExpr {
 92 |     left: PhysicalExprRef,
 93 |     op: Operator,
 94 |     right: PhysicalExprRef,
 95 | }
 96 | 
 97 | impl PhysicalBinaryExpr {
 98 |     pub fn create(left: PhysicalExprRef, op: Operator, right: PhysicalExprRef) -> PhysicalExprRef {
 99 |         Arc::new(Self { left, op, right })
100 |     }
101 | }
102 | 
103 | impl PhysicalExpr for PhysicalBinaryExpr {
104 |     fn as_any(&self) -> &dyn Any {
105 |         self
106 |     }
107 | 
108 |     fn evaluate(&self, input: &RecordBatch) -> crate::Result<ColumnValue> {
109 |         let left_value = self.left.evaluate(input)?;
110 |         let right_value = self.right.evaluate(input)?;
111 | 
112 |         let left_data_type = left_value.data_type();
113 |         let right_data_type = right_value.data_type();
114 |         if left_value.data_type() != right_value.data_type() {
115 |             return Err(ErrorCode::IntervalError(format!(
116 |                 "Cannot evaluate binary expression {:?} with types {:?} and {:?}",
117 |                 self.op, left_data_type, right_data_type
118 |             )));
119 |         }
120 | 
121 |         // TODO(veeupup): speed up if left_value or right_value is scalar
122 | 
123 |         let left_array = left_value.into_array();
124 |         let right_array = right_value.into_array();
125 | 
126 |         match self.op {
127 |             Operator::Eq => compare_bin!(eq_dyn, &left_array, &right_array),
128 |             Operator::NotEq => compare_bin!(neq_dyn, &left_array, &right_array),
129 |             Operator::Lt => compare_bin!(lt_dyn, &left_array, &right_array),
130 |             Operator::LtEq => compare_bin!(lt_eq_dyn, &left_array, &right_array),
131 |             Operator::Gt => compare_bin!(gt_dyn, &left_array, &right_array),
132 |             Operator::GtEq => compare_bin!(gt_eq_dyn, &left_array, &right_array),
133 |             Operator::And => binary_op!(
134 |                 and_kleene,
135 |                 left_data_type,
136 |                 right_data_type,
137 |                 left_array,
138 |                 right_array,
139 |                 Operator::And
140 |             ),
141 |             Operator::Or => binary_op!(
142 |                 or_kleene,
143 |                 left_data_type,
144 |                 right_data_type,
145 |                 left_array,
146 |                 right_array,
147 |                 Operator::Or
148 |             ),
149 |             Operator::Plus => arithemic_op!(add, left_data_type, left_array, right_array),
150 |             Operator::Minus => arithemic_op!(subtract, left_data_type, left_array, right_array),
151 |             Operator::Multiply => arithemic_op!(multiply, left_data_type, left_array, right_array),
152 |             Operator::Divide => arithemic_op!(divide, left_data_type, left_array, right_array),
153 |             Operator::Modulos => arithemic_op!(modulus, left_data_type, left_array, right_array),
154 |         }
155 |     }
156 | }
157 | 


--------------------------------------------------------------------------------
/src/physical_plan/expression/cast.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: ywqzzy
 3 |  * @Date: 2022-05-20
 4 | */
 5 | use arrow::{datatypes::DataType, record_batch::RecordBatch};
 6 | use core::fmt;
 7 | use std::any::Any;
 8 | use std::{
 9 |     fmt::{Debug, Formatter},
10 |     sync::Arc,
11 | };
12 | 
13 | use super::{PhysicalExpr, PhysicalExprRef};
14 | use crate::datatype::ColumnValue;
15 | 
16 | pub struct PhysicalCastExpr {
17 |     #[allow(unused)]
18 |     expr: PhysicalExprRef,
19 |     data_type: DataType,
20 | }
21 | 
22 | impl Debug for PhysicalCastExpr {
23 |     fn fmt(&self, f: &mut Formatter) -> fmt::Result {
24 |         f.debug_struct("CastExpr")
25 |             .field("name", &"CAST")
26 |             .field("return_type", &self.data_type)
27 |             .finish()
28 |     }
29 | }
30 | 
31 | impl PhysicalCastExpr {
32 |     pub fn create(expr: PhysicalExprRef, data_type: &DataType) -> PhysicalExprRef {
33 |         Arc::new(Self {
34 |             expr,
35 |             data_type: data_type.clone(),
36 |         })
37 |     }
38 | }
39 | 
40 | impl PhysicalExpr for PhysicalCastExpr {
41 |     fn as_any(&self) -> &dyn Any {
42 |         self
43 |     }
44 | 
45 |     fn evaluate(&self, _input: &RecordBatch) -> crate::Result<ColumnValue> {
46 |         // let value = self.expr.evaluate(input)?;
47 | 
48 |         // let from_data_type = value.data_type();
49 |         // let value_array = value.into_array();
50 |         // let field_array_builder = arrow::array::make_builder(&self.data_type, input.num_rows());
51 | 
52 |         match self.data_type {
53 |             DataType::Null => todo!(),
54 |             DataType::Boolean => todo!(),
55 |             DataType::Int8 => todo!(),
56 |             DataType::Int16 => todo!(),
57 |             DataType::Int32 => todo!(),
58 |             DataType::Int64 => todo!(),
59 |             DataType::UInt8 => todo!(),
60 |             DataType::UInt16 => todo!(),
61 |             DataType::UInt32 => todo!(),
62 |             DataType::UInt64 => todo!(),
63 |             DataType::Float16 => todo!(),
64 |             DataType::Float32 => todo!(),
65 |             DataType::Float64 => todo!(),
66 |             DataType::Timestamp(_, _) => todo!(),
67 |             DataType::Date32 => todo!(),
68 |             DataType::Date64 => todo!(),
69 |             DataType::Time32(_) => todo!(),
70 |             DataType::Time64(_) => todo!(),
71 |             DataType::Duration(_) => todo!(),
72 |             DataType::Interval(_) => todo!(),
73 |             DataType::Binary => todo!(),
74 |             DataType::FixedSizeBinary(_) => todo!(),
75 |             DataType::LargeBinary => todo!(),
76 |             DataType::Utf8 => todo!(),
77 |             DataType::LargeUtf8 => todo!(),
78 |             DataType::List(_) => todo!(),
79 |             DataType::FixedSizeList(_, _) => todo!(),
80 |             DataType::LargeList(_) => todo!(),
81 |             DataType::Struct(_) => todo!(),
82 |             DataType::Union(_, _) => todo!(),
83 |             DataType::Dictionary(_, _) => todo!(),
84 |             DataType::Decimal(_, _) => todo!(),
85 |             DataType::Map(_, _) => todo!(),
86 |         }
87 |     }
88 | }
89 | 


--------------------------------------------------------------------------------
/src/physical_plan/expression/column.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: Veeupup
 3 |  * @Date: 2022-05-13 14:56:36
 4 |  * @Email: code@tanweime.com
 5 | */
 6 | 
 7 | use std::any::Any;
 8 | use std::sync::Arc;
 9 | 
10 | use arrow::record_batch::RecordBatch;
11 | 
12 | use super::PhysicalExpr;
13 | use crate::datatype::ColumnValue;
14 | use crate::error::{ErrorCode, Result};
15 | use crate::physical_plan::PhysicalExprRef;
16 | 
17 | #[derive(Debug, Clone)]
18 | pub struct ColumnExpr {
19 |     pub name: Option<String>,
20 |     pub idx: Option<usize>,
21 | }
22 | 
23 | impl ColumnExpr {
24 |     pub fn try_create(name: Option<String>, idx: Option<usize>) -> Result<PhysicalExprRef> {
25 |         if name.is_none() && idx.is_none() {
26 |             return Err(ErrorCode::LogicalError(
27 |                 "ColumnExpr must has name or idx".to_string(),
28 |             ));
29 |         }
30 |         Ok(Arc::new(Self { name, idx }))
31 |     }
32 | }
33 | 
34 | impl PhysicalExpr for ColumnExpr {
35 |     fn as_any(&self) -> &dyn Any {
36 |         self
37 |     }
38 | 
39 |     fn evaluate(&self, input: &RecordBatch) -> Result<ColumnValue> {
40 |         // prefer idx first
41 |         if let Some(idx) = self.idx {
42 |             let column = input.column(idx).clone();
43 |             return Ok(ColumnValue::Array(column));
44 |         }
45 |         // then name
46 |         if let Some(name) = &self.name {
47 |             for (idx, field) in input.schema().fields().iter().enumerate() {
48 |                 if field.name() == name {
49 |                     let column = input.column(idx).clone();
50 |                     return Ok(ColumnValue::Array(column));
51 |                 }
52 |             }
53 |         }
54 |         Err(ErrorCode::LogicalError(
55 |             "ColumnExpr must has name or idx".to_string(),
56 |         ))
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/physical_plan/expression/literal.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: Veeupup
 3 |  * @Date: 2022-05-14 21:30:10
 4 |  * @Email: code@tanweime.com
 5 | */
 6 | 
 7 | use crate::logical_plan::expression::ScalarValue;
 8 | use std::any::Any;
 9 | use std::sync::Arc;
10 | 
11 | use super::{PhysicalExpr, PhysicalExprRef};
12 | use crate::datatype::ColumnValue;
13 | use crate::Result;
14 | use arrow::record_batch::RecordBatch;
15 | 
16 | #[derive(Debug)]
17 | pub struct PhysicalLiteralExpr {
18 |     pub literal: ScalarValue,
19 | }
20 | 
21 | impl PhysicalLiteralExpr {
22 |     pub fn create(literal: ScalarValue) -> PhysicalExprRef {
23 |         Arc::new(Self { literal })
24 |     }
25 | }
26 | 
27 | impl PhysicalExpr for PhysicalLiteralExpr {
28 |     fn as_any(&self) -> &dyn Any {
29 |         self
30 |     }
31 | 
32 |     fn evaluate(&self, input: &RecordBatch) -> Result<ColumnValue> {
33 |         Ok(ColumnValue::Const(self.literal.clone(), input.num_rows()))
34 |     }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/physical_plan/expression/mod.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: Veeupup
 3 |  * @Date: 2022-05-13 14:26:45
 4 |  * @Email: code@tanweime.com
 5 | */
 6 | 
 7 | mod binary;
 8 | mod cast;
 9 | mod column;
10 | mod literal;
11 | mod unary;
12 | 
13 | pub use binary::PhysicalBinaryExpr;
14 | pub use cast::PhysicalCastExpr;
15 | pub use column::ColumnExpr;
16 | pub use literal::PhysicalLiteralExpr;
17 | pub use unary::PhysicalUnaryExpr;
18 | 
19 | use crate::{datatype::ColumnValue, error::Result};
20 | use arrow::record_batch::RecordBatch;
21 | use std::any::Any;
22 | use std::fmt::Debug;
23 | use std::sync::Arc;
24 | 
25 | pub trait PhysicalExpr: Debug {
26 |     fn as_any(&self) -> &dyn Any;
27 | 
28 |     fn evaluate(&self, input: &RecordBatch) -> Result<ColumnValue>;
29 | }
30 | 
31 | pub type PhysicalExprRef = Arc<dyn PhysicalExpr>;
32 | 


--------------------------------------------------------------------------------
/src/physical_plan/expression/unary.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * @Author: ywqzzy
  3 |  * @Date: 2022-05-19
  4 | */
  5 | use arrow::{
  6 |     array::PrimitiveArray,
  7 |     datatypes::{DataType, Float32Type, Float64Type},
  8 |     record_batch::RecordBatch,
  9 | };
 10 | use core::fmt;
 11 | use std::any::Any;
 12 | use std::{
 13 |     fmt::{Debug, Formatter},
 14 |     sync::Arc,
 15 | };
 16 | 
 17 | use super::{PhysicalExpr, PhysicalExprRef};
 18 | use crate::{datatype::ColumnValue, logical_plan::expression::UnaryOperator};
 19 | 
 20 | macro_rules! unary_arith_op {
 21 |     ($OP:ident, $DT: expr, $COL: expr) => {{
 22 |         match $DT {
 23 |             DataType::Float64 => {
 24 |                 let value = $COL
 25 |                     .as_any()
 26 |                     .downcast_ref::<PrimitiveArray<Float64Type>>()
 27 |                     .unwrap();
 28 |                 let res: PrimitiveArray<Float64Type> =
 29 |                     arrow::compute::kernels::arity::unary(value, |x| x.$OP());
 30 |                 Ok(ColumnValue::Array(Arc::new(res)))
 31 |             }
 32 |             DataType::Float32 => {
 33 |                 let value = $COL
 34 |                     .as_any()
 35 |                     .downcast_ref::<PrimitiveArray<Float32Type>>()
 36 |                     .unwrap();
 37 |                 let res: PrimitiveArray<Float32Type> =
 38 |                     arrow::compute::kernels::arity::unary(value, |x| x.$OP());
 39 |                 Ok(ColumnValue::Array(Arc::new(res)))
 40 |             }
 41 |             _ => unimplemented!(),
 42 |         }
 43 |     }};
 44 | }
 45 | 
 46 | pub struct PhysicalUnaryExpr {
 47 |     expr: PhysicalExprRef,
 48 |     func: UnaryOperator,
 49 |     name: String,
 50 |     return_type: DataType,
 51 | }
 52 | 
 53 | impl Debug for PhysicalUnaryExpr {
 54 |     fn fmt(&self, f: &mut Formatter) -> fmt::Result {
 55 |         f.debug_struct("UnaryExpr")
 56 |             .field("func", &"<FUNC>")
 57 |             .field("name", &self.name)
 58 |             .field("expr", &self.expr)
 59 |             .field("return_type", &self.return_type)
 60 |             .finish()
 61 |     }
 62 | }
 63 | 
 64 | impl PhysicalUnaryExpr {
 65 |     pub fn create(
 66 |         expr: PhysicalExprRef,
 67 |         func: UnaryOperator,
 68 |         name: String,
 69 |         return_type: &DataType,
 70 |     ) -> PhysicalExprRef {
 71 |         Arc::new(Self {
 72 |             expr,
 73 |             func,
 74 |             name,
 75 |             return_type: return_type.clone(),
 76 |         })
 77 |     }
 78 | }
 79 | 
 80 | impl PhysicalExpr for PhysicalUnaryExpr {
 81 |     fn as_any(&self) -> &dyn Any {
 82 |         self
 83 |     }
 84 | 
 85 |     fn evaluate(&self, input: &RecordBatch) -> crate::Result<ColumnValue> {
 86 |         let value = self.expr.evaluate(input)?;
 87 | 
 88 |         let data_type = value.data_type();
 89 | 
 90 |         let value_array = value.into_array();
 91 | 
 92 |         match self.func {
 93 |             UnaryOperator::Abs => unary_arith_op!(abs, data_type, value_array),
 94 |             UnaryOperator::Sin => unary_arith_op!(sin, data_type, value_array),
 95 |             UnaryOperator::Cos => unary_arith_op!(cos, data_type, value_array),
 96 |             UnaryOperator::Tan => unary_arith_op!(cos, data_type, value_array),
 97 |             UnaryOperator::Trim => todo!(),
 98 |             UnaryOperator::LTrim => todo!(),
 99 |             UnaryOperator::RTrim => todo!(),
100 |             UnaryOperator::CharacterLength => todo!(),
101 |             UnaryOperator::Lower => todo!(),
102 |             UnaryOperator::Upper => todo!(),
103 |             UnaryOperator::Repeat => todo!(),
104 |             UnaryOperator::Replace => todo!(),
105 |             UnaryOperator::Reverse => todo!(),
106 |             UnaryOperator::Substr => todo!(),
107 |         }
108 |     }
109 | }
110 | 
111 | #[cfg(test)]
112 | mod tests {
113 |     use super::*;
114 |     use crate::datasource::{CsvConfig, CsvTable};
115 |     use crate::error::Result;
116 |     use crate::logical_plan::expression::UnaryOperator;
117 |     use crate::physical_plan::expression::ColumnExpr;
118 |     use crate::physical_plan::PhysicalUnaryExpr;
119 |     use arrow::array::{ArrayRef, Float64Array};
120 |     use arrow::datatypes::DataType;
121 | 
122 |     #[test]
123 |     fn test_abs_expression() -> Result<()> {
124 |         let table = CsvTable::try_create("data/test_data.csv", CsvConfig::default())?;
125 |         let abs_expr = PhysicalUnaryExpr::create(
126 |             ColumnExpr::try_create(Some("score".to_string()), None)?,
127 |             UnaryOperator::Abs,
128 |             "abs".to_string(),
129 |             &DataType::Float64,
130 |         );
131 |         let batches = table.scan(Some(vec![3]))?;
132 |         let res = abs_expr.evaluate(&batches[0])?.into_array();
133 |         let score_expected: ArrayRef = Arc::new(Float64Array::from(vec![
134 |             Some(60.0),
135 |             Some(90.1),
136 |             Some(99.99),
137 |             Some(81.1),
138 |             Some(82.2),
139 |             Some(83.3),
140 |             Some(84.4),
141 |             Some(85.5),
142 |         ]));
143 |         assert_eq!(&res, &score_expected);
144 |         Ok(())
145 |     }
146 | 
147 |     #[test]
148 |     fn test_sin_expression() -> Result<()> {
149 |         let table = CsvTable::try_create("data/test_data.csv", CsvConfig::default())?;
150 |         let abs_expr = PhysicalUnaryExpr::create(
151 |             ColumnExpr::try_create(Some("score".to_string()), None)?,
152 |             UnaryOperator::Sin,
153 |             "sin".to_string(),
154 |             &DataType::Float64,
155 |         );
156 |         let batches = table.scan(Some(vec![3]))?;
157 |         let res = abs_expr.evaluate(&batches[0])?.into_array();
158 |         let score_expected: ArrayRef = Arc::new(Float64Array::from(vec![
159 |             Some(-0.3048106211022167),
160 |             Some(0.8447976840197418),
161 |             Some(-0.5149633680424761),
162 |             Some(-0.5492019627147913),
163 |             Some(0.49565689358989423),
164 |             Some(0.9988580516952367),
165 |             Some(0.4104993826174394),
166 |             Some(-0.6264561960895026),
167 |         ]));
168 |         assert_eq!(&res, &score_expected);
169 |         Ok(())
170 |     }
171 | }
172 | 


--------------------------------------------------------------------------------
/src/physical_plan/hash_join.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * @Author: Veeupup
  3 |  * @Date: 2022-05-19 14:17:29
  4 |  * @Last Modified by: Veeupup
  5 |  * @Last Modified time: 2022-05-19 17:28:50
  6 | */
  7 | 
  8 | use arrow::array::ArrayRef;
  9 | use arrow::array::Int64Builder;
 10 | use arrow::array::PrimitiveArray;
 11 | use arrow::array::StringArray;
 12 | use arrow::compute;
 13 | use arrow::compute::concat;
 14 | use arrow::datatypes::DataType;
 15 | 
 16 | use arrow::datatypes::Int64Type;
 17 | use arrow::datatypes::SchemaRef;
 18 | use arrow::datatypes::UInt64Type;
 19 | use arrow::record_batch::RecordBatch;
 20 | 
 21 | use twox_hash::XxHash64;
 22 | 
 23 | use super::PhysicalPlan;
 24 | use super::PhysicalPlanRef;
 25 | use crate::error::ErrorCode;
 26 | use crate::logical_plan::expression::Column;
 27 | use crate::logical_plan::plan::JoinType;
 28 | use crate::logical_plan::schema::NaiveSchema;
 29 | use crate::physical_plan::ColumnExpr;
 30 | 
 31 | use crate::Result;
 32 | use std::collections::HashMap;
 33 | 
 34 | use std::hash::Hasher;
 35 | 
 36 | use std::sync::Arc;
 37 | use std::sync::Mutex;
 38 | 
 39 | /// HashJoin has two phase for join
 40 | /// 1. build phase will build HashMap about outer table using on column as hashval
 41 | ///     hashmap: col hash val -> vec<row id>
 42 | /// 2. probe phase will probe all inner table by using on col to check
 43 | #[derive(Debug)]
 44 | pub struct HashJoin {
 45 |     left: PhysicalPlanRef,
 46 |     right: PhysicalPlanRef,
 47 |     on: Vec<(Column, Column)>,
 48 |     #[allow(unused)]
 49 |     join_type: JoinType,
 50 |     schema: NaiveSchema,
 51 |     /// on col hash val and row id
 52 |     /// chain hash table
 53 |     hashtable: Mutex<HashMap<u64, Vec<usize>>>,
 54 |     /// data, combine all data in one record batch
 55 |     data: Mutex<Option<RecordBatch>>,
 56 | }
 57 | 
 58 | macro_rules! build_match {
 59 |     ($LEFT_COL: expr, $TYPE: ty, $SINGLE_BATCH: expr, $HASHTABLE: expr, $WRITE_DT: ident) => {{
 60 |         let left_col = $LEFT_COL
 61 |             .as_any()
 62 |             .downcast_ref::<PrimitiveArray<$TYPE>>()
 63 |             .unwrap();
 64 | 
 65 |         // build hashmap
 66 |         for i in 0..$SINGLE_BATCH.num_rows() {
 67 |             let left_val = left_col.value(i);
 68 |             let mut hasher = XxHash64::default();
 69 |             hasher.$WRITE_DT(left_val);
 70 |             let hash_val = hasher.finish();
 71 |             if let Some(vec) = $HASHTABLE.get_mut(&hash_val) {
 72 |                 vec.push(i);
 73 |             } else {
 74 |                 $HASHTABLE.insert(hash_val, vec![i]);
 75 |             }
 76 |         }
 77 |     }};
 78 | }
 79 | 
 80 | macro_rules! probe_match {
 81 |     ($RIGHT_COL: expr, $LEFT_COL: expr, $TYPE: ty, $RIGHT_BATCH: expr, $HASHTABLE: expr, $OUTER_POS: expr, $INNER_POS: expr, $WRITE_DT: ident) => {{
 82 |         let right_col = $RIGHT_COL.as_any().downcast_ref::<$TYPE>().unwrap();
 83 |         let left_col = $LEFT_COL.as_any().downcast_ref::<$TYPE>().unwrap();
 84 | 
 85 |         // probe
 86 |         for i in 0..$RIGHT_BATCH.num_rows() {
 87 |             let right_val = right_col.value(i);
 88 |             let mut hasher = XxHash64::default();
 89 |             hasher.$WRITE_DT(right_val);
 90 |             let hash_val = hasher.finish();
 91 | 
 92 |             if let Some(left_pos) = $HASHTABLE.get(&hash_val) {
 93 |                 for idx in left_pos {
 94 |                     // hash val same, but we need to check whether real value equal or not
 95 |                     if left_col.value(*idx) == right_col.value(i) {
 96 |                         $OUTER_POS.append_value(*idx as i64)?;
 97 |                         $INNER_POS.append_value(i as i64)?;
 98 |                     }
 99 |                 }
100 |             }
101 |         }
102 |     }};
103 | }
104 | 
105 | impl HashJoin {
106 |     pub fn create(
107 |         left: PhysicalPlanRef,
108 |         right: PhysicalPlanRef,
109 |         on: Vec<(Column, Column)>,
110 |         join_type: JoinType,
111 |         schema: NaiveSchema,
112 |     ) -> PhysicalPlanRef {
113 |         Arc::new(Self {
114 |             left,
115 |             right,
116 |             on,
117 |             join_type,
118 |             schema,
119 |             hashtable: Mutex::new(HashMap::new()),
120 |             data: Mutex::new(None),
121 |         })
122 |     }
123 | 
124 |     pub fn build(&self) -> Result<Vec<ArrayRef>> {
125 |         if self.on.is_empty() {
126 |             return Err(ErrorCode::PlanError(
127 |                 "Inner Join on Conditions can't not be empty".to_string(),
128 |             ));
129 |         }
130 | 
131 |         let left = self.left.execute()?;
132 |         let single_batch = concat_batches(&self.left.schema().clone().into(), &left)?;
133 | 
134 |         let (left_col, _) = &self.on[0];
135 |         let left_col = ColumnExpr::try_create(Some(left_col.name.clone()), None)?;
136 |         let left_col = left_col.evaluate(&single_batch)?.into_array();
137 | 
138 |         let mut hashtable = self.hashtable.lock().unwrap();
139 |         match left_col.data_type() {
140 |             DataType::Int64 => {
141 |                 build_match!(left_col, Int64Type, single_batch, hashtable, write_i64)
142 |             }
143 |             DataType::UInt64 => {
144 |                 build_match!(left_col, UInt64Type, single_batch, hashtable, write_u64)
145 |             }
146 |             DataType::Utf8 => {
147 |                 let left_col = left_col.as_any().downcast_ref::<StringArray>().unwrap();
148 | 
149 |                 // build hashmap
150 |                 for i in 0..single_batch.num_rows() {
151 |                     let mut hasher = XxHash64::default();
152 |                     hasher.write(left_col.value(i).as_bytes());
153 |                     let hash_val = hasher.finish();
154 |                     if let Some(vec) = hashtable.get_mut(&hash_val) {
155 |                         vec.push(i);
156 |                     } else {
157 |                         hashtable.insert(hash_val, vec![i]);
158 |                     }
159 |                 }
160 |             }
161 |             _ => return Err(ErrorCode::NotImplemented),
162 |         }
163 | 
164 |         *self.data.lock().unwrap() = Some(single_batch);
165 |         Ok(vec![left_col])
166 |     }
167 | 
168 |     pub fn probe(&self, left_cols: Vec<ArrayRef>) -> Result<Vec<RecordBatch>> {
169 |         let right_batches = self.right.execute()?;
170 | 
171 |         let (_, right_col) = &self.on[0];
172 |         let right_col = ColumnExpr::try_create(Some(right_col.name.clone()), None)?;
173 |         let left_col = &left_cols[0];
174 | 
175 |         let mut batches = vec![];
176 | 
177 |         for right_batch in &right_batches {
178 |             let right_col = right_col.evaluate(right_batch)?.into_array();
179 | 
180 |             let hashtable = self.hashtable.lock().unwrap();
181 | 
182 |             let mut outer_pos = Int64Builder::new(left_col.len());
183 |             let mut inner_pos = Int64Builder::new(right_col.len());
184 |             match right_col.data_type() {
185 |                 DataType::Int64 => probe_match!(
186 |                     right_col,
187 |                     left_col,
188 |                     PrimitiveArray<Int64Type>,
189 |                     right_batch,
190 |                     hashtable,
191 |                     outer_pos,
192 |                     inner_pos,
193 |                     write_i64
194 |                 ),
195 |                 DataType::UInt64 => probe_match!(
196 |                     right_col,
197 |                     left_col,
198 |                     PrimitiveArray<UInt64Type>,
199 |                     right_batch,
200 |                     hashtable,
201 |                     outer_pos,
202 |                     inner_pos,
203 |                     write_u64
204 |                 ),
205 |                 DataType::Utf8 => {
206 |                     let right_col = right_col.as_any().downcast_ref::<StringArray>().unwrap();
207 |                     let left_col = left_col.as_any().downcast_ref::<StringArray>().unwrap();
208 | 
209 |                     // probe
210 |                     for i in 0..right_batch.num_rows() {
211 |                         let mut hasher = XxHash64::default();
212 |                         hasher.write(right_col.value(i).as_bytes());
213 |                         let hash_val = hasher.finish();
214 | 
215 |                         if let Some(left_pos) = hashtable.get(&hash_val) {
216 |                             for idx in left_pos {
217 |                                 // hash val same, but we need to check whether real value equal or not
218 |                                 if left_col.value(*idx) == right_col.value(i) {
219 |                                     outer_pos.append_value(*idx as i64)?;
220 |                                     inner_pos.append_value(i as i64)?;
221 |                                 }
222 |                             }
223 |                         }
224 |                     }
225 |                 }
226 |                 _ => return Err(ErrorCode::NotImplemented),
227 |             }
228 | 
229 |             let mut columns = vec![];
230 | 
231 |             let outer_pos = outer_pos.finish();
232 |             let inner_pos = inner_pos.finish();
233 | 
234 |             // add left columns
235 |             let data = self.data.lock().unwrap();
236 |             if let Some(outer_table) = &*data {
237 |                 for i in 0..self.left.schema().fields().len() {
238 |                     let array = outer_table.column(i);
239 |                     columns.push(compute::take(array.as_ref(), &outer_pos, None)?);
240 |                 }
241 | 
242 |                 // add right columns
243 |                 for i in 0..self.right.schema().fields().len() {
244 |                     let array = right_batch.column(i);
245 |                     columns.push(compute::take(array.as_ref(), &inner_pos, None)?);
246 |                 }
247 | 
248 |                 let batch = RecordBatch::try_new(SchemaRef::from(self.schema.clone()), columns)?;
249 |                 batches.push(batch);
250 |             }
251 |         }
252 | 
253 |         Ok(batches)
254 |     }
255 | }
256 | 
257 | /// Concatenates an array of `RecordBatch` into one batch
258 | pub fn concat_batches(schema: &SchemaRef, batches: &[RecordBatch]) -> Result<RecordBatch> {
259 |     if batches.is_empty() {
260 |         return Ok(RecordBatch::new_empty(schema.clone()));
261 |     }
262 |     let mut arrays = Vec::with_capacity(schema.fields().len());
263 |     for i in 0..schema.fields().len() {
264 |         let array = concat(
265 |             &batches
266 |                 .iter()
267 |                 .map(|batch| batch.column(i).as_ref())
268 |                 .collect::<Vec<_>>(),
269 |         )?;
270 |         arrays.push(array);
271 |     }
272 |     Ok(RecordBatch::try_new(schema.clone(), arrays)?)
273 | }
274 | 
275 | impl PhysicalPlan for HashJoin {
276 |     fn schema(&self) -> &NaiveSchema {
277 |         &self.schema
278 |     }
279 | 
280 |     fn execute(&self) -> Result<Vec<RecordBatch>> {
281 |         let left_cols = self.build()?;
282 | 
283 |         self.probe(left_cols)
284 |     }
285 | 
286 |     fn children(&self) -> Result<Vec<PhysicalPlanRef>> {
287 |         Ok(vec![self.left.clone(), self.right.clone()])
288 |     }
289 | }
290 | 


--------------------------------------------------------------------------------
/src/physical_plan/limit.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: Veeupup
 3 |  * @Date: 2022-05-17 11:27:29
 4 |  * @Last Modified by: Veeupup
 5 |  * @Last Modified time: 2022-05-18 14:45:03
 6 |  */
 7 | 
 8 | use super::{PhysicalPlan, PhysicalPlanRef};
 9 | use crate::error::Result;
10 | use crate::logical_plan::schema::NaiveSchema;
11 | 
12 | use arrow::record_batch::RecordBatch;
13 | use std::sync::Arc;
14 | 
15 | #[derive(Debug, Clone)]
16 | pub struct PhysicalLimitPlan {
17 |     input: PhysicalPlanRef,
18 |     n: usize,
19 | }
20 | 
21 | impl PhysicalLimitPlan {
22 |     pub fn create(input: PhysicalPlanRef, n: usize) -> PhysicalPlanRef {
23 |         Arc::new(Self { input, n })
24 |     }
25 | }
26 | 
27 | impl PhysicalPlan for PhysicalLimitPlan {
28 |     fn schema(&self) -> &NaiveSchema {
29 |         self.input.schema()
30 |     }
31 | 
32 |     fn execute(&self) -> Result<Vec<RecordBatch>> {
33 |         let batches = self.input.execute()?;
34 |         let mut n = self.n;
35 |         let mut ret = vec![];
36 |         for batch in &batches {
37 |             if n == 0 {
38 |                 break;
39 |             }
40 |             if batch.num_rows() <= n {
41 |                 ret.push(batch.clone());
42 |                 n -= batch.num_rows();
43 |             } else {
44 |                 ret.push(batch.slice(0, n));
45 |                 n = 0;
46 |             };
47 |         }
48 |         Ok(ret)
49 |     }
50 | 
51 |     fn children(&self) -> Result<Vec<PhysicalPlanRef>> {
52 |         Ok(vec![self.input.clone()])
53 |     }
54 | }
55 | 
56 | #[cfg(test)]
57 | mod tests {
58 |     use crate::{
59 |         datasource::{CsvConfig, CsvTable},
60 |         physical_plan::ScanPlan,
61 |     };
62 |     use arrow::array::{ArrayRef, Float64Array, Int64Array, StringArray};
63 | 
64 |     use super::*;
65 | 
66 |     #[test]
67 |     fn test_physical_limit() -> Result<()> {
68 |         let source = CsvTable::try_create("data/test_data.csv", CsvConfig::default())?;
69 | 
70 |         let scan_plan = ScanPlan::create(source, None);
71 |         let limit_plan = PhysicalLimitPlan::create(scan_plan, 2);
72 | 
73 |         let result = limit_plan.execute()?;
74 | 
75 |         assert_eq!(result.len(), 1);
76 |         let record_batch = &result[0];
77 |         assert_eq!(record_batch.columns().len(), 4);
78 | 
79 |         let id_excepted: ArrayRef = Arc::new(Int64Array::from(vec![1, 2]));
80 |         let name_excepted: ArrayRef = Arc::new(StringArray::from(vec!["veeupup", "alex"]));
81 |         let age_excepted: ArrayRef = Arc::new(Int64Array::from(vec![23, 20]));
82 |         let score_excepted: ArrayRef = Arc::new(Float64Array::from(vec![60.0, 90.1]));
83 | 
84 |         assert_eq!(record_batch.column(0), &id_excepted);
85 |         assert_eq!(record_batch.column(1), &name_excepted);
86 |         assert_eq!(record_batch.column(2), &age_excepted);
87 |         assert_eq!(record_batch.column(3), &score_excepted);
88 | 
89 |         Ok(())
90 |     }
91 | }
92 | 


--------------------------------------------------------------------------------
/src/physical_plan/mod.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: Veeupup
 3 |  * @Date: 2022-05-13 14:07:36
 4 |  * @Email: code@tanweime.com
 5 | */
 6 | 
 7 | mod expression;
 8 | mod plan;
 9 | 
10 | mod aggregate;
11 | mod cross_join;
12 | mod hash_join;
13 | mod limit;
14 | mod nested_loop_join;
15 | mod offset;
16 | mod projection;
17 | mod scan;
18 | mod selection;
19 | mod visitor;
20 | 
21 | pub use aggregate::*;
22 | pub use cross_join::*;
23 | pub use expression::*;
24 | pub use hash_join::*;
25 | pub use limit::*;
26 | pub use nested_loop_join::*;
27 | pub use offset::*;
28 | pub use plan::*;
29 | pub use projection::*;
30 | pub use scan::*;
31 | pub use selection::*;
32 | 


--------------------------------------------------------------------------------
/src/physical_plan/nested_loop_join.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * @Author: Veeupup
  3 |  * @Date: 2022-05-18 16:00:13
  4 |  * @Last Modified by: Veeupup
  5 |  * @Last Modified time: 2022-05-19 15:53:24
  6 |  */
  7 | use super::PhysicalPlan;
  8 | use super::PhysicalPlanRef;
  9 | use crate::error::ErrorCode;
 10 | use crate::logical_plan::expression::Column;
 11 | use crate::logical_plan::plan::JoinType;
 12 | use crate::logical_plan::schema::NaiveSchema;
 13 | use crate::physical_plan::ColumnExpr;
 14 | 
 15 | use crate::Result;
 16 | use std::sync::Arc;
 17 | 
 18 | use arrow::array::Array;
 19 | use arrow::array::Int64Builder;
 20 | use arrow::array::PrimitiveArray;
 21 | use arrow::array::StringArray;
 22 | use arrow::compute;
 23 | use arrow::datatypes::DataType;
 24 | use arrow::datatypes::Float64Type;
 25 | use arrow::datatypes::Int64Type;
 26 | use arrow::datatypes::SchemaRef;
 27 | use arrow::datatypes::UInt64Type;
 28 | use arrow::record_batch::RecordBatch;
 29 | 
 30 | #[derive(Debug)]
 31 | pub struct NestedLoopJoin {
 32 |     left: PhysicalPlanRef,
 33 |     right: PhysicalPlanRef,
 34 |     on: Vec<(Column, Column)>,
 35 |     #[allow(unused)]
 36 |     join_type: JoinType,
 37 |     schema: NaiveSchema,
 38 | }
 39 | 
 40 | impl NestedLoopJoin {
 41 |     #[allow(unused)]
 42 |     pub fn create(
 43 |         left: PhysicalPlanRef,
 44 |         right: PhysicalPlanRef,
 45 |         on: Vec<(Column, Column)>,
 46 |         join_type: JoinType,
 47 |         schema: NaiveSchema,
 48 |     ) -> PhysicalPlanRef {
 49 |         Arc::new(Self {
 50 |             left,
 51 |             right,
 52 |             on,
 53 |             join_type,
 54 |             schema,
 55 |         })
 56 |     }
 57 | }
 58 | 
 59 | macro_rules! join_match {
 60 |     ($DATATYPE: ty, $LEFT_COL: expr, $RIGHT_COL: expr, $OUTER_POS: expr, $INNER_POS: expr) => {{
 61 |         let left_col = $LEFT_COL
 62 |             .as_any()
 63 |             .downcast_ref::<PrimitiveArray<$DATATYPE>>()
 64 |             .unwrap();
 65 |         let right_col = $RIGHT_COL
 66 |             .as_any()
 67 |             .downcast_ref::<PrimitiveArray<$DATATYPE>>()
 68 |             .unwrap();
 69 | 
 70 |         for (x_pos, x) in left_col.iter().enumerate() {
 71 |             for (y_pos, y) in right_col.iter().enumerate() {
 72 |                 match (x, y) {
 73 |                     (Some(x), Some(y)) => {
 74 |                         if x == y {
 75 |                             // equal and we should
 76 |                             $OUTER_POS.append_value(x_pos as i64)?;
 77 |                             $INNER_POS.append_value(y_pos as i64)?;
 78 |                         }
 79 |                     }
 80 |                     _ => {}
 81 |                 }
 82 |             }
 83 |         }
 84 |     }};
 85 | }
 86 | 
 87 | impl PhysicalPlan for NestedLoopJoin {
 88 |     fn schema(&self) -> &NaiveSchema {
 89 |         &self.schema
 90 |     }
 91 | 
 92 |     fn execute(&self) -> Result<Vec<RecordBatch>> {
 93 |         let outer_table = self.left.execute()?;
 94 |         let inner_table = self.right.execute()?;
 95 | 
 96 |         let mut batches: Vec<RecordBatch> = vec![];
 97 |         // TODO(veeupup): support multi on conditions
 98 |         // Using for loop to combine different conditions
 99 |         if self.on.is_empty() {
100 |             return Err(ErrorCode::PlanError(
101 |                 "Inner Join on Conditions can't not be empty".to_string(),
102 |             ));
103 |         }
104 | 
105 |         let (left_col, right_col) = &self.on[0];
106 |         // TODO(veeupup): consider make left_col in physical plan and not create when executing
107 |         let left_col = ColumnExpr::try_create(Some(left_col.name.clone()), None)?;
108 |         let right_col = ColumnExpr::try_create(Some(right_col.name.clone()), None)?;
109 | 
110 |         for outer in &outer_table {
111 |             let left_col = left_col.evaluate(outer)?.into_array();
112 | 
113 |             let dt = left_col.data_type();
114 |             for inner in &inner_table {
115 |                 let right_col = right_col.evaluate(inner)?.into_array();
116 | 
117 |                 // check if ok
118 |                 if left_col.data_type() != right_col.data_type() {
119 |                     return Err(ErrorCode::PlanError(format!(
120 |                         "Join on left and right data type should be same: left: {:?}, right: {:?}",
121 |                         left_col.data_type(),
122 |                         right_col.data_type()
123 |                     )));
124 |                 }
125 | 
126 |                 let mut outer_pos = Int64Builder::new(left_col.len());
127 |                 let mut inner_pos = Int64Builder::new(right_col.len());
128 |                 match dt {
129 |                     DataType::Int64 => {
130 |                         join_match!(Int64Type, left_col, right_col, outer_pos, inner_pos)
131 |                     }
132 |                     DataType::UInt64 => {
133 |                         join_match!(UInt64Type, left_col, right_col, outer_pos, inner_pos)
134 |                     }
135 |                     DataType::Float64 => {
136 |                         join_match!(Float64Type, left_col, right_col, outer_pos, inner_pos)
137 |                     }
138 |                     DataType::Utf8 => {
139 |                         let left_col = left_col.as_any().downcast_ref::<StringArray>().unwrap();
140 |                         let right_col = right_col.as_any().downcast_ref::<StringArray>().unwrap();
141 | 
142 |                         for (x_pos, x) in left_col.iter().enumerate() {
143 |                             for (y_pos, y) in right_col.iter().enumerate() {
144 |                                 if let (Some(x), Some(y)) = (x, y) {
145 |                                     if x == y {
146 |                                         // equal and we should
147 |                                         outer_pos.append_value(x_pos as i64)?;
148 |                                         inner_pos.append_value(y_pos as i64)?;
149 |                                     }
150 |                                 }
151 |                             }
152 |                         }
153 |                     }
154 |                     _ => unimplemented!(),
155 |                 }
156 |                 let mut columns = vec![];
157 | 
158 |                 let outer_pos = outer_pos.finish();
159 |                 let inner_pos = inner_pos.finish();
160 | 
161 |                 // add left columns
162 |                 for i in 0..self.left.schema().fields().len() {
163 |                     let array = outer.column(i);
164 |                     columns.push(compute::take(array.as_ref(), &outer_pos, None)?);
165 |                 }
166 | 
167 |                 // add right columns
168 |                 for i in 0..self.right.schema().fields().len() {
169 |                     let array = inner.column(i);
170 |                     columns.push(compute::take(array.as_ref(), &inner_pos, None)?);
171 |                 }
172 | 
173 |                 let batch = RecordBatch::try_new(SchemaRef::from(self.schema.clone()), columns)?;
174 |                 batches.push(batch);
175 |             }
176 |         }
177 | 
178 |         Ok(batches)
179 |     }
180 | 
181 |     fn children(&self) -> Result<Vec<PhysicalPlanRef>> {
182 |         Ok(vec![self.left.clone(), self.right.clone()])
183 |     }
184 | }
185 | 


--------------------------------------------------------------------------------
/src/physical_plan/offset.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: GanZiheng
 3 |  * @Date: 2022-05-25
 4 |  */
 5 | 
 6 | use super::{PhysicalPlan, PhysicalPlanRef};
 7 | use crate::error::Result;
 8 | use crate::logical_plan::schema::NaiveSchema;
 9 | 
10 | use arrow::record_batch::RecordBatch;
11 | use std::sync::Arc;
12 | 
13 | #[derive(Debug, Clone)]
14 | pub struct PhysicalOffsetPlan {
15 |     input: PhysicalPlanRef,
16 |     n: usize,
17 | }
18 | 
19 | impl PhysicalOffsetPlan {
20 |     pub fn create(input: PhysicalPlanRef, n: usize) -> PhysicalPlanRef {
21 |         Arc::new(Self { input, n })
22 |     }
23 | }
24 | 
25 | impl PhysicalPlan for PhysicalOffsetPlan {
26 |     fn schema(&self) -> &NaiveSchema {
27 |         self.input.schema()
28 |     }
29 | 
30 |     fn execute(&self) -> Result<Vec<RecordBatch>> {
31 |         let batches = self.input.execute()?;
32 |         let mut n = self.n;
33 |         let mut ret = vec![];
34 | 
35 |         for batch in &batches {
36 |             if n == 0 {
37 |                 ret.push(batch.clone());
38 |                 continue;
39 |             }
40 | 
41 |             if n >= batch.num_rows() {
42 |                 n -= batch.num_rows();
43 |                 continue;
44 |             }
45 | 
46 |             let remain = batch.num_rows() - n;
47 |             ret.push(batch.slice(n, remain));
48 |             n = 0;
49 |         }
50 |         Ok(ret)
51 |     }
52 | 
53 |     fn children(&self) -> Result<Vec<PhysicalPlanRef>> {
54 |         Ok(vec![self.input.clone()])
55 |     }
56 | }
57 | 
58 | #[cfg(test)]
59 | mod tests {
60 |     use crate::{
61 |         datasource::{CsvConfig, CsvTable},
62 |         physical_plan::ScanPlan,
63 |     };
64 |     use arrow::array::{ArrayRef, Float64Array, Int64Array, StringArray};
65 | 
66 |     use super::*;
67 | 
68 |     #[test]
69 |     fn test_physical_offset() -> Result<()> {
70 |         let source = CsvTable::try_create("data/test_data.csv", CsvConfig::default())?;
71 | 
72 |         let scan_plan = ScanPlan::create(source, None);
73 |         let offset_plan = PhysicalOffsetPlan::create(scan_plan, 5);
74 | 
75 |         let result = offset_plan.execute()?;
76 | 
77 |         assert_eq!(result.len(), 1);
78 |         let record_batch = &result[0];
79 |         assert_eq!(record_batch.columns().len(), 4);
80 | 
81 |         let id_excepted: ArrayRef = Arc::new(Int64Array::from(vec![7, 8, 9]));
82 |         let name_excepted: ArrayRef = Arc::new(StringArray::from(vec!["jack", "cock", "primer"]));
83 |         let age_excepted: ArrayRef = Arc::new(Int64Array::from(vec![21, 22, 23]));
84 |         let score_excepted: ArrayRef = Arc::new(Float64Array::from(vec![83.3, 84.4, 85.5]));
85 | 
86 |         assert_eq!(record_batch.column(0), &id_excepted);
87 |         assert_eq!(record_batch.column(1), &name_excepted);
88 |         assert_eq!(record_batch.column(2), &age_excepted);
89 |         assert_eq!(record_batch.column(3), &score_excepted);
90 | 
91 |         Ok(())
92 |     }
93 | }
94 | 


--------------------------------------------------------------------------------
/src/physical_plan/plan.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: Veeupup
 3 |  * @Date: 2022-05-13 14:23:58
 4 |  * @Email: code@tanweime.com
 5 | */
 6 | 
 7 | use std::fmt::Debug;
 8 | use std::sync::Arc;
 9 | 
10 | use arrow::record_batch::RecordBatch;
11 | 
12 | use crate::{error::Result, logical_plan::schema::NaiveSchema};
13 | 
14 | pub trait PhysicalPlan: Debug {
15 |     fn schema(&self) -> &NaiveSchema;
16 | 
17 |     // TODO(veeupup): return by using streaming mode
18 |     fn execute(&self) -> Result<Vec<RecordBatch>>;
19 | 
20 |     fn children(&self) -> Result<Vec<PhysicalPlanRef>>;
21 | }
22 | 
23 | pub type PhysicalPlanRef = Arc<dyn PhysicalPlan>;
24 | 


--------------------------------------------------------------------------------
/src/physical_plan/projection.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * @Author: Veeupup
  3 |  * @Date: 2022-05-13 14:54:33
  4 |  * @Email: code@tanweime.com
  5 | */
  6 | 
  7 | use std::iter::Iterator;
  8 | use std::sync::Arc;
  9 | 
 10 | use super::plan::PhysicalPlan;
 11 | use crate::error::Result;
 12 | use crate::logical_plan::schema::NaiveSchema;
 13 | use crate::physical_plan::PhysicalExprRef;
 14 | use crate::physical_plan::PhysicalPlanRef;
 15 | use arrow::datatypes::SchemaRef;
 16 | use arrow::record_batch::RecordBatch;
 17 | #[derive(Debug, Clone)]
 18 | pub struct ProjectionPlan {
 19 |     input: PhysicalPlanRef,
 20 |     schema: NaiveSchema,
 21 |     expr: Vec<PhysicalExprRef>,
 22 | }
 23 | 
 24 | impl ProjectionPlan {
 25 |     pub fn create(
 26 |         input: PhysicalPlanRef,
 27 |         schema: NaiveSchema,
 28 |         expr: Vec<PhysicalExprRef>,
 29 |     ) -> PhysicalPlanRef {
 30 |         Arc::new(Self {
 31 |             input,
 32 |             schema,
 33 |             expr,
 34 |         })
 35 |     }
 36 | }
 37 | 
 38 | impl PhysicalPlan for ProjectionPlan {
 39 |     fn schema(&self) -> &NaiveSchema {
 40 |         &self.schema
 41 |     }
 42 | 
 43 |     fn execute(&self) -> Result<Vec<RecordBatch>> {
 44 |         let input = self.input.execute()?;
 45 | 
 46 |         // when aggragating, we just output what input does
 47 |         if self.schema.fields().is_empty() {
 48 |             Ok(input)
 49 |         } else {
 50 |             let batches = input
 51 |                 .iter()
 52 |                 .map(|batch| {
 53 |                     let columns = self
 54 |                         .expr
 55 |                         .iter()
 56 |                         // TODO(veeupup): remove unwrap
 57 |                         .map(|expr| expr.evaluate(batch).unwrap())
 58 |                         .collect::<Vec<_>>();
 59 |                     let columns = columns
 60 |                         .iter()
 61 |                         .map(|column| column.clone().into_array())
 62 |                         .collect::<Vec<_>>();
 63 |                     // TODO(veeupup): remove unwrap
 64 |                     // let projection_schema = self.schema.into();
 65 |                     RecordBatch::try_new(SchemaRef::from(self.schema.clone()), columns).unwrap()
 66 |                 })
 67 |                 .collect::<Vec<_>>();
 68 |             Ok(batches)
 69 |         }
 70 |     }
 71 | 
 72 |     fn children(&self) -> Result<Vec<PhysicalPlanRef>> {
 73 |         Ok(vec![self.input.clone()])
 74 |     }
 75 | }
 76 | 
 77 | #[cfg(test)]
 78 | mod tests {
 79 |     use super::*;
 80 |     use crate::datasource::{CsvConfig, CsvTable};
 81 |     use crate::logical_plan::expression::{Operator, ScalarValue};
 82 |     use crate::physical_plan::expression::ColumnExpr;
 83 |     use crate::physical_plan::scan::ScanPlan;
 84 |     use crate::physical_plan::{PhysicalBinaryExpr, PhysicalLiteralExpr};
 85 |     use arrow::array::{ArrayRef, Int64Array, StringArray};
 86 | 
 87 |     #[test]
 88 |     fn test_projection() -> Result<()> {
 89 |         let source = CsvTable::try_create("data/test_data.csv", CsvConfig::default())?;
 90 |         let schema = NaiveSchema::new(vec![
 91 |             source.schema().field(0).clone(),
 92 |             source.schema().field(1).clone(),
 93 |         ]);
 94 |         let scan_plan = ScanPlan::create(source, None);
 95 |         let add_expr = PhysicalBinaryExpr::create(
 96 |             ColumnExpr::try_create(Some("id".to_string()), None)?,
 97 |             Operator::Plus,
 98 |             PhysicalLiteralExpr::create(ScalarValue::Int64(Some(1))),
 99 |         );
100 |         let expr = vec![
101 |             // ColumnExpr::try_create(None, Some(0))?,
102 |             add_expr,
103 |             ColumnExpr::try_create(Some("name".to_string()), None)?,
104 |         ];
105 |         let proj_plan = ProjectionPlan::create(scan_plan, schema, expr);
106 | 
107 |         let res = proj_plan.execute()?;
108 | 
109 |         assert_eq!(res.len(), 1);
110 |         let batch = &res[0];
111 | 
112 |         // let id_excepted: ArrayRef = Arc::new(Int64Array::from(vec![1, 2, 4, 5, 6, 7, 8, 9]));
113 |         let name_excepted: ArrayRef = Arc::new(StringArray::from(vec![
114 |             "veeupup", "alex", "lynne", "alice", "bob", "jack", "cock", "primer",
115 |         ]));
116 |         let id_excepted2: ArrayRef = Arc::new(Int64Array::from(vec![2, 3, 5, 6, 7, 8, 9, 10]));
117 |         assert_eq!(batch.column(0), &id_excepted2);
118 |         assert_eq!(batch.column(1), &name_excepted);
119 | 
120 |         Ok(())
121 |     }
122 | }
123 | 


--------------------------------------------------------------------------------
/src/physical_plan/scan.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: Veeupup
 3 |  * @Date: 2022-05-13 14:26:59
 4 |  * @Email: code@tanweime.com
 5 | */
 6 | 
 7 | use std::sync::Arc;
 8 | 
 9 | use crate::datasource::TableRef;
10 | use crate::error::Result;
11 | use crate::logical_plan::schema::NaiveSchema;
12 | use arrow::record_batch::RecordBatch;
13 | 
14 | use crate::physical_plan::PhysicalPlan;
15 | use crate::physical_plan::PhysicalPlanRef;
16 | 
17 | #[derive(Debug, Clone)]
18 | pub struct ScanPlan {
19 |     source: TableRef,
20 |     projection: Option<Vec<usize>>,
21 | }
22 | 
23 | impl ScanPlan {
24 |     pub fn create(source: TableRef, projection: Option<Vec<usize>>) -> PhysicalPlanRef {
25 |         Arc::new(Self { source, projection })
26 |     }
27 | }
28 | 
29 | impl PhysicalPlan for ScanPlan {
30 |     fn schema(&self) -> &NaiveSchema {
31 |         self.source.schema()
32 |     }
33 | 
34 |     fn execute(&self) -> Result<Vec<RecordBatch>> {
35 |         self.source.scan(self.projection.clone())
36 |     }
37 | 
38 |     fn children(&self) -> Result<Vec<PhysicalPlanRef>> {
39 |         Ok(vec![])
40 |     }
41 | }
42 | 
43 | #[cfg(test)]
44 | mod tests {
45 |     use crate::datasource::{CsvConfig, CsvTable};
46 |     use arrow::array::{ArrayRef, Float64Array, Int64Array, StringArray};
47 | 
48 |     use super::*;
49 | 
50 |     #[test]
51 |     fn test_physical_scan() -> Result<()> {
52 |         let source = CsvTable::try_create("data/test_data.csv", CsvConfig::default())?;
53 | 
54 |         let scan_plan = ScanPlan::create(source, None);
55 | 
56 |         let result = scan_plan.execute()?;
57 | 
58 |         assert_eq!(result.len(), 1);
59 |         let record_batch = &result[0];
60 |         assert_eq!(record_batch.columns().len(), 4);
61 | 
62 |         let id_excepted: ArrayRef = Arc::new(Int64Array::from(vec![1, 2, 4, 5, 6, 7, 8, 9]));
63 |         let name_excepted: ArrayRef = Arc::new(StringArray::from(vec![
64 |             "veeupup", "alex", "lynne", "alice", "bob", "jack", "cock", "primer",
65 |         ]));
66 |         let age_excepted: ArrayRef =
67 |             Arc::new(Int64Array::from(vec![23, 20, 18, 19, 20, 21, 22, 23]));
68 |         let score_excepted: ArrayRef = Arc::new(Float64Array::from(vec![
69 |             60.0, 90.1, 99.99, 81.1, 82.2, 83.3, 84.4, 85.5,
70 |         ]));
71 | 
72 |         assert_eq!(record_batch.column(0), &id_excepted);
73 |         assert_eq!(record_batch.column(1), &name_excepted);
74 |         assert_eq!(record_batch.column(2), &age_excepted);
75 |         assert_eq!(record_batch.column(3), &score_excepted);
76 | 
77 |         Ok(())
78 |     }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/physical_plan/selection.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * @Author: Veeupup
  3 |  * @Date: 2022-05-14 18:20:27
  4 |  * @Email: code@tanweime.com
  5 | */
  6 | 
  7 | use std::sync::Arc;
  8 | 
  9 | use super::{PhysicalExprRef, PhysicalPlan, PhysicalPlanRef};
 10 | use crate::logical_plan::schema::NaiveSchema;
 11 | use crate::Result;
 12 | use arrow::array::{
 13 |     Float64Array, Float64Builder, Int64Array, Int64Builder, StringArray, StringBuilder,
 14 |     UInt64Array, UInt64Builder,
 15 | };
 16 | use arrow::record_batch::RecordBatch;
 17 | use arrow::{
 18 |     array::{Array, BooleanArray, BooleanBuilder},
 19 |     datatypes::DataType,
 20 | };
 21 | 
 22 | #[derive(Debug)]
 23 | pub struct SelectionPlan {
 24 |     input: PhysicalPlanRef,
 25 |     expr: PhysicalExprRef,
 26 | }
 27 | 
 28 | impl SelectionPlan {
 29 |     pub fn create(input: PhysicalPlanRef, expr: PhysicalExprRef) -> PhysicalPlanRef {
 30 |         Arc::new(Self { input, expr })
 31 |     }
 32 | }
 33 | 
 34 | macro_rules! build_array_by_predicate {
 35 |     ($COLUMN: ident, $PREDICATE: expr, $ARRAY_TYPE: ty, $ARRAY_BUILDER: ty) => {{
 36 |         let array = $COLUMN.as_any().downcast_ref::<$ARRAY_TYPE>().unwrap();
 37 |         let mut builder = <$ARRAY_BUILDER>::new(array.len());
 38 |         let iter = $PREDICATE.iter().zip(array.iter());
 39 |         for (valid, val) in iter {
 40 |             match valid {
 41 |                 Some(valid) => {
 42 |                     if valid {
 43 |                         builder.append_option(val)?;
 44 |                     }
 45 |                 }
 46 |                 None => builder.append_option(None)?,
 47 |             }
 48 |         }
 49 |         Arc::new(builder.finish())
 50 |     }};
 51 | }
 52 | 
 53 | impl PhysicalPlan for SelectionPlan {
 54 |     fn schema(&self) -> &NaiveSchema {
 55 |         self.input.schema()
 56 |     }
 57 | 
 58 |     fn execute(&self) -> Result<Vec<RecordBatch>> {
 59 |         let input = self.input.execute()?;
 60 |         let predicate = self.expr.evaluate(&input[0])?.into_array();
 61 |         let predicate = predicate.as_any().downcast_ref::<BooleanArray>().unwrap();
 62 | 
 63 |         let mut batches = vec![];
 64 | 
 65 |         for batch in &input {
 66 |             let mut columns = vec![];
 67 |             for col in batch.columns() {
 68 |                 let dt = col.data_type();
 69 |                 let column: Arc<dyn Array> = match dt {
 70 |                     DataType::Boolean => {
 71 |                         build_array_by_predicate!(col, predicate, BooleanArray, BooleanBuilder)
 72 |                     }
 73 |                     DataType::UInt64 => {
 74 |                         build_array_by_predicate!(col, predicate, UInt64Array, UInt64Builder)
 75 |                     }
 76 |                     DataType::Int64 => {
 77 |                         build_array_by_predicate!(col, predicate, Int64Array, Int64Builder)
 78 |                     }
 79 |                     DataType::Float64 => {
 80 |                         build_array_by_predicate!(col, predicate, Float64Array, Float64Builder)
 81 |                     }
 82 |                     DataType::Utf8 => {
 83 |                         let array = col.as_any().downcast_ref::<StringArray>().unwrap();
 84 |                         let mut builder = StringBuilder::new(array.len());
 85 |                         let iter = predicate.iter().zip(array.iter());
 86 |                         for (valid, val) in iter {
 87 |                             match valid {
 88 |                                 Some(valid) => {
 89 |                                     if valid {
 90 |                                         builder.append_option(val)?;
 91 |                                     }
 92 |                                 }
 93 |                                 None => builder.append_option(None::<&str>)?,
 94 |                             }
 95 |                         }
 96 |                         Arc::new(builder.finish())
 97 |                     }
 98 |                     _ => unimplemented!(),
 99 |                 };
100 |                 columns.push(column);
101 |             }
102 |             let record_batch =
103 |                 RecordBatch::try_new(Arc::new(self.schema().clone().into()), columns)?;
104 |             batches.push(record_batch);
105 |         }
106 |         Ok(batches)
107 |     }
108 | 
109 |     fn children(&self) -> Result<Vec<PhysicalPlanRef>> {
110 |         Ok(vec![self.input.clone()])
111 |     }
112 | }
113 | 
114 | #[cfg(test)]
115 | mod tests {
116 |     use super::*;
117 |     use crate::datasource::{CsvConfig, CsvTable};
118 |     use crate::logical_plan::expression::{Operator, ScalarValue};
119 |     use crate::physical_plan::expression::ColumnExpr;
120 |     use crate::physical_plan::scan::ScanPlan;
121 |     use crate::physical_plan::{PhysicalBinaryExpr, PhysicalLiteralExpr, ProjectionPlan};
122 |     use crate::print_result;
123 |     use arrow::array::{ArrayRef, Int64Array, StringArray};
124 | 
125 |     #[test]
126 |     fn test_selection() -> Result<()> {
127 |         let source = CsvTable::try_create("data/test_data.csv", CsvConfig::default())?;
128 |         let schema = NaiveSchema::new(vec![
129 |             source.schema().field(0).clone(),
130 |             source.schema().field(1).clone(),
131 |             source.schema().field(2).clone(),
132 |         ]);
133 |         let scan_plan = ScanPlan::create(source, None);
134 | 
135 |         let expr = vec![
136 |             ColumnExpr::try_create(None, Some(0))?,
137 |             ColumnExpr::try_create(Some("name".to_string()), None)?,
138 |             ColumnExpr::try_create(None, Some(2))?,
139 |         ];
140 |         let proj_plan = ProjectionPlan::create(scan_plan, schema, expr);
141 | 
142 |         // TODO(veeupup): selection expression
143 | 
144 |         {
145 |             let add_expr = PhysicalBinaryExpr::create(
146 |                 ColumnExpr::try_create(Some("id".to_string()), None)?,
147 |                 Operator::Plus,
148 |                 PhysicalLiteralExpr::create(ScalarValue::Int64(Some(1))),
149 |             );
150 | 
151 |             let expr = PhysicalBinaryExpr::create(
152 |                 add_expr,
153 |                 Operator::Gt,
154 |                 PhysicalLiteralExpr::create(ScalarValue::Int64(Some(5))),
155 |             );
156 | 
157 |             let selection_plan = SelectionPlan::create(proj_plan, expr);
158 | 
159 |             let res = selection_plan.execute()?;
160 | 
161 |             assert_eq!(res.len(), 1);
162 |             let batch = &res[0];
163 | 
164 |             print_result(&res)?;
165 | 
166 |             let id_excepted: ArrayRef = Arc::new(Int64Array::from(vec![5, 6, 7, 8, 9]));
167 |             let name_excepted: ArrayRef = Arc::new(StringArray::from(vec![
168 |                 "alice", "bob", "jack", "cock", "primer",
169 |             ]));
170 | 
171 |             assert_eq!(batch.column(0), &id_excepted);
172 |             assert_eq!(batch.column(1), &name_excepted);
173 |         }
174 | 
175 |         // TODO(veeupup): add more test about binary expression
176 | 
177 |         Ok(())
178 |     }
179 | }
180 | 


--------------------------------------------------------------------------------
/src/physical_plan/visitor.rs:
--------------------------------------------------------------------------------
 1 | use super::PhysicalPlan;
 2 | use crate::error::Result;
 3 | 
 4 | pub trait PhysicalPlanVistor {
 5 |     // Invoke before visit PhysicalPlan
 6 |     fn pre_visit(&mut self, plan: &dyn PhysicalPlan) -> Result<()>;
 7 | 
 8 |     // Invoke before after PhysicalPlan
 9 |     fn post_visit(&mut self, plan: &dyn PhysicalPlan) -> Result<()>;
10 | }
11 | 
12 | pub fn _visit_physical_plan<V: PhysicalPlanVistor>(
13 |     plan: &dyn PhysicalPlan,
14 |     visitor: &mut V,
15 | ) -> Result<()> {
16 |     let children = plan.children()?;
17 |     visitor.pre_visit(plan)?;
18 | 
19 |     for child in children {
20 |         _visit_physical_plan(child.as_ref(), visitor)?;
21 |     }
22 |     visitor.post_visit(plan)?;
23 |     Ok(())
24 | }
25 | 
26 | #[cfg(test)]
27 | mod tests {
28 |     use crate::{
29 |         datasource::CsvTable,
30 |         physical_plan::{PhysicalPlan, ScanPlan},
31 |         CsvConfig,
32 |     };
33 | 
34 |     use super::{PhysicalPlanVistor, _visit_physical_plan};
35 |     use crate::error::Result;
36 | 
37 |     struct TestVisitor {
38 |         v: usize,
39 |     }
40 | 
41 |     impl PhysicalPlanVistor for TestVisitor {
42 |         fn pre_visit(&mut self, _: &dyn PhysicalPlan) -> Result<()> {
43 |             println!("pre_v: {}", self.v);
44 |             Ok(())
45 |         }
46 | 
47 |         fn post_visit(&mut self, _: &dyn PhysicalPlan) -> Result<()> {
48 |             println!("post_v: {}", self.v);
49 |             Ok(())
50 |         }
51 |     }
52 | 
53 |     #[test]
54 |     fn test_visitor() -> Result<()> {
55 |         let source = CsvTable::try_create("data/test_data.csv", CsvConfig::default())?;
56 | 
57 |         let scan_plan = ScanPlan::create(source, None);
58 |         _visit_physical_plan(scan_plan.as_ref(), &mut TestVisitor { v: 1 })?;
59 |         Ok(())
60 |     }
61 | }
62 | 


--------------------------------------------------------------------------------
/src/planner/mod.rs:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * @Author: Veeupup
  3 |  * @Date: 2022-05-13 16:56:35
  4 |  * @Email: code@tanweime.com
  5 |  *
  6 |  * Planner: translate the logical plan into the physical plan.
  7 |  *
  8 | */
  9 | 
 10 | use crate::logical_plan::expression::AggregateFunc;
 11 | use crate::logical_plan::schema::NaiveSchema;
 12 | use crate::physical_plan::CrossJoin;
 13 | use crate::physical_plan::HashJoin;
 14 | 
 15 | use crate::physical_plan::avg::Avg;
 16 | use crate::physical_plan::count::Count;
 17 | use crate::physical_plan::max::Max;
 18 | use crate::physical_plan::min::Min;
 19 | use crate::physical_plan::sum::Sum;
 20 | use crate::physical_plan::PhysicalAggregatePlan;
 21 | use crate::physical_plan::PhysicalBinaryExpr;
 22 | use crate::physical_plan::PhysicalCastExpr;
 23 | use crate::physical_plan::PhysicalExprRef;
 24 | use crate::physical_plan::PhysicalLimitPlan;
 25 | use crate::physical_plan::PhysicalLiteralExpr;
 26 | use crate::physical_plan::PhysicalOffsetPlan;
 27 | use crate::physical_plan::PhysicalPlanRef;
 28 | use crate::physical_plan::PhysicalUnaryExpr;
 29 | use crate::physical_plan::SelectionPlan;
 30 | use crate::{
 31 |     error::{ErrorCode, Result},
 32 |     logical_plan::{
 33 |         expression::{Column, LogicalExpr},
 34 |         plan::LogicalPlan,
 35 |     },
 36 |     physical_plan::{ColumnExpr, ProjectionPlan, ScanPlan},
 37 | };
 38 | 
 39 | pub struct QueryPlanner;
 40 | 
 41 | impl QueryPlanner {
 42 |     pub fn create_physical_plan(plan: &LogicalPlan) -> Result<PhysicalPlanRef> {
 43 |         match plan {
 44 |             LogicalPlan::TableScan(table_scan) => Ok(ScanPlan::create(
 45 |                 table_scan.source.clone(),
 46 |                 table_scan.projection.clone(),
 47 |             )),
 48 |             LogicalPlan::Projection(proj) => {
 49 |                 let input = Self::create_physical_plan(&proj.input)?;
 50 |                 let proj_expr = proj
 51 |                     .exprs
 52 |                     .iter()
 53 |                     .map(|expr| Self::create_physical_expression(expr, &proj.input).unwrap())
 54 |                     .collect::<Vec<_>>();
 55 |                 let fields = proj
 56 |                     .exprs
 57 |                     .iter()
 58 |                     .map(|expr| expr.data_field(proj.input.as_ref()).unwrap())
 59 |                     .collect::<Vec<_>>();
 60 |                 let proj_schema = NaiveSchema::new(fields);
 61 |                 Ok(ProjectionPlan::create(input, proj_schema, proj_expr))
 62 |             }
 63 |             LogicalPlan::Limit(limit) => {
 64 |                 let plan = Self::create_physical_plan(&limit.input)?;
 65 |                 Ok(PhysicalLimitPlan::create(plan, limit.n))
 66 |             }
 67 |             LogicalPlan::Offset(offset) => {
 68 |                 let plan = Self::create_physical_plan(&offset.input)?;
 69 |                 Ok(PhysicalOffsetPlan::create(plan, offset.n))
 70 |             }
 71 |             LogicalPlan::Join(join) => {
 72 |                 let left = Self::create_physical_plan(&join.left)?;
 73 |                 let right = Self::create_physical_plan(&join.right)?;
 74 |                 // We now have two join physical implementation
 75 |                 // Ok(NestedLoopJoin::new(
 76 |                 //     left,
 77 |                 //     right,
 78 |                 //     join.on.clone(),
 79 |                 //     join.join_type,
 80 |                 //     join.schema.clone(),
 81 |                 // ))
 82 |                 Ok(HashJoin::create(
 83 |                     left,
 84 |                     right,
 85 |                     join.on.clone(),
 86 |                     join.join_type,
 87 |                     join.schema.clone(),
 88 |                 ))
 89 |             }
 90 |             LogicalPlan::Filter(filter) => {
 91 |                 let predicate = Self::create_physical_expression(&filter.predicate, plan)?;
 92 |                 let input = Self::create_physical_plan(&filter.input)?;
 93 |                 Ok(SelectionPlan::create(input, predicate))
 94 |             }
 95 |             LogicalPlan::Aggregate(aggr) => {
 96 |                 let mut group_exprs = vec![];
 97 |                 for group_expr in &aggr.group_expr {
 98 |                     group_exprs.push(Self::create_physical_expression(group_expr, &aggr.input)?);
 99 |                 }
100 | 
101 |                 let mut aggr_ops = vec![];
102 |                 for aggr_expr in &aggr.aggr_expr {
103 |                     let aggr_op = match aggr_expr.fun {
104 |                         AggregateFunc::Count => {
105 |                             let expr =
106 |                                 Self::create_physical_expression(&aggr_expr.args, &aggr.input)?;
107 |                             let col_expr = expr.as_any().downcast_ref::<ColumnExpr>();
108 |                             if let Some(col_expr) = col_expr {
109 |                                 Count::create(col_expr.clone())
110 |                             } else {
111 |                                 return Err(ErrorCode::PlanError(
112 |                                     "Aggregate Func should have a column in it".to_string(),
113 |                                 ));
114 |                             }
115 |                         }
116 |                         AggregateFunc::Sum => {
117 |                             let expr =
118 |                                 Self::create_physical_expression(&aggr_expr.args, &aggr.input)?;
119 |                             let col_expr = expr.as_any().downcast_ref::<ColumnExpr>();
120 |                             if let Some(col_expr) = col_expr {
121 |                                 Sum::create(col_expr.clone())
122 |                             } else {
123 |                                 return Err(ErrorCode::PlanError(
124 |                                     "Aggregate Func should have a column in it".to_string(),
125 |                                 ));
126 |                             }
127 |                         }
128 |                         AggregateFunc::Avg => {
129 |                             let expr =
130 |                                 Self::create_physical_expression(&aggr_expr.args, &aggr.input)?;
131 |                             let col_expr = expr.as_any().downcast_ref::<ColumnExpr>();
132 |                             if let Some(col_expr) = col_expr {
133 |                                 Avg::create(col_expr.clone())
134 |                             } else {
135 |                                 return Err(ErrorCode::PlanError(
136 |                                     "Aggregate Func should have a column in it".to_string(),
137 |                                 ));
138 |                             }
139 |                         }
140 |                         AggregateFunc::Min => {
141 |                             let expr =
142 |                                 Self::create_physical_expression(&aggr_expr.args, &aggr.input)?;
143 |                             let col_expr = expr.as_any().downcast_ref::<ColumnExpr>();
144 |                             if let Some(col_expr) = col_expr {
145 |                                 Min::create(col_expr.clone())
146 |                             } else {
147 |                                 return Err(ErrorCode::PlanError(
148 |                                     "Aggregate Func should have a column in it".to_string(),
149 |                                 ));
150 |                             }
151 |                         }
152 |                         AggregateFunc::Max => {
153 |                             let expr =
154 |                                 Self::create_physical_expression(&aggr_expr.args, &aggr.input)?;
155 |                             let col_expr = expr.as_any().downcast_ref::<ColumnExpr>();
156 |                             if let Some(col_expr) = col_expr {
157 |                                 Max::create(col_expr.clone())
158 |                             } else {
159 |                                 return Err(ErrorCode::PlanError(
160 |                                     "Aggregate Func should have a column in it".to_string(),
161 |                                 ));
162 |                             }
163 |                         }
164 |                     };
165 |                     aggr_ops.push(aggr_op);
166 |                 }
167 | 
168 |                 let input = Self::create_physical_plan(&aggr.input)?;
169 |                 Ok(PhysicalAggregatePlan::create(group_exprs, aggr_ops, input))
170 |             }
171 |             LogicalPlan::CrossJoin(join) => {
172 |                 let left = Self::create_physical_plan(&join.left)?;
173 |                 let right = Self::create_physical_plan(&join.right)?;
174 |                 Ok(CrossJoin::create(
175 |                     left,
176 |                     right,
177 |                     join.join_type,
178 |                     join.schema.clone(),
179 |                 ))
180 |             }
181 |         }
182 |     }
183 | 
184 |     pub fn create_physical_expression(
185 |         expr: &LogicalExpr,
186 |         input: &LogicalPlan,
187 |     ) -> Result<PhysicalExprRef> {
188 |         match expr {
189 |             LogicalExpr::Alias(_, _) => todo!(),
190 |             LogicalExpr::Column(Column { name, .. }) => {
191 |                 for (idx, field) in input.schema().fields().iter().enumerate() {
192 |                     if field.name() == name {
193 |                         return ColumnExpr::try_create(None, Some(idx));
194 |                     }
195 |                 }
196 |                 Err(ErrorCode::ColumnNotExists(format!(
197 |                     "column `{}` not exists",
198 |                     name
199 |                 )))
200 |             }
201 |             LogicalExpr::Literal(scalar_val) => Ok(PhysicalLiteralExpr::create(scalar_val.clone())),
202 |             LogicalExpr::BinaryExpr(bin_expr) => {
203 |                 let left = Self::create_physical_expression(bin_expr.left.as_ref(), input)?;
204 |                 let right = Self::create_physical_expression(bin_expr.right.as_ref(), input)?;
205 |                 let phy_bin_expr = PhysicalBinaryExpr::create(left, bin_expr.op.clone(), right);
206 |                 Ok(phy_bin_expr)
207 |             }
208 |             LogicalExpr::UnaryExpr(scalar_expr) => {
209 |                 let expr = Self::create_physical_expression(scalar_expr.arg.as_ref(), input)?;
210 |                 let phy_scalar_expr = PhysicalUnaryExpr::create(
211 |                     expr,
212 |                     scalar_expr.func.clone(),
213 |                     "todo".to_string(),
214 |                     &arrow::datatypes::DataType::Int32,
215 |                 );
216 |                 Ok(phy_scalar_expr)
217 |             }
218 |             LogicalExpr::Not(_) => todo!(),
219 |             LogicalExpr::Cast(cast_expr) => {
220 |                 let expr = Self::create_physical_expression(cast_expr.expr.as_ref(), input)?;
221 |                 let phy_cast_expr = PhysicalCastExpr::create(expr, &cast_expr.data_type);
222 |                 Ok(phy_cast_expr)
223 |             }
224 |             LogicalExpr::AggregateFunction(_) => todo!(),
225 |             LogicalExpr::Wildcard => todo!(),
226 |         }
227 |     }
228 | }
229 | 
230 | #[cfg(test)]
231 | mod tests {
232 |     use arrow::array::ArrayRef;
233 |     use arrow::array::Int64Array;
234 |     use arrow::array::StringArray;
235 |     use std::sync::Arc;
236 | 
237 |     use crate::catalog::Catalog;
238 |     use crate::CsvConfig;
239 | 
240 |     use super::*;
241 | 
242 |     #[test]
243 |     fn test_scan_projection() -> Result<()> {
244 |         // construct
245 |         let mut catalog = Catalog::default();
246 |         catalog.add_csv_table("t1", "data/test_data.csv", CsvConfig::default())?;
247 |         let source = catalog.get_table_df("t1")?;
248 |         let exprs = vec![
249 |             LogicalExpr::column(None, "id".to_string()),
250 |             LogicalExpr::column(None, "name".to_string()),
251 |             LogicalExpr::column(None, "age".to_string()),
252 |         ];
253 |         let logical_plan = source.project(exprs)?.logical_plan();
254 |         let physical_plan = QueryPlanner::create_physical_plan(&logical_plan)?;
255 |         let batches = physical_plan.execute()?;
256 | 
257 |         // test
258 |         assert_eq!(batches.len(), 1);
259 |         let batch = &batches[0];
260 | 
261 |         let id_excepted: ArrayRef = Arc::new(Int64Array::from(vec![1, 2, 4, 5, 6, 7, 8, 9]));
262 |         let name_excepted: ArrayRef = Arc::new(StringArray::from(vec![
263 |             "veeupup", "alex", "lynne", "alice", "bob", "jack", "cock", "primer",
264 |         ]));
265 |         let age_excepted: ArrayRef =
266 |             Arc::new(Int64Array::from(vec![23, 20, 18, 19, 20, 21, 22, 23]));
267 | 
268 |         assert_eq!(batch.column(0), &id_excepted);
269 |         assert_eq!(batch.column(1), &name_excepted);
270 |         assert_eq!(batch.column(2), &age_excepted);
271 | 
272 |         Ok(())
273 |     }
274 | }
275 | 


--------------------------------------------------------------------------------
/src/sql/mod.rs:
--------------------------------------------------------------------------------
1 | /*
2 |  * @Author: Veeupup
3 |  * @Date: 2022-05-13 19:35:34
4 |  * @Email: code@tanweime.com
5 | */
6 | 
7 | pub mod parser;
8 | pub mod planner;
9 | 


--------------------------------------------------------------------------------
/src/sql/parser.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: Veeupup
 3 |  * @Date: 2022-05-13 19:35:41
 4 |  * @Email: code@tanweime.com
 5 | */
 6 | 
 7 | use sqlparser::{
 8 |     ast::Statement,
 9 |     dialect::GenericDialect,
10 |     parser::{Parser, ParserError},
11 |     tokenizer::Tokenizer,
12 | };
13 | 
14 | /// SQL Parser
15 | pub struct SQLParser;
16 | 
17 | impl SQLParser {
18 |     /// Parse the specified tokens and return statement
19 |     pub fn parse(sql: &str) -> Result<Statement, ParserError> {
20 |         let dialect = GenericDialect {};
21 |         let mut tokenizer = Tokenizer::new(&dialect, sql);
22 |         let tokens = tokenizer.tokenize()?;
23 |         let mut parser = Parser::new(tokens, &dialect);
24 |         parser.parse_statement()
25 |     }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/utils.rs:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * @Author: Veeupup
 3 |  * @Date: 2022-05-14 17:33:26
 4 |  * @Email: code@tanweime.com
 5 | */
 6 | 
 7 | use crate::error::ErrorCode;
 8 | use crate::error::Result;
 9 | use arrow::{record_batch::RecordBatch, util::pretty};
10 | 
11 | pub fn print_result(result: &[RecordBatch]) -> Result<()> {
12 |     pretty::print_batches(result).map_err(ErrorCode::ArrowError)
13 | }
14 | 


--------------------------------------------------------------------------------