├── .github └── workflows │ ├── release.yml │ └── rust.yml ├── .gitignore ├── Cargo.lock ├── Cargo.toml ├── Changes.md ├── Documentation.md ├── LICENSE ├── README.md ├── argo-workflow ├── Cargo.toml └── src │ ├── lib.rs │ └── schema.rs ├── dashtool-common ├── Cargo.toml └── src │ └── lib.rs ├── dashtool ├── Cargo.toml └── src │ ├── build │ ├── build_dag.rs │ ├── mod.rs │ └── update_dag.rs │ ├── dag │ ├── identifier.rs │ └── mod.rs │ ├── error.rs │ ├── git.rs │ ├── lib.rs │ ├── main.rs │ ├── plugins │ ├── dashbook │ │ ├── mod.rs │ │ └── openid.rs │ ├── file │ │ └── mod.rs │ ├── mod.rs │ └── sql │ │ └── mod.rs │ ├── state │ └── mod.rs │ └── workflow │ ├── mod.rs │ └── template.rs ├── dashtool_concept.svg └── dashtool_dbt.svg /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: rust release action 2 | on: 3 | push: 4 | tags: 5 | - "v*" 6 | jobs: 7 | release: 8 | name: Release - ${{ matrix.platform.os-name }} 9 | strategy: 10 | matrix: 11 | platform: 12 | - os-name: Linux-x86_64 13 | runs-on: ubuntu-20.04 14 | target: x86_64-unknown-linux-gnu 15 | 16 | - os-name: Windows-x86_64 17 | runs-on: windows-latest 18 | target: x86_64-pc-windows-msvc 19 | 20 | - os-name: macOS-x86_64 21 | runs-on: macOS-latest 22 | target: x86_64-apple-darwin 23 | 24 | runs-on: ${{ matrix.platform.runs-on }} 25 | steps: 26 | - name: Checkout 27 | uses: actions/checkout@v3 28 | - name: Build binary 29 | uses: houseabsolute/actions-rust-cross@v0 30 | with: 31 | command: "build" 32 | target: ${{ matrix.platform.target }} 33 | args: "--locked --release" 34 | working-directory: dashtool 35 | - name: Publish artifacts and release 36 | uses: houseabsolute/actions-rust-release@v0 37 | with: 38 | executable-name: dashtool 39 | target: ${{ matrix.platform.target }} 40 | extra-files: "" 41 | -------------------------------------------------------------------------------- /.github/workflows/rust.yml: -------------------------------------------------------------------------------- 1 | name: Rust 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | env: 10 | CARGO_TERM_COLOR: always 11 | 12 | jobs: 13 | build: 14 | strategy: 15 | matrix: 16 | runner: [ubuntu-latest, windows-latest, macos-latest] 17 | 18 | runs-on: ${{ matrix.runner }} 19 | 20 | steps: 21 | - uses: actions/checkout@v3 22 | - name: Build 23 | run: cargo build --verbose 24 | - name: Run tests 25 | run: cargo test --verbose 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target 2 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | members = ["dashtool", "dashtool-common", "argo-workflow"] 3 | 4 | resolver = "2" 5 | 6 | [workspace.dependencies] 7 | serde = { version = "1", features = ["derive"] } 8 | serde_json = "1" 9 | 10 | [patch.crates-io] 11 | object_store = { version = "0.11.1", git = "https://github.com/apache/arrow-rs", rev = "c60ce14" } 12 | 13 | -------------------------------------------------------------------------------- /Changes.md: -------------------------------------------------------------------------------- 1 | # v0.1.0 2 | - initial release 3 | -------------------------------------------------------------------------------- /Documentation.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | 3 | ## Commands 4 | 5 | ### Build 6 | 7 | The `build` command analyzes all `.sql` files in the subdirectories of the current directory and creates the corresponding Iceberg Materialized Views in the catalog. 8 | 9 | ```shell 10 | dashtool build 11 | ``` 12 | 13 | ### Create Workflow 14 | 15 | The `workflow` command creates a lineage DAG based on the `.sql` files and constructs an Argo workflow based on it. It stores the Workflow configuration file in `argo/workflow.yaml`. 16 | 17 | ```shell 18 | dashtool workflow 19 | ``` 20 | 21 | ### Apply Workflow to the Kubernetes cluster 22 | 23 | To apply the latest version of the workflow to the Kubernetes cluster run the following command: 24 | 25 | ```shell 26 | kubectl apply -f argo/workflow.yaml 27 | ``` 28 | 29 | ## Configuration 30 | 31 | Dashtool uses the `dashtool.json` file to store connection and authentication parameters for the current project. 32 | It uses a plugin system to support different Icebergs catalogs and cloud providers. The "plugin" field specifies which plugin to use. 33 | 34 | | Field | Type | Description | 35 | | --- | --- | --- | 36 | | **plugin** | String | Name of the plugin. Can be: "sql" | 37 | 38 | ### Sql plugin 39 | 40 | The configuration file for the Sql plugin has two sections, one for the Iceberg catalog and one for the cloud provider. 41 | 42 | #### Sql catalog 43 | 44 | | Field | Type | Description | 45 | | --- | --- | --- | 46 | | **catalogUrl** | String | Connection string to the database. Will substitute any variable $VAR with the according environment variable. For example: "postgres://username:$PASSWORD@host:5432/database" | 47 | | **bucket** | String | The bucket to store the data. For example: "s3://bucket" | 48 | | **secrets** | "{ String: { String: String }}"| A nested map that maps a kubernetes secret name to a map from a secret key to the environmant variable. Defines which secrets should be injected into the containers. For example : { "postgres-secret": { "password": "POSTGRES_PASSWORD" }, }| 49 | | **env** | "{ String : String }"| A map from environment variable name to value. For example: { "POSTGRES_PASSWORD": "postgres" }| 50 | 51 | ### Object Storage 52 | 53 | #### AWS S3 54 | 55 | | Field | Type | Description | 56 | | --- | --- | --- | 57 | | **awsAccessKeyId** | String | AWS_ACCESS_KEY_ID | 58 | | **awsSecretAccessKey** | String | AWS_SECRET_ACCESS_KEY | 59 | | **awsRegion** | String | AWS_REGION | 60 | | **awsEndpoint** | String | AWS_ENDPOINT | 61 | | **awsAllowHttp** | String | Allow a http connection | 62 | 63 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | GNU AFFERO GENERAL PUBLIC LICENSE 2 | Version 3, 19 November 2007 3 | 4 | Copyright (C) 2007 Free Software Foundation, Inc. 5 | Everyone is permitted to copy and distribute verbatim copies 6 | of this license document, but changing it is not allowed. 7 | 8 | Preamble 9 | 10 | The GNU Affero General Public License is a free, copyleft license for 11 | software and other kinds of works, specifically designed to ensure 12 | cooperation with the community in the case of network server software. 13 | 14 | The licenses for most software and other practical works are designed 15 | to take away your freedom to share and change the works. By contrast, 16 | our General Public Licenses are intended to guarantee your freedom to 17 | share and change all versions of a program--to make sure it remains free 18 | software for all its users. 19 | 20 | When we speak of free software, we are referring to freedom, not 21 | price. Our General Public Licenses are designed to make sure that you 22 | have the freedom to distribute copies of free software (and charge for 23 | them if you wish), that you receive source code or can get it if you 24 | want it, that you can change the software or use pieces of it in new 25 | free programs, and that you know you can do these things. 26 | 27 | Developers that use our General Public Licenses protect your rights 28 | with two steps: (1) assert copyright on the software, and (2) offer 29 | you this License which gives you legal permission to copy, distribute 30 | and/or modify the software. 31 | 32 | A secondary benefit of defending all users' freedom is that 33 | improvements made in alternate versions of the program, if they 34 | receive widespread use, become available for other developers to 35 | incorporate. Many developers of free software are heartened and 36 | encouraged by the resulting cooperation. However, in the case of 37 | software used on network servers, this result may fail to come about. 38 | The GNU General Public License permits making a modified version and 39 | letting the public access it on a server without ever releasing its 40 | source code to the public. 41 | 42 | The GNU Affero General Public License is designed specifically to 43 | ensure that, in such cases, the modified source code becomes available 44 | to the community. It requires the operator of a network server to 45 | provide the source code of the modified version running there to the 46 | users of that server. Therefore, public use of a modified version, on 47 | a publicly accessible server, gives the public access to the source 48 | code of the modified version. 49 | 50 | An older license, called the Affero General Public License and 51 | published by Affero, was designed to accomplish similar goals. This is 52 | a different license, not a version of the Affero GPL, but Affero has 53 | released a new version of the Affero GPL which permits relicensing under 54 | this license. 55 | 56 | The precise terms and conditions for copying, distribution and 57 | modification follow. 58 | 59 | TERMS AND CONDITIONS 60 | 61 | 0. Definitions. 62 | 63 | "This License" refers to version 3 of the GNU Affero General Public License. 64 | 65 | "Copyright" also means copyright-like laws that apply to other kinds of 66 | works, such as semiconductor masks. 67 | 68 | "The Program" refers to any copyrightable work licensed under this 69 | License. Each licensee is addressed as "you". "Licensees" and 70 | "recipients" may be individuals or organizations. 71 | 72 | To "modify" a work means to copy from or adapt all or part of the work 73 | in a fashion requiring copyright permission, other than the making of an 74 | exact copy. The resulting work is called a "modified version" of the 75 | earlier work or a work "based on" the earlier work. 76 | 77 | A "covered work" means either the unmodified Program or a work based 78 | on the Program. 79 | 80 | To "propagate" a work means to do anything with it that, without 81 | permission, would make you directly or secondarily liable for 82 | infringement under applicable copyright law, except executing it on a 83 | computer or modifying a private copy. Propagation includes copying, 84 | distribution (with or without modification), making available to the 85 | public, and in some countries other activities as well. 86 | 87 | To "convey" a work means any kind of propagation that enables other 88 | parties to make or receive copies. Mere interaction with a user through 89 | a computer network, with no transfer of a copy, is not conveying. 90 | 91 | An interactive user interface displays "Appropriate Legal Notices" 92 | to the extent that it includes a convenient and prominently visible 93 | feature that (1) displays an appropriate copyright notice, and (2) 94 | tells the user that there is no warranty for the work (except to the 95 | extent that warranties are provided), that licensees may convey the 96 | work under this License, and how to view a copy of this License. If 97 | the interface presents a list of user commands or options, such as a 98 | menu, a prominent item in the list meets this criterion. 99 | 100 | 1. Source Code. 101 | 102 | The "source code" for a work means the preferred form of the work 103 | for making modifications to it. "Object code" means any non-source 104 | form of a work. 105 | 106 | A "Standard Interface" means an interface that either is an official 107 | standard defined by a recognized standards body, or, in the case of 108 | interfaces specified for a particular programming language, one that 109 | is widely used among developers working in that language. 110 | 111 | The "System Libraries" of an executable work include anything, other 112 | than the work as a whole, that (a) is included in the normal form of 113 | packaging a Major Component, but which is not part of that Major 114 | Component, and (b) serves only to enable use of the work with that 115 | Major Component, or to implement a Standard Interface for which an 116 | implementation is available to the public in source code form. A 117 | "Major Component", in this context, means a major essential component 118 | (kernel, window system, and so on) of the specific operating system 119 | (if any) on which the executable work runs, or a compiler used to 120 | produce the work, or an object code interpreter used to run it. 121 | 122 | The "Corresponding Source" for a work in object code form means all 123 | the source code needed to generate, install, and (for an executable 124 | work) run the object code and to modify the work, including scripts to 125 | control those activities. However, it does not include the work's 126 | System Libraries, or general-purpose tools or generally available free 127 | programs which are used unmodified in performing those activities but 128 | which are not part of the work. For example, Corresponding Source 129 | includes interface definition files associated with source files for 130 | the work, and the source code for shared libraries and dynamically 131 | linked subprograms that the work is specifically designed to require, 132 | such as by intimate data communication or control flow between those 133 | subprograms and other parts of the work. 134 | 135 | The Corresponding Source need not include anything that users 136 | can regenerate automatically from other parts of the Corresponding 137 | Source. 138 | 139 | The Corresponding Source for a work in source code form is that 140 | same work. 141 | 142 | 2. Basic Permissions. 143 | 144 | All rights granted under this License are granted for the term of 145 | copyright on the Program, and are irrevocable provided the stated 146 | conditions are met. This License explicitly affirms your unlimited 147 | permission to run the unmodified Program. The output from running a 148 | covered work is covered by this License only if the output, given its 149 | content, constitutes a covered work. This License acknowledges your 150 | rights of fair use or other equivalent, as provided by copyright law. 151 | 152 | You may make, run and propagate covered works that you do not 153 | convey, without conditions so long as your license otherwise remains 154 | in force. You may convey covered works to others for the sole purpose 155 | of having them make modifications exclusively for you, or provide you 156 | with facilities for running those works, provided that you comply with 157 | the terms of this License in conveying all material for which you do 158 | not control copyright. Those thus making or running the covered works 159 | for you must do so exclusively on your behalf, under your direction 160 | and control, on terms that prohibit them from making any copies of 161 | your copyrighted material outside their relationship with you. 162 | 163 | Conveying under any other circumstances is permitted solely under 164 | the conditions stated below. Sublicensing is not allowed; section 10 165 | makes it unnecessary. 166 | 167 | 3. Protecting Users' Legal Rights From Anti-Circumvention Law. 168 | 169 | No covered work shall be deemed part of an effective technological 170 | measure under any applicable law fulfilling obligations under article 171 | 11 of the WIPO copyright treaty adopted on 20 December 1996, or 172 | similar laws prohibiting or restricting circumvention of such 173 | measures. 174 | 175 | When you convey a covered work, you waive any legal power to forbid 176 | circumvention of technological measures to the extent such circumvention 177 | is effected by exercising rights under this License with respect to 178 | the covered work, and you disclaim any intention to limit operation or 179 | modification of the work as a means of enforcing, against the work's 180 | users, your or third parties' legal rights to forbid circumvention of 181 | technological measures. 182 | 183 | 4. Conveying Verbatim Copies. 184 | 185 | You may convey verbatim copies of the Program's source code as you 186 | receive it, in any medium, provided that you conspicuously and 187 | appropriately publish on each copy an appropriate copyright notice; 188 | keep intact all notices stating that this License and any 189 | non-permissive terms added in accord with section 7 apply to the code; 190 | keep intact all notices of the absence of any warranty; and give all 191 | recipients a copy of this License along with the Program. 192 | 193 | You may charge any price or no price for each copy that you convey, 194 | and you may offer support or warranty protection for a fee. 195 | 196 | 5. Conveying Modified Source Versions. 197 | 198 | You may convey a work based on the Program, or the modifications to 199 | produce it from the Program, in the form of source code under the 200 | terms of section 4, provided that you also meet all of these conditions: 201 | 202 | a) The work must carry prominent notices stating that you modified 203 | it, and giving a relevant date. 204 | 205 | b) The work must carry prominent notices stating that it is 206 | released under this License and any conditions added under section 207 | 7. This requirement modifies the requirement in section 4 to 208 | "keep intact all notices". 209 | 210 | c) You must license the entire work, as a whole, under this 211 | License to anyone who comes into possession of a copy. This 212 | License will therefore apply, along with any applicable section 7 213 | additional terms, to the whole of the work, and all its parts, 214 | regardless of how they are packaged. This License gives no 215 | permission to license the work in any other way, but it does not 216 | invalidate such permission if you have separately received it. 217 | 218 | d) If the work has interactive user interfaces, each must display 219 | Appropriate Legal Notices; however, if the Program has interactive 220 | interfaces that do not display Appropriate Legal Notices, your 221 | work need not make them do so. 222 | 223 | A compilation of a covered work with other separate and independent 224 | works, which are not by their nature extensions of the covered work, 225 | and which are not combined with it such as to form a larger program, 226 | in or on a volume of a storage or distribution medium, is called an 227 | "aggregate" if the compilation and its resulting copyright are not 228 | used to limit the access or legal rights of the compilation's users 229 | beyond what the individual works permit. Inclusion of a covered work 230 | in an aggregate does not cause this License to apply to the other 231 | parts of the aggregate. 232 | 233 | 6. Conveying Non-Source Forms. 234 | 235 | You may convey a covered work in object code form under the terms 236 | of sections 4 and 5, provided that you also convey the 237 | machine-readable Corresponding Source under the terms of this License, 238 | in one of these ways: 239 | 240 | a) Convey the object code in, or embodied in, a physical product 241 | (including a physical distribution medium), accompanied by the 242 | Corresponding Source fixed on a durable physical medium 243 | customarily used for software interchange. 244 | 245 | b) Convey the object code in, or embodied in, a physical product 246 | (including a physical distribution medium), accompanied by a 247 | written offer, valid for at least three years and valid for as 248 | long as you offer spare parts or customer support for that product 249 | model, to give anyone who possesses the object code either (1) a 250 | copy of the Corresponding Source for all the software in the 251 | product that is covered by this License, on a durable physical 252 | medium customarily used for software interchange, for a price no 253 | more than your reasonable cost of physically performing this 254 | conveying of source, or (2) access to copy the 255 | Corresponding Source from a network server at no charge. 256 | 257 | c) Convey individual copies of the object code with a copy of the 258 | written offer to provide the Corresponding Source. This 259 | alternative is allowed only occasionally and noncommercially, and 260 | only if you received the object code with such an offer, in accord 261 | with subsection 6b. 262 | 263 | d) Convey the object code by offering access from a designated 264 | place (gratis or for a charge), and offer equivalent access to the 265 | Corresponding Source in the same way through the same place at no 266 | further charge. You need not require recipients to copy the 267 | Corresponding Source along with the object code. If the place to 268 | copy the object code is a network server, the Corresponding Source 269 | may be on a different server (operated by you or a third party) 270 | that supports equivalent copying facilities, provided you maintain 271 | clear directions next to the object code saying where to find the 272 | Corresponding Source. Regardless of what server hosts the 273 | Corresponding Source, you remain obligated to ensure that it is 274 | available for as long as needed to satisfy these requirements. 275 | 276 | e) Convey the object code using peer-to-peer transmission, provided 277 | you inform other peers where the object code and Corresponding 278 | Source of the work are being offered to the general public at no 279 | charge under subsection 6d. 280 | 281 | A separable portion of the object code, whose source code is excluded 282 | from the Corresponding Source as a System Library, need not be 283 | included in conveying the object code work. 284 | 285 | A "User Product" is either (1) a "consumer product", which means any 286 | tangible personal property which is normally used for personal, family, 287 | or household purposes, or (2) anything designed or sold for incorporation 288 | into a dwelling. In determining whether a product is a consumer product, 289 | doubtful cases shall be resolved in favor of coverage. For a particular 290 | product received by a particular user, "normally used" refers to a 291 | typical or common use of that class of product, regardless of the status 292 | of the particular user or of the way in which the particular user 293 | actually uses, or expects or is expected to use, the product. A product 294 | is a consumer product regardless of whether the product has substantial 295 | commercial, industrial or non-consumer uses, unless such uses represent 296 | the only significant mode of use of the product. 297 | 298 | "Installation Information" for a User Product means any methods, 299 | procedures, authorization keys, or other information required to install 300 | and execute modified versions of a covered work in that User Product from 301 | a modified version of its Corresponding Source. The information must 302 | suffice to ensure that the continued functioning of the modified object 303 | code is in no case prevented or interfered with solely because 304 | modification has been made. 305 | 306 | If you convey an object code work under this section in, or with, or 307 | specifically for use in, a User Product, and the conveying occurs as 308 | part of a transaction in which the right of possession and use of the 309 | User Product is transferred to the recipient in perpetuity or for a 310 | fixed term (regardless of how the transaction is characterized), the 311 | Corresponding Source conveyed under this section must be accompanied 312 | by the Installation Information. But this requirement does not apply 313 | if neither you nor any third party retains the ability to install 314 | modified object code on the User Product (for example, the work has 315 | been installed in ROM). 316 | 317 | The requirement to provide Installation Information does not include a 318 | requirement to continue to provide support service, warranty, or updates 319 | for a work that has been modified or installed by the recipient, or for 320 | the User Product in which it has been modified or installed. Access to a 321 | network may be denied when the modification itself materially and 322 | adversely affects the operation of the network or violates the rules and 323 | protocols for communication across the network. 324 | 325 | Corresponding Source conveyed, and Installation Information provided, 326 | in accord with this section must be in a format that is publicly 327 | documented (and with an implementation available to the public in 328 | source code form), and must require no special password or key for 329 | unpacking, reading or copying. 330 | 331 | 7. Additional Terms. 332 | 333 | "Additional permissions" are terms that supplement the terms of this 334 | License by making exceptions from one or more of its conditions. 335 | Additional permissions that are applicable to the entire Program shall 336 | be treated as though they were included in this License, to the extent 337 | that they are valid under applicable law. If additional permissions 338 | apply only to part of the Program, that part may be used separately 339 | under those permissions, but the entire Program remains governed by 340 | this License without regard to the additional permissions. 341 | 342 | When you convey a copy of a covered work, you may at your option 343 | remove any additional permissions from that copy, or from any part of 344 | it. (Additional permissions may be written to require their own 345 | removal in certain cases when you modify the work.) You may place 346 | additional permissions on material, added by you to a covered work, 347 | for which you have or can give appropriate copyright permission. 348 | 349 | Notwithstanding any other provision of this License, for material you 350 | add to a covered work, you may (if authorized by the copyright holders of 351 | that material) supplement the terms of this License with terms: 352 | 353 | a) Disclaiming warranty or limiting liability differently from the 354 | terms of sections 15 and 16 of this License; or 355 | 356 | b) Requiring preservation of specified reasonable legal notices or 357 | author attributions in that material or in the Appropriate Legal 358 | Notices displayed by works containing it; or 359 | 360 | c) Prohibiting misrepresentation of the origin of that material, or 361 | requiring that modified versions of such material be marked in 362 | reasonable ways as different from the original version; or 363 | 364 | d) Limiting the use for publicity purposes of names of licensors or 365 | authors of the material; or 366 | 367 | e) Declining to grant rights under trademark law for use of some 368 | trade names, trademarks, or service marks; or 369 | 370 | f) Requiring indemnification of licensors and authors of that 371 | material by anyone who conveys the material (or modified versions of 372 | it) with contractual assumptions of liability to the recipient, for 373 | any liability that these contractual assumptions directly impose on 374 | those licensors and authors. 375 | 376 | All other non-permissive additional terms are considered "further 377 | restrictions" within the meaning of section 10. If the Program as you 378 | received it, or any part of it, contains a notice stating that it is 379 | governed by this License along with a term that is a further 380 | restriction, you may remove that term. If a license document contains 381 | a further restriction but permits relicensing or conveying under this 382 | License, you may add to a covered work material governed by the terms 383 | of that license document, provided that the further restriction does 384 | not survive such relicensing or conveying. 385 | 386 | If you add terms to a covered work in accord with this section, you 387 | must place, in the relevant source files, a statement of the 388 | additional terms that apply to those files, or a notice indicating 389 | where to find the applicable terms. 390 | 391 | Additional terms, permissive or non-permissive, may be stated in the 392 | form of a separately written license, or stated as exceptions; 393 | the above requirements apply either way. 394 | 395 | 8. Termination. 396 | 397 | You may not propagate or modify a covered work except as expressly 398 | provided under this License. Any attempt otherwise to propagate or 399 | modify it is void, and will automatically terminate your rights under 400 | this License (including any patent licenses granted under the third 401 | paragraph of section 11). 402 | 403 | However, if you cease all violation of this License, then your 404 | license from a particular copyright holder is reinstated (a) 405 | provisionally, unless and until the copyright holder explicitly and 406 | finally terminates your license, and (b) permanently, if the copyright 407 | holder fails to notify you of the violation by some reasonable means 408 | prior to 60 days after the cessation. 409 | 410 | Moreover, your license from a particular copyright holder is 411 | reinstated permanently if the copyright holder notifies you of the 412 | violation by some reasonable means, this is the first time you have 413 | received notice of violation of this License (for any work) from that 414 | copyright holder, and you cure the violation prior to 30 days after 415 | your receipt of the notice. 416 | 417 | Termination of your rights under this section does not terminate the 418 | licenses of parties who have received copies or rights from you under 419 | this License. If your rights have been terminated and not permanently 420 | reinstated, you do not qualify to receive new licenses for the same 421 | material under section 10. 422 | 423 | 9. Acceptance Not Required for Having Copies. 424 | 425 | You are not required to accept this License in order to receive or 426 | run a copy of the Program. Ancillary propagation of a covered work 427 | occurring solely as a consequence of using peer-to-peer transmission 428 | to receive a copy likewise does not require acceptance. However, 429 | nothing other than this License grants you permission to propagate or 430 | modify any covered work. These actions infringe copyright if you do 431 | not accept this License. Therefore, by modifying or propagating a 432 | covered work, you indicate your acceptance of this License to do so. 433 | 434 | 10. Automatic Licensing of Downstream Recipients. 435 | 436 | Each time you convey a covered work, the recipient automatically 437 | receives a license from the original licensors, to run, modify and 438 | propagate that work, subject to this License. You are not responsible 439 | for enforcing compliance by third parties with this License. 440 | 441 | An "entity transaction" is a transaction transferring control of an 442 | organization, or substantially all assets of one, or subdividing an 443 | organization, or merging organizations. If propagation of a covered 444 | work results from an entity transaction, each party to that 445 | transaction who receives a copy of the work also receives whatever 446 | licenses to the work the party's predecessor in interest had or could 447 | give under the previous paragraph, plus a right to possession of the 448 | Corresponding Source of the work from the predecessor in interest, if 449 | the predecessor has it or can get it with reasonable efforts. 450 | 451 | You may not impose any further restrictions on the exercise of the 452 | rights granted or affirmed under this License. For example, you may 453 | not impose a license fee, royalty, or other charge for exercise of 454 | rights granted under this License, and you may not initiate litigation 455 | (including a cross-claim or counterclaim in a lawsuit) alleging that 456 | any patent claim is infringed by making, using, selling, offering for 457 | sale, or importing the Program or any portion of it. 458 | 459 | 11. Patents. 460 | 461 | A "contributor" is a copyright holder who authorizes use under this 462 | License of the Program or a work on which the Program is based. The 463 | work thus licensed is called the contributor's "contributor version". 464 | 465 | A contributor's "essential patent claims" are all patent claims 466 | owned or controlled by the contributor, whether already acquired or 467 | hereafter acquired, that would be infringed by some manner, permitted 468 | by this License, of making, using, or selling its contributor version, 469 | but do not include claims that would be infringed only as a 470 | consequence of further modification of the contributor version. For 471 | purposes of this definition, "control" includes the right to grant 472 | patent sublicenses in a manner consistent with the requirements of 473 | this License. 474 | 475 | Each contributor grants you a non-exclusive, worldwide, royalty-free 476 | patent license under the contributor's essential patent claims, to 477 | make, use, sell, offer for sale, import and otherwise run, modify and 478 | propagate the contents of its contributor version. 479 | 480 | In the following three paragraphs, a "patent license" is any express 481 | agreement or commitment, however denominated, not to enforce a patent 482 | (such as an express permission to practice a patent or covenant not to 483 | sue for patent infringement). To "grant" such a patent license to a 484 | party means to make such an agreement or commitment not to enforce a 485 | patent against the party. 486 | 487 | If you convey a covered work, knowingly relying on a patent license, 488 | and the Corresponding Source of the work is not available for anyone 489 | to copy, free of charge and under the terms of this License, through a 490 | publicly available network server or other readily accessible means, 491 | then you must either (1) cause the Corresponding Source to be so 492 | available, or (2) arrange to deprive yourself of the benefit of the 493 | patent license for this particular work, or (3) arrange, in a manner 494 | consistent with the requirements of this License, to extend the patent 495 | license to downstream recipients. "Knowingly relying" means you have 496 | actual knowledge that, but for the patent license, your conveying the 497 | covered work in a country, or your recipient's use of the covered work 498 | in a country, would infringe one or more identifiable patents in that 499 | country that you have reason to believe are valid. 500 | 501 | If, pursuant to or in connection with a single transaction or 502 | arrangement, you convey, or propagate by procuring conveyance of, a 503 | covered work, and grant a patent license to some of the parties 504 | receiving the covered work authorizing them to use, propagate, modify 505 | or convey a specific copy of the covered work, then the patent license 506 | you grant is automatically extended to all recipients of the covered 507 | work and works based on it. 508 | 509 | A patent license is "discriminatory" if it does not include within 510 | the scope of its coverage, prohibits the exercise of, or is 511 | conditioned on the non-exercise of one or more of the rights that are 512 | specifically granted under this License. You may not convey a covered 513 | work if you are a party to an arrangement with a third party that is 514 | in the business of distributing software, under which you make payment 515 | to the third party based on the extent of your activity of conveying 516 | the work, and under which the third party grants, to any of the 517 | parties who would receive the covered work from you, a discriminatory 518 | patent license (a) in connection with copies of the covered work 519 | conveyed by you (or copies made from those copies), or (b) primarily 520 | for and in connection with specific products or compilations that 521 | contain the covered work, unless you entered into that arrangement, 522 | or that patent license was granted, prior to 28 March 2007. 523 | 524 | Nothing in this License shall be construed as excluding or limiting 525 | any implied license or other defenses to infringement that may 526 | otherwise be available to you under applicable patent law. 527 | 528 | 12. No Surrender of Others' Freedom. 529 | 530 | If conditions are imposed on you (whether by court order, agreement or 531 | otherwise) that contradict the conditions of this License, they do not 532 | excuse you from the conditions of this License. If you cannot convey a 533 | covered work so as to satisfy simultaneously your obligations under this 534 | License and any other pertinent obligations, then as a consequence you may 535 | not convey it at all. For example, if you agree to terms that obligate you 536 | to collect a royalty for further conveying from those to whom you convey 537 | the Program, the only way you could satisfy both those terms and this 538 | License would be to refrain entirely from conveying the Program. 539 | 540 | 13. Remote Network Interaction; Use with the GNU General Public License. 541 | 542 | Notwithstanding any other provision of this License, if you modify the 543 | Program, your modified version must prominently offer all users 544 | interacting with it remotely through a computer network (if your version 545 | supports such interaction) an opportunity to receive the Corresponding 546 | Source of your version by providing access to the Corresponding Source 547 | from a network server at no charge, through some standard or customary 548 | means of facilitating copying of software. This Corresponding Source 549 | shall include the Corresponding Source for any work covered by version 3 550 | of the GNU General Public License that is incorporated pursuant to the 551 | following paragraph. 552 | 553 | Notwithstanding any other provision of this License, you have 554 | permission to link or combine any covered work with a work licensed 555 | under version 3 of the GNU General Public License into a single 556 | combined work, and to convey the resulting work. The terms of this 557 | License will continue to apply to the part which is the covered work, 558 | but the work with which it is combined will remain governed by version 559 | 3 of the GNU General Public License. 560 | 561 | 14. Revised Versions of this License. 562 | 563 | The Free Software Foundation may publish revised and/or new versions of 564 | the GNU Affero General Public License from time to time. Such new versions 565 | will be similar in spirit to the present version, but may differ in detail to 566 | address new problems or concerns. 567 | 568 | Each version is given a distinguishing version number. If the 569 | Program specifies that a certain numbered version of the GNU Affero General 570 | Public License "or any later version" applies to it, you have the 571 | option of following the terms and conditions either of that numbered 572 | version or of any later version published by the Free Software 573 | Foundation. If the Program does not specify a version number of the 574 | GNU Affero General Public License, you may choose any version ever published 575 | by the Free Software Foundation. 576 | 577 | If the Program specifies that a proxy can decide which future 578 | versions of the GNU Affero General Public License can be used, that proxy's 579 | public statement of acceptance of a version permanently authorizes you 580 | to choose that version for the Program. 581 | 582 | Later license versions may give you additional or different 583 | permissions. However, no additional obligations are imposed on any 584 | author or copyright holder as a result of your choosing to follow a 585 | later version. 586 | 587 | 15. Disclaimer of Warranty. 588 | 589 | THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY 590 | APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT 591 | HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY 592 | OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, 593 | THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 594 | PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM 595 | IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF 596 | ALL NECESSARY SERVICING, REPAIR OR CORRECTION. 597 | 598 | 16. Limitation of Liability. 599 | 600 | IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 601 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS 602 | THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY 603 | GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE 604 | USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF 605 | DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD 606 | PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), 607 | EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF 608 | SUCH DAMAGES. 609 | 610 | 17. Interpretation of Sections 15 and 16. 611 | 612 | If the disclaimer of warranty and limitation of liability provided 613 | above cannot be given local legal effect according to their terms, 614 | reviewing courts shall apply local law that most closely approximates 615 | an absolute waiver of all civil liability in connection with the 616 | Program, unless a warranty or assumption of liability accompanies a 617 | copy of the Program in return for a fee. 618 | 619 | END OF TERMS AND CONDITIONS 620 | 621 | How to Apply These Terms to Your New Programs 622 | 623 | If you develop a new program, and you want it to be of the greatest 624 | possible use to the public, the best way to achieve this is to make it 625 | free software which everyone can redistribute and change under these terms. 626 | 627 | To do so, attach the following notices to the program. It is safest 628 | to attach them to the start of each source file to most effectively 629 | state the exclusion of warranty; and each file should have at least 630 | the "copyright" line and a pointer to where the full notice is found. 631 | 632 | 633 | Copyright (C) 634 | 635 | This program is free software: you can redistribute it and/or modify 636 | it under the terms of the GNU Affero General Public License as published 637 | by the Free Software Foundation, either version 3 of the License, or 638 | (at your option) any later version. 639 | 640 | This program is distributed in the hope that it will be useful, 641 | but WITHOUT ANY WARRANTY; without even the implied warranty of 642 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 643 | GNU Affero General Public License for more details. 644 | 645 | You should have received a copy of the GNU Affero General Public License 646 | along with this program. If not, see . 647 | 648 | Also add information on how to contact you by electronic and paper mail. 649 | 650 | If your software can interact with users remotely through a computer 651 | network, you should also make sure that it provides a way for users to 652 | get its source. For example, if your program is a web application, its 653 | interface could display a "Source" link that leads users to an archive 654 | of the code. There are many ways you could offer source, and different 655 | solutions will be better for different programs; see section 13 for the 656 | specific requirements. 657 | 658 | You should also get your employer (if you work as a programmer) or school, 659 | if any, to sign a "copyright disclaimer" for the program, if necessary. 660 | For more information on this, and how to apply and follow the GNU AGPL, see 661 | . 662 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Dashtool 2 | 3 | Dashtool is a Lakehouse build tool that builds Iceberg tables from declarative SQL statements and generates Kubernetes workflows to keep these tables up-to-date. 4 | It handles Ingestion, Transformation and Orchestration. 5 | 6 | ## Features 7 | 8 | - Uses declarative SQL select statements as input 9 | - git inspired data version control 10 | - Interoperability through [Apache Iceberg](https://iceberg.apache.org/) Table format 11 | - Data ingestion through [Airbyte](https://github.com/dashbook/airbyte) 12 | - Data processing based on [Datafusion](https://arrow.apache.org/datafusion/) 13 | - Workflow orchestration in Kubernetes through [Argo Workflows](https://argoproj.github.io/workflows/) 14 | 15 | ## How it works 16 | 17 | ![dashtool](dashtool_concept.svg) 18 | 19 | Dashtool constructs a DAG by analyzing all `.sql` files in a directory structure and creates an Iceberg Materialized View for every file. 20 | Each file contains a `SELECT` statement for the Materialized View definition. 21 | Additionally, dashtool can use the DAG to create an Argo Workflow that refreshs the Materialized Views. 22 | During the workflow execution Argo starts Docker containers that run Datafusion to perform the refresh operation. 23 | 24 | ## Examples 25 | 26 | - [Postgres example](https://killercoda.com/dashbook/scenario/dashtool-postgres) 27 | - [Mysql example](https://killercoda.com/dashbook/scenario/dashtool-mysql) 28 | - [Kafka example](https://killercoda.com/dashbook/scenario/dashtool-kafka) 29 | 30 | ## Usage 31 | 32 | Dashtool goes through a build, a workflow and an apply step to turn the declarative input files into an automatically refreshing data pipeline. This is shown in the following diagram: 33 | 34 | ```mermaid 35 | graph LR 36 | git[Files in 37 | git repo] 38 | catalog[Iceberg tables 39 | in catalog] 40 | workflows[Argo 41 | workflow] 42 | data[Data] 43 | git -->|dashtool build|catalog 44 | catalog -->|dashtool workflow|workflows 45 | workflows -->|dashtool apply|data 46 | ``` 47 | 48 | Check out the [Documentation](Documentation.md) for a detailed description. 49 | 50 | ### Build 51 | 52 | The `build` command analyzes all `.sql` files in the subdirectories of the current directory and creates the corresponding Iceberg Materialized Views in the catalog. 53 | 54 | ```shell 55 | dashtool build 56 | ``` 57 | 58 | ### Workflow 59 | 60 | The `workflow` command creates a lineage DAG based on the `.sql` files and constructs an Argo workflow based on it. It stores the Workflow configuration file in `argo/workflow.yaml`. 61 | 62 | ```shell 63 | dashtool workflow 64 | ``` 65 | 66 | ### Apply 67 | 68 | To apply the latest version of the workflow to the Kubernetes cluster run the following command: 69 | 70 | ```shell 71 | dashtool apply 72 | ``` 73 | 74 | ## Installation 75 | 76 | ### Cargo 77 | 78 | ```shell 79 | cargo install dashtool 80 | ``` 81 | -------------------------------------------------------------------------------- /argo-workflow/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "argo-workflow" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | description = "Argo Workflow Models" 7 | 8 | license = "Apache-2.0" 9 | 10 | repository = "https://github.com/dashbook/dashtool" 11 | 12 | [dependencies] 13 | chrono = { version = "0.4", features = ["serde"] } 14 | derive_builder = "0.12.0" 15 | serde = { workspace = true } 16 | serde_json = { workspace = true } 17 | -------------------------------------------------------------------------------- /argo-workflow/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod schema; 2 | -------------------------------------------------------------------------------- /dashtool-common/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "dashtool-common" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | description = "Common functionality for dashtool" 7 | 8 | license = "Apache-2.0" 9 | 10 | repository = "https://github.com/dashbook/dashtool" 11 | 12 | [dependencies] 13 | serde = { workspace = true, features = ["derive"] } 14 | -------------------------------------------------------------------------------- /dashtool-common/src/lib.rs: -------------------------------------------------------------------------------- 1 | use serde::{Deserialize, Serialize}; 2 | 3 | #[derive(Debug, Serialize, Deserialize, Clone)] 4 | #[serde( 5 | from = "Option", 6 | into = "Option" 7 | )] 8 | pub enum ObjectStoreConfig { 9 | S3(S3Config), 10 | Memory, 11 | } 12 | 13 | /// Config for the s3 object-store. The secret_access_key is read from the environment variable 14 | /// AWS_SECRET_ACCESS_KEY 15 | #[derive(Debug, Serialize, Deserialize, Clone)] 16 | #[serde(rename_all = "camelCase")] 17 | pub struct S3Config { 18 | pub aws_access_key_id: String, 19 | pub aws_region: String, 20 | pub aws_secret_access_key: Option, 21 | pub aws_endpoint: Option, 22 | pub aws_allow_http: Option, 23 | } 24 | 25 | impl From> for ObjectStoreConfig { 26 | fn from(value: Option) -> Self { 27 | match value { 28 | None => ObjectStoreConfig::Memory, 29 | Some(value) => match value { 30 | ObjectStoreConfigSerde::S3(value) => ObjectStoreConfig::S3(value), 31 | }, 32 | } 33 | } 34 | } 35 | 36 | impl From for Option { 37 | fn from(value: ObjectStoreConfig) -> Self { 38 | match value { 39 | ObjectStoreConfig::Memory => None, 40 | ObjectStoreConfig::S3(value) => Some(ObjectStoreConfigSerde::S3(value)), 41 | } 42 | } 43 | } 44 | 45 | #[derive(Debug, Serialize, Deserialize)] 46 | #[serde(untagged)] 47 | pub enum ObjectStoreConfigSerde { 48 | S3(S3Config), 49 | } 50 | -------------------------------------------------------------------------------- /dashtool/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "dashtool" 3 | version = "0.4.0" 4 | edition = "2021" 5 | 6 | description = "Lakehouse build tool" 7 | 8 | license = "Apache-2.0" 9 | 10 | repository = "https://github.com/dashbook/dashtool" 11 | 12 | [dependencies] 13 | argo-workflow = "0.1" 14 | clap = { version = "=4.4", features = ["derive"] } 15 | dashtool-common = "0.1" 16 | datafusion-iceberg-sql = { git = "https://github.com/jankaul/iceberg-rust", version = "0.5", branch = "copy-if-not-exists" } 17 | datafusion-sql = "43.0.0" 18 | datafusion-common = "43.0.0" 19 | iceberg-rust = { git = "https://github.com/jankaul/iceberg-rust", version = "0.5", branch = "copy-if-not-exists" } 20 | iceberg-rest-catalog = { git = "https://github.com/jankaul/iceberg-rust", version = "0.5", branch = "copy-if-not-exists" } 21 | iceberg-sql-catalog = { git = "https://github.com/jankaul/iceberg-rust", version = "0.5", branch = "copy-if-not-exists" } 22 | iceberg-file-catalog = { git = "https://github.com/jankaul/iceberg-rust", version = "0.5", branch = "copy-if-not-exists" } 23 | iceberg-glue-catalog = { git = "https://github.com/jankaul/iceberg-rust", version = "0.5", branch = "copy-if-not-exists" } 24 | serde = { workspace = true } 25 | serde_json = { workspace = true } 26 | sqlparser = { version = "0.51", features = ["visitor"] } 27 | sqlx = { version = "0.8", features = ["runtime-tokio", "tls-rustls", "any", "sqlite", "postgres", "mysql"], default-features = false } 28 | object_store = "0.11.1" 29 | openidconnect = "3.3" 30 | openssl = { version = "0.10", features = ["vendored"] } 31 | tokio = { version = "1", features = ["macros", "rt-multi-thread"] } 32 | futures = "0.3.30" 33 | thiserror = "1" 34 | reqwest = { version = "0.11", features = ["rustls"], default-features = false } 35 | url = "2.4" 36 | anyhow = "1" 37 | petgraph = { version = "0.6", features = ["serde-1"] } 38 | arrow-schema = "53.2.0" 39 | serde_yaml = "0.9.25" 40 | async-trait = "0.1" 41 | derive-getters = "0.3.0" 42 | k8s-openapi = { version = "0.20.0", features = ["v1_24"] } 43 | shellexpand = "3.1.0" 44 | gix = { version = "0.58.0", features = ["serde","verbose-object-parsing-errors"] } 45 | itertools = "0.13.0" 46 | 47 | [target.'cfg(not(target_arch = "wasm32"))'.dependencies] 48 | dirs = { version = "5.0.1" } 49 | 50 | [target.'cfg(target_os = "macos")'.dependencies] 51 | getrandom = "=0.2.10" 52 | 53 | [dev-dependencies] 54 | tempfile = "3.8.1" 55 | 56 | -------------------------------------------------------------------------------- /dashtool/src/build/build_dag.rs: -------------------------------------------------------------------------------- 1 | use std::{collections::HashMap, fs, path::Path, sync::Arc}; 2 | 3 | use datafusion_iceberg_sql::schema::get_schema; 4 | use futures::{channel::mpsc::unbounded, stream, SinkExt, StreamExt, TryStreamExt}; 5 | use gix::diff::tree::recorder::Change; 6 | use iceberg_rust::catalog::namespace::Namespace; 7 | use iceberg_rust::spec::view_metadata::{Version, ViewRepresentation}; 8 | use iceberg_rust::{ 9 | catalog::tabular::Tabular as IcebergTabular, error::Error as IcebergError, sql::find_relations, 10 | }; 11 | use iceberg_rust::{ 12 | materialized_view::MaterializedView, 13 | spec::{ 14 | schema::SchemaBuilder, 15 | snapshot::{SnapshotReference, SnapshotRetention}, 16 | view_metadata::REF_PREFIX, 17 | }, 18 | }; 19 | use itertools::Itertools; 20 | 21 | use crate::{ 22 | dag::{identifier::FullIdentifier, Dag, Ingest, IngestConfig, Node, Tabular}, 23 | error::Error, 24 | plugins::Plugin, 25 | }; 26 | 27 | // Converts commits into a dag and performs the according transactions on the tables 28 | pub(super) async fn build_dag<'repo>( 29 | dag: &mut Dag, 30 | diff: &[Change], 31 | plugin: Arc, 32 | branch: &str, 33 | merged_branch: Option<&str>, 34 | ) -> Result<(), Error> { 35 | let (tabular_sender, tabular_reciever) = unbounded(); 36 | let (ingest_sender, ingest_reciever) = unbounded(); 37 | 38 | let catalog_list = plugin.catalog_list().await?; 39 | 40 | stream::iter(diff) 41 | .map(Ok::<_, Error>) 42 | .try_for_each(|change| { 43 | let plugin = plugin.clone(); 44 | let catalog_list = catalog_list.clone(); 45 | 46 | let mut tabular_sender = tabular_sender.clone(); 47 | let mut ingest_sender = ingest_sender.clone(); 48 | async move { 49 | let path = match &change { 50 | Change::Addition { 51 | entry_mode: _, 52 | oid: _, 53 | path, 54 | } => path, 55 | Change::Deletion { 56 | entry_mode: _, 57 | oid: _, 58 | path, 59 | } => path, 60 | Change::Modification { 61 | previous_entry_mode: _, 62 | previous_oid: _, 63 | entry_mode: _, 64 | oid: _, 65 | path, 66 | } => path, 67 | } 68 | .to_string(); 69 | let is_tabular = if path.ends_with(".sql") { 70 | Some(true) 71 | } else if path.ends_with(".ingest.json") { 72 | Some(false) 73 | } else { 74 | None 75 | }; 76 | match is_tabular { 77 | Some(true) => { 78 | let identifier = FullIdentifier::parse_path(&Path::new(&path))?; 79 | 80 | let catalog_name = identifier.catalog_name(); 81 | 82 | let catalog = 83 | catalog_list 84 | .catalog(catalog_name) 85 | .ok_or(IcebergError::NotFound( 86 | "Catalog".to_string(), 87 | catalog_name.to_string(), 88 | ))?; 89 | 90 | let sql = fs::read_to_string(&path)?; 91 | let relations = find_relations(&sql)?; 92 | 93 | match (change, merged_branch) { 94 | ( 95 | Change::Addition { 96 | entry_mode: _, 97 | oid: _, 98 | path: _, 99 | } 100 | | Change::Modification { 101 | previous_entry_mode: _, 102 | previous_oid: _, 103 | entry_mode: _, 104 | oid: _, 105 | path: _, 106 | }, 107 | Some(merged_branch), 108 | ) => { 109 | let tabular = 110 | catalog.load_tabular(&identifier.identifier()?).await?; 111 | let mut matview = 112 | if let IcebergTabular::MaterializedView(matview) = tabular { 113 | Ok(matview) 114 | } else { 115 | Err(Error::Iceberg(IcebergError::Type( 116 | "Entity".to_string(), 117 | "not materialized view".to_string(), 118 | ))) 119 | }?; 120 | let version_id = matview.metadata().current_version_id; 121 | let mut storage_table = matview.storage_table().await?; 122 | let snapshot_id = *storage_table 123 | .metadata() 124 | .current_snapshot(Some(merged_branch))? 125 | .ok_or(Error::Iceberg(IcebergError::NotFound( 126 | "Snapshot id".to_string(), 127 | "branch".to_string() + merged_branch, 128 | )))? 129 | .snapshot_id(); 130 | storage_table 131 | .new_transaction(Some(merged_branch)) 132 | .set_snapshot_ref(( 133 | branch.to_string(), 134 | SnapshotReference { 135 | snapshot_id, 136 | retention: SnapshotRetention::default(), 137 | }, 138 | )) 139 | .commit() 140 | .await?; 141 | matview 142 | .new_transaction(Some(merged_branch)) 143 | .update_properties(vec![( 144 | REF_PREFIX.to_string() + branch, 145 | version_id.to_string(), 146 | )]) 147 | .commit() 148 | .await?; 149 | } 150 | ( 151 | Change::Addition { 152 | entry_mode: _, 153 | oid: _, 154 | path: _, 155 | }, 156 | None, 157 | ) => { 158 | let relations = relations 159 | .iter() 160 | .map(|x| { 161 | FullIdentifier::parse(x).map(|y| { 162 | ( 163 | y.catalog_name().clone(), 164 | y.namespace_name().clone(), 165 | y.table_name().clone(), 166 | ) 167 | }) 168 | }) 169 | .collect::, _>>()?; 170 | 171 | let fields = get_schema( 172 | &sql, 173 | &relations, 174 | catalog_list.clone(), 175 | Some(branch), 176 | ) 177 | .await?; 178 | 179 | let schema = SchemaBuilder::default() 180 | .with_fields(fields) 181 | .build() 182 | .map_err(iceberg_rust::spec::error::Error::from)?; 183 | 184 | let base_path = plugin 185 | .bucket(catalog_name) 186 | .trim_end_matches('/') 187 | .to_string() 188 | + "/" 189 | + path 190 | .as_str() 191 | .trim_start_matches('/') 192 | .trim_end_matches(".sql"); 193 | 194 | MaterializedView::builder() 195 | .with_name(identifier.table_name()) 196 | .with_location(&base_path) 197 | .with_schema(schema) 198 | .with_view_version( 199 | Version::builder() 200 | .with_representation(ViewRepresentation::sql( 201 | &sql, None, 202 | )) 203 | .build() 204 | .map_err(IcebergError::from)?, 205 | ) 206 | .with_property((REF_PREFIX.to_string() + branch, 0.to_string())) 207 | .build(identifier.identifier()?.namespace(), catalog) 208 | .await?; 209 | } 210 | _ => (), 211 | } 212 | 213 | tabular_sender.send((identifier, (sql, relations))).await?; 214 | } 215 | Some(false) => { 216 | let ingest_json: IngestConfig = 217 | serde_json::from_str(&fs::read_to_string(&path)?)?; 218 | let source_json = ingest_json.source; 219 | let mut destination_json = ingest_json.destination; 220 | 221 | let image = ingest_json.image; 222 | 223 | destination_json["branch"] = branch.to_string().into(); 224 | 225 | let (catalog_name, namespace_name) = 226 | Path::new(&path).iter().next_tuple().ok_or(Error::Text( 227 | "File path doesn't contain catalog name and namespace".to_owned(), 228 | ))?; 229 | 230 | let catalog = catalog_list 231 | .catalog( 232 | catalog_name 233 | .to_str() 234 | .ok_or(Error::Text("Catalog name not present.".to_owned()))?, 235 | ) 236 | .ok_or(Error::Text(format!("Catalog not found in catalog list",)))?; 237 | 238 | let namespace = Namespace::try_new(&[namespace_name 239 | .to_str() 240 | .ok_or(Error::Text("Catalog name not a string".to_owned()))? 241 | .to_string()])?; 242 | 243 | let identifiers = catalog.list_tabulars(&namespace).await?; 244 | 245 | for identifier in identifiers.iter() { 246 | if let Some(merged_branch) = merged_branch { 247 | let tabular = catalog.clone().load_tabular(&identifier).await?; 248 | 249 | let mut table = if let IcebergTabular::Table(table) = tabular { 250 | Ok(table) 251 | } else { 252 | Err(Error::Iceberg(IcebergError::Type( 253 | "Entity".to_string(), 254 | "not table".to_string(), 255 | ))) 256 | }?; 257 | let snapshot_id = *table 258 | .metadata() 259 | .current_snapshot(Some(merged_branch))? 260 | .ok_or(Error::Iceberg(IcebergError::NotFound( 261 | "Snapshot id".to_string(), 262 | "branch".to_string() + merged_branch, 263 | )))? 264 | .snapshot_id(); 265 | table 266 | .new_transaction(Some(merged_branch)) 267 | .set_snapshot_ref(( 268 | branch.to_string(), 269 | SnapshotReference { 270 | snapshot_id, 271 | retention: SnapshotRetention::default(), 272 | }, 273 | )) 274 | .commit() 275 | .await?; 276 | } 277 | } 278 | let ingest_key = FullIdentifier::parse_path(Path::new(&path))?; 279 | ingest_sender 280 | .send(Node::Ingest(Ingest::new( 281 | &ingest_key, 282 | &image, 283 | source_json.clone(), 284 | destination_json, 285 | branch, 286 | ))) 287 | .await?; 288 | } 289 | _ => (), 290 | }; 291 | Ok(()) 292 | } 293 | }) 294 | .await?; 295 | 296 | tabular_sender.close_channel(); 297 | ingest_sender.close_channel(); 298 | 299 | let ingests = ingest_reciever.collect::>().await; 300 | 301 | let tabs: HashMap)> = 302 | HashMap::from_iter(tabular_reciever.collect::>().await); 303 | 304 | for ingest in ingests { 305 | dag.add_node(ingest)?; 306 | } 307 | 308 | for (node, (sql, _)) in &tabs { 309 | dag.add_node(Node::Tabular(Tabular::new(node, branch, sql)))?; 310 | } 311 | 312 | for (node, (_, children)) in tabs { 313 | for child in children { 314 | dag.add_edge(&node.to_string(), &child)? 315 | } 316 | } 317 | 318 | Ok(()) 319 | } 320 | 321 | #[cfg(test)] 322 | mod tests { 323 | use core::panic; 324 | use std::{ 325 | env, 326 | fs::{self, File}, 327 | io::Write, 328 | path::Path, 329 | sync::Arc, 330 | }; 331 | 332 | use gix::{diff::tree::recorder::Change, objs::tree::EntryKind, ObjectId}; 333 | use iceberg_rust::{ 334 | catalog::bucket::ObjectStoreBuilder, 335 | spec::{ 336 | partition::{PartitionField, PartitionSpecBuilder, Transform}, 337 | schema::SchemaBuilder, 338 | types::{PrimitiveType, StructField, StructTypeBuilder, Type}, 339 | }, 340 | }; 341 | use iceberg_rust::{ 342 | catalog::{identifier::Identifier, tabular::Tabular, CatalogList}, 343 | table::Table, 344 | }; 345 | use iceberg_sql_catalog::SqlCatalogList; 346 | 347 | use tempfile::TempDir; 348 | 349 | use crate::{ 350 | build::{build_dag::build_dag, update_dag::update_dag}, 351 | dag::Node, 352 | plugins::{sql::SqlPlugin, Config, Plugin}, 353 | }; 354 | 355 | #[tokio::test] 356 | async fn add_ingest() { 357 | let temp_dir = TempDir::new().unwrap(); 358 | 359 | env::set_current_dir(temp_dir.path()).expect("Failed to set current work dir"); 360 | std::env::current_dir().expect("Failed to sync workdir"); 361 | 362 | let bronze_path = temp_dir.path().join("bronze"); 363 | fs::create_dir(&bronze_path).expect("Failed to create directory"); 364 | 365 | let bronze_inventory_path = bronze_path.join("inventory"); 366 | fs::create_dir(&bronze_inventory_path).expect("Failed to create directory"); 367 | 368 | let config_path = bronze_inventory_path.join(Path::new("postgres.ingest.json")); 369 | File::create(&config_path) 370 | .expect("Failed to create file") 371 | .write_all( 372 | r#" 373 | { 374 | "image":"dashbook/source-postgres:sql", 375 | "source":{ 376 | "host": "172.17.0.2", 377 | "port": 5432, 378 | "user": "postgres", 379 | "password": "$POSTGRES_PASSWORD", 380 | "dbname": "postgres", 381 | "filter_schemas": "inventory", 382 | "default_replication_method": "LOG_BASED" 383 | }, 384 | "destination":{ 385 | "catalog": "https://api.dashbook.dev/nessie/cat-1w0qookj", 386 | "bucket": "s3://example-postgres/", 387 | "access_token": "$ACCESS_TOKEN", 388 | "id_token": "$ID_TOKEN" 389 | } 390 | } 391 | "# 392 | .as_bytes(), 393 | ) 394 | .expect("Failed to write to file"); 395 | 396 | let changes = vec![Change::Addition { 397 | entry_mode: EntryKind::Tree 398 | .try_into() 399 | .expect("Failed to create git entry"), 400 | oid: ObjectId::null(gix::hash::Kind::Sha1), 401 | path: config_path.to_str().unwrap().into(), 402 | }]; 403 | 404 | let mut dag = update_dag(&vec![], None, "main").expect("Failed to create dag"); 405 | 406 | let config_json = r#" 407 | { 408 | "plugin": "sql", 409 | "catalogUrl": "sqlite://", 410 | "secrets": {}, 411 | "bucket": "" 412 | } 413 | "#; 414 | 415 | let config = match serde_json::from_str(&config_json).expect("Failed to parse sql config") { 416 | Config::Sql(config) => config, 417 | Config::File(_) => panic!("Wrong config"), 418 | }; 419 | 420 | let plugin = Arc::new( 421 | SqlPlugin::new(config) 422 | .await 423 | .expect("Failed to create plugin"), 424 | ); 425 | 426 | build_dag(&mut dag, &changes, plugin, "main", None) 427 | .await 428 | .expect("Failed to build dag"); 429 | 430 | assert_eq!(dag.ingests.len(), 1); 431 | assert_eq!(dag.map.len(), 1); 432 | 433 | let orders = dag 434 | .ingests 435 | .get("bronze.inventory") 436 | .expect("Failed to get graph index"); 437 | 438 | assert_eq!(orders[0], "bronze.inventory.postgres"); 439 | 440 | let ingest = &dag.dag[*dag.map.get(&orders[0]).expect("Failed to get graph index")]; 441 | 442 | let Node::Ingest(ingest) = ingest else { 443 | panic!("Node is not an ingest") 444 | }; 445 | 446 | assert_eq!(ingest.image, "dashbook/source-postgres:sql"); 447 | assert_eq!(ingest.destination["branch"], "main"); 448 | } 449 | 450 | #[tokio::test] 451 | async fn add_tabular() { 452 | let temp_dir = TempDir::new().unwrap(); 453 | 454 | env::set_current_dir(temp_dir.path()).expect("Failed to set current work dir"); 455 | std::env::current_dir().expect("Failed to sync workdir"); 456 | 457 | let bronze_path = temp_dir.path().join("bronze"); 458 | fs::create_dir(&bronze_path).expect("Failed to create directory"); 459 | 460 | let bronze_inventory_path = bronze_path.join("inventory"); 461 | fs::create_dir(&bronze_inventory_path).expect("Failed to create directory"); 462 | 463 | let config_path = bronze_inventory_path.join(Path::new("postgres.ingest.json")); 464 | File::create(&config_path) 465 | .expect("Failed to create file") 466 | .write_all( 467 | r#" 468 | { 469 | "image":"dashbook/source-postgres:sql", 470 | "source":{ 471 | "host": "172.17.0.2", 472 | "port": 5432, 473 | "user": "postgres", 474 | "password": "$POSTGRES_PASSWORD", 475 | "dbname": "postgres", 476 | "filter_schemas": "inventory", 477 | "default_replication_method": "LOG_BASED" 478 | }, 479 | "destination":{ 480 | "catalog": "https://api.dashbook.dev/nessie/cat-1w0qookj", 481 | "bucket": "s3://example-postgres/", 482 | "access_token": "$ACCESS_TOKEN", 483 | "id_token": "$ID_TOKEN" 484 | } 485 | } 486 | "# 487 | .as_bytes(), 488 | ) 489 | .expect("Failed to write to file"); 490 | 491 | let silver_path = temp_dir.path().join("silver"); 492 | fs::create_dir(&silver_path).expect("Failed to create directory"); 493 | 494 | let silver_inventory_path = silver_path.join("inventory"); 495 | fs::create_dir(&silver_inventory_path).expect("Failed to create directory"); 496 | 497 | let tabular_path = silver_inventory_path.join(Path::new("factOrder.sql")); 498 | File::create(&tabular_path) 499 | .expect("Failed to create file") 500 | .write_all( 501 | r#" 502 | select product_id, sum(amount) from bronze.inventory.orders group by product_id; 503 | "# 504 | .as_bytes(), 505 | ) 506 | .expect("Failed to write to file"); 507 | 508 | let changes = vec![ 509 | Change::Addition { 510 | entry_mode: EntryKind::Tree 511 | .try_into() 512 | .expect("Failed to create git entry"), 513 | oid: ObjectId::null(gix::hash::Kind::Sha1), 514 | path: config_path.to_str().unwrap().into(), 515 | }, 516 | Change::Addition { 517 | entry_mode: EntryKind::Tree 518 | .try_into() 519 | .expect("Failed to create git entry"), 520 | oid: ObjectId::null(gix::hash::Kind::Sha1), 521 | path: tabular_path.to_str().unwrap().into(), 522 | }, 523 | ]; 524 | 525 | let mut dag = update_dag(&vec![], None, "main").expect("Failed to create dag"); 526 | 527 | let object_store = ObjectStoreBuilder::memory(); 528 | 529 | let catalog_list = Arc::new( 530 | SqlCatalogList::new("sqlite://", object_store) 531 | .await 532 | .expect("Failed to create catalog list"), 533 | ); 534 | 535 | let bronze_catalog = catalog_list 536 | .catalog("bronze") 537 | .expect("Failed to create catalog"); 538 | 539 | let schema = SchemaBuilder::default() 540 | .with_fields( 541 | StructTypeBuilder::default() 542 | .with_struct_field(StructField { 543 | id: 1, 544 | name: "id".to_string(), 545 | required: true, 546 | field_type: Type::Primitive(PrimitiveType::Long), 547 | doc: None, 548 | }) 549 | .with_struct_field(StructField { 550 | id: 2, 551 | name: "customer_id".to_string(), 552 | required: true, 553 | field_type: Type::Primitive(PrimitiveType::Long), 554 | doc: None, 555 | }) 556 | .with_struct_field(StructField { 557 | id: 3, 558 | name: "product_id".to_string(), 559 | required: true, 560 | field_type: Type::Primitive(PrimitiveType::Long), 561 | doc: None, 562 | }) 563 | .with_struct_field(StructField { 564 | id: 4, 565 | name: "date".to_string(), 566 | required: true, 567 | field_type: Type::Primitive(PrimitiveType::Date), 568 | doc: None, 569 | }) 570 | .with_struct_field(StructField { 571 | id: 5, 572 | name: "amount".to_string(), 573 | required: true, 574 | field_type: Type::Primitive(PrimitiveType::Int), 575 | doc: None, 576 | }) 577 | .build() 578 | .unwrap(), 579 | ) 580 | .build() 581 | .unwrap(); 582 | 583 | let partition_spec = PartitionSpecBuilder::default() 584 | .with_spec_id(1) 585 | .with_partition_field(PartitionField::new(4, 1000, "day", Transform::Day)) 586 | .build() 587 | .expect("Failed to create partition spec"); 588 | 589 | Table::builder() 590 | .with_name("orders") 591 | .with_location("/bronze/inventory/orders") 592 | .with_schema(schema) 593 | .with_partition_spec(partition_spec) 594 | .build(&["inventory".to_owned()], bronze_catalog.clone()) 595 | .await 596 | .expect("Failed to creat table."); 597 | 598 | let config = match serde_json::from_str( 599 | r#" 600 | { 601 | "plugin": "sql", 602 | "catalogUrl": "sqlite://", 603 | "bucket": "test", 604 | "secrets": {} 605 | } 606 | "#, 607 | ) 608 | .expect("Failed to parse sql config") 609 | { 610 | Config::Sql(config) => config, 611 | Config::File(_) => panic!("Wrong config"), 612 | }; 613 | 614 | let plugin = Arc::new( 615 | SqlPlugin::new_with_catalog(config, catalog_list).expect("Failed to create plugin"), 616 | ); 617 | 618 | build_dag(&mut dag, &changes, plugin.clone(), "main", None) 619 | .await 620 | .expect("Failed to build dag"); 621 | 622 | assert_eq!(dag.ingests.len(), 1); 623 | assert_eq!(dag.map.len(), 2); 624 | 625 | let tabular = &dag.dag[*dag 626 | .map 627 | .get("silver.inventory.factOrder") 628 | .expect("Failed to get graph index")]; 629 | 630 | let Node::Tabular(tabular) = tabular else { 631 | panic!("Node is not a tabular") 632 | }; 633 | 634 | assert_eq!( 635 | &tabular.identifier.to_string(), 636 | "silver.inventory.factOrder" 637 | ); 638 | 639 | let catalog_list = plugin 640 | .catalog_list() 641 | .await 642 | .expect("Failed to get catalog list."); 643 | 644 | let catalog = catalog_list 645 | .catalog("silver") 646 | .expect("Failed to get catalog"); 647 | 648 | let matview = if let Tabular::MaterializedView(matview) = catalog 649 | .load_tabular( 650 | &Identifier::parse("inventory.factOrder", None) 651 | .expect("Failed to parse identifier"), 652 | ) 653 | .await 654 | .expect("Failed to load table") 655 | { 656 | matview 657 | } else { 658 | panic!("Result is not a materialized view") 659 | }; 660 | 661 | let schema = matview 662 | .metadata() 663 | .current_schema(None) 664 | .expect("Failed to get schema"); 665 | 666 | assert_eq!(schema.fields()[0].name, "product_id"); 667 | assert_eq!(schema.fields()[0].field_type.to_string(), "long"); 668 | 669 | assert_eq!( 670 | schema.fields()[1].name, 671 | "sum(bronze.inventory.orders.amount)" 672 | ); 673 | assert_eq!(schema.fields()[1].field_type.to_string(), "long"); 674 | } 675 | } 676 | -------------------------------------------------------------------------------- /dashtool/src/build/mod.rs: -------------------------------------------------------------------------------- 1 | use std::{fs, sync::Arc}; 2 | 3 | use crate::{ 4 | error::Error, 5 | git::{branch, diff}, 6 | plugins::Plugin, 7 | state::State, 8 | }; 9 | 10 | use self::{build_dag::build_dag, update_dag::update_dag}; 11 | 12 | mod build_dag; 13 | mod update_dag; 14 | 15 | pub async fn build(plugin: Arc) -> Result<(), Error> { 16 | let mut state: State = fs::read_to_string(".dashtool/state.json") 17 | .ok() 18 | .and_then(|x| serde_json::from_str(&x).ok()) 19 | .unwrap_or_default(); 20 | 21 | // Inspect git repo 22 | let repo = gix::discover(".")?; 23 | let db = gix::odb::at(".git/objects")?; 24 | 25 | // Name of the currently selected branch in the git repo 26 | let current_branch = branch(&repo)?; 27 | 28 | let current_id = repo 29 | .try_find_reference(¤t_branch)? 30 | .map(|x| x.target().into_owned().try_into_id()) 31 | .transpose() 32 | .unwrap(); 33 | 34 | let main_id = repo 35 | .try_find_reference("main")? 36 | .map(|x| x.target().into_owned().try_into_id()) 37 | .transpose() 38 | .unwrap(); 39 | 40 | // Id of the last commit on the current brranch that was built with dashtool 41 | let last_id = state.commits.get(¤t_branch).cloned(); 42 | 43 | // Id of the last commit on the main branch that was built with dashtool 44 | let last_main_id = state.commits.get("main").cloned(); 45 | 46 | // Check if dashtool built a branch with the same commit as the current main branch to see if the branch was merged 47 | let merged_branch = state 48 | .commits 49 | .iter() 50 | .find(|(k, v)| { 51 | if let Some(y) = &main_id { 52 | *v == y && *k != "main" 53 | } else { 54 | false 55 | } 56 | }) 57 | .map(|(k, _)| k) 58 | .cloned(); 59 | 60 | let last_main_diff = diff(&db, &None, &last_main_id)?; 61 | 62 | let mut dag = update_dag(&last_main_diff, None, "main")?; 63 | 64 | let main_diff = diff(&db, &last_main_id, &main_id)?; 65 | 66 | build_dag( 67 | &mut dag, 68 | &main_diff, 69 | plugin.clone(), 70 | "main", 71 | merged_branch.as_deref(), 72 | ) 73 | .await?; 74 | 75 | // Only apply other changes if not on main branch 76 | let dag = if current_id != main_id { 77 | let last_diff = diff(&db, &main_id, &last_id)?; 78 | 79 | let mut dag = update_dag(&last_diff, Some(dag), ¤t_branch)?; 80 | 81 | let diff = diff(&db, &last_id.or(main_id), ¤t_id)?; 82 | 83 | build_dag(&mut dag, &diff, plugin.clone(), ¤t_branch, None).await?; 84 | 85 | dag 86 | } else { 87 | dag 88 | }; 89 | 90 | let json = serde_json::to_string(&dag)?; 91 | 92 | fs::write( 93 | ".dashtool/dags/".to_string() + ¤t_branch + ".json", 94 | json, 95 | )?; 96 | 97 | if let Some(current_id) = current_id { 98 | state.commits.insert(current_branch, current_id); 99 | } 100 | 101 | if let Some(main_id) = main_id { 102 | state.commits.insert("main".to_owned(), main_id); 103 | } 104 | 105 | let state_json = serde_json::to_string(&state)?; 106 | 107 | fs::write(".dashtool/state.json", state_json)?; 108 | 109 | println!("Building successful."); 110 | 111 | Ok(()) 112 | } 113 | -------------------------------------------------------------------------------- /dashtool/src/build/update_dag.rs: -------------------------------------------------------------------------------- 1 | use std::{fs, path::Path}; 2 | 3 | use gix::diff::tree::recorder::Change; 4 | use iceberg_rust::sql::find_relations; 5 | 6 | use crate::{ 7 | dag::{identifier::FullIdentifier, Dag, Ingest, IngestConfig, Node, Tabular}, 8 | error::Error, 9 | }; 10 | 11 | // Converts the commits into a dag without performing any operations on the tables 12 | pub(super) fn update_dag(diff: &[Change], dag: Option, branch: &str) -> Result { 13 | let mut dag = dag.unwrap_or(Dag::new()); 14 | 15 | let mut ingests = Vec::new(); 16 | let mut tabulars = Vec::new(); 17 | 18 | for delta in diff { 19 | match delta { 20 | Change::Addition { 21 | entry_mode: _, 22 | oid: _, 23 | path, 24 | } => { 25 | if path.ends_with(b".sql") { 26 | tabulars.push(String::from_utf8(path.as_slice().to_owned())?) 27 | } else if path.ends_with(b".ingest.json") { 28 | ingests.push(String::from_utf8(path.as_slice().to_owned())?) 29 | }; 30 | } 31 | _ => (), 32 | } 33 | } 34 | 35 | for path in ingests { 36 | let identifier = FullIdentifier::parse_path(Path::new(&path))?; 37 | 38 | let ingest_json: IngestConfig = serde_json::from_str(&fs::read_to_string(&path)?)?; 39 | let source_json = ingest_json.source; 40 | let mut destination_json = ingest_json.destination; 41 | 42 | let image = ingest_json.image; 43 | 44 | destination_json["branch"] = branch.to_string().into(); 45 | 46 | dag.add_node(Node::Ingest(Ingest::new( 47 | &identifier, 48 | &image, 49 | source_json.clone(), 50 | destination_json, 51 | branch, 52 | )))?; 53 | } 54 | 55 | for path in tabulars { 56 | let sql = fs::read_to_string(&path)?; 57 | 58 | let children = find_relations(&sql)?; 59 | 60 | let identifier = FullIdentifier::parse_path(Path::new(&path))?; 61 | 62 | dag.add_node(Node::Tabular(Tabular::new(&identifier, branch, &sql)))?; 63 | 64 | for child in children { 65 | dag.add_edge(&identifier.to_string(), &child)? 66 | } 67 | } 68 | Ok(dag) 69 | } 70 | 71 | #[cfg(test)] 72 | mod tests { 73 | use std::{ 74 | env, 75 | fs::{self, File}, 76 | io::Write, 77 | path::Path, 78 | }; 79 | 80 | use gix::{diff::tree::recorder::Change, objs::tree::EntryKind, ObjectId}; 81 | use tempfile::TempDir; 82 | 83 | use crate::{build::update_dag::update_dag, dag::Node}; 84 | 85 | #[test] 86 | fn add_ingest() { 87 | let temp_dir = TempDir::new().unwrap(); 88 | 89 | env::set_current_dir(temp_dir.path()).expect("Failed to set current work dir"); 90 | std::env::current_dir().expect("Failed to sync workdir"); 91 | 92 | let bronze_path = temp_dir.path().join("bronze"); 93 | fs::create_dir(&bronze_path).expect("Failed to create directory"); 94 | 95 | let bronze_inventory_path = bronze_path.join("inventory"); 96 | fs::create_dir(&bronze_inventory_path).expect("Failed to create directory"); 97 | 98 | let config_path = bronze_inventory_path.join(Path::new("postgres.ingest.json")); 99 | File::create(&config_path) 100 | .expect("Failed to create file") 101 | .write_all( 102 | r#" 103 | { 104 | "image":"dashbook/source-postgres:sql", 105 | "source":{ 106 | "host": "172.17.0.2", 107 | "port": 5432, 108 | "user": "postgres", 109 | "password": "$POSTGRES_PASSWORD", 110 | "dbname": "postgres", 111 | "filter_schemas": "inventory", 112 | "default_replication_method": "LOG_BASED" 113 | }, 114 | "destination":{ 115 | "catalog": "https://api.dashbook.dev/nessie/cat-1w0qookj", 116 | "bucket": "s3://example-postgres/", 117 | "access_token": "$ACCESS_TOKEN", 118 | "id_token": "$ID_TOKEN" 119 | } 120 | } 121 | "# 122 | .as_bytes(), 123 | ) 124 | .expect("Failed to write to file"); 125 | 126 | let changes = vec![Change::Addition { 127 | entry_mode: EntryKind::Tree 128 | .try_into() 129 | .expect("Failed to create git entry"), 130 | oid: ObjectId::null(gix::hash::Kind::Sha1), 131 | path: config_path.to_str().unwrap().into(), 132 | }]; 133 | 134 | let dag = update_dag(&changes, None, "main").expect("Failed to create dag"); 135 | 136 | assert_eq!(dag.ingests.len(), 1); 137 | 138 | let orders = dag 139 | .ingests 140 | .get("bronze.inventory") 141 | .expect("Failed to get graph index"); 142 | 143 | assert_eq!(orders[0], "bronze.inventory.postgres"); 144 | 145 | let ingest = &dag.dag[*dag.map.get(&orders[0]).expect("Failed to get graph index")]; 146 | 147 | let Node::Ingest(ingest) = ingest else { 148 | panic!("Node is not an ingest") 149 | }; 150 | 151 | assert_eq!(ingest.image, "dashbook/source-postgres:sql") 152 | } 153 | 154 | #[test] 155 | fn add_tabular() { 156 | let temp_dir = TempDir::new().unwrap(); 157 | 158 | env::set_current_dir(temp_dir.path()).expect("Failed to set current work dir"); 159 | std::env::current_dir().expect("Failed to sync workdir"); 160 | 161 | let bronze_path = temp_dir.path().join("bronze"); 162 | fs::create_dir(&bronze_path).expect("Failed to create directory"); 163 | 164 | let bronze_inventory_path = bronze_path.join("inventory"); 165 | fs::create_dir(&bronze_inventory_path).expect("Failed to create directory"); 166 | 167 | let config_path = bronze_inventory_path.join(Path::new("postgres.ingest.json")); 168 | File::create(&config_path) 169 | .expect("Failed to create file") 170 | .write_all( 171 | r#" 172 | { 173 | "image":"dashbook/source-postgres:sql", 174 | "source":{ 175 | "host": "172.17.0.2", 176 | "port": 5432, 177 | "user": "postgres", 178 | "password": "$POSTGRES_PASSWORD", 179 | "dbname": "postgres", 180 | "filter_schemas": "inventory", 181 | "default_replication_method": "LOG_BASED" 182 | }, 183 | "destination":{ 184 | "catalog": "https://api.dashbook.dev/nessie/cat-1w0qookj", 185 | "bucket": "s3://example-postgres/", 186 | "access_token": "$ACCESS_TOKEN", 187 | "id_token": "$ID_TOKEN" 188 | } 189 | } 190 | "# 191 | .as_bytes(), 192 | ) 193 | .expect("Failed to write to file"); 194 | 195 | let silver_path = temp_dir.path().join("silver"); 196 | fs::create_dir(&silver_path).expect("Failed to create directory"); 197 | 198 | let silver_inventory_path = silver_path.join("inventory"); 199 | fs::create_dir(&silver_inventory_path).expect("Failed to create directory"); 200 | 201 | let tabular_path = silver_inventory_path.join(Path::new("factOrder.sql")); 202 | File::create(&tabular_path) 203 | .expect("Failed to create file") 204 | .write_all( 205 | r#" 206 | select order_id from bronze.inventory.orders; 207 | "# 208 | .as_bytes(), 209 | ) 210 | .expect("Failed to write to file"); 211 | 212 | let changes = vec![ 213 | Change::Addition { 214 | entry_mode: EntryKind::Tree 215 | .try_into() 216 | .expect("Failed to create git entry"), 217 | oid: ObjectId::null(gix::hash::Kind::Sha1), 218 | path: config_path.to_str().unwrap().into(), 219 | }, 220 | Change::Addition { 221 | entry_mode: EntryKind::Tree 222 | .try_into() 223 | .expect("Failed to create git entry"), 224 | oid: ObjectId::null(gix::hash::Kind::Sha1), 225 | path: tabular_path.to_str().unwrap().into(), 226 | }, 227 | ]; 228 | 229 | let dag = update_dag(&changes, None, "main").expect("Failed to create dag"); 230 | 231 | assert_eq!(dag.map.len(), 2); 232 | 233 | let tabular = &dag.dag[*dag 234 | .map 235 | .get("silver.inventory.factOrder") 236 | .expect("Failed to get graph index")]; 237 | 238 | let Node::Tabular(tabular) = tabular else { 239 | panic!("Node is not a tabular") 240 | }; 241 | 242 | assert_eq!( 243 | &tabular.identifier.to_string(), 244 | "silver.inventory.factOrder" 245 | ) 246 | } 247 | 248 | #[test] 249 | fn add_ingest_branch() { 250 | let temp_dir = TempDir::new().unwrap(); 251 | 252 | env::set_current_dir(temp_dir.path()).expect("Failed to set current work dir"); 253 | std::env::current_dir().expect("Failed to sync workdir"); 254 | 255 | let bronze_path = temp_dir.path().join("bronze"); 256 | fs::create_dir(&bronze_path).expect("Failed to create directory"); 257 | 258 | let bronze_inventory_path = bronze_path.join("inventory"); 259 | fs::create_dir(&bronze_inventory_path).expect("Failed to create directory"); 260 | 261 | let config_path = bronze_inventory_path.join(Path::new("postgres.ingest.json")); 262 | File::create(&config_path) 263 | .expect("Failed to create file") 264 | .write_all( 265 | r#" 266 | { 267 | "image":"dashbook/source-postgres:sql", 268 | "source":{ 269 | "host": "172.17.0.2", 270 | "port": 5432, 271 | "user": "postgres", 272 | "password": "$POSTGRES_PASSWORD", 273 | "dbname": "postgres", 274 | "filter_schemas": "inventory", 275 | "default_replication_method": "LOG_BASED" 276 | }, 277 | "destination":{ 278 | "catalog": "https://api.dashbook.dev/nessie/cat-1w0qookj", 279 | "bucket": "s3://example-postgres/", 280 | "access_token": "$ACCESS_TOKEN", 281 | "id_token": "$ID_TOKEN" 282 | } 283 | } 284 | "# 285 | .as_bytes(), 286 | ) 287 | .expect("Failed to write to file"); 288 | 289 | let changes = vec![Change::Addition { 290 | entry_mode: EntryKind::Tree 291 | .try_into() 292 | .expect("Failed to create git entry"), 293 | oid: ObjectId::null(gix::hash::Kind::Sha1), 294 | path: config_path.to_str().unwrap().into(), 295 | }]; 296 | 297 | let dag = update_dag(&changes, None, "expenditures").expect("Failed to create dag"); 298 | 299 | assert_eq!(dag.ingests.len(), 1); 300 | assert_eq!(dag.map.len(), 1); 301 | 302 | let orders = dag 303 | .ingests 304 | .get("bronze.inventory") 305 | .expect("Failed to get graph index"); 306 | 307 | assert_eq!(orders[0], "bronze.inventory.postgres"); 308 | 309 | let ingest = &dag.dag[*dag.map.get(&orders[0]).expect("Failed to get graph index")]; 310 | 311 | let Node::Ingest(ingest) = ingest else { 312 | panic!("Node is not an ingest") 313 | }; 314 | 315 | assert_eq!(ingest.image, "dashbook/source-postgres:sql") 316 | } 317 | } 318 | -------------------------------------------------------------------------------- /dashtool/src/dag/identifier.rs: -------------------------------------------------------------------------------- 1 | use std::{fmt::Display, path::Path}; 2 | 3 | use derive_getters::Getters; 4 | use iceberg_rust::catalog::identifier::Identifier; 5 | use serde::{Deserialize, Serialize}; 6 | 7 | use crate::error::Error; 8 | 9 | #[derive(Debug, Serialize, Deserialize, Clone, Hash, PartialEq, Eq, Getters)] 10 | pub(crate) struct FullIdentifier { 11 | catalog_name: String, 12 | namespace_name: String, 13 | table_name: String, 14 | } 15 | 16 | impl FullIdentifier { 17 | pub fn parse(input: &str) -> Result { 18 | let mut parts = input.split("."); 19 | let catalog_name = parts 20 | .next() 21 | .ok_or(Error::Text("Input is empty".to_string()))? 22 | .to_owned(); 23 | let namespace_name = parts 24 | .next() 25 | .ok_or(Error::Text(format!( 26 | "Identifier {} has only one part", 27 | input 28 | )))? 29 | .to_owned(); 30 | let table_name = parts 31 | .next() 32 | .ok_or(Error::Text(format!( 33 | "Identifier {} has only two parts", 34 | input 35 | )))? 36 | .to_owned(); 37 | Ok(FullIdentifier { 38 | catalog_name, 39 | namespace_name, 40 | table_name, 41 | }) 42 | } 43 | 44 | pub(crate) fn parse_path(input: &Path) -> Result { 45 | let mut parts = input.iter().rev(); 46 | let table_name = parts 47 | .next() 48 | .ok_or(Error::Text(format!( 49 | "Identifier {:?} has only two parts", 50 | input 51 | )))? 52 | .to_str() 53 | .ok_or(Error::Text("Failed to convert OsStr".to_string()))? 54 | .trim_end_matches(".sql") 55 | .trim_end_matches(".ingest.json") 56 | .to_owned(); 57 | let namespace_name = parts 58 | .next() 59 | .ok_or(Error::Text(format!( 60 | "Identifier {:?} has only one part", 61 | input 62 | )))? 63 | .to_str() 64 | .ok_or(Error::Text("Failed to convert OsStr".to_string()))? 65 | .to_owned(); 66 | let catalog_name = parts 67 | .next() 68 | .ok_or(Error::Text("Input is empty".to_string()))? 69 | .to_str() 70 | .ok_or(Error::Text("Failed to convert OsStr".to_string()))? 71 | .to_owned(); 72 | Ok(FullIdentifier { 73 | catalog_name, 74 | namespace_name, 75 | table_name, 76 | }) 77 | } 78 | 79 | pub(crate) fn identifier(&self) -> Result { 80 | Ok(Identifier::try_new( 81 | &vec![self.namespace_name.clone(), self.table_name.clone()], 82 | None, 83 | )?) 84 | } 85 | } 86 | 87 | impl Display for FullIdentifier { 88 | fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { 89 | write!( 90 | f, 91 | "{}.{}.{}", 92 | self.catalog_name, self.namespace_name, self.table_name 93 | ) 94 | } 95 | } 96 | 97 | #[cfg(test)] 98 | mod tests { 99 | use std::path::Path; 100 | 101 | use super::FullIdentifier; 102 | 103 | #[test] 104 | fn parse() { 105 | let identifier = 106 | FullIdentifier::parse("bronze.inventory.orders").expect("Failed to parse identifier"); 107 | 108 | assert_eq!("bronze.inventory.orders", &identifier.to_string()); 109 | } 110 | 111 | #[test] 112 | fn parse_path_sql() { 113 | let identifier = FullIdentifier::parse_path(&Path::new("silver/inventory/factOrder.sql")) 114 | .expect("Failed to parse identifier"); 115 | 116 | assert_eq!("silver.inventory.factOrder", &identifier.to_string()); 117 | } 118 | } 119 | -------------------------------------------------------------------------------- /dashtool/src/dag/mod.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | collections::{hash_map::Entry, HashMap}, 3 | fs, 4 | }; 5 | 6 | use itertools::Itertools; 7 | use petgraph::stable_graph::{NodeIndex, StableDiGraph}; 8 | use serde::{Deserialize, Serialize}; 9 | use serde_json::Value as JsonValue; 10 | 11 | use crate::error::Error; 12 | 13 | use self::identifier::FullIdentifier; 14 | 15 | pub(crate) mod identifier; 16 | 17 | #[derive(Serialize, Deserialize, Debug)] 18 | pub enum Node { 19 | Tabular(Tabular), 20 | Ingest(Ingest), 21 | } 22 | 23 | impl Node { 24 | pub(crate) fn identifier(&self) -> &FullIdentifier { 25 | match self { 26 | Node::Ingest(ingest) => &ingest.identifier, 27 | Node::Tabular(tab) => &tab.identifier, 28 | } 29 | } 30 | } 31 | 32 | #[derive(Serialize, Deserialize, Debug)] 33 | pub struct Tabular { 34 | pub(crate) identifier: FullIdentifier, 35 | pub(crate) branch: String, 36 | pub(crate) query: String, 37 | } 38 | 39 | impl Tabular { 40 | pub(crate) fn new(identifier: &FullIdentifier, branch: &str, query: &String) -> Self { 41 | Self { 42 | identifier: identifier.clone(), 43 | branch: branch.to_owned(), 44 | query: query.to_owned(), 45 | } 46 | } 47 | } 48 | 49 | #[derive(Serialize, Deserialize, Debug)] 50 | pub struct Ingest { 51 | pub(crate) identifier: FullIdentifier, 52 | pub(crate) branch: String, 53 | pub(crate) image: String, 54 | pub(crate) source: JsonValue, 55 | pub(crate) destination: JsonValue, 56 | } 57 | 58 | impl Ingest { 59 | pub(crate) fn new( 60 | identifier: &FullIdentifier, 61 | image: &str, 62 | source: JsonValue, 63 | destination: JsonValue, 64 | branch: &str, 65 | ) -> Self { 66 | Self { 67 | identifier: identifier.clone(), 68 | branch: branch.to_owned(), 69 | image: image.to_owned(), 70 | source, 71 | destination, 72 | } 73 | } 74 | } 75 | 76 | #[derive(Serialize, Deserialize, Debug)] 77 | pub(crate) struct IngestConfig { 78 | pub(crate) image: String, 79 | pub(crate) source: JsonValue, 80 | pub(crate) destination: JsonValue, 81 | } 82 | 83 | #[derive(Serialize, Deserialize, Debug)] 84 | pub struct Dag { 85 | pub(crate) ingests: HashMap>, 86 | pub(crate) map: HashMap, 87 | pub(crate) dag: StableDiGraph, 88 | } 89 | 90 | impl Dag { 91 | pub fn new() -> Self { 92 | Self { 93 | ingests: HashMap::new(), 94 | map: HashMap::new(), 95 | dag: StableDiGraph::new(), 96 | } 97 | } 98 | } 99 | 100 | impl Dag { 101 | pub(crate) fn add_node(&mut self, node: Node) -> Result<(), Error> { 102 | let identifier = match &node { 103 | Node::Ingest(ingest) => { 104 | let identifier = ingest.identifier.clone(); 105 | self.ingests 106 | .entry(identifier.catalog_name().clone() + "." + identifier.namespace_name()) 107 | .and_modify(|x| x.push(identifier.to_string())) 108 | .or_insert(vec![identifier.to_string()]); 109 | identifier 110 | } 111 | Node::Tabular(tab) => tab.identifier.clone(), 112 | }; 113 | match self.map.entry(identifier.to_string()) { 114 | Entry::Vacant(entry) => { 115 | let idx = self.dag.add_node(node); 116 | entry.insert(idx); 117 | } 118 | Entry::Occupied(entry) => { 119 | let idx = entry.get(); 120 | self.dag[*idx] = node; 121 | } 122 | }; 123 | Ok(()) 124 | } 125 | 126 | pub(crate) fn add_edge(&mut self, a: &str, b: &str) -> Result<(), Error> { 127 | let a = self 128 | .map 129 | .get(a) 130 | .cloned() 131 | .ok_or(Error::Text(format!("Node {} not in graph.", a)))?; 132 | 133 | let bs = match self.ingests.get(&b.split(".").take(2).join(".")) { 134 | None => vec![self 135 | .map 136 | .get(b) 137 | .cloned() 138 | .ok_or(Error::Text(format!("Node {} not in graph.", b)))?], 139 | Some(ingests) => ingests 140 | .iter() 141 | .map(|ident| { 142 | self.map 143 | .get(ident) 144 | .cloned() 145 | .ok_or(Error::Text(format!("Node {} not in graph.", ident))) 146 | }) 147 | .collect::, Error>>()?, 148 | }; 149 | 150 | for b in bs { 151 | self.dag.add_edge(b, a, ()); 152 | } 153 | 154 | Ok(()) 155 | } 156 | } 157 | 158 | pub fn get_dag(branch: &str) -> Result { 159 | let path = ".dashtool/dags/".to_string() + branch + ".json"; 160 | let dag = if fs::metadata(&path).is_ok() { 161 | let json = fs::read_to_string(&path)?; 162 | let dag: Dag = serde_json::from_str(&json)?; 163 | dag 164 | } else { 165 | Dag { 166 | ingests: HashMap::new(), 167 | map: HashMap::new(), 168 | dag: StableDiGraph::new(), 169 | } 170 | }; 171 | Ok(dag) 172 | } 173 | -------------------------------------------------------------------------------- /dashtool/src/error.rs: -------------------------------------------------------------------------------- 1 | use thiserror::Error; 2 | 3 | #[derive(Error, Debug)] 4 | pub enum Error { 5 | #[error(transparent)] 6 | Anyhow(#[from] anyhow::Error), 7 | #[error(transparent)] 8 | SerdeJSON(#[from] serde_json::Error), 9 | #[error(transparent)] 10 | SerdeYAML(#[from] serde_yaml::Error), 11 | #[error(transparent)] 12 | IO(#[from] std::io::Error), 13 | #[error(transparent)] 14 | FuturesChannel(#[from] futures::channel::mpsc::SendError), 15 | #[error(transparent)] 16 | ObjectStore(#[from] object_store::Error), 17 | #[error(transparent)] 18 | Iceberg(#[from] iceberg_rust::error::Error), 19 | #[error(transparent)] 20 | IcebergSpec(#[from] iceberg_rust::spec::error::Error), 21 | #[error(transparent)] 22 | IcebergSqlCatalog(#[from] iceberg_sql_catalog::error::Error), 23 | #[error(transparent)] 24 | IcebergFileCatalog(#[from] iceberg_file_catalog::error::Error), 25 | #[error(transparent)] 26 | GitDiscover(#[from] gix::discover::Error), 27 | #[error(transparent)] 28 | GitReference(#[from] gix::reference::find::Error), 29 | #[error(transparent)] 30 | GitReferenceExisting(#[from] gix::reference::find::existing::Error), 31 | #[error(transparent)] 32 | GitObject(#[from] gix::object::find::Error), 33 | #[error(transparent)] 34 | GitChanges(#[from] gix::diff::tree::changes::Error), 35 | #[error(transparent)] 36 | GitObjectDecode(#[from] gix::objs::decode::Error), 37 | #[error(transparent)] 38 | GitObjectTryInto(#[from] gix::object::try_into::Error), 39 | #[error(transparent)] 40 | GitRecorder(#[from] Box), 41 | #[error(transparent)] 42 | Utf8(#[from] std::string::FromUtf8Error), 43 | #[error(transparent)] 44 | OIDCDiscovery( 45 | #[from] openidconnect::DiscoveryError>, 46 | ), 47 | #[error(transparent)] 48 | OIDCConfiguration(#[from] openidconnect::ConfigurationError), 49 | #[error(transparent)] 50 | OIDCReqestToken( 51 | #[from] 52 | openidconnect::RequestTokenError< 53 | openidconnect::reqwest::Error, 54 | openidconnect::StandardErrorResponse, 55 | >, 56 | ), 57 | #[error(transparent)] 58 | OIDCReqestTokenDevice( 59 | #[from] 60 | openidconnect::RequestTokenError< 61 | openidconnect::reqwest::Error, 62 | openidconnect::StandardErrorResponse, 63 | >, 64 | ), 65 | #[error(transparent)] 66 | Parse(#[from] url::ParseError), 67 | #[error(transparent)] 68 | Envsubst(#[from] shellexpand::LookupError), 69 | #[error(transparent)] 70 | StrParse(#[from] std::str::ParseBoolError), 71 | #[error(transparent)] 72 | SQLParser(#[from] sqlparser::parser::ParserError), 73 | #[error(transparent)] 74 | Datafusion(#[from] datafusion_common::DataFusionError), 75 | #[error(transparent)] 76 | ArgoDagTask(#[from] argo_workflow::schema::DagTaskBuilderError), 77 | #[error(transparent)] 78 | ArgoTemplate(#[from] argo_workflow::schema::TemplateBuilderError), 79 | #[error(transparent)] 80 | ArgoDagTemplate(#[from] argo_workflow::schema::DagTemplateBuilderError), 81 | #[error(transparent)] 82 | ArgoVolumeMount(#[from] argo_workflow::schema::VolumeMountBuilderError), 83 | #[error(transparent)] 84 | ArgoVolume(#[from] argo_workflow::schema::VolumeBuilderError), 85 | #[error(transparent)] 86 | ArgoSecretVolumeSource(#[from] argo_workflow::schema::SecretVolumeSourceBuilderError), 87 | #[error(transparent)] 88 | ArgoConfigMapVolumeSource(#[from] argo_workflow::schema::ConfigMapVolumeSourceBuilderError), 89 | #[error(transparent)] 90 | ArgoContainer(#[from] argo_workflow::schema::ContainerBuilderError), 91 | #[error(transparent)] 92 | ArgoUserContainer(#[from] argo_workflow::schema::UserContainerBuilderError), 93 | #[error(transparent)] 94 | ArgoEnvVar(#[from] argo_workflow::schema::EnvVarBuilderError), 95 | #[error(transparent)] 96 | ArgoInputs(#[from] argo_workflow::schema::InputsBuilderError), 97 | #[error(transparent)] 98 | ArgoParameter(#[from] argo_workflow::schema::ParameterBuilderError), 99 | #[error(transparent)] 100 | ArgoArguments(#[from] argo_workflow::schema::ArgumentsBuilderError), 101 | #[error(transparent)] 102 | ArgoObjectMeta(#[from] argo_workflow::schema::ObjectMetaBuilderError), 103 | #[error(transparent)] 104 | ArgoEnvVarSource(#[from] argo_workflow::schema::EnvVarSourceBuilderError), 105 | #[error(transparent)] 106 | ArgoSecretKeySelector(#[from] argo_workflow::schema::SecretKeySelectorBuilderError), 107 | #[error(transparent)] 108 | ArgoCronWorkflowSpec(#[from] argo_workflow::schema::CronWorkflowSpecBuilderError), 109 | #[error(transparent)] 110 | ArgoCronWorkflow(#[from] argo_workflow::schema::CronWorkflowBuilderError), 111 | #[error(transparent)] 112 | ArgoWorkflow(#[from] argo_workflow::schema::WorkflowSpecBuilderError), 113 | #[error("No {0} token revieved.")] 114 | NoToken(String), 115 | #[error("{0}")] 116 | Text(String), 117 | } 118 | -------------------------------------------------------------------------------- /dashtool/src/git.rs: -------------------------------------------------------------------------------- 1 | use anyhow::anyhow; 2 | use gix::{ 3 | diff::tree::{recorder::Change, Changes, Recorder}, 4 | prelude::Find, 5 | ObjectId, OdbHandle, Repository, 6 | }; 7 | 8 | use crate::error::Error; 9 | 10 | pub(crate) fn diff<'a>( 11 | db: &OdbHandle, 12 | old_id: &Option, 13 | new_id: &Option, 14 | ) -> Result, Error> { 15 | let mut old_commit_buffer = Vec::new(); 16 | let mut new_commit_buffer = Vec::new(); 17 | let mut old_tree_buffer = Vec::new(); 18 | let mut new_tree_buffer = Vec::new(); 19 | let old_tree = old_id 20 | .as_ref() 21 | .and_then(|x| db.try_find(&x, &mut old_commit_buffer).transpose()) 22 | .transpose()? 23 | .and_then(|x| x.try_into_commit_iter()) 24 | .map(|mut x| x.tree_id()) 25 | .transpose()? 26 | .and_then(|x| db.try_find(&x, &mut old_tree_buffer).transpose()) 27 | .transpose()? 28 | .and_then(|x| x.try_into_tree_iter()); 29 | 30 | let new_tree = new_id 31 | .as_ref() 32 | .or(old_id.as_ref()) 33 | .and_then(|x| db.try_find(&x, &mut new_commit_buffer).transpose()) 34 | .transpose()? 35 | .and_then(|x| x.try_into_commit_iter()) 36 | .map(|mut x| x.tree_id()) 37 | .transpose()? 38 | .and_then(|x| db.try_find(&x, &mut new_tree_buffer).transpose()) 39 | .transpose()? 40 | .and_then(|x| x.try_into_tree_iter()); 41 | 42 | let mut recorder = Recorder::default(); 43 | 44 | if let Some(new_tree) = new_tree { 45 | Changes::from(old_tree).needed_to_obtain( 46 | new_tree, 47 | gix::diff::tree::State::default(), 48 | db, 49 | &mut recorder, 50 | )?; 51 | } 52 | 53 | let diff = recorder.records; 54 | 55 | Ok(diff) 56 | } 57 | 58 | pub(crate) fn branch(repo: &Repository) -> Result { 59 | String::from_utf8( 60 | repo.find_reference("HEAD")? 61 | .target() 62 | .try_name() 63 | .ok_or(Error::Anyhow(anyhow!( 64 | "Dashtool cannot run with uncommited changes. Please commit all your changes." 65 | )))? 66 | .as_bstr() 67 | .strip_prefix("refs/heads/".as_bytes()) 68 | .to_owned() 69 | .ok_or(Error::Anyhow(anyhow!( 70 | "Dashtool cannot run with uncommited changes. Please commit all your changes." 71 | )))? 72 | .to_owned(), 73 | ) 74 | .map_err(Error::from) 75 | } 76 | -------------------------------------------------------------------------------- /dashtool/src/lib.rs: -------------------------------------------------------------------------------- 1 | pub mod build; 2 | pub(crate) mod dag; 3 | pub mod error; 4 | pub(crate) mod git; 5 | pub mod plugins; 6 | pub mod state; 7 | pub mod workflow; 8 | -------------------------------------------------------------------------------- /dashtool/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::{fs, process::Command, str::FromStr, sync::Arc}; 2 | 3 | use anyhow::anyhow; 4 | use dashtool::{ 5 | build::build, 6 | error::Error, 7 | plugins::{file::FilePlugin, sql::SqlPlugin, Config, Plugin}, 8 | workflow::workflow, 9 | }; 10 | 11 | use clap::{Parser, Subcommand}; 12 | 13 | #[derive(Parser)] 14 | #[command(version)] 15 | struct Args { 16 | #[command(subcommand)] 17 | commands: Commands, 18 | /// Set file for Argo workflow definition 19 | #[arg(short, long)] 20 | file: Option, 21 | } 22 | 23 | #[derive(Subcommand)] 24 | enum Commands { 25 | Build, 26 | Workflow, 27 | Apply, 28 | } 29 | 30 | static DASHTOOL_CONFIG: &str = "dashtool.json"; 31 | static OUTPUT_FILE: &str = "argo/workflow.yaml"; 32 | static DAG_DIR: &str = ".dashtool/dags"; 33 | 34 | #[tokio::main] 35 | async fn main() -> Result<(), Error> { 36 | let args = Args::parse(); 37 | 38 | let output = args.file.unwrap_or(OUTPUT_FILE.to_owned()); 39 | 40 | fs::create_dir_all(DAG_DIR).ok(); 41 | fs::create_dir_all( 42 | std::path::Path::new(&output) 43 | .parent() 44 | .ok_or(Error::Anyhow(anyhow!("Output file cannot be a directory")))?, 45 | ) 46 | .ok(); 47 | #[cfg(not(target_arch = "wasm32"))] 48 | fs::create_dir_all( 49 | dirs::config_local_dir() 50 | .and_then(|x| Some(String::from_str(x.to_str()?).ok()?)) 51 | .ok_or(Error::Anyhow(anyhow!("Failed to get config directory.")))? 52 | + "/dashtool", 53 | )?; 54 | 55 | let config_json = fs::read_to_string(DASHTOOL_CONFIG)?; 56 | let config: Config = serde_json::from_str(&shellexpand::env(&config_json)?)?; 57 | 58 | let plugin: Arc = match config { 59 | Config::Sql(sql_config) => { 60 | Ok::<_, Error>(Arc::new(SqlPlugin::new(sql_config).await?) as Arc) 61 | } 62 | Config::File(file_config) => { 63 | Ok::<_, Error>(Arc::new(FilePlugin::new(file_config).await?) as Arc) 64 | } 65 | }?; 66 | 67 | match args.commands { 68 | Commands::Build => build(plugin).await, 69 | Commands::Workflow => workflow(plugin, &output), 70 | Commands::Apply => { 71 | if cfg!(target_os = "windows") { 72 | Command::new("kubectl") 73 | .args(["apply", "-f", &output]) 74 | .output() 75 | .and_then(|x| { 76 | if x.status.success() { 77 | Ok(()) 78 | } else { 79 | Err(std::io::Error::new( 80 | std::io::ErrorKind::Other, 81 | "Failed to apply Argo workflow to Kubernetes cluster", 82 | )) 83 | } 84 | }) 85 | .map_err(Error::from) 86 | } else { 87 | Command::new("kubectl") 88 | .args(["apply", "-f", &output]) 89 | .output() 90 | .and_then(|x| { 91 | if x.status.success() { 92 | Ok(()) 93 | } else { 94 | Err(std::io::Error::new( 95 | std::io::ErrorKind::Other, 96 | "Failed to apply Argo workflow to Kubernetes cluster", 97 | )) 98 | } 99 | }) 100 | .map_err(Error::from) 101 | } 102 | } 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /dashtool/src/plugins/dashbook/mod.rs: -------------------------------------------------------------------------------- 1 | use std::{collections::HashMap, fs, sync::Arc}; 2 | 3 | use argo_workflow::schema::{ 4 | ConfigMapVolumeSourceBuilder, IoArgoprojWorkflowV1alpha1UserContainer, 5 | IoK8sApiCoreV1EmptyDirVolumeSource, IoK8sApiCoreV1Volume, SecretVolumeSourceBuilder, 6 | UserContainerBuilder, VolumeBuilder, VolumeMountBuilder, 7 | }; 8 | use async_trait::async_trait; 9 | use dashbook_catalog::DashbookS3CatalogList; 10 | use iceberg_rust::catalog::CatalogList; 11 | use serde::{Deserialize, Serialize}; 12 | 13 | use crate::error::Error; 14 | 15 | use self::openid::{authorization, get_refresh_token}; 16 | 17 | use super::Plugin; 18 | 19 | mod openid; 20 | 21 | #[derive(Debug, Serialize, Deserialize)] 22 | #[serde(rename_all = "camelCase")] 23 | pub struct Config { 24 | /// A map from catalog_name to bucket 25 | pub buckets: HashMap, 26 | pub issuer: Option, 27 | pub client_id: Option, 28 | } 29 | 30 | #[derive(Debug)] 31 | pub struct DashbookPlugin { 32 | config: Config, 33 | catalog_list: Arc, 34 | } 35 | 36 | impl DashbookPlugin { 37 | pub async fn new(path: &str) -> Result { 38 | let config_json = fs::read_to_string(path)?; 39 | let config: Config = serde_json::from_str(&config_json)?; 40 | 41 | let refresh_token = get_refresh_token(&config).await?; 42 | 43 | let issuer = config 44 | .issuer 45 | .as_deref() 46 | .unwrap_or("https://auth.dashbook.dev/realms/dashbook"); 47 | 48 | let client_id = config.issuer.as_deref().unwrap_or("dashbook"); 49 | 50 | let (access_token, id_token) = authorization(issuer, client_id, &refresh_token).await?; 51 | 52 | let catalog_list = Arc::new(DashbookS3CatalogList::new(&access_token, &id_token).await?); 53 | 54 | Ok(DashbookPlugin { 55 | config, 56 | catalog_list, 57 | }) 58 | } 59 | } 60 | 61 | #[async_trait] 62 | impl Plugin for DashbookPlugin { 63 | async fn catalog_list(&self) -> Result, Error> { 64 | Ok(self.catalog_list.clone()) 65 | } 66 | 67 | fn bucket(&self, catalog_name: &str) -> &str { 68 | self.config 69 | .buckets 70 | .get(catalog_name) 71 | .map(|x| x.as_str()) 72 | .unwrap() 73 | } 74 | 75 | fn refresh_image(&self) -> &str { 76 | "dashbook/refresh-iceberg-datafusion:dashbook" 77 | } 78 | 79 | fn refresh_config(&self, _identifier: &str, _branch: &str) -> Result { 80 | Ok("dashbook/refresh-iceberg-datafusion:dashbook".to_owned()) 81 | } 82 | 83 | fn init_containters( 84 | &self, 85 | ) -> Result>, Error> { 86 | Ok(Some(vec![ 87 | UserContainerBuilder::default() 88 | .name("authorization".to_string()) 89 | .image(Some( 90 | "dashbook/dashtool-authorization".to_string(), 91 | )) 92 | .volume_mounts(vec![ 93 | VolumeMountBuilder::default() 94 | .name("authorization".to_string()) 95 | .mount_path("/tmp/authorization".to_string()) 96 | .build()?, 97 | VolumeMountBuilder::default() 98 | .name("authentication".to_string()) 99 | .mount_path("/tmp/authentication".to_string()) 100 | .build()?, 101 | ]) 102 | .build()?, 103 | UserContainerBuilder::default() 104 | .name("envsubst".to_string()) 105 | .image(Some("dibi/envsubst".to_string())) 106 | .command(vec!["/bin/sh".to_string()]) 107 | .args(vec!["-c".to_string(),"export ACCESS_TOKEN=$(cat /tmp/authorization/access.jwt) && export ID_TOKEN=$(cat /tmp/authorization/id.jwt) && /envsubst-file.sh".to_string()]) 108 | .volume_mounts(vec![ 109 | VolumeMountBuilder::default() 110 | .name("authorization".to_string()) 111 | .mount_path("/tmp/authorization".to_string()) 112 | .build()?, 113 | VolumeMountBuilder::default() 114 | .name("config_template".to_string()) 115 | .mount_path("/workdir".to_string()) 116 | .build()?, 117 | VolumeMountBuilder::default() 118 | .name("config".to_string()) 119 | .mount_path("/processed".to_string()) 120 | .build()?, 121 | ]) 122 | .build()?, 123 | ])) 124 | } 125 | 126 | fn volumes(&self) -> Result>, Error> { 127 | Ok(Some(vec![ 128 | VolumeBuilder::default() 129 | .name("authorization".to_string()) 130 | .empty_dir(Some(IoK8sApiCoreV1EmptyDirVolumeSource::default())) 131 | .build()?, 132 | VolumeBuilder::default() 133 | .name("authentication".to_string()) 134 | .secret(Some( 135 | SecretVolumeSourceBuilder::default() 136 | .secret_name(Some("dashtool-authentication".to_string())) 137 | .build()?, 138 | )) 139 | .build()?, 140 | VolumeBuilder::default() 141 | .name("config".to_string()) 142 | .empty_dir(Some(IoK8sApiCoreV1EmptyDirVolumeSource::default())) 143 | .build()?, 144 | VolumeBuilder::default() 145 | .name("config_template".to_string()) 146 | .config_map(Some(ConfigMapVolumeSourceBuilder::default().build()?)) 147 | .build()?, 148 | ])) 149 | } 150 | } 151 | -------------------------------------------------------------------------------- /dashtool/src/plugins/dashbook/openid.rs: -------------------------------------------------------------------------------- 1 | use std::{fs, str::FromStr}; 2 | 3 | use anyhow::anyhow; 4 | use openidconnect::{ 5 | core::{CoreClient, CoreProviderMetadata}, 6 | reqwest::async_http_client, 7 | ClientId, DeviceAuthorizationResponse, DeviceAuthorizationUrl, 8 | EmptyExtraDeviceAuthorizationFields, IssuerUrl, OAuth2TokenResponse, RefreshToken, 9 | TokenResponse, 10 | }; 11 | 12 | use crate::{error::Error, plugins::dashbook::Config}; 13 | 14 | pub async fn authentication(issuer_url: &str, client_id: &str) -> Result { 15 | let provider_metadata = CoreProviderMetadata::discover_async( 16 | IssuerUrl::new(issuer_url.to_string())?, 17 | async_http_client, 18 | ) 19 | .await?; 20 | let client = CoreClient::from_provider_metadata( 21 | provider_metadata, 22 | ClientId::new(client_id.to_string()), 23 | None, 24 | ) 25 | .enable_openid_scope() 26 | .set_device_authorization_uri(DeviceAuthorizationUrl::new( 27 | issuer_url.to_string() + "/protocol/openid-connect/auth/device", 28 | )?); 29 | let details: DeviceAuthorizationResponse = client 30 | .exchange_device_code()? 31 | .request_async(async_http_client) 32 | .await?; 33 | 34 | println!( 35 | "Open this URL in your browser:\n{}\nand enter the code: {}", 36 | details.verification_uri().to_string(), 37 | details.user_code().secret().to_string() 38 | ); 39 | let tokens = client 40 | .exchange_device_access_token(&details) 41 | .request_async(async_http_client, tokio::time::sleep, None) 42 | .await?; 43 | 44 | let refresh_token = tokens 45 | .refresh_token() 46 | .ok_or(Error::NoToken("device".to_string()))?; 47 | Ok(refresh_token.secret().to_string()) 48 | } 49 | 50 | pub async fn authorization( 51 | issuer_url: &str, 52 | client_id: &str, 53 | refresh_token: &str, 54 | ) -> Result<(String, String), Error> { 55 | let provider_metadata = CoreProviderMetadata::discover_async( 56 | IssuerUrl::new(issuer_url.to_string())?, 57 | async_http_client, 58 | ) 59 | .await?; 60 | let client = CoreClient::from_provider_metadata( 61 | provider_metadata, 62 | ClientId::new(client_id.to_string()), 63 | None, 64 | ) 65 | .enable_openid_scope(); 66 | 67 | let response = client 68 | .exchange_refresh_token(&RefreshToken::new(refresh_token.to_owned())) 69 | .request_async(async_http_client) 70 | .await?; 71 | 72 | let access_token = response.access_token().secret().to_string(); 73 | let id_token = response 74 | .id_token() 75 | .ok_or(Error::NoToken("id".to_string()))?; 76 | Ok((access_token, id_token.to_string())) 77 | } 78 | 79 | pub(crate) async fn get_refresh_token(config: &Config) -> Result { 80 | #[cfg(not(target_arch = "wasm32"))] 81 | let refresh_token = match fs::read_to_string( 82 | dirs::config_local_dir() 83 | .and_then(|x| Some(String::from_str(x.to_str()?).ok()?)) 84 | .ok_or(Error::Anyhow(anyhow!("Failed to get config directory.")))? 85 | + "/dashtool/refresh.jwt", 86 | ) { 87 | Ok(token) => token, 88 | Err(_) => { 89 | let refresh_token = fetch_refresh_token(&config).await?; 90 | refresh_token 91 | } 92 | }; 93 | #[cfg(all(target_arch = "wasm32", target_os = "wasi"))] 94 | let refresh_token = std::env::var("REFRESH_TOKEN")?; 95 | Ok(refresh_token) 96 | } 97 | 98 | pub(crate) async fn fetch_refresh_token(config: &Config) -> Result { 99 | let issuer = config 100 | .issuer 101 | .as_deref() 102 | .unwrap_or("https://auth.dashbook.dev/realms/dashbook"); 103 | 104 | let client_id = config.issuer.as_deref().unwrap_or("dashbook"); 105 | 106 | let refresh_token = authentication(issuer, client_id).await?; 107 | fs::write( 108 | dirs::config_local_dir() 109 | .and_then(|x| Some(String::from_str(x.to_str()?).ok()?)) 110 | .ok_or(Error::Anyhow(anyhow!("Failed to get config directory.")))? 111 | + "/dashtool/refresh.jwt", 112 | &refresh_token, 113 | )?; 114 | Ok(refresh_token) 115 | } 116 | -------------------------------------------------------------------------------- /dashtool/src/plugins/file/mod.rs: -------------------------------------------------------------------------------- 1 | use argo_workflow::schema::{ 2 | EnvVarBuilder, EnvVarSourceBuilder, SecretKeySelectorBuilder, UserContainerBuilder, 3 | VolumeMountBuilder, 4 | }; 5 | use dashtool_common::ObjectStoreConfig; 6 | use iceberg_file_catalog::FileCatalogList; 7 | use std::{collections::HashMap, sync::Arc}; 8 | 9 | use argo_workflow::schema::{IoArgoprojWorkflowV1alpha1UserContainer, IoK8sApiCoreV1Volume}; 10 | use async_trait::async_trait; 11 | use iceberg_rust::catalog::{bucket::ObjectStoreBuilder, CatalogList}; 12 | use object_store::aws::AmazonS3Builder; 13 | use serde::{Deserialize, Serialize}; 14 | 15 | use crate::error::Error; 16 | 17 | use super::Plugin; 18 | 19 | #[derive(Debug, Serialize, Deserialize)] 20 | #[serde(rename_all = "camelCase")] 21 | pub struct FileConfig { 22 | #[serde(flatten)] 23 | pub object_store: ObjectStoreConfig, 24 | pub catalog_url: String, 25 | pub bucket: String, 26 | /// A nested map that maps a kubernetes secret name to a map from a environement name to the 27 | /// key of the secret value in the secret. 28 | #[serde(default)] 29 | pub secrets: HashMap>, 30 | #[serde(default)] 31 | pub env: HashMap, 32 | } 33 | 34 | #[derive(Debug)] 35 | pub struct FilePlugin { 36 | config: FileConfig, 37 | catalog_list: Arc, 38 | } 39 | 40 | impl FilePlugin { 41 | pub async fn new(mut config: FileConfig) -> Result { 42 | let mut full_bucket_name = config.bucket.clone(); 43 | let object_store = match &config.object_store { 44 | ObjectStoreConfig::Memory => ObjectStoreBuilder::memory(), 45 | ObjectStoreConfig::S3(s3_config) => { 46 | let bucket_name = config.bucket.trim_start_matches("s3://"); 47 | full_bucket_name = "s3://".to_owned() + bucket_name; 48 | 49 | let mut builder = AmazonS3Builder::from_env() 50 | .with_region(&s3_config.aws_region) 51 | .with_bucket_name(bucket_name) 52 | .with_access_key_id(&s3_config.aws_access_key_id); 53 | 54 | if let Some(endpoint) = &s3_config.aws_endpoint { 55 | builder = builder.with_endpoint(endpoint); 56 | } 57 | 58 | if let Some(allow_http) = &s3_config.aws_allow_http { 59 | builder = builder.with_allow_http(allow_http.parse()?); 60 | } 61 | 62 | ObjectStoreBuilder::S3(builder) 63 | } 64 | }; 65 | 66 | config.bucket = full_bucket_name; 67 | 68 | let catalog_list = Arc::new(FileCatalogList::new(&config.catalog_url, object_store).await?); 69 | 70 | Ok(FilePlugin { 71 | config, 72 | catalog_list, 73 | }) 74 | } 75 | } 76 | 77 | #[derive(Debug, Serialize, Deserialize)] 78 | #[serde(rename_all = "camelCase")] 79 | pub struct RefreshConfig { 80 | #[serde(flatten)] 81 | pub object_store: ObjectStoreConfig, 82 | pub catalog_url: String, 83 | pub identifier: String, 84 | pub bucket: Option, 85 | pub branch: Option, 86 | } 87 | 88 | #[async_trait] 89 | impl Plugin for FilePlugin { 90 | async fn catalog_list(&self) -> Result, Error> { 91 | Ok(self.catalog_list.clone()) 92 | } 93 | 94 | fn bucket(&self, _catalog_name: &str) -> &str { 95 | &self.config.bucket 96 | } 97 | 98 | fn refresh_image(&self) -> &str { 99 | "dashbook/refresh-iceberg-datafusion:file" 100 | } 101 | 102 | fn refresh_config(&self, identifier: &str, branch: &str) -> Result { 103 | let mut object_store_config = self.config.object_store.clone(); 104 | match &mut object_store_config { 105 | ObjectStoreConfig::S3(config) => { 106 | config.aws_secret_access_key = Some("$AWS_SECRET_ACCESS_KEY".to_owned()); 107 | config.aws_endpoint = self.config.env.get("AWS_ENDPOINT").cloned() 108 | } 109 | _ => (), 110 | } 111 | let config = RefreshConfig { 112 | identifier: identifier.to_owned(), 113 | branch: Some(branch.to_owned()), 114 | object_store: object_store_config, 115 | catalog_url: self 116 | .config 117 | .env 118 | .get("CATALOG_URL") 119 | .cloned() 120 | .unwrap_or(self.config.catalog_url.clone()), 121 | bucket: Some(self.config.bucket.clone()), 122 | }; 123 | Ok(serde_json::to_string(&config).unwrap()) 124 | } 125 | 126 | fn init_containters( 127 | &self, 128 | ) -> Result>, Error> { 129 | let mut builder = UserContainerBuilder::default(); 130 | builder 131 | .name("envsubst".to_string()) 132 | .image(Some("dibi/envsubst".to_string())) 133 | .volume_mounts(vec![ 134 | VolumeMountBuilder::default() 135 | .name("config-template".to_string()) 136 | .mount_path("/workdir".to_string()) 137 | .build()?, 138 | VolumeMountBuilder::default() 139 | .name("config".to_string()) 140 | .mount_path("/processed".to_string()) 141 | .build()?, 142 | ]); 143 | 144 | builder.env( 145 | self.config 146 | .secrets 147 | .iter() 148 | .flat_map(|(secret, map)| { 149 | map.iter().map(|(key, value)| { 150 | Ok(EnvVarBuilder::default() 151 | .name(value.trim_start_matches('$').to_owned()) 152 | .value_from(Some( 153 | EnvVarSourceBuilder::default() 154 | .secret_key_ref(Some( 155 | SecretKeySelectorBuilder::default() 156 | .name(Some(secret.clone())) 157 | .key(key.clone()) 158 | .optional(None) 159 | .build()?, 160 | )) 161 | .config_map_key_ref(None) 162 | .field_ref(None) 163 | .resource_field_ref(None) 164 | .build()?, 165 | )) 166 | .value(None) 167 | .build()?) 168 | }) 169 | }) 170 | .collect::, Error>>()?, 171 | ); 172 | 173 | Ok(Some(vec![builder.build()?])) 174 | } 175 | fn volumes(&self) -> Result>, Error> { 176 | Ok(None) 177 | } 178 | } 179 | -------------------------------------------------------------------------------- /dashtool/src/plugins/mod.rs: -------------------------------------------------------------------------------- 1 | use std::{fmt::Debug, sync::Arc}; 2 | 3 | use argo_workflow::schema::{IoArgoprojWorkflowV1alpha1UserContainer, IoK8sApiCoreV1Volume}; 4 | use async_trait::async_trait; 5 | use file::FileConfig; 6 | use iceberg_rust::catalog::CatalogList; 7 | use serde::{Deserialize, Serialize}; 8 | 9 | use crate::error::Error; 10 | 11 | use self::sql::SqlConfig; 12 | 13 | pub mod file; 14 | pub mod sql; 15 | 16 | #[async_trait] 17 | pub trait Plugin: Debug { 18 | async fn catalog_list(&self) -> Result, Error>; 19 | fn bucket(&self, catalog_name: &str) -> &str; 20 | fn refresh_image(&self) -> &str; 21 | fn refresh_config(&self, identifier: &str, branch: &str) -> Result; 22 | fn init_containters( 23 | &self, 24 | ) -> Result>, Error>; 25 | fn volumes(&self) -> Result>, Error>; 26 | } 27 | 28 | #[derive(Serialize, Deserialize)] 29 | #[serde(tag = "plugin", rename_all = "lowercase")] 30 | pub enum Config { 31 | Sql(SqlConfig), 32 | File(FileConfig), 33 | } 34 | -------------------------------------------------------------------------------- /dashtool/src/plugins/sql/mod.rs: -------------------------------------------------------------------------------- 1 | use argo_workflow::schema::{ 2 | EnvVarBuilder, EnvVarSourceBuilder, SecretKeySelectorBuilder, UserContainerBuilder, 3 | VolumeMountBuilder, 4 | }; 5 | use dashtool_common::ObjectStoreConfig; 6 | use std::{collections::HashMap, sync::Arc}; 7 | 8 | use argo_workflow::schema::{IoArgoprojWorkflowV1alpha1UserContainer, IoK8sApiCoreV1Volume}; 9 | use async_trait::async_trait; 10 | use iceberg_rust::catalog::{bucket::ObjectStoreBuilder, CatalogList}; 11 | use iceberg_sql_catalog::SqlCatalogList; 12 | use object_store::aws::AmazonS3Builder; 13 | use serde::{Deserialize, Serialize}; 14 | 15 | use crate::error::Error; 16 | 17 | use super::Plugin; 18 | 19 | #[derive(Debug, Serialize, Deserialize)] 20 | #[serde(rename_all = "camelCase")] 21 | pub struct SqlConfig { 22 | #[serde(flatten)] 23 | pub object_store: ObjectStoreConfig, 24 | pub catalog_url: String, 25 | pub bucket: String, 26 | /// A nested map that maps a kubernetes secret name to a map from a environement name to the 27 | /// key of the secret value in the secret. 28 | #[serde(default)] 29 | pub secrets: HashMap>, 30 | #[serde(default)] 31 | pub env: HashMap, 32 | } 33 | 34 | #[derive(Debug)] 35 | pub struct SqlPlugin { 36 | config: SqlConfig, 37 | catalog_list: Arc, 38 | } 39 | 40 | impl SqlPlugin { 41 | pub async fn new(mut config: SqlConfig) -> Result { 42 | let mut full_bucket_name = config.bucket.clone(); 43 | let object_store = match &config.object_store { 44 | ObjectStoreConfig::Memory => ObjectStoreBuilder::memory(), 45 | ObjectStoreConfig::S3(s3_config) => { 46 | let bucket_name = config.bucket.trim_start_matches("s3://"); 47 | full_bucket_name = "s3://".to_owned() + bucket_name; 48 | 49 | let mut builder = AmazonS3Builder::from_env() 50 | .with_region(&s3_config.aws_region) 51 | .with_bucket_name(bucket_name) 52 | .with_access_key_id(&s3_config.aws_access_key_id); 53 | 54 | if let Some(endpoint) = &s3_config.aws_endpoint { 55 | builder = builder.with_endpoint(endpoint); 56 | } 57 | 58 | if let Some(allow_http) = &s3_config.aws_allow_http { 59 | builder = builder.with_allow_http(allow_http.parse()?); 60 | } 61 | 62 | ObjectStoreBuilder::S3(builder) 63 | } 64 | }; 65 | 66 | config.bucket = full_bucket_name; 67 | 68 | let catalog_list = Arc::new(SqlCatalogList::new(&config.catalog_url, object_store).await?); 69 | 70 | Ok(SqlPlugin { 71 | config, 72 | catalog_list, 73 | }) 74 | } 75 | } 76 | 77 | #[cfg(test)] 78 | impl SqlPlugin { 79 | pub(crate) fn new_with_catalog( 80 | config: SqlConfig, 81 | catalog_list: Arc, 82 | ) -> Result { 83 | Ok(SqlPlugin { 84 | config, 85 | catalog_list, 86 | }) 87 | } 88 | } 89 | 90 | #[derive(Debug, Serialize, Deserialize)] 91 | #[serde(rename_all = "camelCase")] 92 | pub struct RefreshConfig { 93 | #[serde(flatten)] 94 | pub object_store: ObjectStoreConfig, 95 | pub catalog_url: String, 96 | pub identifier: String, 97 | pub bucket: Option, 98 | pub branch: Option, 99 | } 100 | 101 | #[async_trait] 102 | impl Plugin for SqlPlugin { 103 | async fn catalog_list(&self) -> Result, Error> { 104 | Ok(self.catalog_list.clone()) 105 | } 106 | 107 | fn bucket(&self, _catalog_name: &str) -> &str { 108 | &self.config.bucket 109 | } 110 | 111 | fn refresh_image(&self) -> &str { 112 | "dashbook/refresh-iceberg-datafusion:sql" 113 | } 114 | 115 | fn refresh_config(&self, identifier: &str, branch: &str) -> Result { 116 | let mut object_store_config = self.config.object_store.clone(); 117 | match &mut object_store_config { 118 | ObjectStoreConfig::S3(config) => { 119 | config.aws_secret_access_key = Some("$AWS_SECRET_ACCESS_KEY".to_owned()); 120 | config.aws_endpoint = self.config.env.get("AWS_ENDPOINT").cloned() 121 | } 122 | _ => (), 123 | } 124 | let config = RefreshConfig { 125 | identifier: identifier.to_owned(), 126 | branch: Some(branch.to_owned()), 127 | object_store: object_store_config, 128 | catalog_url: self 129 | .config 130 | .env 131 | .get("CATALOG_URL") 132 | .cloned() 133 | .unwrap_or(self.config.catalog_url.clone()), 134 | bucket: Some(self.config.bucket.clone()), 135 | }; 136 | Ok(serde_json::to_string(&config).unwrap()) 137 | } 138 | 139 | fn init_containters( 140 | &self, 141 | ) -> Result>, Error> { 142 | let mut builder = UserContainerBuilder::default(); 143 | builder 144 | .name("envsubst".to_string()) 145 | .image(Some("dibi/envsubst".to_string())) 146 | .volume_mounts(vec![ 147 | VolumeMountBuilder::default() 148 | .name("config-template".to_string()) 149 | .mount_path("/workdir".to_string()) 150 | .build()?, 151 | VolumeMountBuilder::default() 152 | .name("config".to_string()) 153 | .mount_path("/processed".to_string()) 154 | .build()?, 155 | ]); 156 | 157 | builder.env( 158 | self.config 159 | .secrets 160 | .iter() 161 | .flat_map(|(secret, map)| { 162 | map.iter().map(|(key, value)| { 163 | Ok(EnvVarBuilder::default() 164 | .name(value.trim_start_matches('$').to_owned()) 165 | .value_from(Some( 166 | EnvVarSourceBuilder::default() 167 | .secret_key_ref(Some( 168 | SecretKeySelectorBuilder::default() 169 | .name(Some(secret.clone())) 170 | .key(key.clone()) 171 | .optional(None) 172 | .build()?, 173 | )) 174 | .config_map_key_ref(None) 175 | .field_ref(None) 176 | .resource_field_ref(None) 177 | .build()?, 178 | )) 179 | .value(None) 180 | .build()?) 181 | }) 182 | }) 183 | .collect::, Error>>()?, 184 | ); 185 | 186 | Ok(Some(vec![builder.build()?])) 187 | } 188 | fn volumes(&self) -> Result>, Error> { 189 | Ok(None) 190 | } 191 | } 192 | -------------------------------------------------------------------------------- /dashtool/src/state/mod.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | 3 | use gix::ObjectId; 4 | use serde::{Deserialize, Serialize}; 5 | 6 | #[derive(Debug, Clone, Serialize, Deserialize, Default)] 7 | pub struct State { 8 | pub commits: HashMap, 9 | } 10 | -------------------------------------------------------------------------------- /dashtool/src/workflow/mod.rs: -------------------------------------------------------------------------------- 1 | use std::{ 2 | collections::{BTreeMap, HashMap}, 3 | fs, 4 | sync::Arc, 5 | }; 6 | 7 | use argo_workflow::schema::{ 8 | ArgumentsBuilder, CronWorkflowBuilder, CronWorkflowSpecBuilder, DagTaskBuilder, 9 | DagTemplateBuilder, IoArgoprojWorkflowV1alpha1Template, ObjectMetaBuilder, ParameterBuilder, 10 | TemplateBuilder, WorkflowSpecBuilder, 11 | }; 12 | use k8s_openapi::api::core::v1::ConfigMap; 13 | use petgraph::Direction; 14 | 15 | use crate::{ 16 | dag::{get_dag, Node}, 17 | error::Error, 18 | git::branch, 19 | plugins::Plugin, 20 | }; 21 | 22 | use self::template::{iceberg_template, ingest_template}; 23 | 24 | mod template; 25 | 26 | static API_VERSION: &str = "argoproj.io/v1alpha1"; 27 | 28 | pub fn workflow(plugin: Arc, output: &str) -> Result<(), Error> { 29 | let repo = gix::discover(".")?; 30 | let branch = branch(&repo)?; 31 | 32 | let dag = get_dag(&branch)?; 33 | 34 | let mut templates: HashMap = 35 | HashMap::from_iter(vec![("refresh".to_string(), iceberg_template(&*plugin)?)]); 36 | 37 | let mut config_maps: HashMap = HashMap::new(); 38 | 39 | let tasks = dag 40 | .dag 41 | .node_indices() 42 | .map(|index| { 43 | let node = &dag.dag[index]; 44 | let task = match node { 45 | Node::Ingest(node) => { 46 | templates 47 | .entry(node.image.clone()) 48 | .or_insert_with(|| ingest_template(&node, &*plugin).unwrap()); 49 | 50 | let mut config_map = ConfigMap::default(); 51 | config_map.metadata.name = Some( 52 | node.identifier 53 | .to_string() 54 | .replace(['/', ':', '_', '.'], "-") 55 | .to_lowercase() 56 | .to_owned() 57 | + "-config-template", 58 | ); 59 | config_map.data = Some(BTreeMap::from_iter(vec![ 60 | ( 61 | "source.json".to_owned(), 62 | serde_json::to_string(&node.source)?, 63 | ), 64 | ( 65 | "destination.json".to_owned(), 66 | serde_json::to_string(&node.destination)?, 67 | ), 68 | ])); 69 | config_maps.insert( 70 | node.identifier 71 | .to_string() 72 | .replace(['/', ':', '_', '.'], "-") 73 | .to_lowercase() 74 | .clone(), 75 | config_map, 76 | ); 77 | 78 | DagTaskBuilder::default() 79 | .name( 80 | node.identifier 81 | .to_string() 82 | .replace(['/', ':', '_', '.'], "-") 83 | .to_lowercase(), 84 | ) 85 | .template(Some(node.image.clone().replace(['/', ':', '_', '.'], "-"))) 86 | .arguments(Some( 87 | ArgumentsBuilder::default() 88 | .parameters(vec![{ 89 | let mut builder: ParameterBuilder = Default::default(); 90 | builder 91 | .name("identifier".to_owned()) 92 | .value(Some( 93 | node.identifier 94 | .to_string() 95 | .replace(['/', ':', '_', '.'], "-") 96 | .to_lowercase(), 97 | )) 98 | .build()? 99 | }]) 100 | .build()?, 101 | )) 102 | .build() 103 | } 104 | 105 | Node::Tabular(node) => { 106 | let mut config_map = ConfigMap::default(); 107 | config_map.metadata.name = Some( 108 | node.identifier 109 | .to_string() 110 | .replace(['/', ':', '_', '.'], "-") 111 | .to_lowercase() 112 | .to_owned() 113 | + "-config-template", 114 | ); 115 | config_map.data = Some(BTreeMap::from_iter(vec![( 116 | "refresh.json".to_owned(), 117 | plugin.refresh_config(&node.identifier.to_string(), &node.branch)?, 118 | )])); 119 | config_maps.insert( 120 | node.identifier 121 | .to_string() 122 | .replace(['/', ':', '_', '.'], "-") 123 | .to_lowercase() 124 | .clone(), 125 | config_map, 126 | ); 127 | 128 | DagTaskBuilder::default() 129 | .name( 130 | node.identifier 131 | .to_string() 132 | .replace(['/', ':', '_', '.'], "-") 133 | .to_lowercase(), 134 | ) 135 | .template(Some("refresh".to_string())) 136 | .arguments(Some( 137 | ArgumentsBuilder::default() 138 | .parameters(vec![{ 139 | let mut builder: ParameterBuilder = Default::default(); 140 | builder 141 | .name("identifier".to_owned()) 142 | .value(Some( 143 | node.identifier 144 | .to_string() 145 | .replace(['/', ':', '_', '.'], "-") 146 | .to_lowercase(), 147 | )) 148 | .build()? 149 | }]) 150 | .build()?, 151 | )) 152 | .dependencies( 153 | dag.dag 154 | .neighbors_directed(index, Direction::Incoming) 155 | .into_iter() 156 | .map(|x| { 157 | dag.dag[x] 158 | .identifier() 159 | .to_string() 160 | .replace(['/', ':', '_', '.'], "-") 161 | .to_lowercase() 162 | .to_owned() 163 | }) 164 | .collect(), 165 | ) 166 | .build() 167 | } 168 | }?; 169 | 170 | Ok::<_, Error>(task) 171 | }) 172 | .collect::, Error>>()?; 173 | 174 | let dag_template = TemplateBuilder::default() 175 | .name(Some("dashtool".to_string())) 176 | .dag(Some(DagTemplateBuilder::default().tasks(tasks).build()?)) 177 | .build()?; 178 | 179 | templates.insert("dag".to_string(), dag_template); 180 | 181 | let workflow = CronWorkflowBuilder::default() 182 | .api_version(Some(API_VERSION.to_string())) 183 | .kind(Some("CronWorkflow".to_string())) 184 | .metadata( 185 | ObjectMetaBuilder::default() 186 | .name(Some("dashtool".to_owned())) 187 | .build()?, 188 | ) 189 | .spec( 190 | CronWorkflowSpecBuilder::default() 191 | .schedule("0 0 * * *".to_owned()) 192 | .workflow_spec( 193 | WorkflowSpecBuilder::default() 194 | .entrypoint(Some("dashtool".to_string())) 195 | .templates(templates.into_values().collect()) 196 | .build()?, 197 | ) 198 | .build()?, 199 | ) 200 | .build()?; 201 | 202 | let mut workflow_yaml = serde_yaml::to_string(&workflow)?; 203 | 204 | for config_map in config_maps { 205 | let yaml = serde_yaml::to_string(&config_map.1)?; 206 | workflow_yaml.push_str("---\n"); 207 | workflow_yaml.push_str(&yaml); 208 | } 209 | 210 | fs::write(output, workflow_yaml)?; 211 | 212 | println!("Creating workflow successful."); 213 | 214 | Ok(()) 215 | } 216 | -------------------------------------------------------------------------------- /dashtool/src/workflow/template.rs: -------------------------------------------------------------------------------- 1 | use crate::{dag::Ingest, error::Error, plugins::Plugin}; 2 | 3 | use argo_workflow::schema::{ 4 | ConfigMapVolumeSourceBuilder, ContainerBuilder, InputsBuilder, 5 | IoArgoprojWorkflowV1alpha1Template, IoK8sApiCoreV1EmptyDirVolumeSource, ParameterBuilder, 6 | TemplateBuilder, UserContainerBuilder, VolumeBuilder, VolumeMountBuilder, 7 | }; 8 | 9 | pub(crate) fn ingest_template( 10 | node: &Ingest, 11 | plugin: &dyn Plugin, 12 | ) -> Result { 13 | let template = 14 | TemplateBuilder::default() 15 | .name(Some(node.image.clone().replace(['/', ':', '_', '.'], "-"))) 16 | .inputs(Some( 17 | InputsBuilder::default() 18 | .parameters(vec![{ 19 | let mut builder: ParameterBuilder = Default::default(); 20 | builder.name("identifier".to_string()).build()? 21 | }]) 22 | .build()?, 23 | )) 24 | .container(Some( 25 | ContainerBuilder::default() 26 | .image(node.image.clone()) 27 | .volume_mounts(vec![VolumeMountBuilder::default() 28 | .name("config".to_string()) 29 | .mount_path("/tmp/config".to_string()) 30 | .build()?]) 31 | .build()?, 32 | )) 33 | .init_containers(plugin.init_containters()?.unwrap_or( 34 | vec![UserContainerBuilder::default() 35 | .name("envsubst".to_string()) 36 | .image(Some("dibi/envsubst".to_string())) 37 | .volume_mounts(vec![ 38 | VolumeMountBuilder::default() 39 | .name("config-template".to_string()) 40 | .mount_path("/workdir".to_string()) 41 | .build()?, 42 | VolumeMountBuilder::default() 43 | .name("config".to_string()) 44 | .mount_path("/processed".to_string()) 45 | .build()?, 46 | ]) 47 | .build()?], 48 | )) 49 | .volumes(plugin.volumes()?.unwrap_or(vec![ 50 | VolumeBuilder::default() 51 | .name("config".to_string()) 52 | .empty_dir(Some(IoK8sApiCoreV1EmptyDirVolumeSource::default())) 53 | .build()?, 54 | VolumeBuilder::default() 55 | .name("config-template".to_string()) 56 | .config_map(Some( 57 | ConfigMapVolumeSourceBuilder::default() 58 | .name(Some( 59 | "{{inputs.parameters.identifier}}-config-template".to_string(), 60 | )) 61 | .build()?, 62 | )) 63 | .build()?, 64 | ])) 65 | .build() 66 | .unwrap(); 67 | Ok(template) 68 | } 69 | 70 | pub(crate) fn iceberg_template( 71 | plugin: &dyn Plugin, 72 | ) -> Result { 73 | let template = TemplateBuilder::default() 74 | .name(Some("refresh".to_string())) 75 | .inputs(Some( 76 | InputsBuilder::default() 77 | .parameters(vec![{ 78 | let mut builder: ParameterBuilder = Default::default(); 79 | builder.name("identifier".to_string()).build()? 80 | }]) 81 | .build()?, 82 | )) 83 | .container(Some( 84 | ContainerBuilder::default() 85 | .image(plugin.refresh_image().to_owned()) 86 | .volume_mounts(vec![VolumeMountBuilder::default() 87 | .name("config".to_string()) 88 | .mount_path("/tmp/config".to_string()) 89 | .build()?]) 90 | .build()?, 91 | )) 92 | .init_containers(plugin.init_containters()?.unwrap_or(vec![ 93 | UserContainerBuilder::default() 94 | .name("envsubst".to_string()) 95 | .image(Some("dibi/envsubst".to_string())) 96 | .volume_mounts(vec![ 97 | VolumeMountBuilder::default() 98 | .name("config-template".to_string()) 99 | .mount_path("/workdir".to_string()) 100 | .build()?, 101 | VolumeMountBuilder::default() 102 | .name("config".to_string()) 103 | .mount_path("/processed".to_string()) 104 | .build()?, 105 | ]) 106 | .build()?, 107 | ])) 108 | .volumes(plugin.volumes()?.unwrap_or(vec![ 109 | VolumeBuilder::default() 110 | .name("config".to_string()) 111 | .empty_dir(Some(IoK8sApiCoreV1EmptyDirVolumeSource::default())) 112 | .build()?, 113 | VolumeBuilder::default() 114 | .name("config-template".to_string()) 115 | .config_map(Some( 116 | ConfigMapVolumeSourceBuilder::default() 117 | .name(Some( 118 | "{{inputs.parameters.identifier}}-config-template".to_string(), 119 | )) 120 | .build()?, 121 | )) 122 | .build()?, 123 | ])) 124 | .build() 125 | .unwrap(); 126 | Ok(template) 127 | } 128 | --------------------------------------------------------------------------------