├── .gitignore
├── LICENSE
├── README.md
├── clouds
├── aws
│ └── README.md
├── azure
│ └── README.md
└── gcp
│ └── README.md
├── data-catalogs
├── README.md
├── boringcatalog
│ └── README.md
├── egeria
│ ├── README.md
│ ├── bin
│ │ ├── egeria-screen.sh
│ │ └── egeria-start.sh
│ └── etc
│ │ ├── egeria
│ │ └── homebrew.mxcl.egeria.plist
├── hive-metastore
│ ├── README.md
│ ├── etc
│ │ └── hive-site.xml
│ ├── pyutils
│ │ ├── SparkSessionUtil.py
│ │ └── hive-spark-client.py
│ └── sql
│ │ ├── hive-schema-2.3.0.postgres.sql
│ │ └── hive-txn-schema-2.3.0.postgres.sql
├── nessie
│ └── README.md
└── unity-catalog
│ └── README.md
├── data-processing
├── arkflow
│ └── README.md
├── dbt
│ └── README.md
├── polars
│ └── examples
│ │ └── unity-catalog
│ │ ├── Polars Unity Catalog Table Management Tutorial from Databricks notebook.ipynb
│ │ ├── Polars Unity Catalog Table Management Tutorial from a notebook outside Databricks.ipynb
│ │ └── readme.md
├── preswald
│ └── README.md
├── spark
│ ├── README.md
│ └── tools
│ │ ├── start-connect-server.sh
│ │ ├── start-history-server.sh
│ │ ├── stop-connect-server.sh
│ │ └── stop-history-server.sh
└── sqlmesh
│ ├── README.md
│ ├── examples
│ ├── 001-simple
│ │ ├── .gitignore
│ │ ├── Makefile
│ │ ├── audits
│ │ │ ├── .gitkeep
│ │ │ └── assert_positive_order_ids.sql
│ │ ├── config.yaml
│ │ ├── docs
│ │ │ └── sqlmesh-dag.html
│ │ ├── macros
│ │ │ ├── .gitkeep
│ │ │ └── __init__.py
│ │ ├── models
│ │ │ ├── .gitkeep
│ │ │ ├── full_model.sql
│ │ │ ├── incremental_model.sql
│ │ │ └── seed_model.sql
│ │ ├── seeds
│ │ │ ├── .gitkeep
│ │ │ └── seed_data.csv
│ │ └── tests
│ │ │ ├── .gitkeep
│ │ │ └── test_full_model.yaml
│ ├── 002-postgresql-state
│ │ ├── .gitignore
│ │ ├── Makefile
│ │ ├── audits
│ │ │ ├── .gitkeep
│ │ │ └── assert_positive_order_ids.sql
│ │ ├── config.yaml.sample
│ │ ├── macros
│ │ │ ├── .gitkeep
│ │ │ └── __init__.py
│ │ ├── models
│ │ │ ├── .gitkeep
│ │ │ ├── full_model.sql
│ │ │ ├── incremental_model.sql
│ │ │ └── seed_model.sql
│ │ ├── seeds
│ │ │ ├── .gitkeep
│ │ │ └── seed_data.csv
│ │ └── tests
│ │ │ ├── .gitkeep
│ │ │ └── test_full_model.yaml
│ ├── 003-python-simple
│ │ ├── .gitignore
│ │ ├── Makefile
│ │ ├── audits
│ │ │ └── assert_positive_order_ids.sql
│ │ ├── config.yaml
│ │ ├── macros
│ │ │ ├── .gitkeep
│ │ │ └── __init__.py
│ │ ├── models
│ │ │ ├── .gitkeep
│ │ │ ├── full_model.sql
│ │ │ ├── full_model_python.py
│ │ │ ├── incremental_model.sql
│ │ │ └── seed_model.sql
│ │ ├── requirements.txt
│ │ ├── seeds
│ │ │ ├── .gitkeep
│ │ │ └── seed_data.csv
│ │ ├── tests
│ │ │ ├── .gitkeep
│ │ │ └── test_full_model.yaml
│ │ └── tmp
│ │ │ └── test_sqlglot.py
│ ├── 004-python-ibis
│ │ ├── .gitignore
│ │ ├── Makefile
│ │ ├── audits
│ │ │ ├── .gitkeep
│ │ │ └── assert_positive_order_ids.sql
│ │ ├── config.yaml
│ │ ├── constants.py
│ │ ├── data
│ │ │ └── .gitkeep
│ │ ├── macros
│ │ │ ├── .gitkeep
│ │ │ └── __init__.py
│ │ ├── models
│ │ │ ├── .gitkeep
│ │ │ ├── full_model.sql
│ │ │ ├── ibis_full_model_python.py
│ │ │ ├── ibis_full_model_sql.py
│ │ │ ├── incremental_model.sql
│ │ │ └── seed_model.sql
│ │ ├── requirements.txt
│ │ └── seeds
│ │ │ ├── .gitkeep
│ │ │ └── seed_data.csv
│ ├── 005-pyspark-simple
│ │ ├── .gitignore
│ │ ├── Makefile
│ │ ├── audits
│ │ │ ├── .gitkeep
│ │ │ └── assert_positive_order_ids.sql
│ │ ├── config.yaml
│ │ ├── macros
│ │ │ ├── .gitkeep
│ │ │ └── __init__.py
│ │ ├── models
│ │ │ ├── .gitkeep
│ │ │ ├── full_model.sql
│ │ │ ├── full_model_python.py
│ │ │ ├── incremental_model.sql
│ │ │ ├── pyspark_model.py
│ │ │ └── seed_model.sql
│ │ ├── requirements.txt
│ │ ├── seeds
│ │ │ ├── .gitkeep
│ │ │ └── seed_data.csv
│ │ └── tests
│ │ │ ├── .gitkeep
│ │ │ └── test_full_model.yaml
│ ├── 006-e2e
│ │ └── .gitkeep
│ ├── 007-databricks-simple
│ │ ├── .env.sample
│ │ ├── .gitignore
│ │ ├── Makefile
│ │ ├── audits
│ │ │ ├── .gitkeep
│ │ │ └── assert_positive_order_ids.sql
│ │ ├── config.yaml.in
│ │ ├── macros
│ │ │ ├── .gitkeep
│ │ │ └── __init__.py
│ │ ├── models
│ │ │ ├── .gitignore
│ │ │ ├── .gitkeep
│ │ │ ├── full_model.sql.in
│ │ │ ├── incremental_model.sql.in
│ │ │ └── seed_model.sql.in
│ │ ├── seeds
│ │ │ ├── .gitkeep
│ │ │ └── seed_data.csv
│ │ └── tests
│ │ │ ├── .gitignore
│ │ │ ├── .gitkeep
│ │ │ └── test_full_model.yaml.in
│ └── 008-unitycatalog-simple
│ │ ├── .gitignore
│ │ ├── Makefile
│ │ ├── audits
│ │ ├── .gitkeep
│ │ └── assert_positive_order_ids.sql
│ │ ├── config.yaml
│ │ ├── macros
│ │ ├── .gitkeep
│ │ └── __init__.py
│ │ ├── models
│ │ ├── .gitkeep
│ │ ├── full_model.sql
│ │ ├── incremental_model.sql
│ │ └── seed_model.sql
│ │ ├── seeds
│ │ ├── .gitkeep
│ │ └── seed_data.csv
│ │ └── tests
│ │ ├── .gitkeep
│ │ └── test_full_model.yaml
│ └── tools
│ ├── clean-pg-state.sh
│ ├── init-dbs-w-env.sh
│ └── sqlmesh_dag_to_mermaid.py
├── data-quality
├── great-expectations
│ └── README.md
└── soda
│ └── README.md
├── data-storage
├── lakefs
│ ├── .gitignore
│ ├── README.md
│ ├── etc
│ │ ├── config.yaml
│ │ ├── homebrew.mxcl.lakefs.plist
│ │ └── lakefs
│ └── ipython-notebooks
│ │ └── lakefs-browse.ipynb
└── minio
│ ├── README.md
│ ├── etc
│ ├── homebrew.mxcl.minio.plist
│ └── minio
│ ├── ipython-notebooks
│ └── minio-browse.ipynb
│ └── python
│ └── minio-browse.py
├── db
├── dremio
│ └── README.md
├── duckdb
│ ├── .gitignore
│ ├── README.md
│ ├── data
│ │ ├── csv
│ │ │ └── .gitignore
│ │ └── parquet
│ │ │ └── .gitignore
│ ├── elt-geonames.py
│ └── ipython-notebooks
│ │ ├── data
│ │ └── duckdb-geonames-basic.ipynb
├── postgresql
│ ├── README.md
│ ├── data
│ ├── ipython-notebooks
│ │ ├── .gitignore
│ │ ├── config-sample.json
│ │ ├── confmgr.py
│ │ ├── data
│ │ ├── jars
│ │ ├── postgresql-pyspark.ipynb
│ │ └── postgresql-python-sdk.ipynb
│ ├── jars
│ │ └── .gitignore
│ ├── jupyter
│ │ └── pyspark-kernel.json
│ ├── nginx
│ │ └── conf.d
│ │ │ └── stream-postgresql.conf
│ └── sql
│ │ └── create-geonames-tables.sql
└── trino
│ └── README.md
├── images
└── data-catalogs
│ └── uc-ui.png
├── infrastructure
├── docker
│ ├── README.md
│ └── rancher-desktop.md
├── k8s
│ ├── README.md
│ └── demos
│ │ ├── archive
│ │ └── full-postgresql.yaml
│ │ ├── simple-postgresql.yaml
│ │ └── simple-shell.yaml
├── nexus
│ ├── README.md
│ └── systemd
│ │ └── nexus.service
└── serverless
│ └── README.md
├── orchestrators
├── airflow
│ ├── .gitignore
│ ├── Pipfile.in
│ └── README.md
└── n8n
│ └── README.md
├── packaging
├── deb-world
│ └── README.md
├── mac-world
│ └── README.md
└── rpm-world
│ └── README.md
├── parsers
├── jq
│ └── README.md
└── yq
│ └── README.md
├── programming
├── building
│ └── cmake
│ │ └── README.md
├── java-world
│ └── README.md
├── js-world
│ └── README.md
├── jupyter
│ └── jupyter-pyspark-duckdb
│ │ ├── .gitignore
│ │ ├── README.md
│ │ ├── data
│ │ └── parquet
│ │ │ └── user-details.parquet
│ │ └── ipython-notebooks
│ │ ├── readme.ipynb
│ │ ├── simple-duckdb-w-ext.ipynb
│ │ ├── simple-duckdb.ipynb
│ │ └── simple-spark-pandas.ipynb
└── python
│ ├── README.md
│ ├── example
│ ├── .gitignore
│ ├── .python-version
│ ├── README.md
│ ├── example.py
│ ├── hello.py
│ └── pyproject.toml
│ ├── geo
│ └── README.md
│ └── numbers
│ ├── .gitignore
│ ├── .python-version
│ ├── README.md
│ ├── pyproject.toml
│ └── src
│ └── numbers
│ ├── __init__.py
│ └── py.typed
└── secret-management
└── hashicorp-vault
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | # Python
2 | .ipynb_checkpoints/
3 | __pycache__/
4 | # Jupyter
5 | .virtual_documents/
6 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Creative Commons Legal Code
2 |
3 | CC0 1.0 Universal
4 |
5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
12 | HEREUNDER.
13 |
14 | Statement of Purpose
15 |
16 | The laws of most jurisdictions throughout the world automatically confer
17 | exclusive Copyright and Related Rights (defined below) upon the creator
18 | and subsequent owner(s) (each and all, an "owner") of an original work of
19 | authorship and/or a database (each, a "Work").
20 |
21 | Certain owners wish to permanently relinquish those rights to a Work for
22 | the purpose of contributing to a commons of creative, cultural and
23 | scientific works ("Commons") that the public can reliably and without fear
24 | of later claims of infringement build upon, modify, incorporate in other
25 | works, reuse and redistribute as freely as possible in any form whatsoever
26 | and for any purposes, including without limitation commercial purposes.
27 | These owners may contribute to the Commons to promote the ideal of a free
28 | culture and the further production of creative, cultural and scientific
29 | works, or to gain reputation or greater distribution for their Work in
30 | part through the use and efforts of others.
31 |
32 | For these and/or other purposes and motivations, and without any
33 | expectation of additional consideration or compensation, the person
34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
35 | is an owner of Copyright and Related Rights in the Work, voluntarily
36 | elects to apply CC0 to the Work and publicly distribute the Work under its
37 | terms, with knowledge of his or her Copyright and Related Rights in the
38 | Work and the meaning and intended legal effect of CC0 on those rights.
39 |
40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
41 | protected by copyright and related or neighboring rights ("Copyright and
42 | Related Rights"). Copyright and Related Rights include, but are not
43 | limited to, the following:
44 |
45 | i. the right to reproduce, adapt, distribute, perform, display,
46 | communicate, and translate a Work;
47 | ii. moral rights retained by the original author(s) and/or performer(s);
48 | iii. publicity and privacy rights pertaining to a person's image or
49 | likeness depicted in a Work;
50 | iv. rights protecting against unfair competition in regards to a Work,
51 | subject to the limitations in paragraph 4(a), below;
52 | v. rights protecting the extraction, dissemination, use and reuse of data
53 | in a Work;
54 | vi. database rights (such as those arising under Directive 96/9/EC of the
55 | European Parliament and of the Council of 11 March 1996 on the legal
56 | protection of databases, and under any national implementation
57 | thereof, including any amended or successor version of such
58 | directive); and
59 | vii. other similar, equivalent or corresponding rights throughout the
60 | world based on applicable law or treaty, and any national
61 | implementations thereof.
62 |
63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
65 | irrevocably and unconditionally waives, abandons, and surrenders all of
66 | Affirmer's Copyright and Related Rights and associated claims and causes
67 | of action, whether now known or unknown (including existing as well as
68 | future claims and causes of action), in the Work (i) in all territories
69 | worldwide, (ii) for the maximum duration provided by applicable law or
70 | treaty (including future time extensions), (iii) in any current or future
71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
72 | including without limitation commercial, advertising or promotional
73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
74 | member of the public at large and to the detriment of Affirmer's heirs and
75 | successors, fully intending that such Waiver shall not be subject to
76 | revocation, rescission, cancellation, termination, or any other legal or
77 | equitable action to disrupt the quiet enjoyment of the Work by the public
78 | as contemplated by Affirmer's express Statement of Purpose.
79 |
80 | 3. Public License Fallback. Should any part of the Waiver for any reason
81 | be judged legally invalid or ineffective under applicable law, then the
82 | Waiver shall be preserved to the maximum extent permitted taking into
83 | account Affirmer's express Statement of Purpose. In addition, to the
84 | extent the Waiver is so judged Affirmer hereby grants to each affected
85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
88 | maximum duration provided by applicable law or treaty (including future
89 | time extensions), (iii) in any current or future medium and for any number
90 | of copies, and (iv) for any purpose whatsoever, including without
91 | limitation commercial, advertising or promotional purposes (the
92 | "License"). The License shall be deemed effective as of the date CC0 was
93 | applied by Affirmer to the Work. Should any part of the License for any
94 | reason be judged legally invalid or ineffective under applicable law, such
95 | partial invalidity or ineffectiveness shall not invalidate the remainder
96 | of the License, and in such case Affirmer hereby affirms that he or she
97 | will not (i) exercise any of his or her remaining Copyright and Related
98 | Rights in the Work or (ii) assert any associated claims and causes of
99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 |
102 | 4. Limitations and Disclaimers.
103 |
104 | a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 | surrendered, licensed or otherwise affected by this document.
106 | b. Affirmer offers the Work as-is and makes no representations or
107 | warranties of any kind concerning the Work, express, implied,
108 | statutory or otherwise, including without limitation warranties of
109 | title, merchantability, fitness for a particular purpose, non
110 | infringement, or the absence of latent or other defects, accuracy, or
111 | the present or absence of errors, whether or not discoverable, all to
112 | the greatest extent permissible under applicable law.
113 | c. Affirmer disclaims responsibility for clearing rights of other persons
114 | that may apply to the Work or any use thereof, including without
115 | limitation any person's Copyright and Related Rights in the Work.
116 | Further, Affirmer disclaims responsibility for obtaining any necessary
117 | consents, permissions or other rights required for any use of the
118 | Work.
119 | d. Affirmer understands and acknowledges that Creative Commons is not a
120 | party to this document and has no duty or obligation with respect to
121 | this CC0 or use of the Work.
122 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | Knowledge sharing - Cheat sheets
2 | ================================
3 |
4 | # Table of Content (ToC)
5 | * [Overview](#overview)
6 | * [References](#references)
7 | * [Programming](#programming)
8 | * [JavaScript (JS) development](#javascript-js-development)
9 | * [Java development](#java-development)
10 | * [JupyterLab with PySpark and DuckDB](#jupyterlab-with-pyspark-and-duckdb)
11 | * [Databases](#databases)
12 | * [PostgreSQL](#postgresql)
13 | * [Trino (formerly Presto)](#trino-formerly-presto)
14 | * [DuckDB](#duckdb)
15 | * [Cloud services](#cloud-services)
16 | * [Amazon web services (AWS)](#amazon-web-services-aws)
17 | * [Azure cloud services](#azure-cloud-services)
18 | * [Google Cloud Platform (GCP)](#google-cloud-platform-gcp)
19 | * [Data storage](#data-storage)
20 | * [Minio](#minio)
21 | * [LakeFS](#lakefs)
22 | * [Data processing](#data-processing)
23 | * [DBT](#dbt)
24 | * [Data catalogs](#data-catalogs)
25 | * [Unity Catalog](#unity-catalog)
26 | * [Hive Metastore](#hive-metastore)
27 | * [Egeria](#egeria)
28 | * [Infrastructure](#infrastructure)
29 | * [Docker](#docker)
30 | * [Kubernetes (k8s)](#kubernetes-k8s)
31 |
32 | Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc.go)
33 |
34 | # Overview
35 | [This project](https://github.com/data-engineering-helpers/ks-cheat-sheets)
36 | aims at collecting cheat sheets about software and data engineering subjects,
37 | such as for instance JavaScript programming or DuckDB.
38 | Each cheat sheet usually gives a quick starter and common use cases.
39 |
40 | Even though the members of the GitHub organization may be employed by
41 | some companies, they speak on their personal behalf and do not represent
42 | these companies.
43 |
44 | # References
45 | * [Architecture principles for data engineering pipelines on the Modern Data Stack (MDS)](https://github.com/data-engineering-helpers/architecture-principles)
46 | + [Material for the Data platform - Architecture principles](https://github.com/data-engineering-helpers/architecture-principles/blob/main/material/README.md)
47 | * Specifications/principles for a
48 | [data engineering pipeline deployment tool](https://github.com/data-engineering-helpers/data-pipeline-deployment)
49 | + [`dpcctl`, the Data Processing Pipeline (DPP) CLI utility](https://github.com/data-engineering-helpers/dppctl), a Minimal Viable Product (MVP) in Go
50 | * [Material for the Data platform - Modern Data Stack (MDS) in a box](https://github.com/data-engineering-helpers/mds-in-a-box/blob/main/README.md)
51 | * [Material for the Data platform - Data life cycle](https://github.com/data-engineering-helpers/data-life-cycle/blob/main/README.md)
52 | * [Material for the Data platform - Data contracts](https://github.com/data-engineering-helpers/data-contracts/blob/main/README.md)
53 | * [Material for the Data platform - Metadata](https://github.com/data-engineering-helpers/metadata/blob/main/README.md)
54 | * [Material for the Data platform - Data quality](https://github.com/data-engineering-helpers/data-quality/blob/main/README.md)
55 |
56 | # Programming
57 |
58 | ## JavaScript (JS) development
59 | * [Directory dedicated to JS world](programming/js-world/)
60 |
61 | ## Java development
62 | * [Directory dedicated to Java-world (including Scala) development](programming/java-world/)
63 |
64 | ## JupyterLab with PySpark and DuckDB
65 | * [Directory dedicated to JupyterLab with PySpark and DuckDB](programming/jupyter/jupyter-pyspark-duckdb)
66 |
67 | # Databases
68 |
69 | ## PostgreSQL
70 | * [Directory dedicated to PostgreSQL](db/postgresql/)
71 |
72 | ## Trino (formerly Presto)
73 | * [Directory dedicated to Trino](db/trino/)
74 |
75 | ## DuckDB
76 | * [Directory dedicated to DuckDB](db/duckdb/)
77 |
78 | # Cloud services
79 |
80 | ## Amazon web services (AWS)
81 | * [Directory dedicated to Amazon Web Services (AWS)](clouds/aws/)
82 |
83 | ## Azure cloud services
84 | * [Directory dedicated to Azure cloud services](clouds/azure/)
85 |
86 | ## Google Cloud Platform (GCP)
87 | * [Directory dedicated to Google Cloud Platform (GCP)](clouds/gcp/)
88 |
89 | # Data storage
90 |
91 | ## Minio
92 | * [Directory dedicated to Minio](data-storage/minio/)
93 |
94 | ## LakeFS
95 | * [Directory dedicated to LakeFS](data-storage/lakefs/)
96 |
97 | # Data processing
98 |
99 | ## DBT
100 | * [Directory dedicated to DBT](data-processing/dbt/)
101 |
102 | # Data catalogs
103 |
104 | ## Unity Catalog
105 | * [Directory dedicated to Unity Catalog](data-catalogs/unity-catalog/)
106 |
107 | ## Hive Metastore
108 | * [Directory dedicated to Hive Metastore](data-catalogs/hive-metastore/)
109 |
110 | ## Egeria
111 | * [Directory dedicated to Egeria OMAG (Open Metadata and Governance)](data-catlogs/egeria/)
112 |
113 | # Infrastructure
114 |
115 | ## Docker
116 | * [Directory dedicated to containerization (Docker)](infrasrtructure/docker/)
117 |
118 | ## Kubernetes (k8s)
119 | * [Directory dedicated to Kubernetes (k8s)](infrastructure/k8s/)
120 |
121 |
122 |
--------------------------------------------------------------------------------
/clouds/azure/README.md:
--------------------------------------------------------------------------------
1 | Cheat Sheet - Cloud - Azure
2 | ===========================
3 |
4 | # Table of Content (ToC)
5 |
6 | Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc.go)
7 |
8 | # Overview
9 | [This cheat sheet](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/clouds/aws/README.md)
10 | explains how to use and interact with Azure cloud services, that is,
11 | installing Azure command-line utilities such as `az` and `bcp`
12 | and interacting with remote Azure services (_e.g._, ADLS, Azure VM).
13 |
14 | # References
15 |
16 | ## Data Engineering helpers
17 | * [Architecture principles for data engineering pipelines on the Modern Data Stack (MDS)](https://github.com/data-engineering-helpers/architecture-principles)
18 | * [Data Engineering Helpers - Knowledge Sharing - PostgreSQL](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/postgresql/README.md)
19 | * [Data Engineering Helpers - Knowledge Sharing - Kubernetes (k8s)](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/frameworks/k8s/README.md)
20 |
21 |
22 |
--------------------------------------------------------------------------------
/clouds/gcp/README.md:
--------------------------------------------------------------------------------
1 | Cheat Sheet - Cloud - Google Cloud Platform (GCP)
2 | =================================================
3 |
4 | # Table of Content (ToC)
5 |
6 | Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc.go)
7 |
8 | # Overview
9 | [This cheat sheet](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/clouds/aws/README.md)
10 | explains how to use and interact with Google Cloud Platform (GCP) services,
11 | that is, installing GCP command-line utilities such as `gcloud`
12 | and interacting with remote GCP services (_e.g._, GCS, GKE, BigQuery).
13 |
14 | # References
15 |
16 | ## Data Engineering helpers
17 | * [Architecture principles for data engineering pipelines on the Modern Data Stack (MDS)](https://github.com/data-engineering-helpers/architecture-principles)
18 | * [Data Engineering Helpers - Knowledge Sharing - PostgreSQL](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/postgresql/README.md)
19 | * [Data Engineering Helpers - Knowledge Sharing - Kubernetes (k8s)](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/frameworks/k8s/README.md)
20 |
21 |
22 |
--------------------------------------------------------------------------------
/data-catalogs/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # References
4 |
5 | ## Articles
6 |
7 | ### Comprehensive Data Catalog Comparison
8 | * Title: Comprehensive Data Catalog Comparison
9 | * Author: Kyle Weller
10 | ([Kyle Weller on LinkedIn](https://www.linkedin.com/in/lakehouse/))
11 | * Date: Dec. 2024
12 | * Link to the article: https://www.onehouse.ai/blog/comprehensive-data-catalog-comparison
13 |
14 | ## Iceberg REST API
15 | * Specification in GitHub: https://github.com/apache/iceberg/blob/main/open-api/rest-catalog-open-api.yaml
16 | * Article on Substack by Alex Merced, Feb. 2025: https://amdatalakehouse.substack.com/p/iceberg-rest-catalog-overview-1-introduction
17 |
18 |
--------------------------------------------------------------------------------
/data-catalogs/boringcatalog/README.md:
--------------------------------------------------------------------------------
1 | Cheat Sheet - Boring Catalog
2 | ============================
3 |
4 | # Table of Content (ToC)
5 | * [Overview](#overview)
6 | * [References](#references)
7 | * [Data Engineering helpers](#data-engineering-helpers)
8 | * [BoringData documentation](#boringdata-documentation)
9 | * [Setup](#setup)
10 |
11 | Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc.go)
12 |
13 | # Overview
14 | [This cheat sheet](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-catalogs/boringcatalog/README.md)
15 | explains how to install and to use
16 | [Boring Catalog](https://github.com/boringdata/boring-catalog)
17 | on premises, _e.g._, on a laptop or on a virtual machine (VM).
18 |
19 | # References
20 |
21 | ## Data Engineering helpers
22 | * [Data Engineering Helpers - Knowledge Sharing - Minio](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-storage/minio/README.md)
23 | * [Data Engineering Helpers - Knowledge Sharing - DuckDB](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/duckdb/README.md)
24 |
25 | ## BoringData documentation
26 | * GitHub page: https://github.com/boringdata/boring-catalog
27 | * Article on Substack by Julien Hurault (the author of Boring Catalog):
28 | https://juhache.substack.com/p/boring-iceberg-catalog
29 | * Showcase video by Daniel Beach, May 2025, Substack:
30 | https://substack.com/home/post/p-164251337
31 | * Companion article: https://substack.com/home/post/p-163944714
32 |
33 | # Setup
34 | * Optionally (but recommended), install `uv`
35 | * On MacOS:
36 | ```bash
37 | brew intall uv
38 | ```
39 |
40 | * Optionally (but recommended), install a virtual environment with `uv`
41 | ```bash
42 | uv venv
43 | ```
44 |
45 | * Install the Boring-Catalog Python module (as well as a few other utilities)
46 | * With a standard Python environment:
47 | ```bash
48 | pip install -U pyiceberg duckdb polars boringcatalog
49 | ```
50 | * With `uv`:
51 | ```bash
52 | uv pip install -U pyiceberg duckdb polars boringcatalog
53 | ```
54 |
55 | * With a standard Python environment, the Shell needs to be reloaded
56 | (`uv` already manages virtual environment, no additional step is needed with
57 | it)
58 | * With Bash
59 | ```bash
60 | exec bash
61 | ```
62 | * With Zsh
63 | ```bash
64 | exec zsh
65 | ```
66 |
67 | * From this point on, it will not be repeated every time how to use `uv`.
68 | With `uv`, just prefix every command by `uv run`. For instance:
69 | * Simple Python command: `uv run python -V`
70 | * BoringCatalog command (`ice`): `uv run ice`
71 | * And so on
72 |
73 | * Init the catalog with a storage location on S3:
74 | ```bash
75 | ice init -p warehouse=s3://mybucket/ice-warehouse
76 | ```
77 |
78 | * Download the NYC taxi Parquet file (45 MB) from Kaggle:
79 | ```bash
80 | curl https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2023-01.parquet -o /tmp/yellow_tripdata_2023-01.parquet
81 | ```
82 |
83 | * Create a table with the Parquet file:
84 | ```bash
85 | ice commit my_table --source /tmp/yellow_tripdata_2023-01.parquet
86 | ```
87 |
88 | * Launch DuckDB prompt, pre-set with the BoringCatalog extension:
89 | ```bash
90 | ice duck
91 | ```
92 |
93 | * From the DuckDB prompt
94 | * Dump everything as DDL:
95 | ```sql
96 | .schema
97 | ```
98 | * Set the Boring-Catalog as the default schema:
99 | ```sql
100 | use ice_default;
101 | ```
102 | * Display the tables:
103 | ```sql
104 | show tables;
105 | ```
106 | * Display a few rows of the just created table (prefixing the table name
107 | is optional, as a default schema has been specified; the prefix is kept here
108 | so that the line can still be copied/pasted even without specifying a default
109 | schema):
110 | ```sql
111 | select * from ice_default.my_table limit 10;
112 | ```
113 | * Describe the schema:
114 | ```sql
115 | desc ice_default.my_table;
116 | ```
117 | * Count the number of rows:
118 | ```sql
119 | select count(*) as nb_rows from ice_default.my_table;
120 | ┌────────────────┐
121 | │ nb_rows │
122 | │ int64 │
123 | ├────────────────┤
124 | │ 3066766 │
125 | │ (3.07 million) │
126 | └────────────────┘
127 | ```
128 | * Quit the DuckDB Shell:
129 | ```sql
130 | .quit
131 | ```
132 |
--------------------------------------------------------------------------------
/data-catalogs/egeria/README.md:
--------------------------------------------------------------------------------
1 | Cheat Sheet - Data catalogs - Egeria
2 | ====================================
3 |
4 | # Table of Content (ToC)
5 |
6 | Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc.go)
7 |
8 | # Overview
9 | [This cheat sheet](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-catalogs/egeria/README.md)
10 | explains how to install and to use
11 | [Egeria](https://github.com/odpi/egeria/)
12 | on premises, _e.g._, on a laptop or on a virtual machine (VM).
13 |
14 | # References
15 |
16 | ## Data Engineering helpers
17 | * [Material for the Data platform - Modern Data Stack (MDS) in a box](https://github.com/data-engineering-helpers/mds-in-a-box/blob/main/README.md)
18 | * [Material for the Data platform - Data life cycle](https://github.com/data-engineering-helpers/data-life-cycle/blob/main/README.md)
19 | * [Data Engineering Helpers - Knowledge Sharing - Minio](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-storage/minio/README.md)
20 | * [Data Engineering Helpers - Knowledge Sharing - DuckDB](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/duckdb/README.md)
21 | * [Data Engineering Helpers - Knowledge Sharing - PostgreSQL](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/postgresql/README.md)
22 | * [Data Engineering Helpers - Knowledge Sharing - Hive Metastore](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-catalogs/hive-metastore/README.md)
23 | * [Data Engineering Helpers - Knowledge Sharing - Trino](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/trino/README.md)
24 | * [Data Engineering Helpers - Knowledge Sharing - Dremio](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/dremio/README.md)
25 | * [Data Engineering Helpers - Knowledge Sharing - Java world](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/programming/java-world/README.md)
26 |
27 | ## Egeria
28 | * Egeria home page: https://egeria-project.org/
29 | * GitHub project: https://github.com/odpi/egeria
30 |
31 | ## Articles
32 | * [Medium - ](),
33 | by [Xxx](),
34 | May 2024
35 |
36 | # Installation
37 |
38 | ## Egeria Git repository
39 | * If not already done so, clone the Git repository of Egeria (beware that it takes up
40 | half a GB of storage on disk, and a good internet connection is therefore mandatory):
41 | ```bash
42 | $ mkdir -p ~/dev/metadata && cd ~/dev/metadata
43 | git clone git@github.com:odpi/egeria.git
44 | ```
45 |
46 | * Go into the Egeria project directory:
47 | ```bash
48 | $ cd ~/dev/metadata/egeria
49 | ```
50 |
51 | ## Egeria as a standalone server without Docker
52 | * Go into the standalone server directory:
53 | ```bash
54 | $ cd open-metadata-distribution/omag-server-platform/build/unpacked/egeria*gz/assembly/platform
55 | ```
56 |
57 | * Start the OMAG (Open Metadata and Governance) server:
58 | ```bash
59 | $ java -Dloader.path=lib,extra -jar omag-server-platform*.jar
60 | ```
61 |
62 | * To shutdown the server, just type Control-C in the terminal where Egeria was started
63 |
64 | * In order to ease the launching of the Egeria server in the background,
65 | two Shell scripts are provided in this Git repository:
66 | * [GitHub - KS Egeria - `bin/egeria-start.sh`](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-catalogs/egeria/bin/egeria-start.sh)
67 | * [GitHub - KS Egeria - `bin/egeria-screen.sh`](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-catalogs/egeria/bin/egeria-screen.sh)
68 | * If not already done so, install `screen`, _e.g._, on MacOS, `brew install screen`
69 | * Copy those 2 scripts in the local `~/bin/` directory, so that the Egeria server may be launched and audited as following:
70 | * Installation:
71 | ```bash
72 | $ mkdir -p ~/bin
73 | curl https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/main/data-catalogs/egeria/bin/egeria-start.sh -o ~/bin/egeria-start.sh
74 | curl https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/main/data-catalogs/egeria/bin/egeria-screen.sh -o ~/bin/egeria-screen.sh
75 | chmod +x ~/bin/egeria-*.sh
76 | ```
77 | * Start the Egeria server in a dedicated screen:
78 | ```bash
79 | $ ~/bin/egeria-screen.sh
80 | ```
81 | * Go into the screen where the Egeria server has started:
82 | ```bash
83 | $ screen -r egeria
84 | ```
85 | * When finished, stop the Egeria server by typing Control-C in the screen where the Egeria server runs
86 |
--------------------------------------------------------------------------------
/data-catalogs/egeria/bin/egeria-screen.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/frameworks/egeria/README.md
4 |
5 | screen -A -m -d -S egeria $HOME/bin/egeria-start.sh &
6 |
7 |
--------------------------------------------------------------------------------
/data-catalogs/egeria/bin/egeria-start.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #
3 | # https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/frameworks/egeria/README.md
4 |
5 | cd $HOME/dev/metadata/egeria/open-metadata-distribution/omag-server-platform/build/unpacked/egeria*gz/assembly/platform
6 | java -Dloader.path=lib,extra -jar omag-server-platform*.jar
7 |
8 |
--------------------------------------------------------------------------------
/data-catalogs/egeria/etc/egeria:
--------------------------------------------------------------------------------
1 | # Egeria default environment variables
2 |
3 |
--------------------------------------------------------------------------------
/data-catalogs/egeria/etc/homebrew.mxcl.egeria.plist:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | KeepAlive
6 |
7 | EnvironmentVariables
8 |
9 | EGERIA_CONFIG_ENV_FILE
10 | /etc/default/egeria
11 |
12 | Label
13 | homebrew.mxcl.egeria
14 | LimitLoadToSessionType
15 |
16 | Aqua
17 | Background
18 | LoginWindow
19 | StandardIO
20 | System
21 |
22 | ProgramArguments
23 |
24 | $BREW_PFX/bin/egeria
25 |
26 | RunAtLoad
27 |
28 | StandardErrorPath
29 | $BREW_PFX/var/log/egeria.log
30 | StandardOutPath
31 | $BREW_PFX/var/log/egeria.log
32 | WorkingDirectory
33 | $BREW_PFX
34 |
35 |
36 |
37 |
--------------------------------------------------------------------------------
/data-catalogs/hive-metastore/etc/hive-site.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 | hive.exec.scratchdir
4 | scratchdir
5 | Scratch space for Hive jobs
6 |
7 |
8 | hive.metastore.warehouse.dir
9 | spark-warehouse
10 | Spark Warehouse
11 |
12 |
13 | javax.jdo.option.ConnectionURL
14 | jdbc:postgresql://localhost:5432/metastore
15 | PostgreSQL JDBC driver connection URL
16 |
17 |
18 | javax.jdo.option.ConnectionDriverName
19 | org.postgresql.Driver
20 | PostgreSQL metastore driver class name
21 |
22 |
23 | javax.jdo.option.ConnectionUserName
24 | metastore
25 | username for the DB instance
26 |
27 |
28 | javax.jdo.option.ConnectionPassword
29 | metastore-passwd
30 | password for the DB instance
31 |
32 |
33 |
--------------------------------------------------------------------------------
/data-catalogs/hive-metastore/pyutils/SparkSessionUtil.py:
--------------------------------------------------------------------------------
1 | #
2 | # File: https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/frameworks/hive-metastore/pyutils/SparkSessionUtil.py
3 | #
4 | #
5 | import os
6 | from pyspark import SparkConf, SparkContext
7 | from pyspark.sql import SparkSession
8 | from delta import *
9 | from pathlib import Path
10 |
11 | DATABRICKS_SERVICE_PORT = "8787"
12 |
13 |
14 | class SparkSessionUtil:
15 | """
16 | Helper class for configuring Spark session based on the spark environment being used.
17 | Determines whether are using local spark, databricks-connect or directly executing on a cluster and sets up config
18 | settings for local spark as required.
19 | """
20 |
21 | @staticmethod
22 | def get_configured_spark_session(cluster_id=None):
23 | """
24 | Determines the execution environment and returns a spark session configured for either local or cluster usage
25 | accordingly
26 | :param cluster_id: a cluster_id to connect to if using databricks-connect
27 | :return: a configured spark session. We use the spark.sql.cerespower.session.environment custom property to store
28 | the environment for which the session is created, being either 'databricks', 'db_connect' or 'local'
29 | """
30 | # Note: We must enable Hive support on our original Spark Session for it to work with any we recreate locally
31 | # from the same context configuration.
32 | # if SparkSession._instantiatedSession:
33 | # return SparkSession._instantiatedSession
34 | if SparkSession.getActiveSession():
35 | return SparkSession.getActiveSession()
36 | spark = SparkSession.builder.config("spark.sql.cerespower.session.environment", "databricks").getOrCreate()
37 | if SparkSessionUtil.is_cluster_direct_exec(spark):
38 | # simply return the existing spark session
39 | return spark
40 | conf = SparkConf()
41 | # copy all the configuration values from the current Spark Context
42 | for (k, v) in spark.sparkContext.getConf().getAll():
43 | conf.set(k, v)
44 | if SparkSessionUtil.is_databricks_connect():
45 | # set the cluster for execution as required
46 | # Note: we are unable to check whether the cluster_id has changed as this setting is unset at this point
47 | if cluster_id:
48 | conf.set("spark.databricks.service.clusterId", cluster_id)
49 | conf.set("spark.databricks.service.port", DATABRICKS_SERVICE_PORT)
50 | # stop the spark session context in order to create a new one with the required cluster_id, else we
51 | # will still use the current cluster_id for execution
52 | spark.stop()
53 | con = SparkContext(conf=conf)
54 | sess = SparkSession(con)
55 | return sess.builder.config("spark.sql.cerespower.session.environment", "db_connect",
56 | conf=conf).getOrCreate()
57 | else:
58 | # Set up for local spark installation
59 | # Note: metastore connection and configuration details are taken from \conf\hive-site.xml
60 | conf.set("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
61 | conf.set("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")
62 | conf.set("spark.broadcast.compress", "false")
63 | conf.set("spark.shuffle.compress", "false")
64 | conf.set("spark.shuffle.spill.compress", "false")
65 | conf.set("spark.master", "local[*]")
66 | conf.set("spark.driver.host", "localhost")
67 | conf.set("spark.sql.debug.maxToStringFields", 1000)
68 | conf.set("spark.sql.hive.metastore.version", "2.3.7")
69 | conf.set("spark.sql.hive.metastore.schema.verification", "false")
70 | conf.set("spark.sql.hive.metastore.jars", "builtin")
71 | conf.set("spark.sql.hive.metastore.uris", "thrift://localhost:9083")
72 | conf.set("spark.sql.catalogImplementation", "hive")
73 | conf.set("spark.sql.cerespower.session.environment", "local")
74 | spark.stop()
75 | con = SparkContext(conf=conf)
76 | sess = SparkSession(con)
77 | builder = sess.builder.config(conf=conf)
78 |
79 | return configure_spark_with_delta_pip(builder).getOrCreate()
80 |
81 | @staticmethod
82 | def is_databricks_connect():
83 | """
84 | Determines whether the spark session is using databricks-connect, based on the existence of a 'databricks'
85 | directory within the SPARK_HOME directory
86 | :param spark: the spark session
87 | :return: True if using databricks-connect to connect to a cluster, else False
88 | """
89 | return Path(os.environ.get('SPARK_HOME'), 'databricks').exists()
90 |
91 | @staticmethod
92 | def is_cluster_direct_exec(spark):
93 | """
94 | Determines whether executing directly on cluster, based on the existence of the clusterName configuration
95 | setting
96 | :param spark: the spark session
97 | :return: True if executing directly on a cluster, else False
98 | """
99 | # Note: using spark.conf.get(...) will cause the cluster to start, whereas spark.sparkContext.getConf().get does
100 | # not. As we may want to change the clusterid when using databricks-connect we don't want to start the wrong
101 | # cluster prematurely.
102 | return spark.sparkContext.getConf().get("spark.databricks.clusterUsageTags.clusterName", None) is not None
103 |
104 |
--------------------------------------------------------------------------------
/data-catalogs/hive-metastore/pyutils/hive-spark-client.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # File: https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/frameworks/hive-metastore/pyutils/hive-spark-client.py
4 | #
5 | #
6 | import ntpath
7 | import posixpath
8 | from os import path
9 | from SparkSessionUtil import SparkSessionUtil
10 |
11 | data_root = 'data'
12 | db_name = 'test_metastore_persist'
13 | table_name = 'test_table'
14 | db_path = f"'{path.join(data_root, db_name)}'".replace(ntpath.sep, posixpath.sep)
15 | spark = SparkSessionUtil.get_configured_spark_session()
16 | spark.sql(f"""create database if not exists {db_name} location {db_path}""")
17 | spark.sql(f"""create table if not exists {db_name}.{table_name}(Id int not null)""")
18 |
19 | # reset our spark session
20 | spark = None
21 |
22 | spark = SparkSessionUtil.get_configured_spark_session()
23 | # confirm the database and table created above are available in the metastore
24 | spark.sql(f"show tables in {db_name}").show(truncate=False)
25 |
--------------------------------------------------------------------------------
/data-catalogs/hive-metastore/sql/hive-txn-schema-2.3.0.postgres.sql:
--------------------------------------------------------------------------------
1 | -- Licensed to the Apache Software Foundation (ASF) under one or more
2 | -- contributor license agreements. See the NOTICE file distributed with
3 | -- this work for additional information regarding copyright ownership.
4 | -- The ASF licenses this file to You under the Apache License, Version 2.0
5 | -- (the "License"); you may not use this file except in compliance with
6 | -- the License. You may obtain a copy of the License at
7 | --
8 | -- http://www.apache.org/licenses/LICENSE-2.0
9 | --
10 | -- Unless required by applicable law or agreed to in writing, software
11 | -- distributed under the License is distributed on an "AS IS" BASIS,
12 | -- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | -- See the License for the specific language governing permissions and
14 | -- limitations under the License.
15 |
16 | --
17 | -- Tables for transaction management
18 | --
19 | -- File: https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/frameworks/hive-metastore/sql/hive-txn-schema-2.3.0.postgres.sql
20 | -- Source: https://github.com/apache/hive/blob/master/metastore/scripts/upgrade/postgres/hive-txn-schema-2.3.0.postgres.sql
21 | --
22 |
23 | CREATE TABLE TXNS (
24 | TXN_ID bigint PRIMARY KEY,
25 | TXN_STATE char(1) NOT NULL,
26 | TXN_STARTED bigint NOT NULL,
27 | TXN_LAST_HEARTBEAT bigint NOT NULL,
28 | TXN_USER varchar(128) NOT NULL,
29 | TXN_HOST varchar(128) NOT NULL,
30 | TXN_AGENT_INFO varchar(128),
31 | TXN_META_INFO varchar(128),
32 | TXN_HEARTBEAT_COUNT integer
33 | );
34 |
35 | CREATE TABLE TXN_COMPONENTS (
36 | TC_TXNID bigint REFERENCES TXNS (TXN_ID),
37 | TC_DATABASE varchar(128) NOT NULL,
38 | TC_TABLE varchar(128),
39 | TC_PARTITION varchar(767) DEFAULT NULL,
40 | TC_OPERATION_TYPE char(1) NOT NULL
41 | );
42 |
43 | CREATE INDEX TC_TXNID_INDEX ON TXN_COMPONENTS USING hash (TC_TXNID);
44 |
45 | CREATE TABLE COMPLETED_TXN_COMPONENTS (
46 | CTC_TXNID bigint,
47 | CTC_DATABASE varchar(128) NOT NULL,
48 | CTC_TABLE varchar(256),
49 | CTC_PARTITION varchar(767)
50 | );
51 |
52 | CREATE TABLE NEXT_TXN_ID (
53 | NTXN_NEXT bigint NOT NULL
54 | );
55 | INSERT INTO NEXT_TXN_ID VALUES(1);
56 |
57 | CREATE TABLE HIVE_LOCKS (
58 | HL_LOCK_EXT_ID bigint NOT NULL,
59 | HL_LOCK_INT_ID bigint NOT NULL,
60 | HL_TXNID bigint,
61 | HL_DB varchar(128) NOT NULL,
62 | HL_TABLE varchar(128),
63 | HL_PARTITION varchar(767) DEFAULT NULL,
64 | HL_LOCK_STATE char(1) NOT NULL,
65 | HL_LOCK_TYPE char(1) NOT NULL,
66 | HL_LAST_HEARTBEAT bigint NOT NULL,
67 | HL_ACQUIRED_AT bigint,
68 | HL_USER varchar(128) NOT NULL,
69 | HL_HOST varchar(128) NOT NULL,
70 | HL_HEARTBEAT_COUNT integer,
71 | HL_AGENT_INFO varchar(128),
72 | HL_BLOCKEDBY_EXT_ID bigint,
73 | HL_BLOCKEDBY_INT_ID bigint,
74 | PRIMARY KEY(HL_LOCK_EXT_ID, HL_LOCK_INT_ID)
75 | );
76 |
77 | CREATE INDEX HL_TXNID_INDEX ON HIVE_LOCKS USING hash (HL_TXNID);
78 |
79 | CREATE TABLE NEXT_LOCK_ID (
80 | NL_NEXT bigint NOT NULL
81 | );
82 | INSERT INTO NEXT_LOCK_ID VALUES(1);
83 |
84 | CREATE TABLE COMPACTION_QUEUE (
85 | CQ_ID bigint PRIMARY KEY,
86 | CQ_DATABASE varchar(128) NOT NULL,
87 | CQ_TABLE varchar(128) NOT NULL,
88 | CQ_PARTITION varchar(767),
89 | CQ_STATE char(1) NOT NULL,
90 | CQ_TYPE char(1) NOT NULL,
91 | CQ_TBLPROPERTIES varchar(2048),
92 | CQ_WORKER_ID varchar(128),
93 | CQ_START bigint,
94 | CQ_RUN_AS varchar(128),
95 | CQ_HIGHEST_TXN_ID bigint,
96 | CQ_META_INFO bytea,
97 | CQ_HADOOP_JOB_ID varchar(32)
98 | );
99 |
100 | CREATE TABLE NEXT_COMPACTION_QUEUE_ID (
101 | NCQ_NEXT bigint NOT NULL
102 | );
103 | INSERT INTO NEXT_COMPACTION_QUEUE_ID VALUES(1);
104 |
105 | CREATE TABLE COMPLETED_COMPACTIONS (
106 | CC_ID bigint PRIMARY KEY,
107 | CC_DATABASE varchar(128) NOT NULL,
108 | CC_TABLE varchar(128) NOT NULL,
109 | CC_PARTITION varchar(767),
110 | CC_STATE char(1) NOT NULL,
111 | CC_TYPE char(1) NOT NULL,
112 | CC_TBLPROPERTIES varchar(2048),
113 | CC_WORKER_ID varchar(128),
114 | CC_START bigint,
115 | CC_END bigint,
116 | CC_RUN_AS varchar(128),
117 | CC_HIGHEST_TXN_ID bigint,
118 | CC_META_INFO bytea,
119 | CC_HADOOP_JOB_ID varchar(32)
120 | );
121 |
122 | CREATE TABLE AUX_TABLE (
123 | MT_KEY1 varchar(128) NOT NULL,
124 | MT_KEY2 bigint NOT NULL,
125 | MT_COMMENT varchar(255),
126 | PRIMARY KEY(MT_KEY1, MT_KEY2)
127 | );
128 |
129 | CREATE TABLE WRITE_SET (
130 | WS_DATABASE varchar(128) NOT NULL,
131 | WS_TABLE varchar(128) NOT NULL,
132 | WS_PARTITION varchar(767),
133 | WS_TXNID bigint NOT NULL,
134 | WS_COMMIT_ID bigint NOT NULL,
135 | WS_OPERATION_TYPE char(1) NOT NULL
136 | );
137 |
--------------------------------------------------------------------------------
/data-catalogs/nessie/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-catalogs/nessie/README.md
--------------------------------------------------------------------------------
/data-processing/arkflow/README.md:
--------------------------------------------------------------------------------
1 | Cheat Sheet - dbt
2 | =================
3 |
4 | # Table of Content (ToC)
5 | * [Overview](#overview)
6 | * [References](#references)
7 | * [Data Engineering helpers](#data-engineering-helpers)
8 | * [ArkFlow](#arkflow)
9 | * [Quickstart](#quickstart)
10 | * [Installation](#installation)
11 | * [Rust](#rust)
12 | * [Build from the sources](#build-from-the-sources)
13 |
14 | Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc.go)
15 |
16 | # Overview
17 | [This cheat sheet](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-processing/arkflow/README.md)
18 | explains how to install and to use [ArkFlow](https://ark-flow.github.io/arkflow/)
19 | on premises, _e.g._, on a laptop or on a virtual machine (VM).
20 |
21 | # References
22 |
23 | ## Data Engineering helpers
24 | * [Data Engineering Helpers - Knowledge Sharing - SQLMesh](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-processing/dbt/README.md)
25 | * [Material for the Data platform - Modern Data Stack (MDS) in a box](https://github.com/data-engineering-helpers/mds-in-a-box/blob/main/README.md)
26 | * [Material for the Data platform - Data life cycle](https://github.com/data-engineering-helpers/data-life-cycle/blob/main/README.md)
27 | * [Data Engineering Helpers - Knowledge Sharing - Minio](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-storage/minio/README.md)
28 | * [Data Engineering Helpers - Knowledge Sharing - DuckDB](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/duckdb/README.md)
29 |
30 | ## ArkFlow
31 | * GitHub repository: https://github.com/ark-flow/arkflow
32 |
33 | # Quickstart
34 | * Create a configuration file `config.yaml`:
35 | ```yaml
36 | logging:
37 | level: info
38 | streams:
39 | - input:
40 | type: "generate"
41 | context: '{ "timestamp": 1625000000000, "value": 10, "sensor": "temp_1" }'
42 | interval: 1s
43 | batch_size: 10
44 |
45 | pipeline:
46 | thread_num: 4
47 | processors:
48 | - type: "json_to_arrow"
49 | - type: "sql"
50 | query: "SELECT * FROM flow WHERE value >= 10"
51 |
52 | output:
53 | type: "stdout"
54 | ```
55 |
56 | * Run ArkFlow:
57 | ```bash
58 | ./target/release/arkflow --config config.yaml
59 | ```
60 |
61 | # Installation
62 |
63 | ## Rust
64 | Rust tools need to be installed
65 |
66 | * On MacOS:
67 | ```bash
68 | brew install rust
69 | ```
70 |
71 | ## Build from the sources
72 | * Clone the Git repository:
73 | ```bash
74 | mkdir -p ~/dev/infra
75 | git clone https://github.com/ark-flow/arkflow.git ~/dev/infra/arkflow
76 | cd ~/dev/infra/arkflow
77 | ```
78 |
79 | * Build project
80 | ```bash
81 | cargo build --release
82 | ```
83 |
84 | * Run tests:
85 | ```bash
86 | cargo test
87 | ```
88 |
89 |
--------------------------------------------------------------------------------
/data-processing/dbt/README.md:
--------------------------------------------------------------------------------
1 | Cheat Sheet - dbt
2 | =================
3 |
4 | # Table of Content (ToC)
5 | * [Overview](#overview)
6 | * [References](#references)
7 | * [Data Engineering helpers](#data-engineering-helpers)
8 | * [DuckDB](#duckdb)
9 | * [dbt](#dbt)
10 | * [dbt\-duckdb](#dbt-duckdb)
11 | * [Quickstart](#quickstart)
12 | * [Installation](#installation)
13 | * [Dependencies](#dependencies)
14 |
15 | Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc.go)
16 |
17 | # Overview
18 | [This cheat sheet](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-processing/dbt/README.md)
19 | explains how to install and to use [dbt](https://getdbt.com/) with
20 | [DuckDB](https://duckdb.org/) on premises, _e.g._, on a laptop or
21 | on a virtual machine (VM).
22 |
23 | > [DuckDB](https://duckdb.org/) is an embedded database, similar to SQLite,
24 | > but designed for OLAP-style analytics. It is crazy fast and allows you
25 | > to read and write data stored in CSV, JSON, and Parquet files directly,
26 | > without requiring you to load them into the database first.
27 |
28 | > [dbt](https://getdbt.com/) is the best way to manage a collection of data
29 | > transformations written in SQL or Python for analytics and data science.
30 | > [`dbt-duckdb`](https://github.com/duckdb/dbt-duckdb) is the project that ties
31 | > DuckDB and dbt together, allowing you to create a
32 | > [Modern Data Stack In A Box](https://duckdb.org/2022/10/12/modern-data-stack-in-a-box.html)
33 | > or a simple and powerful data lakehouse with Python.
34 |
35 | # References
36 |
37 | ## Data Engineering helpers
38 | * [Data Engineering Helpers - Knowledge Sharing - SQLMesh](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-processing/dbt/README.md)
39 | * [Material for the Data platform - Modern Data Stack (MDS) in a box](https://github.com/data-engineering-helpers/mds-in-a-box/blob/main/README.md)
40 | * [Material for the Data platform - Data life cycle](https://github.com/data-engineering-helpers/data-life-cycle/blob/main/README.md)
41 | * [Data Engineering Helpers - Knowledge Sharing - Minio](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-storage/minio/README.md)
42 | * [Data Engineering Helpers - Knowledge Sharing - DuckDB](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/duckdb/README.md)
43 |
44 | ## DuckDB
45 | * Home page: https://duckdb.org/
46 | * [DuckDB doc - HTTPFS extension](https://duckdb.org/docs/extensions/httpfs.html)
47 |
48 | ## dbt
49 | * Home page: https://gitdbt.com
50 |
51 | ## `dbt-duckdb`
52 | * Git repository: https://github.com/duckdb/dbt-duckdb
53 |
54 | # Quickstart
55 |
56 | # Installation
57 | * Install `dbt-duckdb` as a Python module:
58 | ```bash
59 | $ python -mpip install -U pip dbt-duckdb
60 | ```
61 |
62 | * To enable persistency of the DuckDB-created tables in the
63 | [AWS Glue service](https://aws.amazon.com/glue/),
64 | install the Glue dependency:
65 | ```bash
66 | $ python -mpip install -U "dbt-duckdb[glue]"
67 | ```
68 |
69 | ## Dependencies
70 |
71 |
--------------------------------------------------------------------------------
/data-processing/polars/examples/unity-catalog/readme.md:
--------------------------------------------------------------------------------
1 | # Cheat Sheet - Polars
--------------------------------------------------------------------------------
/data-processing/preswald/README.md:
--------------------------------------------------------------------------------
1 | Cheat Sheet - Preswald
2 | ======================
3 |
4 | # Table of Content (ToC)
5 |
6 | # Overview
7 | [This cheat sheet](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-processing/preswald/README.md)
8 | explains how to install and to use
9 | [Preswald](https://https://github.com/StructuredLabs/preswald), _e.g._,
10 | on a laptop or on a virtual machine (VM).
11 |
12 | # References:
13 |
14 | ## Data Engineering helpers
15 | * [Data Engineering Helpers - Knowledge Sharing - SQLMesh](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-processing/sqlmesh/)
16 | * [Data Engineering Helpers - Knowledge Sharing - DuckDB](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/duckdb/)
17 | * [Data Engineering Helpers - Knowledge Sharing - PostgreSQL](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/postgresql/)
18 | * [Data Engineering Helpers - Knowledge Sharing - Unity Catalog (UC)](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-catalogs/unity-catalog/)
19 | * [Data Engineering Helpers - Knowledge Sharing - Spark](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-processing/spark/)
20 | * [Data Engineering Helpers - Knowledge Sharing - dbt](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-processing/dbt/)
21 | * [Data Engineering Helpers - Knowledge Sharing - Airflow](https://github.com/data-engineering-helpers/ks-cheat-sheets/tree/main/orchestrators/airflow)
22 | * [Material for the Data platform - Data life cycle](https://github.com/data-engineering-helpers/data-life-cycle)
23 | * [Material for the Data platform - Modern Data Stack (MDS) in a box](https://github.com/data-engineering-helpers/mds-in-a-box)
24 |
25 | ## Preswald
26 | * GitHub repository: https://github.com/StructuredLabs/preswald
27 | * Home page: https://preswald.com
28 |
29 |
--------------------------------------------------------------------------------
/data-processing/spark/tools/start-connect-server.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #
4 | # Licensed to the Apache Software Foundation (ASF) under one or more
5 | # contributor license agreements. See the NOTICE file distributed with
6 | # this work for additional information regarding copyright ownership.
7 | # The ASF licenses this file to You under the Apache License, Version 2.0
8 | # (the "License"); you may not use this file except in compliance with
9 | # the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 |
20 | # Enter posix mode for bash
21 | set -o posix
22 |
23 | # Shell script for starting the Spark Connect server
24 | if [ -z "${SPARK_HOME}" ]; then
25 | export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
26 | fi
27 |
28 | # NOTE: This exact class name is matched downstream by SparkSubmit.
29 | # Any changes need to be reflected there.
30 | CLASS="org.apache.spark.sql.connect.service.SparkConnectServer"
31 |
32 | if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
33 | echo "Usage: ./sbin/start-connect-server.sh [options]"
34 |
35 | "${SPARK_HOME}"/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
36 | exit 1
37 | fi
38 |
39 | . "${SPARK_HOME}/bin/load-spark-env.sh"
40 |
41 | exec "${SPARK_HOME}"/sbin/spark-daemon.sh submit $CLASS 1 --name "Spark Connect server" "$@"
42 |
--------------------------------------------------------------------------------
/data-processing/spark/tools/start-history-server.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #
4 | # Licensed to the Apache Software Foundation (ASF) under one or more
5 | # contributor license agreements. See the NOTICE file distributed with
6 | # this work for additional information regarding copyright ownership.
7 | # The ASF licenses this file to You under the Apache License, Version 2.0
8 | # (the "License"); you may not use this file except in compliance with
9 | # the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 |
20 | # Starts the history server on the machine this script is executed on.
21 | #
22 | # Usage: start-history-server.sh
23 | #
24 | # Use the SPARK_HISTORY_OPTS environment variable to set history server configuration.
25 | #
26 |
27 | if [ -z "${SPARK_HOME}" ]; then
28 | export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
29 | fi
30 |
31 | # NOTE: This exact class name is matched downstream by SparkSubmit.
32 | # Any changes need to be reflected there.
33 | CLASS="org.apache.spark.deploy.history.HistoryServer"
34 |
35 | if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
36 | echo "Usage: ./sbin/start-history-server.sh [options]"
37 | pattern="Usage:"
38 | pattern+="\|Using Spark's default log4j profile:"
39 | pattern+="\|Started daemon with process name"
40 | pattern+="\|Registered signal handler for"
41 |
42 | "${SPARK_HOME}"/bin/spark-class $CLASS --help 2>&1 | grep -v "$pattern" 1>&2
43 | exit 1
44 | fi
45 |
46 | . "${SPARK_HOME}/sbin/spark-config.sh"
47 | . "${SPARK_HOME}/bin/load-spark-env.sh"
48 |
49 | exec "${SPARK_HOME}/sbin"/spark-daemon.sh start $CLASS 1 "$@"
50 |
--------------------------------------------------------------------------------
/data-processing/spark/tools/stop-connect-server.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #
4 | # Licensed to the Apache Software Foundation (ASF) under one or more
5 | # contributor license agreements. See the NOTICE file distributed with
6 | # this work for additional information regarding copyright ownership.
7 | # The ASF licenses this file to You under the Apache License, Version 2.0
8 | # (the "License"); you may not use this file except in compliance with
9 | # the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 |
20 | # Stops the connect server on the machine this script is executed on.
21 |
22 | if [ -z "${SPARK_HOME}" ]; then
23 | export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
24 | fi
25 |
26 | "${SPARK_HOME}/sbin"/spark-daemon.sh stop org.apache.spark.sql.connect.service.SparkConnectServer 1
27 |
--------------------------------------------------------------------------------
/data-processing/spark/tools/stop-history-server.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | #
4 | # Licensed to the Apache Software Foundation (ASF) under one or more
5 | # contributor license agreements. See the NOTICE file distributed with
6 | # this work for additional information regarding copyright ownership.
7 | # The ASF licenses this file to You under the Apache License, Version 2.0
8 | # (the "License"); you may not use this file except in compliance with
9 | # the License. You may obtain a copy of the License at
10 | #
11 | # http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | #
19 |
20 | # Stops the history server on the machine this script is executed on.
21 |
22 | if [ -z "${SPARK_HOME}" ]; then
23 | export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)"
24 | fi
25 |
26 | "${SPARK_HOME}/sbin/spark-daemon.sh" stop org.apache.spark.deploy.history.HistoryServer 1
27 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/001-simple/.gitignore:
--------------------------------------------------------------------------------
1 | # SQLMesh
2 | /.cache/
3 | /db.db
4 | /db.db.wal
5 | /logs/
6 |
7 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/001-simple/Makefile:
--------------------------------------------------------------------------------
1 | #
2 | # File: https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-processing/sqlmesh/examples/001-simple/Makefile
3 | #
4 |
5 | clean-cache: ## Clean the SQLMesh cache directory
6 | rm -rf .cache
7 |
8 | clean-logs: ## Clean the SQLMesh logs
9 | rm -rf logs
10 |
11 | clean-wh: ## Clean the SQLMesh warehouse (DuckDB)
12 | rm -f db.db
13 |
14 | clean: clean-cache clean-logs clean-wh ## Clean potential previous states and logs
15 |
16 | hint-change: ## Hint for the changes to be made
17 | @echo "Edit the incremental_model.sql file and uncomment the z column"
18 | @echo "vi models/incremental_model.sql"
19 |
20 | plan-prod: ## Plan, backfill and apply changes
21 | sqlmesh plan
22 |
23 | plan-dev: ## Plan, backfill and apply changes in dev
24 | sqlmesh plan dev --include-unmodified
25 |
26 | audit: ## Audit
27 | sqlmesh audit
28 |
29 | test: ## Tests
30 | sqlmesh test
31 |
32 | list-tables-prod: ## List the tables in prod
33 | sqlmesh fetchdf "use sqlmesh_example; show tables"
34 |
35 | list-tables-dev: ## List the tables in dev
36 | sqlmesh fetchdf "use sqlmesh_example__dev; show tables"
37 |
38 | check-data-prod: ## Check the data in prod
39 | sqlmesh fetchdf "select * from sqlmesh_example.incremental_model"
40 |
41 | check-data-dev: ## Check the data in dev
42 | sqlmesh fetchdf "select * from sqlmesh_example__dev.incremental_model"
43 |
44 | diff: ## Differences between dev and prod
45 | sqlmesh table_diff prod:dev sqlmesh_example.incremental_model
46 |
47 | ui: ## Launch the UI
48 | sqlmesh ui --port 10000
49 |
50 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/001-simple/audits/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/001-simple/audits/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/001-simple/audits/assert_positive_order_ids.sql:
--------------------------------------------------------------------------------
1 | AUDIT (
2 | name assert_positive_order_ids,
3 | );
4 |
5 | SELECT *
6 | FROM @this_model
7 | WHERE
8 | item_id < 0
9 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/001-simple/config.yaml:
--------------------------------------------------------------------------------
1 | gateways:
2 | local:
3 | # DuckDb is used both as the execution engine (connection)
4 | # and to store the state (state_connection)
5 | connection:
6 | type: duckdb
7 | database: db.db
8 |
9 | default_gateway: local
10 |
11 | model_defaults:
12 | dialect: duckdb
13 | start: 2024-12-24
14 |
15 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/001-simple/docs/sqlmesh-dag.html:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/001-simple/macros/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/001-simple/macros/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/001-simple/macros/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/001-simple/macros/__init__.py
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/001-simple/models/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/001-simple/models/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/001-simple/models/full_model.sql:
--------------------------------------------------------------------------------
1 | MODEL (
2 | name sqlmesh_example.full_model,
3 | kind FULL,
4 | cron '@daily',
5 | grain item_id,
6 | audits (assert_positive_order_ids),
7 | );
8 |
9 | SELECT
10 | item_id,
11 | COUNT(DISTINCT id) AS num_orders,
12 | FROM
13 | sqlmesh_example.incremental_model
14 | GROUP BY item_id
15 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/001-simple/models/incremental_model.sql:
--------------------------------------------------------------------------------
1 | MODEL (
2 | name sqlmesh_example.incremental_model,
3 | kind INCREMENTAL_BY_TIME_RANGE (
4 | time_column event_date
5 | ),
6 | start '2020-01-01',
7 | cron '@daily',
8 | grain (id, event_date)
9 | );
10 |
11 | SELECT
12 | id,
13 | item_id,
14 | --'z' AS new_column, -- Added column
15 | event_date,
16 | FROM
17 | sqlmesh_example.seed_model
18 | WHERE
19 | event_date BETWEEN @start_date AND @end_date
20 |
21 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/001-simple/models/seed_model.sql:
--------------------------------------------------------------------------------
1 | MODEL (
2 | name sqlmesh_example.seed_model,
3 | kind SEED (
4 | path '../seeds/seed_data.csv'
5 | ),
6 | columns (
7 | id INTEGER,
8 | item_id INTEGER,
9 | event_date DATE
10 | ),
11 | grain (id, event_date)
12 | );
13 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/001-simple/seeds/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/001-simple/seeds/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/001-simple/seeds/seed_data.csv:
--------------------------------------------------------------------------------
1 | id,item_id,event_date
2 | 1,2,2020-01-01
3 | 2,1,2020-01-01
4 | 3,3,2020-01-03
5 | 4,1,2020-01-04
6 | 5,1,2020-01-05
7 | 6,1,2020-01-06
8 | 7,1,2020-01-07
9 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/001-simple/tests/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/001-simple/tests/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/001-simple/tests/test_full_model.yaml:
--------------------------------------------------------------------------------
1 | test_example_full_model:
2 | model: sqlmesh_example.full_model
3 | inputs:
4 | sqlmesh_example.incremental_model:
5 | rows:
6 | - id: 1
7 | item_id: 1
8 | - id: 2
9 | item_id: 1
10 | - id: 3
11 | item_id: 2
12 | outputs:
13 | query:
14 | rows:
15 | - item_id: 1
16 | num_orders: 2
17 | - item_id: 2
18 | num_orders: 1
19 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/002-postgresql-state/.gitignore:
--------------------------------------------------------------------------------
1 | # SQLMesh
2 | /.cache/
3 | /db.db
4 | /db.db.wal
5 | /logs/
6 | /config.yaml
7 |
8 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/002-postgresql-state/Makefile:
--------------------------------------------------------------------------------
1 | #
2 | # File: https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-processing/sqlmesh/examples/002-postgresql-state/Makefile
3 | #
4 |
5 | clean-cache: ## Clean the SQLMesh cache directory
6 | rm -rf .cache
7 |
8 | clean-logs: ## Clean the SQLMesh logs
9 | rm -rf logs
10 |
11 | clean-wh: ## Clean the SQLMesh warehouse (DuckDB)
12 | rm -f db.db
13 |
14 | clean-pg: ## Clean the SQLMesh state in the local PostgreSQL database
15 | @../../tools/clean-pg-state.sh
16 |
17 | clean: clean-cache clean-logs clean-wh clean-pg ## Clean potential previous states and logs
18 |
19 | init: ## Initialize the project with PostgreSQL to store the state
20 | cp -f config.yaml.sample config.yaml
21 | sed -i.bak -e 's//sqlmesh/' config.yaml && rm -f config.yaml.bak
22 |
23 | hint-change: ## Hint for the changes to be made
24 | @echo "Edit the incremental_model.sql file and uncomment the z column"
25 | @echo "vi models/incremental_model.sql"
26 |
27 | plan-prod: ## Plan, backfill and apply changes
28 | sqlmesh plan
29 |
30 | plan-dev: ## Plan, backfill and apply changes in dev
31 | sqlmesh plan dev --include-unmodified
32 |
33 | audit: ## Audit
34 | sqlmesh audit
35 |
36 | test: ## Tests
37 | sqlmesh test
38 |
39 | list-tables-prod: ## List the tables in prod
40 | sqlmesh fetchdf "use sqlmesh_example; show tables"
41 |
42 | list-tables-dev: ## List the tables in dev
43 | sqlmesh fetchdf "use sqlmesh_example__dev; show tables"
44 |
45 | check-data-prod: ## Check the data in prod
46 | sqlmesh fetchdf "select * from sqlmesh_example.incremental_model"
47 |
48 | check-data-dev: ## Check the data in dev
49 | sqlmesh fetchdf "select * from sqlmesh_example__dev.incremental_model"
50 |
51 | diff: ## Differences between dev and prod
52 | sqlmesh table_diff prod:dev sqlmesh_example.incremental_model
53 |
54 | ui: ## Launch the UI
55 | sqlmesh ui --port 10000
56 |
57 |
58 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/002-postgresql-state/audits/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/002-postgresql-state/audits/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/002-postgresql-state/audits/assert_positive_order_ids.sql:
--------------------------------------------------------------------------------
1 | AUDIT (
2 | name assert_positive_order_ids,
3 | );
4 |
5 | SELECT *
6 | FROM @this_model
7 | WHERE
8 | item_id < 0
9 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/002-postgresql-state/config.yaml.sample:
--------------------------------------------------------------------------------
1 | gateways:
2 | local:
3 | # * DuckDb is used only as the execution engine (connection)
4 | # * PostgreSQL is used to store the state (state_connection)
5 | # * Doc: https://sqlmesh.readthedocs.io/en/stable/guides/configuration/#overrides
6 | # * Doc: https://sqlmesh.readthedocs.io/en/stable/integrations/engines/postgres/#localbuilt-in-scheduler
7 | connection:
8 | type: duckdb
9 | database: db.db
10 | state_connection:
11 | type: postgres
12 | host: localhost
13 | port: 5432
14 | database: sqlmesh
15 | user: sqlmesh
16 | password:
17 |
18 | default_gateway: local
19 |
20 | model_defaults:
21 | dialect: duckdb
22 | start: 2024-12-24
23 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/002-postgresql-state/macros/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/002-postgresql-state/macros/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/002-postgresql-state/macros/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/002-postgresql-state/macros/__init__.py
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/002-postgresql-state/models/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/002-postgresql-state/models/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/002-postgresql-state/models/full_model.sql:
--------------------------------------------------------------------------------
1 | MODEL (
2 | name sqlmesh_example.full_model,
3 | kind FULL,
4 | cron '@daily',
5 | grain item_id,
6 | audits (assert_positive_order_ids),
7 | );
8 |
9 | SELECT
10 | item_id,
11 | COUNT(DISTINCT id) AS num_orders,
12 | FROM
13 | sqlmesh_example.incremental_model
14 | GROUP BY item_id
15 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/002-postgresql-state/models/incremental_model.sql:
--------------------------------------------------------------------------------
1 | MODEL (
2 | name sqlmesh_example.incremental_model,
3 | kind INCREMENTAL_BY_TIME_RANGE (
4 | time_column event_date
5 | ),
6 | start '2020-01-01',
7 | cron '@daily',
8 | grain (id, event_date)
9 | );
10 |
11 | SELECT
12 | id,
13 | item_id,
14 | --'z' AS new_column, -- Added column
15 | event_date,
16 | FROM
17 | sqlmesh_example.seed_model
18 | WHERE
19 | event_date BETWEEN @start_date AND @end_date
20 |
21 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/002-postgresql-state/models/seed_model.sql:
--------------------------------------------------------------------------------
1 | MODEL (
2 | name sqlmesh_example.seed_model,
3 | kind SEED (
4 | path '../seeds/seed_data.csv'
5 | ),
6 | columns (
7 | id INTEGER,
8 | item_id INTEGER,
9 | event_date DATE
10 | ),
11 | grain (id, event_date)
12 | );
13 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/002-postgresql-state/seeds/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/002-postgresql-state/seeds/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/002-postgresql-state/seeds/seed_data.csv:
--------------------------------------------------------------------------------
1 | id,item_id,event_date
2 | 1,2,2020-01-01
3 | 2,1,2020-01-01
4 | 3,3,2020-01-03
5 | 4,1,2020-01-04
6 | 5,1,2020-01-05
7 | 6,1,2020-01-06
8 | 7,1,2020-01-07
9 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/002-postgresql-state/tests/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/002-postgresql-state/tests/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/002-postgresql-state/tests/test_full_model.yaml:
--------------------------------------------------------------------------------
1 | test_example_full_model:
2 | model: sqlmesh_example.full_model
3 | inputs:
4 | sqlmesh_example.incremental_model:
5 | rows:
6 | - id: 1
7 | item_id: 1
8 | - id: 2
9 | item_id: 1
10 | - id: 3
11 | item_id: 2
12 | outputs:
13 | query:
14 | rows:
15 | - item_id: 1
16 | num_orders: 2
17 | - item_id: 2
18 | num_orders: 1
19 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/003-python-simple/.gitignore:
--------------------------------------------------------------------------------
1 | # SQLMesh
2 | /.cache/
3 | /db.db
4 | /dbwost.db
5 | /logs/
6 | /config.yaml
7 |
8 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/003-python-simple/Makefile:
--------------------------------------------------------------------------------
1 | #
2 | # File: https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-processing/sqlmesh/examples/003-python-simple/Makefile
3 | #
4 |
5 | clean-cache: ## Clean the SQLMesh cache directory
6 | rm -rf .cache
7 |
8 | clean-logs: ## Clean the SQLMesh logs
9 | rm -rf logs
10 |
11 | clean-wh: ## Clean the SQLMesh warehouse (DuckDB)
12 | rm -f db.db
13 |
14 | clean: clean-cache clean-logs clean-wh ## Clean potential previous states and logs
15 |
16 | hint-change: ## Hint for the changes to be made
17 | @echo "Edit the full_model_python.py file and uncomment the country column"
18 | @echo "vi models/full_model_python.py"
19 |
20 | plan-prod: ## Plan, backfill and apply changes
21 | sqlmesh plan
22 |
23 | plan-dev: ## Plan, backfill and apply changes in dev
24 | sqlmesh plan dev --include-unmodified
25 |
26 | audit: ## Audit
27 | sqlmesh audit
28 |
29 | test: ## Tests
30 | sqlmesh test
31 |
32 | list-tables-prod: ## List the tables in prod
33 | sqlmesh fetchdf "use sqlmesh_example; show tables"
34 |
35 | list-tables-dev: ## List the tables in dev
36 | sqlmesh fetchdf "use sqlmesh_example__dev; show tables"
37 |
38 | check-data-prod: ## Check the data in prod
39 | sqlmesh fetchdf "select * from sqlmesh_example.full_model_python"
40 |
41 | check-data-dev: ## Check the data in dev
42 | sqlmesh fetchdf "select * from sqlmesh_example__dev.full_model_python"
43 |
44 | diff: ## Differences between dev and prod
45 | sqlmesh table_diff prod:dev sqlmesh_example.full_model_python
46 |
47 | ui: ## Launch the UI
48 | sqlmesh ui --port 10000
49 |
50 |
51 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/003-python-simple/audits/assert_positive_order_ids.sql:
--------------------------------------------------------------------------------
1 | AUDIT (
2 | name assert_positive_order_ids,
3 | );
4 |
5 | SELECT *
6 | FROM @this_model
7 | WHERE
8 | item_id < 0
9 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/003-python-simple/config.yaml:
--------------------------------------------------------------------------------
1 | gateways:
2 | local:
3 | connection:
4 | type: duckdb
5 | database: db.db
6 |
7 | default_gateway: local
8 |
9 | model_defaults:
10 | dialect: duckdb
11 | start: 2024-12-26
12 |
13 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/003-python-simple/macros/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/003-python-simple/macros/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/003-python-simple/macros/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/003-python-simple/macros/__init__.py
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/003-python-simple/models/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/003-python-simple/models/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/003-python-simple/models/full_model.sql:
--------------------------------------------------------------------------------
1 | MODEL (
2 | name sqlmesh_example.full_model,
3 | kind FULL,
4 | cron '@daily',
5 | grain item_id,
6 | audits (assert_positive_order_ids),
7 | );
8 |
9 | SELECT
10 | item_id,
11 | COUNT(DISTINCT id) AS num_orders,
12 | FROM
13 | sqlmesh_example.incremental_model
14 | GROUP BY item_id
15 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/003-python-simple/models/full_model_python.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 | from datetime import datetime
3 |
4 | import pandas as pd
5 | #from sqlglot import exp
6 |
7 | from sqlmesh import ExecutionContext, model
8 | from sqlmesh.core.model import ModelKindName
9 |
10 |
11 | @model(
12 | "sqlmesh_example.full_model_python",
13 | kind=dict(name=ModelKindName.FULL),
14 | cron="@daily",
15 | columns={
16 | "id": "int",
17 | "name": "text",
18 | #"country": "text",
19 | },
20 | column_descriptions={
21 | "id": "Unique ID",
22 | "name": "Name corresponding to the ID",
23 | },
24 | grain=["id"],
25 | audits=[
26 | ("not_null", {"columns": ["id"]}),
27 | ],
28 | description="Simple Python model",
29 | )
30 | def execute(
31 | context: ExecutionContext,
32 | start: datetime,
33 | end: datetime,
34 | execution_time: datetime,
35 | **kwargs: t.Any,
36 | ) -> pd.DataFrame:
37 |
38 | df = pd.DataFrame([{"id": 1, "name": "Laura"}, {"id": 2, "name": "John"}, {"id": 3, "name": "Lucie"}])
39 | #df = pd.DataFrame([{"id": 1, "name": "Laura", "country": "DE"}, {"id": 2, "name": "John", "country": "UK"}, {"id": 3, "name": "Lucie", "country": "FR"}])
40 |
41 | return df
42 |
43 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/003-python-simple/models/incremental_model.sql:
--------------------------------------------------------------------------------
1 | MODEL (
2 | name sqlmesh_example.incremental_model,
3 | kind INCREMENTAL_BY_TIME_RANGE (
4 | time_column event_date
5 | ),
6 | start '2020-01-01',
7 | cron '@daily',
8 | grain (id, event_date)
9 | );
10 |
11 | SELECT
12 | id,
13 | item_id,
14 | --'z' AS new_column, -- Added column
15 | event_date,
16 | FROM
17 | sqlmesh_example.seed_model
18 | WHERE
19 | event_date BETWEEN @start_date AND @end_date
20 |
21 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/003-python-simple/models/seed_model.sql:
--------------------------------------------------------------------------------
1 | MODEL (
2 | name sqlmesh_example.seed_model,
3 | kind SEED (
4 | path '../seeds/seed_data.csv'
5 | ),
6 | columns (
7 | id INTEGER,
8 | item_id INTEGER,
9 | event_date DATE
10 | ),
11 | grain (id, event_date)
12 | );
13 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/003-python-simple/requirements.txt:
--------------------------------------------------------------------------------
1 | sqlmesh[web]
2 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/003-python-simple/seeds/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/003-python-simple/seeds/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/003-python-simple/seeds/seed_data.csv:
--------------------------------------------------------------------------------
1 | id,item_id,event_date
2 | 1,2,2020-01-01
3 | 2,1,2020-01-01
4 | 3,3,2020-01-03
5 | 4,1,2020-01-04
6 | 5,1,2020-01-05
7 | 6,1,2020-01-06
8 | 7,1,2020-01-07
9 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/003-python-simple/tests/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/003-python-simple/tests/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/003-python-simple/tests/test_full_model.yaml:
--------------------------------------------------------------------------------
1 | test_example_full_model:
2 | model: sqlmesh_example.full_model
3 | inputs:
4 | sqlmesh_example.incremental_model:
5 | rows:
6 | - id: 1
7 | item_id: 1
8 | - id: 2
9 | item_id: 1
10 | - id: 3
11 | item_id: 2
12 | outputs:
13 | query:
14 | rows:
15 | - item_id: 1
16 | num_orders: 2
17 | - item_id: 2
18 | num_orders: 1
19 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/003-python-simple/tmp/test_sqlglot.py:
--------------------------------------------------------------------------------
1 | import sqlglot
2 |
3 | #
4 | print(f"SQLGlot version: {sqlglot.__version__}")
5 |
6 | #
7 | sql = sqlglot.parse_one("l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1")
8 | print(f"SQL query: {sql}")
9 |
10 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/004-python-ibis/.gitignore:
--------------------------------------------------------------------------------
1 | # SQLMesh
2 | /.cache/
3 | /db.db
4 | /db.db.wal
5 | local.duckdb
6 | /logs/
7 | /config.yaml
8 |
9 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/004-python-ibis/Makefile:
--------------------------------------------------------------------------------
1 | #
2 | # File: https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-processing/sqlmesh/examples/004-python-ibis/Makefile
3 | #
4 |
5 | clean-cache: ## Clean the SQLMesh cache directory
6 | rm -rf .cache
7 |
8 | clean-logs: ## Clean the SQLMesh logs
9 | rm -rf logs
10 |
11 | clean-wh: ## Clean the SQLMesh warehouse (DuckDB)
12 | rm -f db.db
13 |
14 | clean: clean-cache clean-logs clean-wh ## Clean potential previous states and logs
15 |
16 | hint-change: ## Hint for the changes to be made
17 | @echo "Edit the incremental_model.sql file and uncomment the country column"
18 | @echo "vi models/incremental_model.sql"
19 |
20 | plan-prod: ## Plan, backfill and apply changes
21 | sqlmesh plan
22 |
23 | plan-dev: ## Plan, backfill and apply changes in dev
24 | sqlmesh plan dev --include-unmodified
25 |
26 | audit: ## Audit
27 | sqlmesh audit
28 |
29 | test: ## Tests
30 | sqlmesh test
31 |
32 | list-tables-prod: ## List the tables in prod
33 | sqlmesh fetchdf "use ibis; show tables"
34 |
35 | list-tables-dev: ## List the tables in dev
36 | sqlmesh fetchdf "use ibis__dev; show tables"
37 |
38 | check-data-prod: ## Check the data in prod
39 | sqlmesh fetchdf "select * from ibis.incremental_model"
40 |
41 | check-data-dev: ## Check the data in dev
42 | sqlmesh fetchdf "select * from ibis__dev.incremental_model"
43 |
44 | diff: ## Differences between dev and prod
45 | sqlmesh table_diff prod:dev ibis.incremental_model
46 |
47 | ui: ## Launch the UI
48 | sqlmesh ui --port 10000
49 |
50 |
51 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/004-python-ibis/audits/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/004-python-ibis/audits/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/004-python-ibis/audits/assert_positive_order_ids.sql:
--------------------------------------------------------------------------------
1 | AUDIT (
2 | name assert_positive_order_ids,
3 | );
4 |
5 | SELECT *
6 | FROM @this_model
7 | WHERE
8 | item_id < 0
9 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/004-python-ibis/config.yaml:
--------------------------------------------------------------------------------
1 | gateways:
2 | my_gateway:
3 | connection:
4 | type: duckdb
5 | catalogs:
6 | local: 'data/local.duckdb'
7 | model_defaults:
8 | dialect: 'duckdb'
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/004-python-ibis/constants.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | DB_PATH = os.path.join(os.path.dirname(__file__), "data/local.duckdb")
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/004-python-ibis/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/004-python-ibis/data/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/004-python-ibis/macros/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/004-python-ibis/macros/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/004-python-ibis/macros/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/004-python-ibis/macros/__init__.py
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/004-python-ibis/models/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/004-python-ibis/models/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/004-python-ibis/models/full_model.sql:
--------------------------------------------------------------------------------
1 | MODEL (
2 | name ibis.full_model,
3 | kind FULL,
4 | cron '@daily',
5 | grain item_id,
6 | audits (assert_positive_order_ids),
7 | );
8 |
9 | SELECT
10 | item_id,
11 | COUNT(DISTINCT id) AS num_orders,
12 | FROM
13 | ibis.incremental_model
14 | GROUP BY item_id
15 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/004-python-ibis/models/ibis_full_model_python.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 | from datetime import datetime
3 |
4 | import ibis # type: ignore
5 | import pandas as pd
6 | from constants import DB_PATH # type: ignore
7 | from sqlglot import exp
8 |
9 | from sqlmesh import ExecutionContext, model
10 | from sqlmesh.core.model import ModelKindName
11 |
12 |
13 | @model(
14 | "ibis.ibis_full_model_python",
15 | kind=dict(name=ModelKindName.FULL),
16 | columns={
17 | "item_id": "int",
18 | "num_orders": "int",
19 | },
20 | audits=["assert_positive_order_ids"],
21 | description="This model uses ibis to transform a `table` object and return a dataframe",
22 | )
23 | def execute(
24 | context: ExecutionContext,
25 | start: datetime,
26 | end: datetime,
27 | execution_time: datetime,
28 | **kwargs: t.Any,
29 | ) -> pd.DataFrame:
30 | # get physical table name
31 | upstream_model = exp.to_table(context.table("ibis.incremental_model"))
32 | # connect ibis to database
33 | con = ibis.duckdb.connect(DB_PATH)
34 |
35 | # retrieve table
36 | incremental_model = con.table(name=upstream_model.name, database=upstream_model.db)
37 |
38 | # build query
39 | count = incremental_model.id.nunique()
40 | aggregate = incremental_model.group_by("item_id").aggregate(num_orders=count)
41 | query = aggregate.order_by("item_id")
42 |
43 | return query.to_pandas()
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/004-python-ibis/models/ibis_full_model_sql.py:
--------------------------------------------------------------------------------
1 | import ibis # type: ignore
2 | from ibis.expr.operations import Namespace, UnboundTable # type: ignore
3 |
4 | from sqlmesh.core.macros import MacroEvaluator
5 | from sqlmesh.core.model import model
6 |
7 |
8 | @model(
9 | "ibis.ibis_full_model_sql",
10 | is_sql=True,
11 | kind="FULL",
12 | audits=["assert_positive_order_ids"],
13 | description="This model uses ibis to generate and return a SQL string",
14 | )
15 | def entrypoint(evaluator: MacroEvaluator) -> str:
16 | # create table reference
17 | incremental_model = UnboundTable(
18 | name="incremental_model",
19 | schema={"id": "int32", "item_id": "int32", "ds": "varchar"},
20 | namespace=Namespace(catalog="local", database="ibis"),
21 | ).to_expr()
22 |
23 | # build query
24 | count = incremental_model.id.nunique()
25 | aggregate = incremental_model.group_by("item_id").aggregate(num_orders=count)
26 | query = aggregate.order_by("item_id")
27 |
28 | return ibis.to_sql(query)
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/004-python-ibis/models/incremental_model.sql:
--------------------------------------------------------------------------------
1 | MODEL (
2 | name ibis.incremental_model,
3 | kind INCREMENTAL_BY_TIME_RANGE (
4 | time_column event_date
5 | ),
6 | start '2020-01-01',
7 | cron '@daily',
8 | grain (id, event_date)
9 | );
10 |
11 | SELECT
12 | id,
13 | item_id,
14 | --'z' AS new_column, -- Added column
15 | event_date,
16 | FROM
17 | ibis.seed_model
18 | WHERE
19 | event_date BETWEEN @start_date AND @end_date
20 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/004-python-ibis/models/seed_model.sql:
--------------------------------------------------------------------------------
1 | MODEL (
2 | name ibis.seed_model,
3 | kind SEED (
4 | path '../seeds/seed_data.csv'
5 | ),
6 | columns (
7 | id INTEGER,
8 | item_id INTEGER,
9 | event_date DATE
10 | ),
11 | grain (id, event_date)
12 | );
13 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/004-python-ibis/requirements.txt:
--------------------------------------------------------------------------------
1 | ibis-framework[duckdb]
2 | sqlmesh[web]
3 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/004-python-ibis/seeds/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/004-python-ibis/seeds/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/004-python-ibis/seeds/seed_data.csv:
--------------------------------------------------------------------------------
1 | id,item_id,event_date
2 | 1,2,2020-01-01
3 | 2,1,2020-01-01
4 | 3,3,2020-01-03
5 | 4,1,2020-01-04
6 | 5,1,2020-01-05
7 | 6,1,2020-01-06
8 | 7,1,2020-01-07
9 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/005-pyspark-simple/.gitignore:
--------------------------------------------------------------------------------
1 | # SQLMesh
2 | /.cache/
3 | /db.db
4 | /db.db.wal
5 | /logs/
6 | # Spark
7 | derby.log
8 | metastore_db/
9 | spark-warehouse/
10 |
11 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/005-pyspark-simple/Makefile:
--------------------------------------------------------------------------------
1 | #
2 | # File: https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-processing/sqlmesh/examples/005-pyspark-simple/Makefile
3 | #
4 |
5 | clean-cache: ## Clean the SQLMesh cache directory
6 | rm -rf .cache
7 |
8 | clean-logs: ## Clean the SQLMesh logs
9 | rm -rf logs
10 |
11 | clean-state: ## Clean the SQLMesh state (DuckDB)
12 | rm -f db.db db.db.wal
13 |
14 | clean-spark: ## Clean the warehouse (Spark metastore and warehouse)
15 | rm -rf derby.log metastore_db spark-warehouse
16 |
17 | clean: clean-cache clean-logs clean-state clean-spark ## Clean potential previous states and logs
18 |
19 | hint-change: ## Hint for the changes to be made
20 | @echo "Edit the full_model_python.py file and uncomment the country column"
21 | @echo "vi models/full_model_python.py"
22 |
23 | plan-prod: ## Plan, backfill and apply changes
24 | sqlmesh plan
25 |
26 | plan-dev: ## Plan, backfill and apply changes in dev
27 | sqlmesh plan dev --include-unmodified
28 |
29 | audit: ## Audit
30 | sqlmesh audit
31 |
32 | test: ## Tests
33 | sqlmesh test
34 |
35 | list-tables-prod: ## List the tables in prod
36 | sqlmesh fetchdf "show tables in spark_catalog.sqlmesh_example"
37 |
38 | list-tables-dev: ## List the tables in dev
39 | sqlmesh fetchdf "show tables in spark_catalog.sqlmesh_example__dev"
40 |
41 | check-data-prod: ## Check the data in prod
42 | sqlmesh fetchdf "select * from spark_catalog.sqlmesh_example.full_model_python"
43 |
44 | check-data-dev: ## Check the data in dev
45 | sqlmesh fetchdf "select * from sqlmesh_example__dev.full_model_python"
46 |
47 | diff: ## Differences between dev and prod
48 | sqlmesh table_diff prod:dev sqlmesh_example.full_model_python
49 |
50 | ui: ## Launch the UI
51 | sqlmesh ui --port 10000
52 |
53 |
54 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/005-pyspark-simple/audits/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/005-pyspark-simple/audits/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/005-pyspark-simple/audits/assert_positive_order_ids.sql:
--------------------------------------------------------------------------------
1 | AUDIT (
2 | name assert_positive_order_ids,
3 | );
4 |
5 | SELECT *
6 | FROM @this_model
7 | WHERE
8 | item_id < 0
9 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/005-pyspark-simple/config.yaml:
--------------------------------------------------------------------------------
1 | gateways:
2 | local:
3 | connection:
4 | type: spark
5 | state_connection:
6 | type: duckdb
7 | database: db.db
8 |
9 | default_gateway: local
10 |
11 | model_defaults:
12 | dialect: spark
13 | start: 2024-12-26
14 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/005-pyspark-simple/macros/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/005-pyspark-simple/macros/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/005-pyspark-simple/macros/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/005-pyspark-simple/macros/__init__.py
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/005-pyspark-simple/models/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/005-pyspark-simple/models/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/005-pyspark-simple/models/full_model.sql:
--------------------------------------------------------------------------------
1 | MODEL (
2 | name sqlmesh_example.full_model,
3 | kind FULL,
4 | cron '@daily',
5 | grain item_id,
6 | audits (assert_positive_order_ids),
7 | );
8 |
9 | SELECT
10 | item_id,
11 | COUNT(DISTINCT id) AS num_orders,
12 | FROM
13 | sqlmesh_example.incremental_model
14 | GROUP BY item_id
15 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/005-pyspark-simple/models/full_model_python.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 | from datetime import datetime
3 |
4 | import pandas as pd
5 | #from sqlglot import exp
6 |
7 | from sqlmesh import ExecutionContext, model
8 | from sqlmesh.core.model import ModelKindName
9 |
10 |
11 | @model(
12 | "sqlmesh_example.full_model_python",
13 | kind=dict(name=ModelKindName.FULL),
14 | cron="@daily",
15 | columns={
16 | "id": "int",
17 | "name": "text",
18 | #"country": "text",
19 | },
20 | column_descriptions={
21 | "id": "Unique ID",
22 | "name": "Name corresponding to the ID",
23 | },
24 | grain=["id"],
25 | audits=[
26 | ("not_null", {"columns": ["id"]}),
27 | ],
28 | description="Simple Python model",
29 | )
30 | def execute(
31 | context: ExecutionContext,
32 | start: datetime,
33 | end: datetime,
34 | execution_time: datetime,
35 | **kwargs: t.Any,
36 | ) -> pd.DataFrame:
37 |
38 | df = pd.DataFrame([{"id": 1, "name": "Laura"}, {"id": 2, "name": "John"}, {"id": 3, "name": "Lucie"}])
39 | #df = pd.DataFrame([{"id": 1, "name": "Laura", "country": "DE"}, {"id": 2, "name": "John", "country": "UK"}, {"id": 3, "name": "Lucie", "country": "FR"}])
40 |
41 | return df
42 |
43 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/005-pyspark-simple/models/incremental_model.sql:
--------------------------------------------------------------------------------
1 | MODEL (
2 | name sqlmesh_example.incremental_model,
3 | kind INCREMENTAL_BY_TIME_RANGE (
4 | time_column event_date
5 | ),
6 | start '2020-01-01',
7 | cron '@daily',
8 | grain (id, event_date)
9 | );
10 |
11 | SELECT
12 | id,
13 | item_id,
14 | event_date,
15 | FROM
16 | sqlmesh_example.seed_model
17 | WHERE
18 | event_date BETWEEN @start_date AND @end_date
19 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/005-pyspark-simple/models/pyspark_model.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 | from datetime import datetime
3 |
4 | import pandas as pd
5 | from pyspark.sql import DataFrame, functions
6 |
7 | from sqlmesh import ExecutionContext, model
8 |
9 | @model(
10 | "sqlmesh_example.pyspark",
11 | columns={
12 | "id": "int",
13 | "name": "text",
14 | "country": "text",
15 | },
16 | )
17 | def execute(
18 | context: ExecutionContext,
19 | start: datetime,
20 | end: datetime,
21 | execution_time: datetime,
22 | **kwargs: t.Any,
23 | ) -> DataFrame:
24 | # get the upstream model's name and register it as a dependency
25 | table = context.resolve_table("sqlmesh_example.full_model_python")
26 |
27 | # use the spark DataFrame api to add the country column
28 | df = context.spark.table(table).withColumn("country", functions.lit("USA"))
29 |
30 | # returns the pyspark DataFrame directly, so no data is computed locally
31 | return df
32 |
33 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/005-pyspark-simple/models/seed_model.sql:
--------------------------------------------------------------------------------
1 | MODEL (
2 | name sqlmesh_example.seed_model,
3 | kind SEED (
4 | path '../seeds/seed_data.csv'
5 | ),
6 | columns (
7 | id INTEGER,
8 | item_id INTEGER,
9 | event_date DATE
10 | ),
11 | grain (id, event_date)
12 | );
13 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/005-pyspark-simple/requirements.txt:
--------------------------------------------------------------------------------
1 | apache-airflow-providers-apache-spark
2 | sqlmesh[web]
3 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/005-pyspark-simple/seeds/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/005-pyspark-simple/seeds/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/005-pyspark-simple/seeds/seed_data.csv:
--------------------------------------------------------------------------------
1 | id,item_id,event_date
2 | 1,2,2020-01-01
3 | 2,1,2020-01-01
4 | 3,3,2020-01-03
5 | 4,1,2020-01-04
6 | 5,1,2020-01-05
7 | 6,1,2020-01-06
8 | 7,1,2020-01-07
9 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/005-pyspark-simple/tests/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/005-pyspark-simple/tests/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/005-pyspark-simple/tests/test_full_model.yaml:
--------------------------------------------------------------------------------
1 | test_example_full_model:
2 | model: sqlmesh_example.full_model
3 | inputs:
4 | sqlmesh_example.incremental_model:
5 | rows:
6 | - id: 1
7 | item_id: 1
8 | - id: 2
9 | item_id: 1
10 | - id: 3
11 | item_id: 2
12 | outputs:
13 | query:
14 | rows:
15 | - item_id: 1
16 | num_orders: 2
17 | - item_id: 2
18 | num_orders: 1
19 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/006-e2e/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/006-e2e/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/007-databricks-simple/.env.sample:
--------------------------------------------------------------------------------
1 | # DataBricks
2 |
3 | ## Example: .cloud.databricks.com
4 | DBS_SVR_HST=${DBS_SVR_HST}
5 |
6 | ## Example: sql/protocolv1/o//
7 | DBS_HTTP_PATH=${DBS_HTTP_PATH}
8 |
9 | ## Personal Access Token (PAT)
10 | DBS_PAT=${DBS_PAT}
11 |
12 | ## DataBricks schema
13 | DBS_SCH=${DBS_SCH}
14 |
15 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/007-databricks-simple/.gitignore:
--------------------------------------------------------------------------------
1 | # SQLMesh
2 | /.cache/
3 | /db.db
4 | /db.db.wal
5 | /logs/
6 | /config.yaml
7 | # Spark
8 | derby.log
9 | metastore_db/
10 | spark-warehouse/
11 | # Env
12 | /.env
13 |
14 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/007-databricks-simple/Makefile:
--------------------------------------------------------------------------------
1 | #
2 | # File: https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-processing/sqlmesh/examples/005-pyspark-simple/Makefile
3 | #
4 |
5 | # DBS_ environment variables
6 | include .env
7 |
8 | clean-cache: ## Clean the SQLMesh cache directory
9 | rm -rf .cache
10 |
11 | clean-logs: ## Clean the SQLMesh logs
12 | rm -rf logs
13 |
14 | clean-state: ## Clean the SQLMesh state (DuckDB)
15 | rm -f db.db db.db.wal
16 |
17 | clean-spark: ## Clean the warehouse (Spark metastore and warehouse)
18 | rm -rf derby.log metastore_db spark-warehouse
19 |
20 | clean: clean-cache clean-logs clean-state clean-spark ## Clean potential previous states and logs
21 |
22 | init-python: ## Install Python libraries
23 | python -mpip install -U "sqlmesh[web,databricks,spark]"
24 |
25 | init-files: ## Create files with instantiated env vars
26 | @../../tools/init-dbs-w-env.sh
27 |
28 | init: init-python init-files ## Initialization
29 |
30 | info: ## Info about the project
31 | sqlmesh info
32 |
33 | hint-change: ## Hint for the changes to be made
34 | @echo "Edit the incremental_model.sql file and uncomment the z column"
35 | @echo "vi models/incremental_model.sql"
36 |
37 | plan-prod: ## Plan, backfill and apply changes
38 | sqlmesh plan
39 |
40 | plan-dev: ## Plan, backfill and apply changes in dev
41 | sqlmesh plan dev --include-unmodified
42 |
43 | audit: ## Audit
44 | sqlmesh audit
45 |
46 | test: ## Tests
47 | sqlmesh test
48 |
49 | list-tables-prod: ## List the tables in prod
50 | sqlmesh fetchdf "show tables in $(DBS_SCH) like '*seed_model*|*incremental_model*|*full_model*'"
51 |
52 | list-tables-dev: ## List the tables in dev
53 | sqlmesh fetchdf "show tables in $(DBS_SCH) like '*seed_model__dev*|*incremental_model__dev*|*full_model__dev*'"
54 |
55 | check-data-prod: ## Check the data in prod
56 | sqlmesh fetchdf "select * from $(DBS_SCH).incremental_model"
57 |
58 | check-data-dev: ## Check the data in dev
59 | sqlmesh fetchdf "select * from $(DBS_SCH).incremental_model__dev"
60 |
61 | diff: ## Differences between dev and prod
62 | sqlmesh table_diff --temp-schema $(DBS_SCH) prod:dev $(DBS_SCH).incremental_model
63 |
64 | ui: ## Launch the UI
65 | sqlmesh ui --port 10000
66 |
67 |
68 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/007-databricks-simple/audits/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/007-databricks-simple/audits/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/007-databricks-simple/audits/assert_positive_order_ids.sql:
--------------------------------------------------------------------------------
1 | AUDIT (
2 | name assert_positive_order_ids,
3 | );
4 |
5 | SELECT *
6 | FROM @this_model
7 | WHERE
8 | item_id < 0
9 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/007-databricks-simple/config.yaml.in:
--------------------------------------------------------------------------------
1 | gateways:
2 | local:
3 | connection:
4 | type: databricks
5 | server_hostname: {{ env_var('DBS_SVR_HST') }}
6 | http_path: {{ env_var('DBS_HTTP_PATH') }}
7 | access_token: {{ env_var('DBS_PAT') }}
8 | catalog: hive_metastore
9 | state_connection:
10 | type: duckdb
11 | database: db.db
12 |
13 | physical_schema_mapping:
14 | '^${DBS_SCH}$': {{ env_var('DBS_SCH') }}
15 |
16 | environment_suffix_target: table
17 |
18 | default_gateway: local
19 |
20 | model_defaults:
21 | dialect: spark
22 | start: 2024-12-29
23 |
24 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/007-databricks-simple/macros/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/007-databricks-simple/macros/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/007-databricks-simple/macros/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/007-databricks-simple/macros/__init__.py
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/007-databricks-simple/models/.gitignore:
--------------------------------------------------------------------------------
1 | /*.sql
2 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/007-databricks-simple/models/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/007-databricks-simple/models/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/007-databricks-simple/models/full_model.sql.in:
--------------------------------------------------------------------------------
1 | MODEL (
2 | name ${DBS_SCH}.full_model,
3 | kind FULL,
4 | cron '@daily',
5 | grain item_id,
6 | audits (assert_positive_order_ids),
7 | );
8 |
9 | SELECT
10 | item_id,
11 | COUNT(DISTINCT id) AS num_orders,
12 | FROM
13 | ${DBS_SCH}.incremental_model
14 | GROUP BY item_id
15 |
16 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/007-databricks-simple/models/incremental_model.sql.in:
--------------------------------------------------------------------------------
1 | MODEL (
2 | name ${DBS_SCH}.incremental_model,
3 | kind INCREMENTAL_BY_TIME_RANGE (
4 | time_column event_date
5 | ),
6 | start '2020-01-01',
7 | cron '@daily',
8 | grain (id, event_date)
9 | );
10 |
11 | SELECT
12 | id,
13 | item_id,
14 | --'z' AS new_column, -- Added column
15 | event_date,
16 | FROM
17 | ${DBS_SCH}.seed_model
18 | WHERE
19 | event_date BETWEEN @start_date AND @end_date
20 |
21 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/007-databricks-simple/models/seed_model.sql.in:
--------------------------------------------------------------------------------
1 | MODEL (
2 | name ${DBS_SCH}.seed_model,
3 | kind SEED (
4 | path '../seeds/seed_data.csv'
5 | ),
6 | columns (
7 | id INTEGER,
8 | item_id INTEGER,
9 | event_date DATE
10 | ),
11 | grain (id, event_date)
12 | );
13 |
14 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/007-databricks-simple/seeds/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/007-databricks-simple/seeds/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/007-databricks-simple/seeds/seed_data.csv:
--------------------------------------------------------------------------------
1 | id,item_id,event_date
2 | 1,2,2020-01-01
3 | 2,1,2020-01-01
4 | 3,3,2020-01-03
5 | 4,1,2020-01-04
6 | 5,1,2020-01-05
7 | 6,1,2020-01-06
8 | 7,1,2020-01-07
9 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/007-databricks-simple/tests/.gitignore:
--------------------------------------------------------------------------------
1 | /*.yaml
2 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/007-databricks-simple/tests/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/007-databricks-simple/tests/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/007-databricks-simple/tests/test_full_model.yaml.in:
--------------------------------------------------------------------------------
1 | test_example_full_model:
2 | model: ${DBS_SCH}.full_model
3 | inputs:
4 | ${DBS_SCH}.incremental_model:
5 | rows:
6 | - id: 1
7 | item_id: 1
8 | - id: 2
9 | item_id: 1
10 | - id: 3
11 | item_id: 2
12 | outputs:
13 | query:
14 | rows:
15 | - item_id: 1
16 | num_orders: 2
17 | - item_id: 2
18 | num_orders: 1
19 |
20 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/008-unitycatalog-simple/.gitignore:
--------------------------------------------------------------------------------
1 | # SQLMesh
2 | /.cache/
3 | /db.db
4 | /db.db.wal
5 | /logs/
6 | # Spark
7 | derby.log
8 | metastore_db/
9 | spark-warehouse/
10 | # Env
11 | /.env
12 |
13 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/008-unitycatalog-simple/Makefile:
--------------------------------------------------------------------------------
1 | #
2 | # File: https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-processing/sqlmesh/examples/005-pyspark-simple/Makefile
3 | #
4 |
5 | clean-cache: ## Clean the SQLMesh cache directory
6 | rm -rf .cache
7 |
8 | clean-logs: ## Clean the SQLMesh logs
9 | rm -rf logs
10 |
11 | clean-state: ## Clean the SQLMesh state (DuckDB)
12 | rm -f db.db db.db.wal
13 |
14 | clean-spark: ## Clean the warehouse (Spark metastore and warehouse)
15 | rm -rf derby.log metastore_db spark-warehouse
16 |
17 | clean: clean-cache clean-logs clean-state clean-spark ## Clean potential previous states and logs
18 |
19 | init-python: ## Install Python libraries
20 | python -mpip install -U "sqlmesh[web,databricks,spark]"
21 |
22 | init-files: ## Create files with instantiated env vars
23 | @../../tools/init-dbs-w-env.sh
24 |
25 | init: init-python init-files ## Initialization
26 |
27 | info: ## Info about the project
28 | sqlmesh info
29 |
30 | hint-change: ## Hint for the changes to be made
31 | @echo "Edit the incremental_model.sql file and uncomment the z column"
32 | @echo "vi models/incremental_model.sql"
33 |
34 | plan-prod: ## Plan, backfill and apply changes
35 | sqlmesh plan
36 |
37 | plan-dev: ## Plan, backfill and apply changes in dev
38 | sqlmesh plan dev --include-unmodified
39 |
40 | audit: ## Audit
41 | sqlmesh audit
42 |
43 | test: ## Tests
44 | sqlmesh test
45 |
46 | list-tables-prod: ## List the tables in prod
47 | sqlmesh fetchdf "show tables in $(DBS_SCH) like '*seed_model*|*incremental_model*|*full_model*'"
48 |
49 | list-tables-dev: ## List the tables in dev
50 | sqlmesh fetchdf "show tables in $(DBS_SCH) like '*seed_model__dev*|*incremental_model__dev*|*full_model__dev*'"
51 |
52 | check-data-prod: ## Check the data in prod
53 | sqlmesh fetchdf "select * from $(DBS_SCH).incremental_model"
54 |
55 | check-data-dev: ## Check the data in dev
56 | sqlmesh fetchdf "select * from $(DBS_SCH).incremental_model__dev"
57 |
58 | diff: ## Differences between dev and prod
59 | sqlmesh table_diff --temp-schema $(DBS_SCH) prod:dev $(DBS_SCH).incremental_model
60 |
61 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/008-unitycatalog-simple/audits/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/008-unitycatalog-simple/audits/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/008-unitycatalog-simple/audits/assert_positive_order_ids.sql:
--------------------------------------------------------------------------------
1 | AUDIT (
2 | name assert_positive_order_ids,
3 | );
4 |
5 | SELECT *
6 | FROM @this_model
7 | WHERE
8 | item_id < 0
9 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/008-unitycatalog-simple/config.yaml:
--------------------------------------------------------------------------------
1 | gateways:
2 | local:
3 | connection:
4 | type: spark
5 | catalog: unity
6 | state_connection:
7 | type: duckdb
8 | database: db.db
9 |
10 | physical_schema_mapping:
11 | '^default$': default
12 |
13 | environment_suffix_target: table
14 |
15 | default_gateway: local
16 |
17 | model_defaults:
18 | dialect: spark
19 | start: 2024-12-30
20 |
21 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/008-unitycatalog-simple/macros/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/008-unitycatalog-simple/macros/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/008-unitycatalog-simple/macros/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/008-unitycatalog-simple/macros/__init__.py
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/008-unitycatalog-simple/models/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/008-unitycatalog-simple/models/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/008-unitycatalog-simple/models/full_model.sql:
--------------------------------------------------------------------------------
1 | MODEL (
2 | name default.full_model,
3 | kind FULL,
4 | cron '@daily',
5 | grain item_id,
6 | audits (assert_positive_order_ids),
7 | );
8 |
9 | SELECT
10 | item_id,
11 | COUNT(DISTINCT id) AS num_orders,
12 | FROM
13 | default.incremental_model
14 | GROUP BY item_id
15 |
16 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/008-unitycatalog-simple/models/incremental_model.sql:
--------------------------------------------------------------------------------
1 | MODEL (
2 | name default.incremental_model,
3 | kind INCREMENTAL_BY_TIME_RANGE (
4 | time_column event_date
5 | ),
6 | start '2020-01-01',
7 | cron '@daily',
8 | grain (id, event_date)
9 | );
10 |
11 | SELECT
12 | id,
13 | item_id,
14 | event_date,
15 | FROM
16 | default.seed_model
17 | WHERE
18 | event_date BETWEEN @start_date AND @end_date
19 |
20 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/008-unitycatalog-simple/models/seed_model.sql:
--------------------------------------------------------------------------------
1 | MODEL (
2 | name default.seed_model,
3 | kind SEED (
4 | path '../seeds/seed_data.csv'
5 | ),
6 | columns (
7 | id INTEGER,
8 | item_id INTEGER,
9 | event_date DATE
10 | ),
11 | grain (id, event_date)
12 | );
13 |
14 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/008-unitycatalog-simple/seeds/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/008-unitycatalog-simple/seeds/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/008-unitycatalog-simple/seeds/seed_data.csv:
--------------------------------------------------------------------------------
1 | id,item_id,event_date
2 | 1,2,2020-01-01
3 | 2,1,2020-01-01
4 | 3,3,2020-01-03
5 | 4,1,2020-01-04
6 | 5,1,2020-01-05
7 | 6,1,2020-01-06
8 | 7,1,2020-01-07
9 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/008-unitycatalog-simple/tests/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-processing/sqlmesh/examples/008-unitycatalog-simple/tests/.gitkeep
--------------------------------------------------------------------------------
/data-processing/sqlmesh/examples/008-unitycatalog-simple/tests/test_full_model.yaml:
--------------------------------------------------------------------------------
1 | test_example_full_model:
2 | model: default.full_model
3 | inputs:
4 | default.incremental_model:
5 | rows:
6 | - id: 1
7 | item_id: 1
8 | - id: 2
9 | item_id: 1
10 | - id: 3
11 | item_id: 2
12 | outputs:
13 | query:
14 | rows:
15 | - item_id: 1
16 | num_orders: 2
17 | - item_id: 2
18 | num_orders: 1
19 |
20 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/tools/clean-pg-state.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # SQLMesh state database parameters
4 | SQLMESH_PG_SVR="localhost"
5 | SQLMESH_PG_DB="sqlmesh"
6 | SQLMESH_PG_USR="sqlmesh"
7 | SQLMESH_PG_SCH="sqlmesh"
8 |
9 | # Sanity checks
10 | if [ $(command -v psql) ]
11 | then
12 | PSQL_VER="$(psql --version 2> /dev/null)"
13 | # echo "PostgreSQL CLI tool found - Version: ${PSQL_VER}"
14 | else
15 | echo "Error - the psql command (PostgreSQL CLI tool) cannot be found"
16 | echo " * On MacOS, it can be installed with brew install postgresql"
17 | echo " * On Linux, it can be installed with the native packager, for instance"
18 | echo " * On Fedora/CentOS/RedHat/Rocky OSes: dnf -y install postgresql"
19 | echo " * On Debian-/Ubuntu-based OSes: sudo apt-get install postgresql-client"
20 | exit 1
21 | fi
22 |
23 | # The credentials for the PostgreSQL database are assumed to be in the ~/.pgpass file
24 | declare -a table_list=($(psql -h ${SQLMESH_PG_SVR} -U ${SQLMESH_PG_USR} -d ${SQLMESH_PG_DB} -t -c "select table_name from information_schema.tables where table_schema = '${SQLMESH_PG_SCH}'"))
25 | table_list_length=${#table_list[@]}
26 |
27 | # Reporting
28 | if [ "${table_list_length}" == "0" ]
29 | then
30 | echo "The PostgreSQL database (Server: ${SQLMESH_PG_SVR} - DB: ${SQLMESH_PG_DB} - User: ${SQLMESH_PG_USR}) has no more state-related tables"
31 | else
32 | echo "List of tables in PostgreSQL to store the state: ${table_list[@]}"
33 | fi
34 |
35 | # Drop every single state table
36 | for table in "${table_list[@]}"
37 | do echo "Dropping ${table} table..."
38 | echo "Command to be executed: psql -h ${SQLMESH_PG_SVR} -U ${SQLMESH_PG_USR} -d ${SQLMESH_PG_DB} -c \"drop table if exists ${SQLMESH_PG_SCH}.${table};\" "
39 | psql -h ${SQLMESH_PG_SVR} -U ${SQLMESH_PG_USR} -d ${SQLMESH_PG_DB} -c "drop table if exists ${SQLMESH_PG_SCH}.${table};"
40 | echo "... ${table} table dropped"
41 | done
42 |
43 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/tools/init-dbs-w-env.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Sanity checks
4 | if [ -z "${DBS_SVR_HST}" ]
5 | then
6 | echo "Error - The DBS_SVR_HST environment variable should be set, but does not appear to be"
7 | echo " It should point to the DataBricks workspace URL (e.g., .cloud.databricks.com)"
8 | exit 1
9 | else
10 | echo "DataBricks server host: ${DBS_SVR_HST}"
11 | fi
12 | if [ -z "${DBS_HTTP_PATH}" ]
13 | then
14 | echo "Error - The DBS_HTTP_PATH environment variable should be set, but does not appear to be"
15 | echo " It should point to the DataBricks HTTP path of the cluster (e.g., sql/protocolv1/o//)"
16 | exit 1
17 | else
18 | echo "DataBricks server host: ${DBS_HTTP_PATH}"
19 | fi
20 | if [ -z "${DBS_PAT}" ]
21 | then
22 | echo "Error - The DBS_PAT environment variable should be set, but does not appear to be"
23 | echo " It should point to the DataBricks Personal Access Token (PAT)"
24 | exit 1
25 | else
26 | echo "DataBricks Personal Access Token specified, that is fine"
27 | fi
28 | if [ -z "${DBS_SCH}" ]
29 | then
30 | echo "Error - The DBS_SCH environment variable should be set, but does not appear to be"
31 | echo " It should point to the DataBricks schema (e.g., schema_example)"
32 | exit 1
33 | else
34 | echo "DataBricks schema: ${DBS_SCH}"
35 | fi
36 |
37 | #
38 | declare -a file_list=($(ls *.in */*.in))
39 | file_list_length=${#file_list[@]}
40 |
41 | # Reporting
42 | if [ "${file_list_length}" == "0" ]
43 | then
44 | echo "There is no file where environment variables should be substituted"
45 | else
46 | echo "List of files where environment variables should be substituted: ${file_list[@]}"
47 | fi
48 |
49 | # Drop every single state table
50 | for myfile in "${file_list[@]}"
51 | do
52 | myfile_dir="$(dirname ${myfile})"
53 | myfile_tgt_fn="$(basename ${myfile} .in)"
54 | myfile_tgt="${myfile_dir}/${myfile_tgt_fn}"
55 | echo "Substituting environment variables in ${myfile} => ${myfile_tgt}..."
56 | echo "Command to be executed: envsubst < ${myfile} > ${myfile_tgt}"
57 | envsubst < ${myfile} > ${myfile_tgt}
58 | echo "... env vars substituting in ${myfile} => ${myfile_tgt}"
59 | done
60 |
61 |
62 |
--------------------------------------------------------------------------------
/data-processing/sqlmesh/tools/sqlmesh_dag_to_mermaid.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env -S uv run
2 | # /// script
3 | # requires-python = ">=3.12"
4 | # dependencies = [
5 | # "sqlmesh==*",
6 | # ]
7 | # ///
8 | #
9 | # File:
10 | #
11 | # Origin: https://github.com/mattiasthalen/obsidian-insights/blob/main/sqlmesh_dag_to_mermaid.py
12 | # Saved here in case the origin happens to disappear in the future
13 | #
14 | import json
15 | import os
16 | import re
17 | import subprocess
18 | import sys
19 | import tempfile
20 |
21 | from bs4 import BeautifulSoup
22 |
23 | def run_sqlmesh_dag():
24 | """Run sqlmesh dag command and save output to a temporary file"""
25 | try:
26 | # Create a temporary file
27 | with tempfile.NamedTemporaryFile(suffix='.html', delete=False) as tmp_file:
28 | temp_path = tmp_file.name
29 |
30 | # Run sqlmesh dag command with the temp file as output
31 | subprocess.run(['sqlmesh', 'dag', temp_path], check=True)
32 |
33 | # Read the contents of the temp file
34 | with open(temp_path, 'r') as f:
35 | content = f.read()
36 |
37 | # Clean up the temporary file
38 | os.unlink(temp_path)
39 |
40 | return content
41 | except subprocess.CalledProcessError as e:
42 | print(f"Error running sqlmesh dag: {e}")
43 | return None
44 | except Exception as e:
45 | print(f"Unexpected error: {e}")
46 | return None
47 |
48 | def extract_graph_data(html_content):
49 | """Extract nodes and edges from the HTML output"""
50 | soup = BeautifulSoup(html_content, 'html.parser')
51 |
52 | # Find the JavaScript code that contains the graph data
53 | script = soup.find('script', string=re.compile('vis.DataSet'))
54 | if not script:
55 | print("No script found with vis.DataSet")
56 | return None, None
57 |
58 | # Extract nodes and edges data using regex
59 | nodes_match = re.search(r'nodes = new vis\.DataSet\((.*?)\)', script.string, re.DOTALL)
60 | edges_match = re.search(r'edges: new vis\.DataSet\((.*?)\)', script.string, re.DOTALL)
61 |
62 | if not nodes_match or not edges_match:
63 | print("Could not find nodes or edges data")
64 | return None, None
65 |
66 | try:
67 | nodes = json.loads(nodes_match.group(1))
68 | edges = json.loads(edges_match.group(1))
69 | return nodes, edges
70 | except json.JSONDecodeError as e:
71 | print(f"Error parsing JSON: {e}")
72 | return None, None
73 |
74 | def get_db_and_schema_from_id(node_id):
75 | """Extract database and schema names from node ID"""
76 | parts = node_id.split('.')
77 | if len(parts) >= 3:
78 | return parts[0].strip('"'), parts[1].strip('"')
79 | return None, None
80 |
81 | def get_schema_order(schema):
82 | """Helper function to determine schema order"""
83 | order = {
84 | 'bronze': 0,
85 | 'silver': 1,
86 | 'gold': 2
87 | }
88 | return order.get(schema, 999) # Unknown schemas go to the end
89 |
90 | def convert_to_mermaid(nodes, edges):
91 | """Convert nodes and edges to Mermaid flowchart format"""
92 | mermaid_code = ["flowchart LR"]
93 |
94 | # Group nodes by database and schema
95 | db_schemas = {}
96 | for node in nodes:
97 | db, schema = get_db_and_schema_from_id(node['id'])
98 | if db and schema:
99 | db_schema_key = f"{db}.{schema}"
100 | if db_schema_key not in db_schemas:
101 | db_schemas[db_schema_key] = []
102 | node_name = node['label'].strip('"')
103 | db_schemas[db_schema_key].append(node_name)
104 |
105 | # Add subgraphs for each schema (including database name)
106 | for db_schema in sorted(db_schemas.keys(), key=lambda x: get_schema_order(x.split('.')[-1])):
107 | mermaid_code.append(f" subgraph {db_schema}[\"{db_schema}\"]")
108 | mermaid_code.append(" direction LR")
109 | for node in sorted(db_schemas[db_schema]):
110 | node_id = node.replace('.', '_').replace('-', '_')
111 | mermaid_code.append(f" {node_id}([\"{node}\"])")
112 | mermaid_code.append(" end")
113 | mermaid_code.append("")
114 |
115 | # Group edges by source and target database.schema
116 | edge_groups = {}
117 | for edge in edges:
118 | from_parts = edge['from'].split('.')
119 | to_parts = edge['to'].split('.')
120 | from_db_schema = f"{from_parts[0].strip('\"')}.{from_parts[1].strip('\"')}"
121 | to_db_schema = f"{to_parts[0].strip('\"')}.{to_parts[1].strip('\"')}"
122 | group_key = f"{from_db_schema} -> {to_db_schema}"
123 |
124 | from_node = edge['from'].split('.')[-1].strip('"').replace('.', '_').replace('-', '_')
125 | to_node = edge['to'].split('.')[-1].strip('"').replace('.', '_').replace('-', '_')
126 |
127 | if group_key not in edge_groups:
128 | edge_groups[group_key] = []
129 | edge_groups[group_key].append(f" {from_node} --> {to_node}")
130 |
131 | # Add grouped relationships with comments in correct order
132 | for group_key in sorted(edge_groups.keys(), key=lambda x: (
133 | get_schema_order(x.split(' -> ')[0].split('.')[-1]),
134 | get_schema_order(x.split(' -> ')[1].split('.')[-1]))):
135 | mermaid_code.append(f" %% {group_key}")
136 | mermaid_code.extend(sorted(edge_groups[group_key]))
137 | mermaid_code.append("")
138 |
139 | return "\n".join(mermaid_code)
140 |
141 | def main():
142 | # Get output path if provided
143 | output_path = sys.argv[1] if len(sys.argv) > 1 else None
144 |
145 | # Run sqlmesh dag and get output
146 | html_output = run_sqlmesh_dag()
147 | if not html_output:
148 | return
149 |
150 | # Extract nodes and edges
151 | nodes, edges = extract_graph_data(html_output)
152 | if not nodes or not edges:
153 | print("Failed to extract graph data")
154 | return
155 |
156 | # Convert to Mermaid
157 | mermaid_code = convert_to_mermaid(nodes, edges)
158 |
159 | if output_path:
160 | if '/' in output_path or '\\' in output_path:
161 | # If path contains separators, create directories
162 | os.makedirs(os.path.dirname(output_path), exist_ok=True)
163 | # Save to specified file
164 | with open(output_path, 'w') as f:
165 | f.write(mermaid_code)
166 | else:
167 | # Print to stdout
168 | print(mermaid_code)
169 |
170 | if __name__ == "__main__":
171 | main()
172 |
--------------------------------------------------------------------------------
/data-quality/great-expectations/README.md:
--------------------------------------------------------------------------------
1 | Cheat Sheet - Data Quality - Great Expectations
2 | ===============================================
3 |
4 | # Table of Content (ToC)
5 |
6 | # Overview
7 | * [This cheat sheet](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-quality/great-expectations/README.md)
8 | explains how to install and to use Great Expectations (GX),
9 | and associated tools and utilities.
10 |
11 | * Great Expectations (GX) relies on the
12 | [Python programming stack](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/programming/python/)
13 |
14 | ## Data Engineering helpers
15 | * [Data Engineering Helpers - Data quality](https://github.com/data-engineering-helpers/data-quality)
16 | * [Data Engineering Helpers - Data contracts](https://github.com/data-engineering-helpers/data-contracts)
17 | * [Data Engineering Helpers - Data products](https://github.com/data-engineering-helpers/data-products)
18 | * [Data Engineering Helpers - Knowledge Sharing - Python programming stack](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/programming/python/)
19 | * [Data Engineering Helpers - Knowledge Sharing - SODA data quality](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-quality/soda/)
20 | * [Data Engineering Helpers - Knowledge Sharing - Jupyter, PySpark and DuckDB](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/programming/jupyter/jupyter-pyspark-duckdb/)
21 | * [Data Engineering Helpers - Knowledge Sharing - Spark](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-processing/spark/)
22 | * [Data Engineering Helpers - Knowledge Sharing - SQLMesh](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/data-processing/sqlmesh/)
23 | * [Data Engineering Helpers - Knowledge Sharing - DuckDB](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/duckdb/)
24 | * [Material for the Data platform - Modern Data Stack (MDS) in a box](https://github.com/data-engineering-helpers/mds-in-a-box)
25 |
26 | ## Great Expectations
27 | * Home page: https://greatexpectations.io/
28 | * Documentation: https://docs.greatexpectations.io/docs/home
29 | * GitHub repository: https://github.com/great-expectations/great_expectations
30 |
31 | # Getting started
32 |
33 | # Installation
34 |
35 | ## Python programming stack
36 | * See
37 | [Data Engineering Helpers - Knowledge Sharing - Python programming stack](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/programming/python/)
38 | for more details on how to install and use Python, PyEnv, Poetry, uv and ruff
39 |
40 | ## Great Expectations (GX)
41 |
--------------------------------------------------------------------------------
/data-quality/soda/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/data-quality/soda/README.md
--------------------------------------------------------------------------------
/data-storage/lakefs/.gitignore:
--------------------------------------------------------------------------------
1 | # DuckDB
2 | /db.duckdb
3 |
--------------------------------------------------------------------------------
/data-storage/lakefs/etc/config.yaml:
--------------------------------------------------------------------------------
1 | ---
2 | database:
3 | type: "postgres"
4 | postgres:
5 | connection_string: "postgres://localhost:5432/postgres?sslmode=disable"
6 |
7 | auth:
8 | encrypt:
9 | # replace this with a randomly-generated string. Make sure to keep it safe!
10 | secret_key: "10a718b3f285d89c36e9864494cdd1507f3bc85b342df24736ea81f9a1134bcc09e90b6641"
11 |
12 | blockstore:
13 | type: s3
14 | s3:
15 | force_path_style: true
16 | endpoint: http://localhost:9000
17 | discover_bucket_region: false
18 | credentials:
19 | access_key_id:
20 | secret_access_key:
21 |
22 |
--------------------------------------------------------------------------------
/data-storage/lakefs/etc/homebrew.mxcl.lakefs.plist:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | KeepAlive
6 |
7 | EnvironmentVariables
8 |
9 | LAKEFS_CONFIG_ENV_FILE
10 | /etc/default/lakefs
11 |
12 | Label
13 | homebrew.mxcl.lakefs
14 | LimitLoadToSessionType
15 |
16 | Aqua
17 | Background
18 | LoginWindow
19 | StandardIO
20 | System
21 |
22 | ProgramArguments
23 |
24 | $BREW_PFX/bin/lakefs
25 | --config
26 | $LAKEFS_CFG
27 | run
28 |
29 | RunAtLoad
30 |
31 | StandardErrorPath
32 | $BREW_PFX/var/log/lakefs.log
33 | StandardOutPath
34 | $BREW_PFX/var/log/lakefs.log
35 | WorkingDirectory
36 | $BREW_PFX
37 |
38 |
39 |
40 |
--------------------------------------------------------------------------------
/data-storage/lakefs/etc/lakefs:
--------------------------------------------------------------------------------
1 | # LakeFS configuration is usually specified in the following two places:
2 | # ~/.lakectl.yaml
3 | # ~/.lakefs/config.yaml
4 |
5 | LAKEFS_CFG="/Users/DENIS/.lakefs/config.yaml"
--------------------------------------------------------------------------------
/data-storage/lakefs/ipython-notebooks/lakefs-browse.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "238e827d-845d-4a77-a7f0-17dbda6cf63d",
6 | "metadata": {},
7 | "source": [
8 | "Cheat Sheet - Copy files to LakeFS with Python\n",
9 | "=============================================="
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "id": "80ee3ea2-44bb-42b9-8091-43383bd20ddf",
15 | "metadata": {},
16 | "source": [
17 | "# References\n",
18 | "* Cheat sheet: https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/frameworks/lakefs/\n",
19 | "* Local Minio data-lake: http://localhost:9000/browser/silver\n",
20 | "* Local LakeFS: http://localhost:8000/repositories/silver/objects?ref=main\n",
21 | "\n",
22 | "## Minio\n",
23 | "* [Minio](https://min.io/) is a dependency for on-premise deployment\n",
24 | "* Install and deploy Minio on MacOS:\n",
25 | " https://min.io/docs/minio/macos/operations/installation.html\n",
26 | "* Install and deploy containerized Minio:\n",
27 | " https://min.io/docs/minio/container/operations/installation.html\n",
28 | "\n",
29 | "## LakeFS\n",
30 | "* GitHub repository: https://github.com/treeverse/lakeFS\n",
31 | "* End-to-end Write-Audit-Publish (WAP) pattern with LakeFS:\n",
32 | " https://lakefs.io/blog/write-audit-publish-with-lakefs/\n"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 1,
38 | "id": "b540d595-b841-4df4-ae3c-dbb78b1c7a32",
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "import os\n",
43 | "import boto3"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 2,
49 | "id": "be4c4e4a-ebf5-4add-b532-573b4c8d2b93",
50 | "metadata": {},
51 | "outputs": [],
52 | "source": [
53 | "# Replace with the proper LakeFS access key\n",
54 | "s3 = boto3.client(\"s3\", endpoint_url=\"http://localhost:8000/\",\n",
55 | " aws_access_key_id=\"AKIAIOSFODNN7EXAMPLE\",\n",
56 | " aws_secret_access_key=\"wJalrXUtnFEMI/K7MDENG/bPxRfiCYEXAMPLEKEY\")\n",
57 | "\n",
58 | "# Here's the source data folder\n",
59 | "folder_path = \"db/duckdb/data/parquet\"\n",
60 | "\n",
61 | "# Set the S3 bucket name and key prefix\n",
62 | "bucket_name = \"silver\"\n",
63 | "branch = \"main\"\n",
64 | "key_prefix = f\"{branch}/src/\"\n",
65 | "\n",
66 | "# Iterate over the files in the folder and upload each file to S3\n",
67 | "for root, dirs, files in os.walk(folder_path):\n",
68 | " for file in files:\n",
69 | " local_path = os.path.join(root, file)\n",
70 | " s3_key = os.path.join(key_prefix, os.path.relpath(local_path, folder_path))\n",
71 | " s3.upload_file(local_path, bucket_name, s3_key)\n",
72 | " print(f\"Uploaded {local_path} to {bucket_name}/{s3_key}\")\n"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "id": "ea000da4-2201-4543-949b-493ffc6bbb37",
79 | "metadata": {},
80 | "outputs": [],
81 | "source": []
82 | }
83 | ],
84 | "metadata": {
85 | "kernelspec": {
86 | "display_name": "Python 3 (ipykernel)",
87 | "language": "python",
88 | "name": "python3"
89 | },
90 | "language_info": {
91 | "codemirror_mode": {
92 | "name": "ipython",
93 | "version": 3
94 | },
95 | "file_extension": ".py",
96 | "mimetype": "text/x-python",
97 | "name": "python",
98 | "nbconvert_exporter": "python",
99 | "pygments_lexer": "ipython3",
100 | "version": "3.11.4"
101 | }
102 | },
103 | "nbformat": 4,
104 | "nbformat_minor": 5
105 | }
106 |
--------------------------------------------------------------------------------
/data-storage/minio/etc/homebrew.mxcl.minio.plist:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | KeepAlive
6 |
7 | EnvironmentVariables
8 |
9 | MINIO_CONFIG_ENV_FILE
10 | /etc/default/minio
11 |
12 | Label
13 | homebrew.mxcl.minio
14 | LimitLoadToSessionType
15 |
16 | Aqua
17 | Background
18 | LoginWindow
19 | StandardIO
20 | System
21 |
22 | ProgramArguments
23 |
24 | $BREW_PFX/bin/minio
25 | server
26 | $MINIO_VOLUMES
27 | --address=:9000
28 |
29 | RunAtLoad
30 |
31 | StandardErrorPath
32 | $BREW_PFX/var/log/minio.log
33 | StandardOutPath
34 | $BREW_PFX/var/log/minio.log
35 | WorkingDirectory
36 | $BREW_PFX
37 |
38 |
39 |
40 |
--------------------------------------------------------------------------------
/data-storage/minio/etc/minio:
--------------------------------------------------------------------------------
1 | # MINIO_ROOT_USER and MINIO_ROOT_PASSWORD sets the root account for the MinIO server.
2 | # This user has unrestricted permissions to perform S3 and administrative API operations on any resource in the deployment.
3 | # Omit to use the default values 'minioadmin:minioadmin'.
4 | # MinIO recommends setting non-default values as a best practice, regardless of environment
5 |
6 | MINIO_ROOT_USER=myminioadmin
7 | MINIO_ROOT_PASSWORD=minio-secret-key-change-me
8 |
9 | # MINIO_VOLUMES sets the storage volume or path to use for the MinIO server.
10 |
11 | MINIO_VOLUMES="/mnt/data"
12 |
13 | # MINIO_SERVER_URL sets the hostname of the local machine for use with the MinIO Server
14 | # MinIO assumes your network control plane can correctly resolve this hostname to the local machine
15 |
16 | # Uncomment the following line and replace the value with the correct hostname for the local machine and port for the MinIO server (9000 by default).
17 |
18 | #MINIO_SERVER_URL="http://minio.example.net:9000"
19 |
--------------------------------------------------------------------------------
/data-storage/minio/ipython-notebooks/minio-browse.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "238e827d-845d-4a77-a7f0-17dbda6cf63d",
6 | "metadata": {},
7 | "source": [
8 | "Cheat Sheet - Browse files from the Minio data lake with Python\n",
9 | "==============================================================="
10 | ]
11 | },
12 | {
13 | "cell_type": "markdown",
14 | "id": "80ee3ea2-44bb-42b9-8091-43383bd20ddf",
15 | "metadata": {},
16 | "source": [
17 | "# References\n",
18 | "* Cheat sheet: https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/frameworks/minio/\n",
19 | " + [This Jupyter notebook on GitHub](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/frameworks/minio/ipython-notebooks/minio-browse.ipynb)\n",
20 | " + [A simple Python script](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/frameworks/minio/python/minio-browse.py)\n",
21 | "* Local Minio data lake: http://localhost:9000\n",
22 | " + Bronze bucket in the local Minio data lake: http://localhost:58995/browser/bronze\n",
23 | "\n",
24 | "## Minio\n",
25 | "* Minio home page: https://min.io/\n"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 1,
31 | "id": "b540d595-b841-4df4-ae3c-dbb78b1c7a32",
32 | "metadata": {},
33 | "outputs": [],
34 | "source": [
35 | "import os\n",
36 | "from cloudpathlib import CloudPath"
37 | ]
38 | },
39 | {
40 | "cell_type": "code",
41 | "execution_count": 2,
42 | "id": "bb2b9727-600f-40ea-9983-a0264ff1afa2",
43 | "metadata": {},
44 | "outputs": [
45 | {
46 | "name": "stdout",
47 | "output_type": "stream",
48 | "text": [
49 | "s3://bronze/geonames/allCountries.parquet\n",
50 | "s3://bronze/geonames/alternateNames.parquet\n"
51 | ]
52 | }
53 | ],
54 | "source": [
55 | "geo_dir = CloudPath(\"s3://bronze/geonames\")\n",
56 | "for f in geo_dir.glob(\"**/*.parquet\"):\n",
57 | " print(f)"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": null,
63 | "id": "ea000da4-2201-4543-949b-493ffc6bbb37",
64 | "metadata": {},
65 | "outputs": [],
66 | "source": []
67 | }
68 | ],
69 | "metadata": {
70 | "kernelspec": {
71 | "display_name": "Python 3 (ipykernel)",
72 | "language": "python",
73 | "name": "python3"
74 | },
75 | "language_info": {
76 | "codemirror_mode": {
77 | "name": "ipython",
78 | "version": 3
79 | },
80 | "file_extension": ".py",
81 | "mimetype": "text/x-python",
82 | "name": "python",
83 | "nbconvert_exporter": "python",
84 | "pygments_lexer": "ipython3",
85 | "version": "3.11.4"
86 | }
87 | },
88 | "nbformat": 4,
89 | "nbformat_minor": 5
90 | }
91 |
--------------------------------------------------------------------------------
/data-storage/minio/python/minio-browse.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # File: https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/frameworks/minio/python/minio-browse.py
4 | #
5 | import os
6 | from cloudpathlib import CloudPath
7 |
8 | geo_dir = CloudPath("s3://bronze/geonames")
9 | for f in geo_dir.glob("**/*.parquet"):
10 | print(f)
11 |
12 |
--------------------------------------------------------------------------------
/db/dremio/README.md:
--------------------------------------------------------------------------------
1 | Cheat Sheet - Databases - Dremio
2 | ================================
3 |
4 | # Table of Content (ToC)
5 |
6 | Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc.go)
7 |
8 | # Overview
9 | [This cheat sheet](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/dremio/README.md)
10 | explains how to install and to use
11 | [Dremio OSS (open source software)](https://github.com/dremio/dremio-oss/)
12 | on premises, _e.g._, on a laptop or on a virtual machine (VM).
13 |
14 | # References
15 |
16 | ## Data Engineering helpers
17 | * [Material for the Data platform - Modern Data Stack (MDS) in a box](https://github.com/data-engineering-helpers/mds-in-a-box/blob/main/README.md)
18 | * [Material for the Data platform - Data life cycle](https://github.com/data-engineering-helpers/data-life-cycle/blob/main/README.md)
19 | * [Data Engineering Helpers - Knowledge Sharing - Minio](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/frameworks/minio/README.md)
20 | * [Data Engineering Helpers - Knowledge Sharing - DuckDB](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/duckdb/README.md)
21 | * [Data Engineering Helpers - Knowledge Sharing - PostgreSQL](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/postgresql/README.md)
22 | * [Data Engineering Helpers - Knowledge Sharing - Hive Metastore](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/frameworks/hive-metastore/README.md)
23 | * [Data Engineering Helpers - Knowledge Sharing - Trino](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/frameworks/trino/README.md)
24 | * [Data Engineering Helpers - Knowledge Sharing - Java world](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/programming/java-world/README.md)
25 |
26 | ## Dremio
27 | * Dremio home page: https://dremio.io/
28 | * GitHub project: https://github.com/dremio/dremio-oss
29 |
30 | ## Articles
31 | * [Medium - ](),
32 | by [Xxx](),
33 | May 2024
34 |
35 | # Installation
36 |
37 | ## Dremio
38 |
39 |
--------------------------------------------------------------------------------
/db/duckdb/.gitignore:
--------------------------------------------------------------------------------
1 | db.duckdb
2 | allCountries.*
3 | alternateNamesV2.*
4 | alternateNames.*
5 | geonames.parquet
6 | .ipynb_checkpoints/
7 |
--------------------------------------------------------------------------------
/db/duckdb/data/csv/.gitignore:
--------------------------------------------------------------------------------
1 | /*.txt
2 | /*.csv
3 |
--------------------------------------------------------------------------------
/db/duckdb/data/parquet/.gitignore:
--------------------------------------------------------------------------------
1 | /*.parquet
2 |
--------------------------------------------------------------------------------
/db/duckdb/elt-geonames.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # File: https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/duckdb/elt-geonames.py
4 | # Inspired from: http://github.com/opentraveldata/opentraveldata/blob/master/tools/elt-geonames.py
5 | #
6 | import duckdb
7 | import polars as pl
8 | import sqlalchemy
9 | import csv
10 |
11 | conn = duckdb.connect()
12 | conn = duckdb.connect(database='db.duckdb', read_only=False)
13 |
14 | geoname_base_dir: str = "data"
15 | geoname_csv_dir: str = f"{geoname_base_dir}/csv"
16 | geoname_pqt_dir: str = f"{geoname_base_dir}/parquet"
17 |
18 | # allCountries
19 | geoname_allctry_fn: str = "allCountries"
20 | geoname_allctry_csv: str = f"{geoname_csv_dir}/{geoname_allctry_fn}.txt"
21 | geoname_allctry_pqt: str = f"{geoname_pqt_dir}/{geoname_allctry_fn}.parquet"
22 |
23 | geoname_allctry_cln = {
24 | "geonameid": "bigint",
25 | "name": "varchar",
26 | "asciiname": "varchar",
27 | "alternatenames": "varchar",
28 | "latitude": "double",
29 | "longitude": "double",
30 | "fclass": "char(1)",
31 | "fcode": "varchar(10)",
32 | "country": "varchar(2)",
33 | "cc2": "varchar",
34 | "admin1": "varchar",
35 | "admin2": "varchar",
36 | "admin3": "varchar",
37 | "admin4": "varchar",
38 | "population": "smallint",
39 | "elevation": "integer",
40 | "dem": "integer",
41 | "timezone": "varchar",
42 | "moddate": "date"
43 | }
44 |
45 | geoname_allctry_elt_query: str = f"""
46 | COPY (
47 | SELECT *
48 | FROM read_csv_auto("{geoname_allctry_csv}",
49 | header=False,
50 | dateformat="%Y-%m-%d",
51 | columns={geoname_allctry_cln},
52 | quote=csv.QUOTE_NONE,
53 | filename=True,
54 | AUTO_DETECT=TRUE)
55 | )
56 | TO '{geoname_allctry_pqt}' (FORMAT 'parquet')
57 | """
58 |
59 | geoname_allctry_view_query: str = f"drop view if exists allcountries; create view allcountries as select * from '{geoname_allctry_pqt}'"
60 |
61 | # Alternate names
62 | geoname_altname_fn: str = "alternateNames"
63 | geoname_altname_csv: str = f"{geoname_csv_dir}/{geoname_altname_fn}.txt"
64 | geoname_altname_pqt: str = f"{geoname_pqt_dir}/{geoname_altname_fn}.parquet"
65 |
66 | geoname_altname_cln = {
67 | "alternatenameId": "bigint",
68 | "geonameid": "bigint",
69 | "isoLanguage": "varchar",
70 | "alternateName": "varchar",
71 | "isPreferredName": "smallint",
72 | "isShortName": "smallint",
73 | "isColloquial": "smallint",
74 | "isHistoric": "smallint"
75 | }
76 |
77 | geoname_altname_elt_query: str = f"""
78 | COPY (
79 | SELECT *
80 | FROM read_csv_auto("{geoname_altname_csv}",
81 | header=False,
82 | dateformat="%Y-%m-%d",
83 | columns={geoname_altname_cln},
84 | quote=csv.QUOTE_NONE,
85 | filename=True,
86 | AUTO_DETECT=TRUE)
87 | )
88 | TO '{geoname_altname_pqt}' (FORMAT 'parquet')
89 | """
90 |
91 | geoname_altname_view_query: str = f"drop view if exists altnames; create view altnames as select * from '{geoname_altname_pqt}'"
92 |
93 | # Joint of allCountries and altNames on the GeonameID
94 | geoname_joint_fn: str = "geonames"
95 | geoname_joint_pqt: str = f"{geoname_pqt_dir}/{geoname_joint_fn}.parquet"
96 |
97 | geoname_join_view_query: str = f"""
98 | drop view if exists geoanames;
99 |
100 | create view geonames as
101 | select *
102 | from allcountries ac
103 | join altnames an
104 | on ac.geonameid = an.geonameid;
105 |
106 | copy geonames to '{geoname_joint_pqt}'
107 | """
108 |
109 | geoame_nce_query: str = "select * from geonames where isoLanguage='iata' and alternateName='NCE'"
110 |
111 | def eltCSVToParquet():
112 | """
113 | Parse CSV files into Parquet
114 | """
115 | # CSV to Parquet for allCountries
116 | conn.execute(geoname_allctry_elt_query)
117 |
118 | # CSV to Parquet for alternateNames
119 | conn.execute(geoname_altname_elt_query)
120 |
121 | def createViews():
122 | """
123 | Create DuckDB views
124 | """
125 | # allCountries
126 | conn.execute(geoname_allctry_view_query)
127 |
128 | # alternateNames
129 | conn.execute(geoname_altname_view_query)
130 |
131 | def joinViews():
132 | """
133 | Join allCountries with altNames on the GeonameID
134 | """
135 | conn.execute(geoname_join_view_query)
136 |
137 | def countRows():
138 | """
139 | Check that everything goes right
140 | """
141 | count_query: str = """
142 | select count(*)/1e6 as nb from allcountries
143 | union all
144 | select count(*)/1e6 as nb from altnames
145 | union all
146 | select count(*)/1e6 as nb from geonames
147 | """
148 |
149 | nb_list = conn.execute(count_query).fetchall()
150 | return nb_list
151 |
152 | def getNCErows():
153 | """
154 | Retrieve all the records featuring NCE as the IATA code
155 | """
156 | nce_recs = conn.execute(geoame_nce_query).fetchall()
157 | return nce_recs
158 |
159 | # Main
160 | eltCSVToParquet()
161 |
162 | createViews()
163 |
164 | joinViews()
165 |
166 | nb_list = countRows()
167 | print(f"Number of rows: {nb_list}")
168 |
169 | nce_recs = getNCErows()
170 | print("List of records featuring NCE as the IATA code:")
171 | for nce_rec in nce_recs:
172 | print(nce_rec)
173 |
174 |
175 |
--------------------------------------------------------------------------------
/db/duckdb/ipython-notebooks/data:
--------------------------------------------------------------------------------
1 | ../data
--------------------------------------------------------------------------------
/db/postgresql/data:
--------------------------------------------------------------------------------
1 | ../duckdb/data
--------------------------------------------------------------------------------
/db/postgresql/ipython-notebooks/.gitignore:
--------------------------------------------------------------------------------
1 | /config.json
2 |
--------------------------------------------------------------------------------
/db/postgresql/ipython-notebooks/config-sample.json:
--------------------------------------------------------------------------------
1 | {
2 | "meta": {
3 | "project_name": "Knowledge Sharing - PostgreSQL cheat sheet",
4 | "project_url": "https://github.com/data-engineering-helpers/ks-cheat-sheets/tree/main/db/postgresql"
5 | },
6 | "db": {
7 | "type": "postgresql",
8 | "host": "localhost",
9 | "port": 5432,
10 | "dbname": "guest",
11 | "user": "guest",
12 | "passwd": ""
13 | }
14 | }
15 |
16 |
--------------------------------------------------------------------------------
/db/postgresql/ipython-notebooks/confmgr.py:
--------------------------------------------------------------------------------
1 | #
2 | # File: https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/postgresql/ipython-notebooks/confmgr.py
3 | #
4 | import json
5 |
6 | k_cfg_fp: str = "config.json"
7 |
8 | def get_conf() -> dict:
9 | """Retrieve the configuration as a Python ditionary
10 | """
11 | conf: dict = None
12 | try:
13 | with open(k_cfg_fp, "r") as conf_file:
14 | conf = json.load(conf_file)
15 | except Exception as error:
16 | print(f"Error - The '{k_cfg_fp}' configuration file cannot be found "
17 | f"- {error}")
18 | print("Hint: copy the config-sample.json file into config.json "
19 | "and adapt it")
20 | return conf
21 |
22 | def get_db_conn_dict(verbose: bool=False) -> str:
23 | """Retrieve the database connection parameters, from the configuration
24 | file, as a Python dictionary
25 | """
26 | conf: dict = get_conf()
27 |
28 | #
29 | db_cfg: dict = conf.get("db")
30 |
31 | #
32 | return db_cfg
33 |
34 | def get_db_conn_string(verbose: bool=False) -> str:
35 | """Retrieve the database connection string from the configuration file.
36 | * Only PostgreSQL is supported so far.
37 | * If the ~/.pgpass file is be used, leave the password empty in the JSON
38 | configuration file
39 | """
40 | pg_connstr: str = None
41 |
42 | #
43 | db_cfg: dict = get_db_conn_dict()
44 |
45 | #
46 | db_type: str = db_cfg.get("type")
47 | if db_type != "postgresql":
48 | print(f"In the '{k_cfg_fp}' configuration file, the expected database "
49 | f" type, namely '{db_type}', is not supported. Only 'postgresql' "
50 | "is supported so far")
51 | return pg_connstr
52 |
53 | #
54 | pg_host: str = db_cfg.get("host")
55 | pg_port: int = db_cfg.get("port")
56 | pg_dbname: str = db_cfg.get("dbname")
57 | pg_user: str = db_cfg.get("user")
58 | pg_passwd: str = db_cfg.get("passwd")
59 | if pg_passwd == "":
60 | if verbose:
61 | print(f"As the 'passwd' field is left empty in the '{k_cfg_fp}' "
62 | "configuration file, the password will be read from the "
63 | "~/.pgpass secret file")
64 | pg_connstr = f"{db_type}://{pg_user}@{pg_host}:{pg_port}/{pg_dbname}"
65 | else:
66 | pg_connstr = f"{db_type}://{pg_user}:{pg_passwd}@{pg_host}:{pg_port}/{pg_dbname}"
67 |
68 | #
69 | return pg_connstr
70 |
--------------------------------------------------------------------------------
/db/postgresql/ipython-notebooks/data:
--------------------------------------------------------------------------------
1 | ../../duckdb/data
--------------------------------------------------------------------------------
/db/postgresql/ipython-notebooks/jars:
--------------------------------------------------------------------------------
1 | ../jars
--------------------------------------------------------------------------------
/db/postgresql/jars/.gitignore:
--------------------------------------------------------------------------------
1 | /*.jar
2 |
--------------------------------------------------------------------------------
/db/postgresql/jupyter/pyspark-kernel.json:
--------------------------------------------------------------------------------
1 | {
2 | "display_name": "PySpark (Spark 3.4.1)",
3 | "language": "python",
4 | "argv": [
5 | "/usr/bin/python3",
6 | "-m",
7 | "ipykernel",
8 | "-f",
9 | "{connection_file}"
10 | ],
11 | "env": {
12 | "SPARK_HOME": "$SPARK_HOME",
13 | "PYSPARK_PYTHON": "/usr/bin/python3"
14 | }
15 | }
16 |
17 |
--------------------------------------------------------------------------------
/db/postgresql/nginx/conf.d/stream-postgresql.conf:
--------------------------------------------------------------------------------
1 |
2 | server {
3 | listen 5432;
4 |
5 | error_log /var/log/nginx/postgresql.error.log notice;
6 |
7 | proxy_connect_timeout 60s;
8 | proxy_socket_keepalive on;
9 | proxy_pass localhost:6543;
10 | }
11 |
12 |
13 |
--------------------------------------------------------------------------------
/db/postgresql/sql/create-geonames-tables.sql:
--------------------------------------------------------------------------------
1 | --
2 | -- File: https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/postgresql/sql/create-geonames-tables.sql
3 | --
4 | -- Tables for Geonames data
5 | --
6 |
7 | --
8 | -- Table structure for table admin1_codes_ascii
9 | --
10 |
11 | DROP TABLE IF EXISTS admin1_codes_ascii;
12 | CREATE TABLE admin1_codes_ascii (
13 | ccode char(2) NOT NULL,
14 | code varchar(7) NOT NULL,
15 | name text NOT NULL,
16 | nameAscii text default NULL,
17 | geonameid integer NOT NULL
18 | );
19 |
20 |
21 | --
22 | -- Table structure for table admin2_codes
23 | --
24 |
25 | DROP TABLE IF EXISTS admin2_codes;
26 | CREATE TABLE admin2_codes (
27 | ccode char(2) NOT NULL,
28 | code1 varchar(7) default NULL,
29 | code2 varchar(100) NOT NULL,
30 | name_local text default NULL,
31 | name text NOT NULL,
32 | geonameid integer NOT NULL
33 | );
34 |
35 |
36 | --
37 | -- Table structure for table airports_pageranked
38 | --
39 |
40 | DROP TABLE IF EXISTS airport_pageranked;
41 | CREATE TABLE airport_pageranked (
42 | iata_code char(3) NOT NULL,
43 | location_type varchar(4) default NULL,
44 | page_rank decimal(15,12) NOT NULL
45 | );
46 |
47 |
48 | --
49 | -- Table structure for table alternate_name
50 | --
51 |
52 | DROP TABLE IF EXISTS alternate_name;
53 | CREATE TABLE alternate_name (
54 | alternatenameId integer NOT NULL,
55 | geonameid integer default NULL,
56 | isoLanguage varchar(7) default NULL,
57 | alternateName varchar(200) default NULL,
58 | isPreferredName boolean default NULL,
59 | isShortName boolean default NULL,
60 | isColloquial boolean default NULL,
61 | isHistoric boolean default NULL
62 | );
63 |
64 |
65 | --
66 | -- Table structure for table continent_codes
67 | --
68 |
69 | DROP TABLE IF EXISTS continent_codes;
70 | CREATE TABLE continent_codes (
71 | code char(2) NOT NULL,
72 | name varchar(20) default NULL,
73 | geonameid integer default NULL
74 | );
75 |
76 |
77 | --
78 | -- Table structure for table country_info
79 | --
80 |
81 | DROP TABLE IF EXISTS country_info;
82 | CREATE TABLE country_info (
83 | iso_alpha2 char(2) default NULL,
84 | iso_alpha3 char(3) default NULL,
85 | iso_numeric integer default NULL,
86 | fips_code varchar(3) default NULL,
87 | name varchar(200) default NULL,
88 | capital varchar(200) default NULL,
89 | areainsqkm float default NULL,
90 | population integer default NULL,
91 | continent char(2) default NULL,
92 | tld varchar(4) default NULL,
93 | currency_code char(3) default NULL,
94 | currency_name varchar(32) default NULL,
95 | phone varchar(16) default NULL,
96 | postal_code_format varchar(64) default NULL,
97 | postal_code_regex varchar(256) default NULL,
98 | languages varchar(200) default NULL,
99 | geonameId integer default NULL,
100 | neighbours varchar(64) default NULL,
101 | equivalent_fips_code varchar(3) default NULL
102 | );
103 |
104 |
105 | --
106 | -- Table structure for table feature_classes
107 | --
108 |
109 | DROP TABLE IF EXISTS feature_classes;
110 | CREATE TABLE feature_classes (
111 | class char(1) NOT NULL,
112 | names varchar(200) default NULL
113 | );
114 |
115 |
116 | --
117 | -- Table structure for table feature_codes
118 | --
119 |
120 | DROP TABLE IF EXISTS feature_codes;
121 | CREATE TABLE feature_codes (
122 | class char(1) NOT NULL,
123 | code varchar(5) NOT NULL,
124 | name_en varchar(200) default NULL,
125 | description_en text default NULL,
126 | name_ru varchar(200) default NULL,
127 | description_ru text default NULL
128 | );
129 |
130 |
131 | --
132 | -- Table structure for table geoname
133 | --
134 |
135 | DROP TABLE IF EXISTS geoname;
136 | CREATE TABLE geoname (
137 | geonameid integer NOT NULL,
138 | name varchar(200) default NULL,
139 | asciiname varchar(200) default NULL,
140 | alternatenames varchar(4000) default NULL,
141 | latitude decimal(10,7) default NULL,
142 | longitude decimal(10,7) default NULL,
143 | fclass char(1) default NULL,
144 | fcode varchar(10) default NULL,
145 | country varchar(2) default NULL,
146 | cc2 varchar(60) default NULL,
147 | admin1 varchar(20) default NULL,
148 | admin2 varchar(80) default NULL,
149 | admin3 varchar(20) default NULL,
150 | admin4 varchar(20) default NULL,
151 | population integer default NULL,
152 | elevation integer default NULL,
153 | gtopo30 integer default NULL,
154 | timezone varchar(40) default NULL,
155 | moddate date default NULL
156 | );
157 |
158 |
159 | --
160 | -- Table structure for table hierarchy
161 | --
162 |
163 | DROP TABLE IF EXISTS hierarchy;
164 | CREATE TABLE hierarchy (
165 | parentId integer NOT NULL,
166 | childId integer NOT NULL,
167 | relationType varchar(20) NOT NULL
168 | );
169 |
170 |
171 | --
172 | -- Table structure for table iso_language_codes
173 | --
174 |
175 | DROP TABLE IF EXISTS iso_language_codes;
176 | CREATE TABLE iso_language_codes (
177 | iso_639_3 char(4) default NULL,
178 | iso_639_2 varchar(50) default NULL,
179 | iso_639_1 varchar(50) default NULL,
180 | language_name varchar(200) default NULL
181 | );
182 |
183 |
184 | --
185 | -- Table structure for table time_zones
186 | --
187 |
188 | DROP TABLE IF EXISTS time_zones;
189 | CREATE TABLE time_zones (
190 | country varchar(2) default NULL,
191 | timeZoneId varchar(200) default NULL,
192 | GMT_offset decimal(3,1) default NULL,
193 | DST_offset decimal(3,1) default NULL,
194 | raw_offset decimal(3,1) default NULL
195 | );
196 |
197 |
198 | --
199 | -- Table structure for table zip_codes
200 | --
201 |
202 | DROP TABLE IF EXISTS zip_codes;
203 | CREATE TABLE zip_codes (
204 | iso_alpha2 char(2) default NULL,
205 | postal_code varchar(10) default NULL,
206 | place_name varchar(200) default NULL,
207 | admin_name1 varchar(100) default NULL,
208 | admin_code1 varchar(20) default NULL,
209 | admin_name2 varchar(100) default NULL,
210 | admin_code2 varchar(20) default NULL,
211 | admin_name3 varchar(100) default NULL,
212 | latitude decimal(10,7) default NULL,
213 | longitude decimal(10,7) default NULL,
214 | accuracy integer default NULL
215 | );
216 |
217 |
--------------------------------------------------------------------------------
/db/trino/README.md:
--------------------------------------------------------------------------------
1 | Cheat Sheet - Databases - Trino
2 | ===============================
3 |
4 | # Table of Content (ToC)
5 |
6 | Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc.go)
7 |
8 | # Overview
9 | [This cheat sheet](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/trino/README.md)
10 | explains how to install and to use
11 | [Trino (formerly Presto)](https://trino.io/)
12 | on premises, _e.g._, on a laptop or on a virtual machine (VM).
13 |
14 | > Part of the larger Apache Hive data warehouse platform, the Hive metastore is a repository for details relating to Hive databases and their objects. It is adopted by Spark as the solution for storage of metadata regarding tables, databases and their related properties. An essential element of Spark, it is worth getting to know this better so that it can be safeguarded and leveraged for development appropriately.
15 |
16 | # References
17 |
18 | ## Data Engineering helpers
19 | * [Material for the Data platform - Modern Data Stack (MDS) in a box](https://github.com/data-engineering-helpers/mds-in-a-box/blob/main/README.md)
20 | * [Material for the Data platform - Data life cycle](https://github.com/data-engineering-helpers/data-life-cycle/blob/main/README.md)
21 | * [Data Engineering Helpers - Knowledge Sharing - Minio](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/frameworks/minio/README.md)
22 | * [Data Engineering Helpers - Knowledge Sharing - DuckDB](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/duckdb/README.md)
23 | * [Data Engineering Helpers - Knowledge Sharing - PostgreSQL](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/postgresql/README.md)
24 | * [Data Engineering Helpers - Knowledge Sharing - Hive Metastore](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/frameworks/hive-metastore/README.md)
25 | * [Data Engineering Helpers - Knowledge Sharing - Java world](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/programming/java-world/README.md)
26 |
27 | ## Trino
28 | * Trino home page: https://trino.io/
29 | + Presto home page (still existing, as of end 2023): https://prestodb.io/
30 | * GitHub project: https://github.com/trinodb/trino
31 | * [Trino doc - Hive connector](https://trino.io/docs/current/connector/hive.html)
32 |
33 | ## Articles
34 | * [Medium - Visualize parquet files with Apache Superset using Trino or PrestoSQL](https://sairamkrish.medium.com/visualize-parquet-files-with-apache-superset-using-trino-or-prestosql-511f18a37e3b),
35 | by [Sairam Krish](https://www.linkedin.com/in/sairamkrish/),
36 | Dec. 2021
37 | + [GitHub - Containerized Hive Metastore service](https://github.com/bitsondatadev/hive-metastore)
38 | - [GitHub - Containerized Hive Metastore service - `Dockerfile`](https://github.com/bitsondatadev/hive-metastore/blob/master/Dockerfile)
39 | - [GitHub - Containerized Hive Metastore service - `entrypoint.sh` service launch script](https://github.com/bitsondatadev/hive-metastore/blob/master/scripts/entrypoint.sh)
40 | + [GitHub - Trino demo - `metastore-site.xml` Hive Metastore client configuration](https://github.com/sairamkrish/trino-superset-demo/blob/main/hive/conf/metastore-site.xml)
41 |
42 | # Installation
43 |
44 | ## Hive Metastore
45 | * See the dedicated cheat sheet in this Git repository:
46 | [Data Engineering Helpers - Knowledge Sharing - Hive Metastore](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/frameworks/hive-metastore/README.md)
47 |
48 | ## Trino
49 | * On MacOS, install Trino with HomeBrew:
50 | ```bash
51 | $ brew install trino
52 | ```
53 |
54 | * Setup the Hive Metastore client
55 | (inspired by [GitHub - Trino demo - `metastore-site.xml` Hive Metastore client configuration](https://github.com/sairamkrish/trino-superset-demo/blob/main/hive/conf/metastore-site.xml))
56 |
57 | * Launch the Trino service:
58 | ```bash
59 | $ trino-server start
60 | ```
61 |
62 | * Open the Trino administrative UI in a browser: http://192.168.1.99:8080/ui/
63 |
64 | * On the command-line, get the status of the cluster:
65 | ```bash
66 | $ trino
67 | ```
68 | ```sql
69 | trino> SELECT * FROM system.runtime.nodes;
70 | node_id | http_uri | node_version | coordinator | state
71 | --------------------------------------+--------------------------+--------------+-------------+--------
72 | 9243b2b6-9a64-4e29-98e8-c2de6b698553 | http://192.168.1.99:8080 | 412 | true | active
73 | (1 row)
74 |
75 | Query 20231104_100933_00003_v65it, FINISHED, 1 node
76 | Splits: 1 total, 1 done (100.00%)
77 | 0.64 [1 rows, 70B] [1 rows/s, 110B/s]
78 | ```
79 |
--------------------------------------------------------------------------------
/images/data-catalogs/uc-ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/images/data-catalogs/uc-ui.png
--------------------------------------------------------------------------------
/infrastructure/docker/rancher-desktop.md:
--------------------------------------------------------------------------------
1 | Rancher Desktop
2 | ===============
3 |
4 | # MacOS
5 |
6 | ## Administrative right issue with Lima
7 | * Reference:
8 | https://github.com/rancher-sandbox/rancher-desktop/issues/1811#issuecomment-1561948344
9 | * Work-around
10 | * Add the following lines, to the `/private/etc/sudoers.d/zzzzz-rancher-desktop-lima` file,
11 | replacing `` by your user name of the MacOS session
12 | (the `sudo` command will request the user password, in order to escalate administrative rights):
13 | ```bash
14 | sudo bash -c 'echo -e "
15 |
16 | # Overrides to support starting rancher-desktop after reboot without VPN.
17 | ALL=(root:wheel) NOPASSWD:NOSETENV: /bin/mkdir -m 775 -p /private/var/run
18 | ALL=(root:wheel) NOPASSWD:NOSETENV: /opt/rancher-desktop/bin/vde_vmne, /usr/bin/pkill -F /private/var/run/*.pid
19 | ALL=(daemon:everyone) NOPASSWD:NOSETENV: /opt/rancher-desktop/bin/vde_switch, /usr/bin/pkill -F /private/var/run/*.pid
20 |
21 | " >> /private/etc/sudoers.d/zzzzz-rancher-desktop-lima'
22 | ```
23 | * During the launch of Rancher Desktop, when Rancher requests the password of the user through a pop window,
24 | click on the Cancel button rather than on the OK button (otherwise, Rancher Desktop will override the changes
25 | in the `/private/etc/sudoers.d/zzzzz-rancher-desktop-lima` file)
26 |
--------------------------------------------------------------------------------
/infrastructure/k8s/demos/archive/full-postgresql.yaml:
--------------------------------------------------------------------------------
1 | #
2 | # Ref: https://stackoverflow.com/a/76748564/798053
3 | #
4 | apiVersion: apps/v1
5 | kind: StatefulSet
6 | metadata:
7 | name: db
8 |
9 | spec:
10 | selector:
11 | matchLabels:
12 | app: db
13 |
14 | serviceName: "db"
15 | template:
16 | metadata:
17 | labels:
18 | app: db
19 | spec:
20 |
21 | initContainers:
22 | - name: db-init # using root, change permissions to not use root in main container
23 | image: postgres
24 | securityContext:
25 | allowPrivilegeEscalation: false
26 | runAsNonRoot: true
27 | runAsUser: 999 # postgres user in container
28 | runAsGroup: 999
29 | resources:
30 | limits:
31 | memory: 200Mi
32 | cpu: 300m
33 | requests:
34 | memory: 100Mi
35 | cpu: 100m
36 | command:
37 | - 'sh'
38 | - '-c'
39 | - |
40 | chown -R 999:999 /var/lib/postgresql/data
41 |
42 | volumeMounts:
43 | - name: db
44 | mountPath: /var/lib/postgresql/data
45 |
46 |
47 | containers:
48 | - name: db
49 | image: postgres
50 | securityContext:
51 | runAsNonRoot: true
52 | runAsUser: 999 # postgres user in container
53 | runAsGroup: 999
54 | allowPrivilegeEscalation: false
55 |
56 | resources:
57 | limits:
58 | memory: 200Mi
59 | cpu: 300m
60 | requests:
61 | memory: 100Mi
62 | cpu: 100m
63 |
64 | envFrom:
65 | - secretRef:
66 | name: db-env
67 |
68 | ports:
69 | - containerPort: 5432
70 | name: db
71 |
72 | volumeMounts:
73 | - name: db
74 | mountPath: /var/lib/postgresql/data
75 |
76 | startupProbe:
77 | tcpSocket:
78 | port: db
79 | initialDelaySeconds: 5
80 | periodSeconds: 2
81 | failureThreshold: 15
82 |
83 | readinessProbe:
84 | exec:
85 | command:
86 | - bash
87 | - '-c'
88 | - >
89 | psql -h localhost -U "${POSTGRES_USER}" -c 'select 1'
90 |
91 | initialDelaySeconds: 5
92 | periodSeconds: 5
93 | failureThreshold: 5
94 |
95 | livenessProbe:
96 | exec:
97 | command:
98 | - bash
99 | - '-c'
100 | - >
101 | psql -h localhost -U "${POSTGRES_USER}" -c 'select 1'
102 |
103 | initialDelaySeconds: 5
104 | periodSeconds: 10
105 |
106 | volumes:
107 |
108 | - name: db
109 | emptyDir: {}
110 |
--------------------------------------------------------------------------------
/infrastructure/k8s/demos/simple-postgresql.yaml:
--------------------------------------------------------------------------------
1 | #
2 | # Ref: https://stackoverflow.com/a/76748564/798053
3 | #
4 | apiVersion: apps/v1
5 | kind: StatefulSet
6 | metadata:
7 | name: db
8 |
9 | spec:
10 | selector:
11 | matchLabels:
12 | app: db
13 |
14 | serviceName: "db"
15 | template:
16 | metadata:
17 | labels:
18 | app: db
19 | spec:
20 |
21 | securityContext:
22 | fsGroup: 999
23 |
24 | containers:
25 | - name: db
26 | image: postgres
27 | securityContext:
28 | runAsNonRoot: true
29 | runAsUser: 999 # postgres user in container
30 | runAsGroup: 999
31 | allowPrivilegeEscalation: false
32 |
33 | resources:
34 | limits:
35 | memory: 200Mi
36 | cpu: 300m
37 | requests:
38 | memory: 100Mi
39 | cpu: 100m
40 |
41 | env:
42 | - name: POSTGRES_PASSWORD
43 | value: "secret"
44 | - name: PGDATA
45 | value: "/var/lib/postgresql/data/pgdata"
46 |
47 | ports:
48 | - containerPort: 5432
49 | name: db
50 |
51 | volumeMounts:
52 | - name: db
53 | mountPath: /var/lib/postgresql/data
54 |
55 | volumes:
56 | - name: db
57 | emptyDir: {}
58 |
--------------------------------------------------------------------------------
/infrastructure/k8s/demos/simple-shell.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | metadata:
4 | name: shell-demo
5 | spec:
6 | securityContext:
7 | runAsNonRoot: true
8 | runAsUser: 1000
9 | volumes:
10 | - name: shared-data
11 | emptyDir: {}
12 | containers:
13 | - name: nginx
14 | image: nginxinc/nginx-unprivileged
15 | securityContext:
16 | allowPrivilegeEscalation: false
17 | resources:
18 | limits:
19 | memory: 200Mi
20 | cpu: 300m
21 | requests:
22 | memory: 100Mi
23 | cpu: 100m
24 | volumeMounts:
25 | - name: shared-data
26 | mountPath: /usr/share/nginx/html
27 | hostNetwork: false
28 | dnsPolicy: Default
29 |
30 |
--------------------------------------------------------------------------------
/infrastructure/nexus/systemd/nexus.service:
--------------------------------------------------------------------------------
1 | [Unit]
2 | Description=nexus service
3 | After=network.target
4 |
5 | [Service]
6 | Type=forking
7 | LimitNOFILE=65536
8 | User=nexus
9 | Group=nexus
10 | ExecStart=/opt/nexus/nexus-latest/bin/nexus start
11 | ExecStop=/opt/nexus/nexus-latest/bin/nexus stop
12 | User=nexus
13 | Restart=on-abort
14 |
15 | [Install]
16 | WantedBy=multi-user.target
17 |
18 |
--------------------------------------------------------------------------------
/infrastructure/serverless/README.md:
--------------------------------------------------------------------------------
1 | Cheat Sheet - Serverless (AWS Lambda)
2 | =====================================
3 |
4 | # Table of Content (ToC)
5 |
6 | Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc.go)
7 |
8 | # Overview
9 | [This cheat sheet](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/infrastructure/serverless/README.md)
10 | explains how to deploy payloads on AWS Lambda, which is part of the serverless framework.
11 |
12 | # References
13 |
14 | ## Data Engineering helpers
15 | * [Architecture principles for data engineering pipelines on the Modern Data Stack (MDS)](https://github.com/data-engineering-helpers/architecture-principles)
16 | * [Data Engineering Helpers - Knowledge Sharing - JS world](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/programming/js-world/README.md)
17 |
18 | ## Serverless
19 | * Serverless home page: https://www.serverless.com/
20 | * GitHub repository: https://github.com/serverless/serverless
21 |
22 |
23 | # Installation
24 | * See
25 | for more details on how to setup NVM, NodeJS and NPM
26 |
27 | ## Install with NPM
28 | * Install with NPM:
29 | ```bash
30 | npm i serverless -g
31 | ```
32 |
--------------------------------------------------------------------------------
/orchestrators/airflow/.gitignore:
--------------------------------------------------------------------------------
1 | # Python
2 | /.python-version
3 | /Pipfile
4 | /Pipfile.lock
5 |
--------------------------------------------------------------------------------
/orchestrators/airflow/Pipfile.in:
--------------------------------------------------------------------------------
1 | [[source]]
2 | url = "https://pypi.org/simple"
3 | verify_ssl = true
4 | name = "pypi"
5 |
6 | [packages]
7 | apache-airflow = {extras = ["celery"], version = "==$AIRFLOW_VERSION"}
8 | sqlalchemy = "*"
9 | psycopg2 = "*"
10 |
11 | [dev-packages]
12 |
13 | [requires]
14 | python_version = "$PYTHON_VERSION"
15 |
--------------------------------------------------------------------------------
/orchestrators/n8n/README.md:
--------------------------------------------------------------------------------
1 | Cheat Sheet - n8n
2 | =================
3 |
4 | # Table of Content (ToC)
5 | * [Overview](#overview)
6 | * [References](#references)
7 | * [Data Engineering helpers](#data-engineering-helpers)
8 | * [n8n](#n8n)
9 | * [Key Capabilities](#key-capabilities)
10 | * [Getting started](#getting-started)
11 | * [Installation](#installation)
12 |
13 | Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc.go)
14 |
15 | # Overview
16 | [This cheat sheet](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/orchestrators/n8n/README.md)
17 | explains how to install and to use n8n on premises, _e.g._, on a laptop
18 | or on a virtual machine (VM).
19 |
20 | # References
21 |
22 | ## Data Engineering helpers
23 | * [Data Engineering Helpers - Knowledge Sharing - JavaScript / NodeJS](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/programming/js-world/README.md)
24 | * [Material for the Data platform - Modern Data Stack (MDS) in a box](https://github.com/data-engineering-helpers/mds-in-a-box/blob/main/README.md)
25 | * [Data Engineering Helpers - Knowledge Sharing - Airflow](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/orchestrators/airflow/README.md)
26 |
27 | ## n8n
28 | * Home page: https://n8n.io
29 | * GitHub page: https://github.com/n8n-io/n8n
30 | * A LinkedIn post showcasing how to use n8n on DataBricks:
31 | https://www.linkedin.com/posts/hellomikelo_databricks-apps-started-out-as-a-framework-activity-7293304974766194689-wPhg/
32 | * Motto: Secure Workflow Automation for Technical Teams
33 | > n8n is a workflow automation platform that gives technical teams
34 | > the flexibility of code with the speed of no-code. With 400+ integrations,
35 | > native AI capabilities, and a fair-code license, n8n lets you build powerful
36 | > automations while maintaining full control over your data and deployments.
37 |
38 | ### Key Capabilities
39 | * Code When You Need It: Write JavaScript/Python, add npm packages,
40 | or use the visual interface
41 | * AI-Native Platform: Build AI agent workflows based on LangChain
42 | with your own data and models
43 | * Full Control: Self-host with our fair-code license or use our
44 | [cloud offering](https://app.n8n.cloud/login)
45 | * Enterprise-Ready: Advanced permissions, SSO, and air-gapped deployments
46 | * Active Community: 400+ integrations and 900+ ready-to-use
47 | [templates](https://n8n.io/workflows)
48 |
49 |
50 | # Getting started
51 | * Try n8n instantly with [npx](https://docs.n8n.io/hosting/installation/npm/)
52 | (requires [Node.js](https://nodejs.org/en/)):
53 | ```bash
54 | npx n8n
55 | ```
56 |
57 | * Or deploy with [Docker](https://docs.n8n.io/hosting/installation/docker/):
58 | ```bash
59 | docker volume create n8n_data
60 | docker run -it --rm --name n8n -p 5678:5678 -v n8n_data:/home/node/.n8n docker.n8n.io/n8nio/n8n
61 | ```
62 |
63 | # Installation
64 |
65 |
--------------------------------------------------------------------------------
/packaging/deb-world/README.md:
--------------------------------------------------------------------------------
1 | Knowledge Sharing (KS) - Cheat Sheets - Debian world
2 | ====================================================
3 |
4 | # Table of Content (ToC)
5 |
6 | # Overview
7 | [This cheat sheet](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/packaging/deb-world/README.md)
8 | gives a few hints about housekeeping with Debian-based Linux distributions.
9 |
10 | # References
11 |
12 | ## Data Engineering helpers
13 | * [Data Engineering Helpers - Knowledge Sharing - Packaging - RPM world](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/packaging/rpm-world/README.md)
14 | * [Data Engineering Helpers - Knowledge Sharing - Packaging - Debian world (this repository)](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/packaging/deb-world/README.md)
15 |
16 | # Use cases
17 |
18 | ## Identify leaves
19 | * Identify leaves with:
20 | ```bash
21 | $
22 | ```
23 |
24 | ## Identify orphans
25 | * Identify orphans with:
26 | ```bash
27 | $
28 | ```
29 |
30 |
31 |
--------------------------------------------------------------------------------
/packaging/mac-world/README.md:
--------------------------------------------------------------------------------
1 | Mac world - HomeBrew
2 | ====================
3 |
4 | # Getting started
5 |
6 | ## Write a HomeBrew file
7 | * Curate a HomeBrew file, with the dependencies and some specific configuration, like:
8 | ```bash
9 | cat > ~/.brewrc << _EOF
10 |
11 | cask_args appdir: "~/Applications"
12 |
13 | cask "bruno"
14 |
15 | brew "git"
16 | brew "gh"
17 | brew "coreutils"
18 | brew "gnu-sed"
19 | brew "gawk"
20 |
21 | _EOF
22 | ```
23 | * Launch HomeBrew with that "bundle":
24 | ```bash
25 | $ brew bundle --file ~/.brewrc
26 | ```
27 | * The resulting applications (_e.g._, Bruno in the above example)
28 | are installed locally in the user folders, namely `~/Applications`
29 |
30 | ## A few packages from taps
31 | * Companies/organizations may release their own packages, which are available
32 | on GitHub in specific `homebrew-tap` repositories in their respective
33 | GitHub public organizations, that is https://github.com/organization/homebrew-tap ,
34 | for instance https://github.com/databricks/homebrew-tap for DataBricks
35 | and https://github.com/hashicorp/homebrew-tap for HashiCorp.
36 |
37 | * In order to install the "tap" from a specific company:
38 | ```bash
39 | $ brew tap company/tap
40 | ```
41 |
42 | * That usually clone the corresponding Git repository locally in a HomeBrew
43 | folder dedicated to taps, namely
44 | `$(brew --prefix)/Library/Taps/organization/homebrew-tap`
45 | (`$(brew --prefix)` usually expands to `/opt/homebrew`)
46 |
47 | * And, then, to install a package from that tap:
48 | ```bash
49 | $ brew install company/tap/package
50 | ```
51 |
52 | ### DataBricks
53 | * References:
54 | * DataBricks helper page:
55 | https://docs.databricks.com/aws/en/dev-tools/cli/install
56 |
57 | * Install the DataBricks tap:
58 | ```bash
59 | $ brew tap databricks/tap
60 | ```
61 |
62 | * Install the DataBricks CLI (command-line interface) utility:
63 | ```bash
64 | $ brew install databricks/tap/databricks
65 | ```
66 |
67 | ### HashiCorp Vault
68 | * References:
69 | * HashiCorp Vault helper page:
70 | https://developer.hashicorp.com/vault/install
71 |
72 | * Install the HashiCorp tap:
73 | ```bash
74 | $ brew tap hashicorp/tap
75 | ```
76 |
77 | * Install the HashiCorp Vault CLI (command-line interface) utility:
78 | ```bash
79 | $ brew install hashicorp/tap/vault
80 | ```
81 |
82 | ### MinIO
83 | * References:
84 | * MinIO helper page to install it on MacOS:
85 | https://min.io/docs/minio/macos/index.html
86 |
87 | * If any previous standard installation has been made, uninstall it:
88 | ```bash
89 | $ brew uninstall minio
90 | ```
91 |
92 | * Install the MinIO tap:
93 | ```bash
94 | $ brew install minio/stable/minio
95 | ```
96 |
97 | * Install MinIO:
98 | ```bash
99 | $ brew install minio mc
100 | ```
101 |
102 | ## LakeFS
103 | * References:
104 | * GitHub repository: https://github.com/treeverse/homebrew-lakefs
105 |
106 | * Install the LakeFS tap:
107 | ```bash
108 | $ brew tap treeverse/lakefs
109 | ```
110 |
111 | * Install LakeFS:
112 | ```bash
113 | $ brew install lakefs
114 | ```
115 |
116 |
--------------------------------------------------------------------------------
/packaging/rpm-world/README.md:
--------------------------------------------------------------------------------
1 | Knowledge Sharing (KS) - Cheat Sheets - RPM world
2 | =================================================
3 |
4 | # Table of Content (ToC)
5 |
6 | # Overview
7 | [This cheat sheet](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/packaging/rpm-world/README.md)
8 | gives a few hints about housekeeping with RPM-based Linux distributions.
9 |
10 | # References
11 |
12 | ## Data Engineering helpers
13 | * [Data Engineering Helpers - Knowledge Sharing - Packaging - RPM world (this repository)](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/packaging/rpm-world/README.md)
14 | * [Data Engineering Helpers - Knowledge Sharing - Packaging - Debian world](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/packaging/deb-world/README.md)
15 |
16 | ## Fedora
17 | * [Fedora forum - Preferred Fedora Housecleaning/Package Cleanup?](https://forums.fedoraforum.org/showthread.php?330282-Preferred-Fedora-Housecleaning-Package-Cleanup)
18 | * [Fedora docs - Packaging guidelines](https://docs.fedoraproject.org/en-US/packaging-guidelines/)
19 |
20 | # Use cases
21 |
22 | ## Identify leaves
23 | * Identify leaves with:
24 | ```bash
25 | $ dnf leaves
26 | ```
27 |
28 | ## Identify orphans
29 | * Identify orphans with:
30 | ```bash
31 | $ rpmorphan
32 | ```
33 |
34 |
35 |
--------------------------------------------------------------------------------
/parsers/jq/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/parsers/jq/README.md
--------------------------------------------------------------------------------
/parsers/yq/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/parsers/yq/README.md
--------------------------------------------------------------------------------
/programming/building/cmake/README.md:
--------------------------------------------------------------------------------
1 | Cheat Sheet - CMake
2 | ===================
3 |
4 | # Table of Content (ToC)
5 | * [Overview](#overview)
6 | * [Data Engineering helpers](#data-engineering-helpers)
7 | * [Documentation](#documentation)
8 | * [Installation](#installation)
9 | * [Ubuntu](#ubuntu)
10 | * [Debian](#debian)
11 |
12 | Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc.go)
13 |
14 | # Overview
15 | * [This cheat sheet](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/programming/building/cmake/README.md)
16 | explains how to install and to use CMake.
17 |
18 | ## Data Engineering helpers
19 | * [Data Engineering Helpers - Knowledge Sharing - Programming](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/programming/)
20 |
21 | ## Documentation
22 | * https://linuxcapable.com/how-to-install-cmake-on-debian-linux/
23 | * [GitHub - CMake releases](https://github.com/Kitware/CMake/releases)
24 | * [CMake download page](https://cmake.org/download/)
25 | * [Kitware - apt repositories for Ubuntu](https://apt.kitware.com/)
26 |
27 | # Installation
28 |
29 | ## Ubuntu
30 | * Kitware maintains APT repositories for Ubuntu (LTS from 20.04):
31 | https://apt.kitware.com/
32 |
33 | ## Debian
34 | * To have newer versions of CMake, they have to be built from sources.
35 | Helper documentation:
36 | https://linuxcapable.com/how-to-install-cmake-on-debian-linux/
37 |
38 | * Remove any previous version of CMake (and potential no longer used packages):
39 | ```bash
40 | sudo apt-get remove -y cmake && sudo apt-get autoremove -y
41 | ```
42 |
43 | * Install a few system dependencies:
44 | ```bash
45 | sudo apt install -y build-essential checkinstall zlib1g-dev libssl-dev
46 | ```
47 |
48 | * Create a build directory:
49 | ```bash
50 | sudo mkdir /opt/cmake && sudo chown ${USER} /opt/cmake
51 | ```
52 |
53 | * Derive the version of the latest stable release for CMake:
54 | ```bash
55 | CMAKE_VER=$(curl -Ls https://api.github.com/repos/Kitware/CMake/releases/latest | grep 'tag_name' | cut -d'v' -f2,2 | cut -d'"' -f1,1)
56 | ```
57 |
58 | * Download the source tar-ball:
59 | ```bash
60 | curl -Ls https://github.com/Kitware/CMake/archive/refs/tags/v${CMAKE_VER}.tar.gz -o /opt/cmake/cmake-${CMAKE_VER}.tar.gz
61 | ```
62 |
63 | * Go into the build directory:
64 | ```bash
65 | pushd /opt/cmake
66 | ```
67 |
68 | * Un-tar the source directory and delete the source tar-ball:
69 | ```bash
70 | tar zxf cmake-${CMAKE_VER}.tar.gz && rm -f cmake-${CMAKE_VER}.tar.gz
71 | ```
72 |
73 | * Go into the CMake directory:
74 | ```bash
75 | pushd CMake-${CMAKE_VER}
76 | ```
77 |
78 | * Boot-strap the build of CMake:
79 | ```bash
80 | ./bootstrap
81 | ```
82 |
83 | * Launch the build of CMake:
84 | ```bash
85 | gmake -j4
86 | ```
87 |
88 | * Install CMake:
89 | ```bash
90 | sudo make install
91 | ```
92 |
93 | * Go back to the working directory:
94 | ```bash
95 | popd && popd
96 | ```
97 |
--------------------------------------------------------------------------------
/programming/java-world/README.md:
--------------------------------------------------------------------------------
1 | Cheat Sheets - Java and Scala
2 | =============================
3 |
4 | # Table of Content (ToC)
5 | * [Overview](#overview)
6 | * [References](#references)
7 | * [Data Engineering helpers](#data-engineering-helpers)
8 | * [SDKMan](#sdkman)
9 | * [JAR packages on Maven Central](#jar-packages-on-maven-central)
10 | * [Specific JAR packages](#specific-jar-packages)
11 | * [Hadoop](#hadoop)
12 | * [Hive Metastore](#hive-metastore)
13 | * [PostgreSQL JDBC drivers](#postgresql-jdbc-drivers)
14 | * [Spark](#spark)
15 | * [Delta](#delta)
16 |
17 | Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc.go)
18 |
19 | # Overview
20 | [This cheat sheet](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/programming/java-world/README.md)
21 | explains how to install and to maintain a few tools pertaining to
22 | programming with Java and Scala, in particular for Spark-powered data processing.
23 |
24 | # References
25 |
26 | ## Data Engineering helpers
27 | * [Material for the Data platform - Modern Data Stack (MDS) in a box](https://github.com/data-engineering-helpers/mds-in-a-box/blob/main/README.md)
28 | * [Material for the Data platform - Data life cycle](https://github.com/data-engineering-helpers/data-life-cycle/blob/main/README.md)
29 | * [Data Engineering Helpers - Knowledge Sharing - Minio](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/frameworks/minio/README.md)
30 | * [Data Engineering Helpers - Knowledge Sharing - Trino](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/trino/README.md)
31 | * [Data Engineering Helpers - Knowledge Sharing - DuckDB](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/duckdb/README.md)
32 | * [Data Engineering Helpers - Knowledge Sharing - PostgreSQL](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/db/postgresql/README.md)
33 | * [Data Engineering Helpers - Knowledge Sharing - Hive Metastore](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/frameworks/hive-metastore/README.md)
34 |
35 | # SDKMan
36 | * If Java needs to be installed (_e.g._, on systems not packaging it natively),
37 | it is advised to install and use [SDKMan](https://sdkman.io/)
38 | * Once SDKMan has been installed, installing in parallel a specific version of Java becomes as easy as
39 | `sdk install 11.0.21-amzn` (here, for the Amazon-supported Corretto OpenJDK 11)
40 | * On MacOS, Java may simply be installed with HomeBrew: `brew install openjdk`
41 |
42 | # JAR packages on Maven Central
43 | * The packages may be searched for on [Maven Central](https://mvnrepository.com/)
44 |
45 | # Specific JAR packages
46 |
47 | ## Hadoop
48 | * Hadoop download page (as of end 2023, the [latest version is 3.3.6](https://archive.apache.org/dist/hadoop/common/hadoop-3.3.6/)
49 | and dates back to June 2023): https://archive.apache.org/dist/hadoop/common/current/
50 |
51 | ## Hive Metastore
52 | * Hive Metastore standalone download page (as of end 2023, the latest version is 3.0.0 and dates back to 2018):
53 | https://downloads.apache.org/hive/hive-standalone-metastore-3.0.0/
54 |
55 | ## PostgreSQL JDBC drivers
56 | * The [PostgreSQL drivers are available only for JDK up to version 8](https://jdbc.postgresql.org/download)
57 | * PostgreSQL JDBC driver:
58 | * [`org.postgresql:postgresql:42.6.0` package page](https://mvnrepository.com/artifact/org.postgresql/postgresql/42.6.0)
59 | ```bash
60 | $ wget https://repo1.maven.org/maven2/org/postgresql/postgresql/42.6.0/postgresql-42.6.0.jar
61 | ```
62 |
63 | ## Spark
64 | * Download page for Apache Spark: https://spark.apache.org/downloads.html
65 |
66 | ## Delta
67 | * Delta Spark:
68 | * [`io.delta:delta-spark_2.12:3.0.0` package page](https://mvnrepository.com/artifact/io.delta/delta-spark_2.12/3.0.0)
69 | * Download the JAR package:
70 | ```bash
71 | $ wget https://repo1.maven.org/maven2/io/delta/delta-spark_2.12/3.0.0/delta-spark_2.12-3.0.0.jar
72 | ```
73 | * Delta standalone:
74 | * [`io.delta:delta-standalone_2.12:3.0.0` package page](https://mvnrepository.com/artifact/io.delta/delta-standalone_2.12/3.0.0)
75 | * Download the JAR package:
76 | ```bash
77 | $ wget https://repo1.maven.org/maven2/io/delta/delta-standalone_2.12/3.0.0/delta-standalone_2.12-3.0.0.jar
78 | ```
79 |
--------------------------------------------------------------------------------
/programming/js-world/README.md:
--------------------------------------------------------------------------------
1 | Cheat Sheets - JavaScript (JS)
2 | ==============================
3 |
4 | # Table of Content (ToC)
5 | * [Overview](#overview)
6 | * [Quick start](#quick-start)
7 | * [NVM \- Parallel installable NodeJS](#nvm---parallel-installable-nodejs)
8 | * [NodeJS](#nodejs)
9 | * [Node modules](#node-modules)
10 | * [npx](#npx)
11 | * [Yarn](#yarn)
12 | * [TypeScript (TS)](#typescript-ts)
13 | * [Update / upgrade](#update--upgrade)
14 |
15 | Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc.go)
16 |
17 | # Overview
18 | [This cheat sheet](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/programming/js-world/README.md)
19 | explains how to install and to maintain a few tools pertaining to
20 | programming with JavaScript.
21 |
22 | # Quick start
23 |
24 | ## NVM - Parallel installable NodeJS
25 | * Reference: https://github.com/nvm-sh/nvm#install--update-script
26 |
27 | * Releases: https://github.com/nvm-sh/nvm/releases
28 | + Tags: https://github.com/nvm-sh/nvm/tags
29 |
30 | * Install, or update, NVM (for parallel installation of Node) into `~/.nvm`:
31 | ```bash
32 | $ NVM_VER=$(curl -Ls https://api.github.com/repos/nvm-sh/nvm/releases/latest | grep 'tag_name' | cut -d'v' -f2 | cut -d'"' -f1)
33 | curl -o- https://raw.githubusercontent.com/nvm-sh/nvm/v${NVM_VER}/install.sh | bash
34 | ```
35 | * To upgrade NVM, just go into the NVM folder (`~/.nvm`) and pull the latest
36 | changes:
37 | ```bash
38 | $ pushd ~/.nvm && git pull && popd
39 | ```
40 | * Reset the terminal:
41 | ```bash
42 | $ exec bash
43 | ```
44 | * Check the version of NVM:
45 | ```bash
46 | $ nvm --version
47 | 0.40.1
48 | ```
49 |
50 | ## NodeJS
51 | * Reference: https://nodejs.org/en/blog/release
52 |
53 | * List the installed versions of NodeJS:
54 | ```bash
55 | $ nvm ls
56 | ```
57 |
58 | * List the available versions of NodeJS, which may be installed locally:
59 | ```bash
60 | $ nvm ls-remote
61 | ```
62 |
63 | * If there are no specific need, it is better to install the latest
64 | Long Term Support (LTS) release
65 |
66 | * Install some specific version of NodeJS:
67 | ```bash
68 | $ nvm install 22.13.1
69 | Downloading and installing node v22.13.1...
70 | ...
71 | Now using node v22.13.1 (npm v10.9.2)
72 | ```
73 |
74 | * Have a specific NodeJS version as global default:
75 | ```bash
76 | $ nvm use 22.13.1
77 | Now using node v22.13.1 (npm v10.9.2)
78 | ```
79 |
80 | * Uninstall some older version of NodeJS:
81 | ```bash
82 | $ nvm uninstall 22.12.0
83 | Uninstalled node v22.12.0
84 | ```
85 |
86 | * Set default Node version on the Shell:
87 | ```bash
88 | $ nvm alias default 22.13.1
89 | default -> 22.13.1 (-> v22.13.1)
90 | ```
91 |
92 | ## Node modules
93 | * npm (node package manager) is the dependency/package manager that
94 | we get out of the box when we install Node.js (see above).
95 | It provides a way for developers to install packages both globally
96 | and locally
97 | * First and foremost, it is an online repository for the publishing
98 | of open-source Node.js projects
99 | * Second, it is a CLI tool that aids you install those packages
100 | and manage their versions and dependencies. There are hundreds of thousands
101 | of Node.js libraries and applications on npm and many more are added
102 | every day
103 |
104 | * npm by itself does not run any packages. If we want to run a package
105 | using npm, we must specify that package in the `package.json` file.
106 |
107 | * When executables are installed via npm packages, npm creates links to them:
108 | * local installs have links created at the `./node_modules/.bin/` directory
109 | * global installs have links created from the global `bin/` directory
110 | (for example, `/usr/local/bin` on Linux or at `%AppData%/npm` on MS Windows)
111 |
112 | * To execute a package with npm we either have to type the local path,
113 | like this:
114 | ```bash
115 | $ ./node_modules/.bin/your-package
116 | ```
117 |
118 | * Or we can run a locally installed package by adding it into
119 | the `package.json` file in the scripts section, like this:
120 | ```json
121 | {
122 | "name": "your-application",
123 | "version": "1.0.0",
124 | "scripts": {
125 | "your-package": "your-package"
126 | }
127 | }
128 | ```
129 |
130 | * Then the script may be run using `npm run`:
131 | ```bash
132 | npm run your-package
133 | ```
134 |
135 | * We can see that running a package with plain npm requires quite a bit
136 | of ceremony. Fortunately, this is where npx comes in handy:
137 | * Sometimes we might want to take a look at a specific package
138 | and try out some commands. But we cannot do that without installing
139 | the dependencies in our local `node_modules` folder
140 |
141 | ### npx
142 | * References:
143 | * npx command in the command-line (CLI):
144 | https://docs.npmjs.com/cli/v8/commands/npx
145 | * npx package, now part of npm: https://www.npmjs.com/package/npx
146 |
147 | * The `npx` command allows to run an arbitrary command from an npm package
148 | (either one installed locally, or fetched remotely), in a similar context
149 | as running it via `npm run`.
150 |
151 | ### Yarn
152 | * Reference: https://classic.yarnpkg.com/en/docs/install#mac-stable
153 |
154 | * Install Yarn:
155 | ```bash
156 | $ npm install -g yarn
157 | added 1 package in 883ms
158 | ```
159 |
160 | ### TypeScript (TS)
161 | * Reference: https://www.npmjs.com/package/ts-node
162 |
163 | * Install TypeScript and `ts-node`:
164 | ```bash
165 | $ npm install -g typescript
166 | npm install -g ts-node
167 | ```
168 |
169 | # Update / upgrade
170 | * In a given project
171 | * Download the latest information about packages:
172 | ```bash
173 | $ npm update
174 | ```
175 | * Upgrade the packages of the project (as seen in the `package-lock.json`
176 | file):
177 | ```bash
178 | $ npm upgrade
179 | ```
180 |
181 |
182 |
--------------------------------------------------------------------------------
/programming/jupyter/jupyter-pyspark-duckdb/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb_checkpoints/
2 | /db.duckdb
3 |
--------------------------------------------------------------------------------
/programming/jupyter/jupyter-pyspark-duckdb/data/parquet/user-details.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/programming/jupyter/jupyter-pyspark-duckdb/data/parquet/user-details.parquet
--------------------------------------------------------------------------------
/programming/jupyter/jupyter-pyspark-duckdb/ipython-notebooks/readme.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "71e6425d-a8b6-4ef7-8023-bc6553a212eb",
6 | "metadata": {},
7 | "source": [
8 | "Cheat Sheet - Jupyter with PySpark and DuckDB\n",
9 | "=============================================\n",
10 | "\n",
11 | "* Homepage on GitHub:\n",
12 | " * Python: https://github.com/data-engineering-helpers/ks-cheat-sheets/tree/main/programming/python\n",
13 | " * Jupyter, DuckDB and Spark: https://github.com/data-engineering-helpers/ks-cheat-sheets/tree/main/programming/jupyter/jupyter-pyspark-duckdb\n",
14 | "\n",
15 | "* The iPython/Jupyter notebooks:\n",
16 | " https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/programming/jupyter/jupyter-pyspark-duckdb/ipython-notebooks/simple-duckdb-w-ext.ipynb"
17 | ]
18 | },
19 | {
20 | "cell_type": "code",
21 | "execution_count": null,
22 | "id": "532a8f03-e0ec-4b3d-8f7b-fda9ef058370",
23 | "metadata": {},
24 | "outputs": [],
25 | "source": []
26 | }
27 | ],
28 | "metadata": {
29 | "kernelspec": {
30 | "display_name": "Python 3 (ipykernel)",
31 | "language": "python",
32 | "name": "python3"
33 | },
34 | "language_info": {
35 | "codemirror_mode": {
36 | "name": "ipython",
37 | "version": 3
38 | },
39 | "file_extension": ".py",
40 | "mimetype": "text/x-python",
41 | "name": "python",
42 | "nbconvert_exporter": "python",
43 | "pygments_lexer": "ipython3",
44 | "version": "3.11.11"
45 | }
46 | },
47 | "nbformat": 4,
48 | "nbformat_minor": 5
49 | }
50 |
--------------------------------------------------------------------------------
/programming/jupyter/jupyter-pyspark-duckdb/ipython-notebooks/simple-duckdb.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "91e77ea5-e8b1-45a1-aa7c-f6f039040be8",
6 | "metadata": {},
7 | "source": [
8 | "Simple DuckDB test\n",
9 | "==================\n"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 1,
15 | "id": "23d35f8d-830b-4e0c-8eb0-657905da501b",
16 | "metadata": {},
17 | "outputs": [
18 | {
19 | "name": "stdout",
20 | "output_type": "stream",
21 | "text": [
22 | "total 8\n",
23 | "-rw-r--r--@ 1 mac-DARNAU24 staff 3.1K Feb 13 17:12 user-details.parquet\n"
24 | ]
25 | }
26 | ],
27 | "source": [
28 | "%%sh\n",
29 | "ls -lFh ../data/parquet/"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 2,
35 | "id": "a06bc998-38c8-4704-a1ca-1302938fc656",
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "user_data_fp: str = \"../data/parquet/user-details.parquet\""
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 3,
45 | "id": "c25a3f1b-f89f-4971-9ff0-d9e69b4f159e",
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "import duckdb\n",
50 | "import csv\n",
51 | "\n",
52 | "conn = duckdb.connect()\n",
53 | "# conn = duckdb.connect(database='../db.duckdb') #, read_only=True)\n",
54 | "conn = duckdb.connect(database=':memory:')"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 4,
60 | "id": "28826007-95bb-4ad9-a865-792dd0338834",
61 | "metadata": {},
62 | "outputs": [
63 | {
64 | "data": {
65 | "text/html": [
66 | "\n",
67 | "\n",
80 | "
\n",
81 | " \n",
82 | " \n",
83 | " | \n",
84 | " User ID | \n",
85 | " Username | \n",
86 | " Browser | \n",
87 | " OS | \n",
88 | "
\n",
89 | " \n",
90 | " \n",
91 | " \n",
92 | " 0 | \n",
93 | " 1580 | \n",
94 | " Barry | \n",
95 | " FireFox | \n",
96 | " Windows | \n",
97 | "
\n",
98 | " \n",
99 | " 1 | \n",
100 | " 5820 | \n",
101 | " Sam | \n",
102 | " MS Edge | \n",
103 | " Linux | \n",
104 | "
\n",
105 | " \n",
106 | " 2 | \n",
107 | " 2340 | \n",
108 | " Harry | \n",
109 | " Vivaldi | \n",
110 | " Windows | \n",
111 | "
\n",
112 | " \n",
113 | " 3 | \n",
114 | " 7860 | \n",
115 | " Albert | \n",
116 | " Chrome | \n",
117 | " Windows | \n",
118 | "
\n",
119 | " \n",
120 | " 4 | \n",
121 | " 1123 | \n",
122 | " May | \n",
123 | " Safari | \n",
124 | " macOS | \n",
125 | "
\n",
126 | " \n",
127 | "
\n",
128 | "
"
129 | ],
130 | "text/plain": [
131 | " User ID Username Browser OS\n",
132 | "0 1580 Barry FireFox Windows\n",
133 | "1 5820 Sam MS Edge Linux\n",
134 | "2 2340 Harry Vivaldi Windows\n",
135 | "3 7860 Albert Chrome Windows\n",
136 | "4 1123 May Safari macOS"
137 | ]
138 | },
139 | "execution_count": 4,
140 | "metadata": {},
141 | "output_type": "execute_result"
142 | }
143 | ],
144 | "source": [
145 | "user_query: str = f\"select * from '{user_data_fp}'\"\n",
146 | "#user_data = conn.execute(user_query).fetchall()\n",
147 | "user_data = conn.sql(user_query).to_df()\n",
148 | "user_data"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 5,
154 | "id": "6a4b1179-126f-478b-aaa3-b22f086fb130",
155 | "metadata": {},
156 | "outputs": [],
157 | "source": [
158 | "conn.close()"
159 | ]
160 | },
161 | {
162 | "cell_type": "code",
163 | "execution_count": null,
164 | "id": "8a5ce0a4-e745-431c-9070-983a419d3aba",
165 | "metadata": {},
166 | "outputs": [],
167 | "source": []
168 | }
169 | ],
170 | "metadata": {
171 | "kernelspec": {
172 | "display_name": "Python 3 (ipykernel)",
173 | "language": "python",
174 | "name": "python3"
175 | },
176 | "language_info": {
177 | "codemirror_mode": {
178 | "name": "ipython",
179 | "version": 3
180 | },
181 | "file_extension": ".py",
182 | "mimetype": "text/x-python",
183 | "name": "python",
184 | "nbconvert_exporter": "python",
185 | "pygments_lexer": "ipython3",
186 | "version": "3.11.11"
187 | }
188 | },
189 | "nbformat": 4,
190 | "nbformat_minor": 5
191 | }
192 |
--------------------------------------------------------------------------------
/programming/jupyter/jupyter-pyspark-duckdb/ipython-notebooks/simple-spark-pandas.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "Simple PySpark test\n",
8 | "===================\n",
9 | "\n",
10 | "A Spark DataFrame is created in memory and saved as a Parquet file on the local file-system.\n",
11 | "Spark writes several Parquet files when saving a DataFrame. Hence, Pandas is used as a pivot\n",
12 | "data format and when the Pandas DataFrame is saved, a single Parquet file is produced.\n"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 1,
18 | "metadata": {},
19 | "outputs": [
20 | {
21 | "name": "stdout",
22 | "output_type": "stream",
23 | "text": [
24 | "3.11.11 (main, Dec 6 2024, 12:21:43) [Clang 16.0.0 (clang-1600.0.26.4)]\n",
25 | "3.11.11\n"
26 | ]
27 | }
28 | ],
29 | "source": [
30 | "import sys, platform\n",
31 | "print(sys.version)\n",
32 | "print(platform.python_version())"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 2,
38 | "metadata": {
39 | "application/vnd.databricks.v1+cell": {
40 | "cellMetadata": {},
41 | "inputWidgets": {},
42 | "nuid": "fe9e9e64-75f6-4a0c-a946-482d5025e6f1",
43 | "showTitle": false,
44 | "title": ""
45 | }
46 | },
47 | "outputs": [],
48 | "source": [
49 | "user_data_fp: str = \"../data/parquet/user-details.parquet\""
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 3,
55 | "metadata": {},
56 | "outputs": [
57 | {
58 | "name": "stderr",
59 | "output_type": "stream",
60 | "text": [
61 | "Setting default log level to \"WARN\".\n",
62 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
63 | "25/02/14 14:28:27 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
64 | ]
65 | },
66 | {
67 | "name": "stdout",
68 | "output_type": "stream",
69 | "text": [
70 | "3.5.4\n"
71 | ]
72 | }
73 | ],
74 | "source": [
75 | "# Import Libraries\n",
76 | "import pyspark.sql.types as T\n",
77 | "from pyspark.sql import SparkSession\n",
78 | "\n",
79 | "# Setup the Configuration\n",
80 | "#conf = pyspark.SparkConf()\n",
81 | "\n",
82 | "# Retrieve the Spark session\n",
83 | "spark = SparkSession.builder.getOrCreate()\n",
84 | "print(spark.version)"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 4,
90 | "metadata": {},
91 | "outputs": [],
92 | "source": [
93 | "# Setup the Schema\n",
94 | "schema = T.StructType([\n",
95 | "T.StructField(\"User ID\", T.IntegerType(), True),\n",
96 | "T.StructField(\"Username\", T.StringType(), True),\n",
97 | "T.StructField(\"Browser\", T.StringType(), True),\n",
98 | "T.StructField(\"OS\", T.StringType(), True),\n",
99 | "])\n",
100 | "\n",
101 | "# Add Data\n",
102 | "data = ([\n",
103 | "(1580, \"Barry\", \"FireFox\", \"Windows\" ),\n",
104 | "(5820, \"Sam\", \"MS Edge\", \"Linux\"),\n",
105 | "(2340, \"Harry\", \"Vivaldi\", \"Windows\"),\n",
106 | "(7860, \"Albert\", \"Chrome\", \"Windows\"),\n",
107 | "(1123, \"May\", \"Safari\", \"macOS\")\n",
108 | "])\n",
109 | "\n",
110 | "# Setup the Data Frame\n",
111 | "user_data_df = spark.createDataFrame(data, schema=schema)"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 5,
117 | "metadata": {},
118 | "outputs": [
119 | {
120 | "name": "stderr",
121 | "output_type": "stream",
122 | "text": [
123 | " "
124 | ]
125 | },
126 | {
127 | "data": {
128 | "text/html": [
129 | "\n",
130 | "\n",
143 | "
\n",
144 | " \n",
145 | " \n",
146 | " | \n",
147 | " User ID | \n",
148 | " Username | \n",
149 | " Browser | \n",
150 | " OS | \n",
151 | "
\n",
152 | " \n",
153 | " \n",
154 | " \n",
155 | " 0 | \n",
156 | " 1580 | \n",
157 | " Barry | \n",
158 | " FireFox | \n",
159 | " Windows | \n",
160 | "
\n",
161 | " \n",
162 | " 1 | \n",
163 | " 5820 | \n",
164 | " Sam | \n",
165 | " MS Edge | \n",
166 | " Linux | \n",
167 | "
\n",
168 | " \n",
169 | " 2 | \n",
170 | " 2340 | \n",
171 | " Harry | \n",
172 | " Vivaldi | \n",
173 | " Windows | \n",
174 | "
\n",
175 | " \n",
176 | " 3 | \n",
177 | " 7860 | \n",
178 | " Albert | \n",
179 | " Chrome | \n",
180 | " Windows | \n",
181 | "
\n",
182 | " \n",
183 | " 4 | \n",
184 | " 1123 | \n",
185 | " May | \n",
186 | " Safari | \n",
187 | " macOS | \n",
188 | "
\n",
189 | " \n",
190 | "
\n",
191 | "
"
192 | ],
193 | "text/plain": [
194 | " User ID Username Browser OS\n",
195 | "0 1580 Barry FireFox Windows\n",
196 | "1 5820 Sam MS Edge Linux\n",
197 | "2 2340 Harry Vivaldi Windows\n",
198 | "3 7860 Albert Chrome Windows\n",
199 | "4 1123 May Safari macOS"
200 | ]
201 | },
202 | "execution_count": 5,
203 | "metadata": {},
204 | "output_type": "execute_result"
205 | }
206 | ],
207 | "source": [
208 | "user_data_pdf = user_data_df.toPandas()\n",
209 | "user_data_pdf"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": 6,
215 | "metadata": {},
216 | "outputs": [],
217 | "source": [
218 | "user_data_pdf.to_parquet(user_data_fp)"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": 7,
224 | "metadata": {},
225 | "outputs": [
226 | {
227 | "name": "stdout",
228 | "output_type": "stream",
229 | "text": [
230 | "total 8\n",
231 | "-rw-r--r--@ 1 mac-DARNAU24 staff 3.1K Feb 14 14:28 user-details.parquet\n"
232 | ]
233 | }
234 | ],
235 | "source": [
236 | "%%sh\n",
237 | "ls -lFh ../data/parquet/"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": null,
243 | "metadata": {},
244 | "outputs": [],
245 | "source": []
246 | }
247 | ],
248 | "metadata": {
249 | "application/vnd.databricks.v1+notebook": {
250 | "dashboards": [],
251 | "language": "python",
252 | "notebookMetadata": {},
253 | "notebookName": "test",
254 | "widgets": {}
255 | },
256 | "kernelspec": {
257 | "display_name": "Python 3 (ipykernel)",
258 | "language": "python",
259 | "name": "python3"
260 | },
261 | "language_info": {
262 | "codemirror_mode": {
263 | "name": "ipython",
264 | "version": 3
265 | },
266 | "file_extension": ".py",
267 | "mimetype": "text/x-python",
268 | "name": "python",
269 | "nbconvert_exporter": "python",
270 | "pygments_lexer": "ipython3",
271 | "version": "3.11.11"
272 | }
273 | },
274 | "nbformat": 4,
275 | "nbformat_minor": 4
276 | }
277 |
--------------------------------------------------------------------------------
/programming/python/example/.gitignore:
--------------------------------------------------------------------------------
1 | # uv
2 | uv.lock
3 |
4 |
--------------------------------------------------------------------------------
/programming/python/example/.python-version:
--------------------------------------------------------------------------------
1 | 3.13
2 |
--------------------------------------------------------------------------------
/programming/python/example/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/programming/python/example/README.md
--------------------------------------------------------------------------------
/programming/python/example/example.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env -S uv run
2 | # /// script
3 | # requires-python = ">=3.13"
4 | # dependencies = [
5 | # "requests",
6 | # ]
7 | # ///
8 | import requests; print(requests.get("https://astral.sh"))
9 |
--------------------------------------------------------------------------------
/programming/python/example/hello.py:
--------------------------------------------------------------------------------
1 | def main():
2 | print("Hello from example!")
3 |
4 |
5 | if __name__ == "__main__":
6 | main()
7 |
--------------------------------------------------------------------------------
/programming/python/example/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "example"
3 | version = "0.1.0"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | requires-python = ">=3.13"
7 | dependencies = [
8 | "ruff>=0.9.6",
9 | ]
10 |
--------------------------------------------------------------------------------
/programming/python/numbers/.gitignore:
--------------------------------------------------------------------------------
1 | # uv
2 | uv.lock
3 |
4 |
--------------------------------------------------------------------------------
/programming/python/numbers/.python-version:
--------------------------------------------------------------------------------
1 | 3.13
2 |
--------------------------------------------------------------------------------
/programming/python/numbers/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/programming/python/numbers/README.md
--------------------------------------------------------------------------------
/programming/python/numbers/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "numbers"
3 | version = "0.1.0"
4 | description = "Add your description here"
5 | readme = "README.md"
6 | authors = [
7 | { name = "Denis Arnaud", email = "denis.arnaud@decathlon.com" }
8 | ]
9 | requires-python = ">=3.13"
10 | dependencies = []
11 |
12 | [build-system]
13 | requires = ["hatchling"]
14 | build-backend = "hatchling.build"
15 |
16 | [dependency-groups]
17 | dev = [
18 | "ruff>=0.9.6",
19 | ]
20 |
--------------------------------------------------------------------------------
/programming/python/numbers/src/numbers/__init__.py:
--------------------------------------------------------------------------------
1 | from typing import Iterable
2 |
3 | import os
4 |
5 | def sum_even_numbers(numbers: Iterable[int]) -> int:
6 | """Given an iterable of integers, return the sum of all even numbers in the iterable."""
7 | return sum(
8 | num for num in numbers
9 | if num % 2 == 0
10 | )
11 |
12 | def hello() -> str:
13 | return "Hello from numbers!"
14 |
15 |
--------------------------------------------------------------------------------
/programming/python/numbers/src/numbers/py.typed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/data-engineering-helpers/ks-cheat-sheets/ea4c18af5b2a48667ed44352206f69eafd4886ea/programming/python/numbers/src/numbers/py.typed
--------------------------------------------------------------------------------
/secret-management/hashicorp-vault/README.md:
--------------------------------------------------------------------------------
1 | Cheat Sheet - HashiCorp Vault
2 | =============================
3 |
4 | # Table of Content (ToC)
5 |
6 | Created by [gh-md-toc](https://github.com/ekalinin/github-markdown-toc.go)
7 |
8 | # Overview
9 | [This cheat sheet](https://github.com/data-engineering-helpers/ks-cheat-sheets/blob/main/secret-management/hashicorp-vault/README.md)
10 | explains how to install and to use
11 | [HasiCorp Vault](https://developer.hashicorp.com/vault/tutorials/getting-started/getting-started-install)
12 | on premises, _e.g._, on a laptop or on a virtual machine (VM).
13 |
14 | # References
15 | * [HashiCorp - Developer docs - Getting started - Installation](https://developer.hashicorp.com/vault/tutorials/getting-started/getting-started-install)
16 | * [HashiCorp - Developer docs - Install Vault](https://developer.hashicorp.com/vault/install)
17 |
18 | # Setup
19 |
20 | ## MacOS
21 | * Install HashiCorp Vault with HomeBrew:
22 | ```bash
23 | brew tap hashicorp/tap
24 | brew install hashicorp/tap/vault
25 | ```
26 |
27 | * In the Shell init script (_e.g._, `~/.bashrc`), specifiy the `VAULT_ADDR` environment variable:
28 | ```bash
29 | export VAULT_ADDR="https://some.vaultdomain.my.cloud"
30 | ```
31 |
32 | ## Interact with the HashiCorp Vault
33 | * Once in a while, login to the Vault
34 | * Sign in with the domain SSO (single sign on) on a web browser
35 | ```bash
36 | vault login -method=oidc
37 | ```
38 | * That opens a tab in the current web browser window and checks
39 | that the SSO is still valid, and then setups the Vault token
40 | in the `~/.vault-token` file
41 |
42 | * Display the various key-value pairs and paths available for the project:
43 | ```bash
44 | vault kv list -namespace=mynm -mount="secret-mydom" "some/specific/path"
45 | Keys
46 | ----
47 | path1
48 | path2
49 | somedir/
50 | ```
51 |
52 | * Display the Vault key-value pairs for a specific path:
53 | ```bash
54 | vault kv get -namespace=mynm -mount="secret-mydom" "some/specific/path"
55 | ============= Data =============
56 | Key Value
57 | --- -----
58 | ...
59 | some_key1 some_value1
60 | some_key2 some_value2
61 | ```
62 |
63 |
--------------------------------------------------------------------------------