├── .github
├── FUNDING.yml
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
└── workflows
│ └── deploy.yaml
├── .gitignore
├── .hugo_build.lock
├── .prettierrc
├── CODE_OF_CONDUCT.md
├── LICENSE.txt
├── Makefile
├── README.md
├── assets
├── indices
│ └── .gitkeep
├── js
│ ├── callouts.js
│ ├── clipboard.js
│ ├── code-title.js
│ ├── darkmode.js
│ ├── full-text-search.js
│ ├── graph.js
│ ├── popover.js
│ ├── router.js
│ ├── semantic-search.js
│ └── util.js
└── styles
│ ├── _callouts.scss
│ ├── _dark_syntax.scss
│ ├── _light_syntax.scss
│ ├── base.scss
│ ├── clipboard.scss
│ ├── code-title.scss
│ ├── custom.scss
│ ├── darkmode.scss
│ └── syntax.scss
├── config.toml
├── content
├── .obsidian
│ ├── .vimrc
│ ├── app.json
│ ├── appearance.json
│ ├── community-plugins.json
│ ├── core-plugins.json
│ ├── graph.json
│ ├── hotkeys.json
│ ├── plugins
│ │ ├── obsidian-admonition
│ │ │ ├── data.json
│ │ │ ├── main.js
│ │ │ ├── manifest.json
│ │ │ └── styles.css
│ │ └── obsidian-auto-link-title
│ │ │ ├── main.js
│ │ │ ├── manifest.json
│ │ │ └── styles.css
│ ├── templates.json
│ ├── themes
│ │ └── kanagawa.css
│ ├── workspace
│ └── workspace.json
├── _index.md
├── images
│ ├── business-intelligence-tools-landscape.png
│ ├── dag.png
│ ├── data-catalog-feature-comparison2.png
│ ├── data-catalog-overview-sarah.png
│ ├── data-contract-example.png
│ ├── data-contract.png
│ ├── data-engineering-lifecycle.png
│ ├── data-hierarchy-of-needs.png
│ ├── data-integration.jpg
│ ├── data-ops.png
│ ├── data-quality.png
│ ├── declarative-vs-imperative.png
│ ├── elt-tool.png
│ ├── etl-tool.png
│ ├── etlt-extract-tweak-load-transform.png
│ ├── future-modern-data-stack.png
│ ├── semantic-warehouse.png
│ ├── setup-folder-structure.png
│ ├── setup-obsidian-vault.png
│ └── sql-levels-explained.png
├── private
│ └── private-note.md
├── templates
│ ├── airbyte.md
│ └── term.md
└── term
│ ├── about this glossary.md
│ ├── acid transactions.md
│ ├── airbyte catalog.md
│ ├── airbyte cdk.md
│ ├── airbyte cloud.md
│ ├── airbyte glossary of terms.md
│ ├── airbyte specification.md
│ ├── airbyte streams.md
│ ├── airbyte.md
│ ├── analytics.md
│ ├── apache airflow.md
│ ├── apache arrow.md
│ ├── apache avro.md
│ ├── apache druid.md
│ ├── apache hadoop.md
│ ├── apache hive.md
│ ├── apache hudi.md
│ ├── apache iceberg.md
│ ├── apache parquet.md
│ ├── apache spark.md
│ ├── behavioral data.md
│ ├── big o notation.md
│ ├── bus matrix.md
│ ├── business intelligence tools.md
│ ├── business intelligence.md
│ ├── cdp (customer data platform).md
│ ├── cloud provider.md
│ ├── contribute to glossary.md
│ ├── cte (common table expression).md
│ ├── cursor.md
│ ├── dag directed acyclic graph.md
│ ├── dagster.md
│ ├── data asset.md
│ ├── data catalog.md
│ ├── data contract.md
│ ├── data engineering concepts.md
│ ├── data engineering guides.md
│ ├── data engineering lifecycle.md
│ ├── data engineering.md
│ ├── data enrichment.md
│ ├── data federation.md
│ ├── data governance.md
│ ├── data hierarchy of needs.md
│ ├── data integration.md
│ ├── data lake file format.md
│ ├── data lake table format.md
│ ├── data lake transaction log.md
│ ├── data lake.md
│ ├── data lakehouse.md
│ ├── data lineage.md
│ ├── data literacy.md
│ ├── data mesh.md
│ ├── data modeling.md
│ ├── data observability.md
│ ├── data ops.md
│ ├── data orchestrator.md
│ ├── data processing techniques.md
│ ├── data product.md
│ ├── data quality.md
│ ├── data swamp.md
│ ├── data transformation.md
│ ├── data virtualization.md
│ ├── data warehouse.md
│ ├── database normalization.md
│ ├── declarative.md
│ ├── delta lake.md
│ ├── dev ops.md
│ ├── dimensional modeling.md
│ ├── dimensions.md
│ ├── duckdb.md
│ ├── elt.md
│ ├── etl elt airbyte.md
│ ├── etl vs elt.md
│ ├── etl.md
│ ├── etlt.md
│ ├── full refresh synchronization.md
│ ├── functional data engineering.md
│ ├── functional programming.md
│ ├── general infos.md
│ ├── granularity.md
│ ├── idempotency.md
│ ├── imperative.md
│ ├── in-memory format.md
│ ├── incremental synchronization.md
│ ├── jinja template.md
│ ├── key performance indicator (kpi).md
│ ├── kubernetes.md
│ ├── lambda architecture.md
│ ├── machine learning.md
│ ├── map reduce.md
│ ├── master data management (mdm).md
│ ├── maxime beauchemin.md
│ ├── metric.md
│ ├── metrics layer.md
│ ├── modern data stack.md
│ ├── normalization.md
│ ├── notebooks.md
│ ├── olap (online analytical processing).md
│ ├── oltp (online transactional processing).md
│ ├── open data stack.md
│ ├── orc.md
│ ├── other glossaries.md
│ ├── pandas.md
│ ├── partial success.md
│ ├── programming languages.md
│ ├── push-down.md
│ ├── python.md
│ ├── raw tables.md
│ ├── reverse etl.md
│ ├── rollup.md
│ ├── rust.md
│ ├── schema evolution.md
│ ├── semantic layer.md
│ ├── semantic warehouse.md
│ ├── semi-structured data.md
│ ├── slowly changing dimension scd.md
│ ├── soft delete.md
│ ├── software-defined assets.md
│ ├── sql.md
│ ├── storage layer object store.md
│ ├── structured data.md
│ ├── sync run.md
│ ├── temporal.md
│ ├── time travel.md
│ ├── unstructured data.md
│ └── yaml.md
├── data
├── config.yaml
└── graphConfig.yaml
├── deployment.md
├── i18n
├── ar.toml
├── en.toml
├── es.toml
├── fr.toml
└── uk.toml
├── layouts
├── 404.html
├── _default
│ ├── _markup
│ │ ├── render-image.html
│ │ └── render-link.html
│ ├── baseof.html
│ ├── section.html
│ ├── single.html
│ ├── taxonomy.html
│ └── term.html
├── index.html
└── partials
│ ├── backlinks.html
│ ├── contact.html
│ ├── darkmode.html
│ ├── date-fmt.html
│ ├── footer.html
│ ├── footerIndex.html
│ ├── github.html
│ ├── graph.html
│ ├── head.html
│ ├── header.html
│ ├── katex.html
│ ├── page-list.html
│ ├── recent.html
│ ├── search.html
│ ├── tags.html
│ ├── textprocessing.html
│ └── toc.html
├── lower_case.py
├── lower_link_index.py
├── static
├── glossary-feature.jpeg
└── icon.png
└── utils
└── requirements.txt
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: [jackyzha0]
2 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Something about Quartz isn't working the way you expect
4 | title: ''
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Desktop (please complete the following information):**
27 | - Device: [e.g. iPhone6]
28 | - OS: [e.g. iOS]
29 | - Browser [e.g. chrome, safari]
30 |
31 | **Additional context**
32 | Add any other context about the problem here.
33 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea or improvement for Quartz
4 | title: ''
5 | labels: enhancement
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.github/workflows/deploy.yaml:
--------------------------------------------------------------------------------
1 | name: Deploy to GitHub Pages
2 |
3 | on:
4 | push:
5 | branches:
6 | - hugo
7 |
8 | jobs:
9 | deploy:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: actions/checkout@v3
13 | with:
14 | fetch-depth: 0 # Fetch all history for .GitInfo and .Lastmod
15 |
16 | # - name: Checkout python
17 | # uses: actions/checkout@v2
18 | # # Setup steps
19 | # - uses: actions/setup-python@v2
20 | # with:
21 | # python-version: "3.9.11"
22 |
23 | # # hack to have lower-case links instead e.g. instead of term/Apache-Avro we have term/apache-avro
24 | # - name: Convert links of terms to lower case
25 | # run: python3 lower_case.py
26 |
27 | - name: Build Link Index
28 | uses: jackyzha0/hugo-obsidian@v2.18
29 | with:
30 | index: true
31 | input: content
32 | output: assets/indices
33 | root: .
34 |
35 | # - name: Convert linkIndex to lower case (assets)
36 | # run: sudo python3 lower_link_index.py "assets/indices/"
37 |
38 | - name: Setup Hugo
39 | uses: peaceiris/actions-hugo@v2
40 | with:
41 | hugo-version: 'latest'
42 | extended: true
43 |
44 | - name: Build Hugo
45 | run: hugo --minify
46 |
47 | # # hack to also update graph view to lower-case (otherwise links are not showing correctly)
48 | # - name: Convert links to lower case (public)
49 | # run: sudo python3 lower_link_index.py "public/indices/"
50 |
51 | - name: Deploy
52 | uses: peaceiris/actions-gh-pages@v3
53 | with:
54 | # deploy_key: ${{ secrets.ACTIONS_DEPLOY_KEY }}
55 | personal_token: ${{ secrets.PERSONAL_TOKEN }}
56 | publish_dir: ./public
57 | external_repository: airbyteglossary/airbyteglossary.github.io
58 | publish_branch: master # deploying branch
59 | cname: glossary.airbyte.com # without trailing slash #https://airbytehq.github.io/brain
60 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | public
3 | resources
4 | .idea
5 | content/.obsidian
6 | assets/indices/linkIndex.json
7 | assets/indices/contentIndex.json
8 | linkmap
9 | .vscode
10 |
--------------------------------------------------------------------------------
/.hugo_build.lock:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airbytehq/glossary/90017025f55eb565184c51e6b110bf8da9704a9e/.hugo_build.lock
--------------------------------------------------------------------------------
/.prettierrc:
--------------------------------------------------------------------------------
1 | {
2 | "printWidth": 100,
3 | "quoteProps": "as-needed",
4 | "trailingComma": "all",
5 | "tabWidth": 2,
6 | "semi": false
7 | }
8 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 jackyzha0
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .DEFAULT_GOAL := serve
2 |
3 | help: ## Show all Makefile targets
4 | @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
5 |
6 | update: ## Update Quartz to the latest version on Github
7 | go install github.com/jackyzha0/hugo-obsidian@latest
8 | @git remote show upstream || (echo "remote 'upstream' not present, setting 'upstream'" && git remote add upstream https://github.com/jackyzha0/quartz.git)
9 | git fetch upstream
10 | git log --oneline --decorate --graph ..upstream/hugo
11 | git checkout -p upstream/hugo -- layouts .github Makefile assets/js assets/styles/base.scss assets/styles/darkmode.scss config.toml data
12 |
13 | update-force: ## Forcefully pull all changes and don't ask to patch
14 | go install github.com/jackyzha0/hugo-obsidian@latest
15 | @git remote show upstream || (echo "remote 'upstream' not present, setting 'upstream'" && git remote add upstream https://github.com/jackyzha0/quartz.git)
16 | git fetch upstream
17 | git checkout upstream/hugo -- layouts .github Makefile assets/js assets/styles/base.scss assets/styles/darkmode.scss config.toml data
18 |
19 | serve: ## Serve Quartz locally
20 | hugo-obsidian -input=content -output=assets/indices -index -root=. && hugo server --enableGitInfo --minify
21 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # A Single Place for All Data Knowledge 🧠
2 |
3 | Hi, happy to see you here.
4 |
5 | This is the source code of the [Data Glossary](https://glossary.airbyte.com).
6 |
7 | * ℹ If you want to know more about the Glossary, see [About this Glossary](https://glossary.airbyte.com/term/about-this-glossary/)
8 | * ✍ Missing a Term or want to fix a typo? Check [How to Contribute](https://glossary.airbyte.com/term/contribute-to-glossary/)
9 | * 👀 Want to discuss or need help, talk to us on [Slack](https://slack.airbyte.com/)
10 | * 🔨 For more technical details see [Deployment](deployment.md).
11 |
12 | ## Special Thanks
13 | This repo is a fork of [Quartz](https://github.com/jackyzha0/quartz) and we thank Jacky for open-sourcing this gem.
14 |
--------------------------------------------------------------------------------
/assets/indices/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airbytehq/glossary/90017025f55eb565184c51e6b110bf8da9704a9e/assets/indices/.gitkeep
--------------------------------------------------------------------------------
/assets/js/callouts.js:
--------------------------------------------------------------------------------
1 | const addCollapsibleCallouts = () => {
2 | const collapsibleCallouts = document.querySelectorAll("blockquote.callout-collapsible");
3 | collapsibleCallouts.forEach(el => el.addEventListener('click', event => {
4 | event.currentTarget.classList.toggle("callout-collapsed");
5 | }));
6 | }
7 |
--------------------------------------------------------------------------------
/assets/js/clipboard.js:
--------------------------------------------------------------------------------
1 | const svgCopy =
2 | '';
3 | const svgCheck =
4 | '';
5 |
6 |
7 | const addCopyButtons = () => {
8 | let els = document.getElementsByClassName("highlight");
9 | // for each highlight
10 | for (let i = 0; i < els.length; i++) {
11 | if (els[i].getElementsByClassName("clipboard-button").length) continue;
12 |
13 | // find pre > code inside els[i]
14 | let codeBlocks = els[i].getElementsByTagName("code");
15 |
16 | // line numbers are inside first code block
17 | let lastCodeBlock = codeBlocks[codeBlocks.length - 1];
18 | const button = document.createElement("button");
19 | button.className = "clipboard-button";
20 | button.type = "button";
21 | button.innerHTML = svgCopy;
22 | // remove every second newline from lastCodeBlock.innerText
23 | button.addEventListener("click", () => {
24 | navigator.clipboard.writeText(lastCodeBlock.innerText.replace(/\n\n/g, "\n")).then(
25 | () => {
26 | button.blur();
27 | button.innerHTML = svgCheck;
28 | setTimeout(() => {
29 | button.innerHTML = svgCopy
30 | button.style.borderColor = ""
31 | }, 2000);
32 | },
33 | (error) => (button.innerHTML = "Error")
34 | );
35 | });
36 | // find chroma inside els[i]
37 | let chroma = els[i].getElementsByClassName("chroma")[0];
38 | els[i].insertBefore(button, chroma);
39 | }
40 | }
41 |
--------------------------------------------------------------------------------
/assets/js/code-title.js:
--------------------------------------------------------------------------------
1 |
2 | function addTitleToCodeBlocks() {
3 | var els = document.getElementsByClassName("highlight");
4 | for (var i = 0; i < els.length; i++) {
5 | if (els[i].title.length) {
6 | let div = document.createElement("div");
7 | if (els[i].getElementsByClassName("code-title").length) continue;
8 | div.textContent=els[i].title;
9 | div.classList.add("code-title")
10 | els[i].insertBefore(div, els[i].firstChild);
11 | }
12 | }
13 | };
14 |
--------------------------------------------------------------------------------
/assets/js/darkmode.js:
--------------------------------------------------------------------------------
1 | const userPref = window.matchMedia('(prefers-color-scheme: light)').matches ? 'light' : 'dark'
2 | const currentTheme = localStorage.getItem('theme') ?? userPref
3 | const syntaxTheme = document.querySelector("#theme-link");
4 |
5 |
6 | {{ $darkSyntax := resources.Get "styles/_dark_syntax.scss" | resources.ToCSS (dict "outputStyle" "compressed") | resources.Fingerprint "md5" | resources.Minify }}
7 | {{ $lightSyntax := resources.Get "styles/_light_syntax.scss" | resources.ToCSS (dict "outputStyle" "compressed") | resources.Fingerprint "md5" | resources.Minify }}
8 |
9 | if (currentTheme) {
10 | document.documentElement.setAttribute('saved-theme', currentTheme);
11 | syntaxTheme.href = currentTheme === 'dark' ? '{{ $darkSyntax.Permalink }}' : '{{ $lightSyntax.Permalink }}';
12 | }
13 |
14 | const switchTheme = (e) => {
15 | if (e.target.checked) {
16 | document.documentElement.setAttribute('saved-theme', 'dark');
17 | localStorage.setItem('theme', 'dark');
18 | syntaxTheme.href = '{{ $darkSyntax.Permalink }}';
19 | }
20 | else {
21 | document.documentElement.setAttribute('saved-theme', 'light')
22 | localStorage.setItem('theme', 'light')
23 | syntaxTheme.href = '{{ $lightSyntax.Permalink }}';
24 | }
25 | }
26 |
27 | window.addEventListener('DOMContentLoaded', () => {
28 | // Darkmode toggle
29 | const toggleSwitch = document.querySelector('#darkmode-toggle')
30 |
31 | // listen for toggle
32 | toggleSwitch.addEventListener('change', switchTheme, false)
33 |
34 | if (currentTheme === 'dark') {
35 | toggleSwitch.checked = true
36 | }
37 | })
38 |
--------------------------------------------------------------------------------
/assets/js/full-text-search.js:
--------------------------------------------------------------------------------
1 | ; (async function() {
2 | const encoder = (str) => str.toLowerCase().split(/([^a-z]|[^\x00-\x7F])/)
3 | const contentIndex = new FlexSearch.Document({
4 | cache: true,
5 | charset: "latin:extra",
6 | optimize: true,
7 | index: [
8 | {
9 | field: "content",
10 | tokenize: "reverse",
11 | encode: encoder,
12 | },
13 | {
14 | field: "title",
15 | tokenize: "forward",
16 | encode: encoder,
17 | },
18 | ],
19 | })
20 |
21 | const { content } = await fetchData
22 | for (const [key, value] of Object.entries(content)) {
23 | contentIndex.add({
24 | id: key,
25 | title: value.title,
26 | content: removeMarkdown(value.content),
27 | })
28 | }
29 |
30 | const formatForDisplay = (id) => ({
31 | id,
32 | url: id,
33 | title: content[id].title,
34 | content: content[id].content,
35 | })
36 |
37 | registerHandlers((e) => {
38 | term = e.target.value
39 | const searchResults = contentIndex.search(term, [
40 | {
41 | field: "content",
42 | limit: 10,
43 | },
44 | {
45 | field: "title",
46 | limit: 5,
47 | },
48 | ])
49 | const getByField = (field) => {
50 | const results = searchResults.filter((x) => x.field === field)
51 | if (results.length === 0) {
52 | return []
53 | } else {
54 | return [...results[0].result]
55 | }
56 | }
57 | const allIds = new Set([...getByField("title"), ...getByField("content")])
58 | const finalResults = [...allIds].map(formatForDisplay)
59 | displayResults(finalResults, true)
60 | })
61 | })()
62 |
--------------------------------------------------------------------------------
/assets/js/popover.js:
--------------------------------------------------------------------------------
1 | function htmlToElement(html) {
2 | const template = document.createElement("template")
3 | html = html.trim()
4 | template.innerHTML = html
5 | return template.content.firstChild
6 | }
7 |
8 | function initPopover(baseURL, useContextualBacklinks, renderLatex) {
9 | const basePath = baseURL.replace(window.location.origin, "")
10 | fetchData.then(({ content }) => {
11 | const links = [...document.getElementsByClassName("internal-link")]
12 | links
13 | .filter(li => li.dataset.src || (li.dataset.idx && useContextualBacklinks))
14 | .forEach(li => {
15 | var el
16 | if (li.dataset.ctx) {
17 | const linkDest = content[li.dataset.src]
18 | const popoverElement = `
19 |
${linkDest.title}
20 |
${highlight(removeMarkdown(linkDest.content), li.dataset.ctx)}...
21 |
${new Date(linkDest.lastmodified).toLocaleDateString()}
22 |
`
23 | el = htmlToElement(popoverElement)
24 | } else {
25 | const linkDest = content[li.dataset.src.replace(/\/$/g, "").replace(basePath, "")]
26 | if (linkDest) {
27 | let splitLink = li.href.split("#")
28 | let cleanedContent = removeMarkdown(linkDest.content)
29 | if (splitLink.length > 1) {
30 | let headingName = splitLink[1].replace(/\-/g, " ")
31 | let headingIndex = cleanedContent.toLowerCase().indexOf("" + headingName + "")
32 | cleanedContent = cleanedContent.substring(headingIndex, cleanedContent.length)
33 | }
34 | const popoverElement = `
35 |
${linkDest.title}
36 |
${cleanedContent.split(" ", 20).join(" ")}...
37 |
${new Date(linkDest.lastmodified).toLocaleDateString()}
38 |
`
39 | el = htmlToElement(popoverElement)
40 | }
41 | }
42 |
43 | if (el) {
44 | li.appendChild(el)
45 | if (renderLatex) {
46 | renderMathInElement(el, {
47 | delimiters: [
48 | { left: '$$', right: '$$', display: false },
49 | { left: '$', right: '$', display: false },
50 | ],
51 | throwOnError: false
52 | })
53 | }
54 |
55 | li.addEventListener("mouseover", () => {
56 | // fix tooltip positioning
57 | window.FloatingUIDOM.computePosition(li, el, {
58 | middleware: [window.FloatingUIDOM.offset(10), window.FloatingUIDOM.inline(), window.FloatingUIDOM.shift()],
59 | }).then(({ x, y }) => {
60 | Object.assign(el.style, {
61 | left: `${x}px`,
62 | top: `${y}px`,
63 | })
64 | })
65 |
66 | el.classList.add("visible")
67 | })
68 | li.addEventListener("mouseout", () => {
69 | el.classList.remove("visible")
70 | })
71 | }
72 | })
73 | })
74 | }
75 |
--------------------------------------------------------------------------------
/assets/js/router.js:
--------------------------------------------------------------------------------
1 | import {
2 | apply,
3 | navigate,
4 | prefetch,
5 | router,
6 | } from "https://unpkg.com/million@1.11.5/dist/router.mjs"
7 |
8 | export const attachSPARouting = (init, rerender) => {
9 | // Attach SPA functions to the global Million namespace
10 | window.Million = {
11 | apply,
12 | navigate,
13 | prefetch,
14 | router,
15 | }
16 |
17 | const render = () => requestAnimationFrame(rerender)
18 |
19 | window.addEventListener("DOMContentLoaded", () => {
20 | apply((doc) => init(doc))
21 | init()
22 | router(".singlePage")
23 | render()
24 | })
25 | window.addEventListener("million:navigate", render)
26 | }
27 |
--------------------------------------------------------------------------------
/assets/js/semantic-search.js:
--------------------------------------------------------------------------------
1 | const apiKey = "{{$.Site.Data.config.operandApiKey}}"
2 |
3 | async function searchContents(query) {
4 | const response = await fetch('https://prod.operand.ai/v3/search/objects', {
5 | method: 'POST',
6 | headers: {
7 | 'Content-Type': 'application/json',
8 | Authorization: apiKey,
9 | },
10 | body: JSON.stringify({
11 | query,
12 | max: 10
13 | }),
14 | });
15 | return (await response.json());
16 | }
17 |
18 | function debounce(func, timeout = 200) {
19 | let timer;
20 | return (...args) => {
21 | clearTimeout(timer)
22 | timer = setTimeout(() => { func.apply(this, args); }, timeout)
23 | };
24 | }
25 |
26 | registerHandlers(debounce((e) => {
27 | term = e.target.value
28 | if (term !== "") {
29 | searchContents(term)
30 | .then((res) => res.results.map(entry => ({
31 | url: entry.object.properties.url,
32 | content: entry.snippet,
33 | title: entry.object.metadata.title
34 | })
35 | ))
36 | .then(results => displayResults(results))
37 | }
38 | }))
39 |
--------------------------------------------------------------------------------
/assets/styles/clipboard.scss:
--------------------------------------------------------------------------------
1 | .clipboard-button {
2 | position: absolute;
3 | display: flex;
4 | float: right;
5 | right: 0;
6 | padding: 0.69em;
7 | margin: 0.5em;
8 | color: var(--outlinegray);
9 | border-color: var(--dark);
10 | background-color: var(--lightgray);
11 | filter: contrast(1.1);
12 | border: 2px solid;
13 | border-radius: 6px;
14 | font-size: 0.8em;
15 | z-index: 1;
16 | opacity: 0;
17 | transition: 0.12s;
18 |
19 | & > svg {
20 | fill: var(--light);
21 | filter: contrast(0.3);
22 | }
23 |
24 | &:hover {
25 | cursor: pointer;
26 | border-color: var(--primary);
27 |
28 | & > svg {
29 | fill: var(--primary);
30 | }
31 | }
32 |
33 | &:focus {
34 | outline: 0;
35 | }
36 | }
37 |
38 | .highlight {
39 | position: relative;
40 |
41 | &:hover > .clipboard-button {
42 | opacity: 1;
43 | transition: 0.2s;
44 | }
45 | }
46 |
47 |
48 |
--------------------------------------------------------------------------------
/assets/styles/code-title.scss:
--------------------------------------------------------------------------------
1 | .code-title {
2 | color: var(--primary) ;
3 | font-family: var(--font-mono);
4 | width: max-content;
5 | overflow-x: auto;
6 | display: inline-block;
7 | vertical-align: middle;
8 | font-weight: normal;
9 | line-height: 1em;
10 | position: relative;
11 | padding: 0.5em 0.6em 0.6em; // + 1.2 em
12 | max-width: calc(100% - 1.2em); // (-1.2 em) fits article width exactly
13 | margin-bottom: -0.2em;
14 | z-index: -1;
15 | border-top-left-radius: 0.3em;
16 | border-top-right-radius: 0.3em;
17 | font-size: 0.9em;
18 | background-color: var(--lightgray);
19 | filter: hue-rotate(-30deg) contrast(1.0) opacity(0.8);
20 | }
--------------------------------------------------------------------------------
/assets/styles/custom.scss:
--------------------------------------------------------------------------------
1 | // Add your own CSS here!
2 | :root {
3 | --light: #FFF;
4 | --dark: #1A194D;
5 | --secondary: #433BFB;
6 | --tertiary: #FF6A4D;
7 | --visited: #615EFF;
8 | --primary: #615EFF;
9 | --gray: #1A194D; //#4e4e4e;
10 | --lightgray: #E8E8ED;
11 | --outlinegray: #D9D9E0;
12 | --million-progress-bar-color: var(--secondary);
13 | --callout-info-accent: #67DAE1 !important;
14 | }
15 |
16 | [saved-theme="dark"] {
17 | --light: #0A0A23 !important;
18 | --dark: #fff !important;
19 | --secondary: #615EFF !important; //#625eff !important; // #ff6a4d !important;
20 | --visited: #7F7EFF !important;
21 | --tertiary: #FF6A4D !important;
22 | --primary: #7F7EFF !important;
23 | --gray: #fff !important;
24 | --lightgray: #1A194D !important;
25 | --outlinegray: #2D3270 !important;
26 | --callout-info-accent: #00A5B5 !important;
27 | }
28 |
--------------------------------------------------------------------------------
/assets/styles/darkmode.scss:
--------------------------------------------------------------------------------
1 | .darkmode {
2 | float: right;
3 | padding: 1em;
4 | min-width: 5px;
5 | position: relative;
6 |
7 | @media all and (max-width: 450px) {
8 | padding: 1em;
9 | }
10 |
11 | & > .toggle {
12 | display: none;
13 | box-sizing: border-box;
14 | }
15 |
16 | & svg {
17 | opacity: 0;
18 | position: absolute;
19 | width: 20px;
20 | height: 20px;
21 | top: calc(50% - 10px);
22 | margin: 0 7px;
23 | fill: var(--gray);
24 | transition: opacity 0.1s ease;
25 | }
26 | }
27 |
28 | .toggle:checked ~ label {
29 | & > #dayIcon {
30 | opacity: 0;
31 | }
32 | & > #nightIcon {
33 | opacity: 1;
34 | }
35 | }
36 |
37 | .toggle:not(:checked) ~ label {
38 | & > #dayIcon {
39 | opacity: 1;
40 | }
41 | & > #nightIcon {
42 | opacity: 0;
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/assets/styles/syntax.scss:
--------------------------------------------------------------------------------
1 | // Overrides
2 | /* Background */
3 | .chroma {
4 | overflow: hidden !important;
5 | background-color: var(--lightgray) !important;
6 | }
7 |
8 | /* LineTable */
9 | .chroma .lntable {
10 | width: auto !important;
11 | overflow: auto !important;
12 | display: block !important;
13 | }
14 |
15 | /* LineHighlight */
16 | .chroma .hl {
17 | display: block !important;
18 | width: 100% !important;
19 | }
20 |
21 | /* LineNumbersTable */
22 | .chroma .lnt {
23 | margin-right: 0.0em !important;
24 | padding: 0 0.0em 0 0.0em !important;
25 | }
26 |
27 | /* LineNumbers */
28 | .chroma .ln {
29 | margin-right: 0.0em !important;
30 | padding: 0 0.0em 0 0.0em !important;
31 | }
32 |
33 | /* GenericDeleted */
34 | .chroma .gd {
35 | color: #8b080b !important;
36 | }
37 |
38 | /* GenericInserted */
39 | .chroma .gi {
40 | font-weight: bold !important;
41 | }
42 |
43 | .lntd:first-of-type > .chroma {
44 | padding-right: 0 !important;
45 | }
46 |
47 | .chroma code {
48 | font-family: var(--font-mono) !important;
49 | font-size: 0.85em !important;
50 | line-height: 2em !important;
51 | background: none !important;
52 | padding: 0 !important;
53 | }
54 |
55 | .chroma {
56 | border-radius: 3px !important;
57 | margin: 0 !important;
58 | }
59 |
60 | pre.chroma {
61 | -moz-tab-size:4;-o-tab-size:4;tab-size:4;
62 | }
63 |
--------------------------------------------------------------------------------
/config.toml:
--------------------------------------------------------------------------------
1 | baseURL = "https://glossary.airbyte.com/" #make sure it ends with trailing slash
2 | languageCode = "en-us"
3 | googleAnalytics = "G-HDBMVFQGBH"
4 | relativeURLs = false
5 | disablePathToLower = true
6 | ignoreFiles = [
7 | "/content/templates/*",
8 | "/content/private/*",
9 | ]
10 | summaryLength = 20
11 | paginate = 10
12 | enableGitInfo = true
13 |
14 | [markup]
15 | [markup.tableOfContents]
16 | endLevel = 3
17 | ordered = true
18 | startLevel = 2
19 | [markup.highlight]
20 | noClasses = false
21 | anchorLineNos = false
22 | codeFences = true
23 | guessSyntax = true
24 | hl_Lines = ""
25 | lineAnchors = ""
26 | lineNoStart = 1
27 | lineNos = true
28 | lineNumbersInTable = true
29 | style = "dracula"
30 | [frontmatter]
31 | lastmod = ["lastmod", ":git", "date", "publishDate"]
32 | publishDate = ["publishDate", "date"]
33 | [markup.goldmark.renderer]
34 | unsafe = true
35 |
--------------------------------------------------------------------------------
/content/.obsidian/.vimrc:
--------------------------------------------------------------------------------
1 |
2 |
3 | " remap esc key
4 | imap kj
5 | imap jk
6 | imap jj
7 |
8 | " map to visual line instead of full line as in obsidian line are always
9 | " wraped
10 | noremap j
11 | noremap k
12 | nmap j gj
13 | nmap k gk
14 | " nmap $ g$
15 | " nmap ^ g^
16 | " nmap 0 g^
17 |
18 | "yank to clipboard
19 | set clipboard=unnamed
20 |
--------------------------------------------------------------------------------
/content/.obsidian/app.json:
--------------------------------------------------------------------------------
1 | {
2 | "newLinkFormat": "absolute",
3 | "alwaysUpdateLinks": true,
4 | "promptDelete": false,
5 | "vimMode": false,
6 | "attachmentFolderPath": "images",
7 | "newFileLocation": "folder",
8 | "newFileFolderPath": "term",
9 | "useMarkdownLinks": true,
10 | "pdfExportSettings": {
11 | "pageSize": "Letter",
12 | "landscape": false,
13 | "margin": "0",
14 | "downscalePercent": 100
15 | },
16 | "foldHeading": true,
17 | "showInlineTitle": false
18 | }
--------------------------------------------------------------------------------
/content/.obsidian/appearance.json:
--------------------------------------------------------------------------------
1 | {
2 | "cssTheme": "kanagawa",
3 | "accentColor": "",
4 | "showViewHeader": false,
5 | "nativeMenus": true
6 | }
--------------------------------------------------------------------------------
/content/.obsidian/community-plugins.json:
--------------------------------------------------------------------------------
1 | [
2 | "obsidian-admonition",
3 | "obsidian-auto-link-title"
4 | ]
--------------------------------------------------------------------------------
/content/.obsidian/core-plugins.json:
--------------------------------------------------------------------------------
1 | [
2 | "file-explorer",
3 | "global-search",
4 | "switcher",
5 | "graph",
6 | "backlink",
7 | "outgoing-link",
8 | "tag-pane",
9 | "page-preview",
10 | "daily-notes",
11 | "templates",
12 | "note-composer",
13 | "command-palette",
14 | "editor-status",
15 | "starred",
16 | "outline",
17 | "word-count",
18 | "file-recovery"
19 | ]
--------------------------------------------------------------------------------
/content/.obsidian/graph.json:
--------------------------------------------------------------------------------
1 | {
2 | "collapse-filter": false,
3 | "search": "",
4 | "showTags": false,
5 | "showAttachments": true,
6 | "hideUnresolved": false,
7 | "showOrphans": true,
8 | "collapse-color-groups": true,
9 | "colorGroups": [],
10 | "collapse-display": false,
11 | "showArrow": false,
12 | "textFadeMultiplier": 1.5,
13 | "nodeSizeMultiplier": 1,
14 | "lineSizeMultiplier": 1,
15 | "collapse-forces": true,
16 | "centerStrength": 0.518713248970312,
17 | "repelStrength": 10,
18 | "linkStrength": 1,
19 | "linkDistance": 250,
20 | "scale": 1,
21 | "close": true
22 | }
--------------------------------------------------------------------------------
/content/.obsidian/hotkeys.json:
--------------------------------------------------------------------------------
1 | {
2 | "file-explorer:open": [
3 | {
4 | "modifiers": [
5 | "Mod",
6 | "Shift"
7 | ],
8 | "key": "E"
9 | }
10 | ],
11 | "obsidian-auto-link-title:enhance-url-with-title": [],
12 | "file-explorer:move-file": [
13 | {
14 | "modifiers": [
15 | "Mod"
16 | ],
17 | "key": "M"
18 | }
19 | ],
20 | "app:toggle-left-sidebar": [
21 | {
22 | "modifiers": [
23 | "Mod"
24 | ],
25 | "key": "L"
26 | }
27 | ],
28 | "app:toggle-right-sidebar": [
29 | {
30 | "modifiers": [
31 | "Mod"
32 | ],
33 | "key": "R"
34 | }
35 | ],
36 | "file-explorer:reveal-active-file": [
37 | {
38 | "modifiers": [
39 | "Mod",
40 | "Shift"
41 | ],
42 | "key": "R"
43 | }
44 | ],
45 | "editor:focus-left": [
46 | {
47 | "modifiers": [
48 | "Ctrl"
49 | ],
50 | "key": "H"
51 | }
52 | ],
53 | "editor:focus-right": [
54 | {
55 | "modifiers": [
56 | "Ctrl"
57 | ],
58 | "key": "L"
59 | }
60 | ],
61 | "templater-obsidian:insert-templater": [
62 | {
63 | "modifiers": [
64 | "Mod"
65 | ],
66 | "key": "T"
67 | }
68 | ],
69 | "workspace:split-vertical": [
70 | {
71 | "modifiers": [
72 | "Ctrl"
73 | ],
74 | "key": "S"
75 | }
76 | ],
77 | "workspace:split-horizontal": [
78 | {
79 | "modifiers": [
80 | "Ctrl"
81 | ],
82 | "key": "V"
83 | }
84 | ],
85 | "workspace:close": [
86 | {
87 | "modifiers": [
88 | "Ctrl"
89 | ],
90 | "key": "W"
91 | },
92 | {
93 | "modifiers": [
94 | "Mod"
95 | ],
96 | "key": "W"
97 | }
98 | ],
99 | "switcher:open": [
100 | {
101 | "modifiers": [
102 | "Ctrl"
103 | ],
104 | "key": "O"
105 | },
106 | {
107 | "modifiers": [
108 | "Mod"
109 | ],
110 | "key": "O"
111 | }
112 | ],
113 | "graph:open-local": [
114 | {
115 | "modifiers": [
116 | "Mod",
117 | "Shift"
118 | ],
119 | "key": "G"
120 | }
121 | ],
122 | "note-folder-autorename:make-folder-note": [
123 | {
124 | "modifiers": [
125 | "Mod",
126 | "Shift"
127 | ],
128 | "key": "S"
129 | }
130 | ],
131 | "sliding-panes-obsidian:toggle-sliding-panes-leaf-auto-width": [
132 | {
133 | "modifiers": [
134 | "Ctrl"
135 | ],
136 | "key": ";"
137 | }
138 | ],
139 | "editor:toggle-bullet-list": [
140 | {
141 | "modifiers": [
142 | "Mod",
143 | "Shift"
144 | ],
145 | "key": "B"
146 | }
147 | ],
148 | "obsidian-admonition:insert-callout": [
149 | {
150 | "modifiers": [
151 | "Ctrl"
152 | ],
153 | "key": "C"
154 | }
155 | ],
156 | "editor:insert-wikilink": [
157 | {
158 | "modifiers": [
159 | "Ctrl"
160 | ],
161 | "key": "I"
162 | }
163 | ],
164 | "insert-template": [
165 | {
166 | "modifiers": [
167 | "Mod"
168 | ],
169 | "key": "T"
170 | }
171 | ],
172 | "app:open-vault": [
173 | {
174 | "modifiers": [
175 | "Mod",
176 | "Shift"
177 | ],
178 | "key": "O"
179 | }
180 | ]
181 | }
--------------------------------------------------------------------------------
/content/.obsidian/plugins/obsidian-admonition/data.json:
--------------------------------------------------------------------------------
1 | {
2 | "userAdmonitions": {},
3 | "syntaxHighlight": false,
4 | "copyButton": false,
5 | "version": "9.0.3",
6 | "autoCollapse": false,
7 | "defaultCollapseType": "open",
8 | "injectColor": true,
9 | "parseTitles": true,
10 | "dropShadow": true,
11 | "hideEmpty": false,
12 | "open": {
13 | "admonitions": true,
14 | "icons": true,
15 | "other": true,
16 | "advanced": true
17 | },
18 | "icons": [],
19 | "useFontAwesome": true,
20 | "rpgDownloadedOnce": false,
21 | "msDocConverted": false,
22 | "useSnippet": false,
23 | "snippetPath": "custom-admonitions.a37171"
24 | }
--------------------------------------------------------------------------------
/content/.obsidian/plugins/obsidian-admonition/manifest.json:
--------------------------------------------------------------------------------
1 | {
2 | "id": "obsidian-admonition",
3 | "name": "Admonition",
4 | "version": "9.0.3",
5 | "minAppVersion": "0.14.0",
6 | "description": "Enhanced callouts for Obsidian.md",
7 | "author": "Jeremy Valentine",
8 | "authorUrl": "",
9 | "isDesktopOnly": false
10 | }
11 |
--------------------------------------------------------------------------------
/content/.obsidian/plugins/obsidian-auto-link-title/manifest.json:
--------------------------------------------------------------------------------
1 | {
2 | "id": "obsidian-auto-link-title",
3 | "name": "Auto Link Title",
4 | "version": "1.2.5",
5 | "minAppVersion": "0.12.17",
6 | "description": "This plugin automatically fetches the titles of links from the web",
7 | "author": "Matt Furden",
8 | "authorUrl": "https://github.com/zolrath",
9 | "isDesktopOnly": false
10 | }
11 |
--------------------------------------------------------------------------------
/content/.obsidian/plugins/obsidian-auto-link-title/styles.css:
--------------------------------------------------------------------------------
1 | /* no styles */
--------------------------------------------------------------------------------
/content/.obsidian/templates.json:
--------------------------------------------------------------------------------
1 | {
2 | "folder": "templates"
3 | }
--------------------------------------------------------------------------------
/content/_index.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Data Glossary 🧠
3 | ---
4 |
5 | # A Single Place for All Data Knowledge
6 | Welcome to the Data Glossary, a one-stop-shop for data-related concepts. Inspired by the [Digital Garden](https://jzhao.xyz/posts/networked-thought/) analogy, this interactive platform offers a comprehensive collection of data terms, covering various topics. The [Data Glossary](term/about%20this%20glossary.md) aims to help you expand your data knowledge and uncover new insights. Happy learning!
7 |
8 | # Navigation
9 | There are multiple ways to navigate my Second Brain:
10 |
11 | 1. Use the search bar on the top right or press `cmd+k` (`ctrl+k` on Windows) or click on the Search button (top right) to search for any term.
12 | 2. Click on a note to explore its content, and follow the links and backlinks to dive deeper into related topics.
13 | 3. Interact with the graph at the bottom of the page to visualize connections between notes and click on any node to navigate directly to that note.
14 | 4. Click on the [Hashtags](tags) to explore the topics by tags.
15 |
16 | ## Map of Content
17 | The Data Glossary is continuously growing, and while I have some essential Map of Content starting points listed below, there are many more topics to discover as you explore. Feel free to dive into any of the following areas:
18 |
19 | | Category | Topics |
20 | |-----------------------|-----------------------------------------------------------------------------------------------|
21 | | Data Engineering | [Data Engineering Concepts](term/data%20engineering%20concepts), [Data Engineering Guides](term/data%20engineering%20guides), [Data Engineering Lifecycle](term/data%20engineering%20lifecycle) |
22 | | Data Architectures | [Data Warehouse](term/data%20warehouse), [Data Lake](term/data%20lake), [Data Lakehouse](term/data%20lakehouse) |
23 | | Data Processing | [ELT](term/elt), [ETL](term/etl), [EtLT](term/etlt.md), [Reverse ETL](term/reverse%20etl), [Data Integration](term/data%20integration) |
24 | | Data Formats | [Apache Avro](term/apache%20avro), [Apache Parquet](term/apache%20parquet), [Apache ORC](term/orc) |
25 | | Data Analysis, BI & ML | [Analytics](term/analytics), [Business Intelligence](term/business%20intelligence), [Business Intelligence Tools](term/business%20intelligence%20tools.md), [Machine Learning](term/machine%20learning) |
26 | | Programming Languages | [Python](term/python), [Rust](term/rust), [SQL](term/sql) |
27 | | Programming| [Functional Programming](term/functional%20programming), [Functional Data Engineering](term/functional%20data%20engineering) |
28 |
29 | ## Contribute
30 |
31 | > [!info] How to Contribute?
32 | >
33 | > 1. ⭐ Star our [GitHub](https://github.com/airbytehq/glossary) repo
34 | > 2. 🗣️ [Share the Glossary](https://twitter.com/intent/tweet?text=Great%20definitions%20on%20the%20data%20glossary%20🧠%20by%20@airbytehq&url=glossary.airbyte.com)
35 | > 3. ✍️ Missing a Term or want to fix a typo? [Contribute to Glossary](term/contribute%20to%20glossary.md)
36 | > 4. 👀 Want to discuss or need help, talk to us on [Slack](https://slack.airbyte.com)
37 |
--------------------------------------------------------------------------------
/content/images/business-intelligence-tools-landscape.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airbytehq/glossary/90017025f55eb565184c51e6b110bf8da9704a9e/content/images/business-intelligence-tools-landscape.png
--------------------------------------------------------------------------------
/content/images/dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airbytehq/glossary/90017025f55eb565184c51e6b110bf8da9704a9e/content/images/dag.png
--------------------------------------------------------------------------------
/content/images/data-catalog-feature-comparison2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airbytehq/glossary/90017025f55eb565184c51e6b110bf8da9704a9e/content/images/data-catalog-feature-comparison2.png
--------------------------------------------------------------------------------
/content/images/data-catalog-overview-sarah.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airbytehq/glossary/90017025f55eb565184c51e6b110bf8da9704a9e/content/images/data-catalog-overview-sarah.png
--------------------------------------------------------------------------------
/content/images/data-contract-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airbytehq/glossary/90017025f55eb565184c51e6b110bf8da9704a9e/content/images/data-contract-example.png
--------------------------------------------------------------------------------
/content/images/data-contract.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airbytehq/glossary/90017025f55eb565184c51e6b110bf8da9704a9e/content/images/data-contract.png
--------------------------------------------------------------------------------
/content/images/data-engineering-lifecycle.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airbytehq/glossary/90017025f55eb565184c51e6b110bf8da9704a9e/content/images/data-engineering-lifecycle.png
--------------------------------------------------------------------------------
/content/images/data-hierarchy-of-needs.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airbytehq/glossary/90017025f55eb565184c51e6b110bf8da9704a9e/content/images/data-hierarchy-of-needs.png
--------------------------------------------------------------------------------
/content/images/data-integration.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airbytehq/glossary/90017025f55eb565184c51e6b110bf8da9704a9e/content/images/data-integration.jpg
--------------------------------------------------------------------------------
/content/images/data-ops.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airbytehq/glossary/90017025f55eb565184c51e6b110bf8da9704a9e/content/images/data-ops.png
--------------------------------------------------------------------------------
/content/images/data-quality.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airbytehq/glossary/90017025f55eb565184c51e6b110bf8da9704a9e/content/images/data-quality.png
--------------------------------------------------------------------------------
/content/images/declarative-vs-imperative.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airbytehq/glossary/90017025f55eb565184c51e6b110bf8da9704a9e/content/images/declarative-vs-imperative.png
--------------------------------------------------------------------------------
/content/images/elt-tool.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airbytehq/glossary/90017025f55eb565184c51e6b110bf8da9704a9e/content/images/elt-tool.png
--------------------------------------------------------------------------------
/content/images/etl-tool.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airbytehq/glossary/90017025f55eb565184c51e6b110bf8da9704a9e/content/images/etl-tool.png
--------------------------------------------------------------------------------
/content/images/etlt-extract-tweak-load-transform.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airbytehq/glossary/90017025f55eb565184c51e6b110bf8da9704a9e/content/images/etlt-extract-tweak-load-transform.png
--------------------------------------------------------------------------------
/content/images/future-modern-data-stack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airbytehq/glossary/90017025f55eb565184c51e6b110bf8da9704a9e/content/images/future-modern-data-stack.png
--------------------------------------------------------------------------------
/content/images/semantic-warehouse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airbytehq/glossary/90017025f55eb565184c51e6b110bf8da9704a9e/content/images/semantic-warehouse.png
--------------------------------------------------------------------------------
/content/images/setup-folder-structure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airbytehq/glossary/90017025f55eb565184c51e6b110bf8da9704a9e/content/images/setup-folder-structure.png
--------------------------------------------------------------------------------
/content/images/setup-obsidian-vault.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airbytehq/glossary/90017025f55eb565184c51e6b110bf8da9704a9e/content/images/setup-obsidian-vault.png
--------------------------------------------------------------------------------
/content/images/sql-levels-explained.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airbytehq/glossary/90017025f55eb565184c51e6b110bf8da9704a9e/content/images/sql-levels-explained.png
--------------------------------------------------------------------------------
/content/private/private-note.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Private Stuff"
3 | ---
4 |
5 | This page doesn't get published!
--------------------------------------------------------------------------------
/content/templates/airbyte.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "{{title}}"
3 | tags:
4 | - airbyte
5 | ---
6 |
--------------------------------------------------------------------------------
/content/templates/term.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is {{title}}?"
3 | tags:
4 | - data engineering
5 | ---
6 |
--------------------------------------------------------------------------------
/content/term/about this glossary.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "About this Glossary"
3 | tags:
4 | - help
5 | ---
6 | The Airbyte Glossary is built on top of the [Digital Garden](https://jzhao.xyz/posts/networked-thought/) analogy. Instead of aligning all glossary terms in a single level, the digital garden approach lets you go inwards. You can learn about each term and go deeper into each of its connections. The Glossary will show you each link that is related to the above interactive graph to it and all backlinks.
7 |
8 | These will allow you to see connections in a visual way, that you would not otherwise.
9 |
10 | This Glossary is forked from [Quartz](https://github.com/jackyzha0/quartz) and we thank Jacky for open-sourcing this gem.
11 |
12 | ### Navigation
13 | You can simply hit `ctrl/cmd+k` and **search** the whole Data Brain. Or you can click on the links and navigation through our content.
14 |
15 | ### Interactive Graph
16 | Use the `Interactive Graph` on the bottom. It will appear every term. You can zoom and click on different nodes to navigate through the content.
17 |
--------------------------------------------------------------------------------
/content/term/acid transactions.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What are ACID Transactions?"
3 | tags:
4 | - data engineering
5 | ---
6 | An ACID transaction secures that either all changes are successfully committed or rollbacked. It makes sure you never end in an inconsistent state. There is different concurrency control that, for example, guarantees consistency between reads and writes. Each [Data Lake Table Format](term/data%20lake%20table%20format.md) has other implementations and features here.
--------------------------------------------------------------------------------
/content/term/airbyte catalog.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Airbyte Catalog"
3 | tags:
4 | - airbyte
5 | ---
6 |
7 | > [!info] Info
8 | >
9 | > This is only relevant for individuals who want to create a connector.
10 |
11 | This refers to how you define the data that you can retrieve from a Source. For example, if you want to retrieve information from an API, the data that you can receive needs to be defined clearly so that Airbyte can have a clear expectation of what endpoints are supported and what the objects that the streams return look like. This is represented as a sort of schema that Airbyte can interpret.
12 |
13 | Learn more on [Beginners Guide to Catalog](https://docs.airbyte.com/understanding-airbyte/beginners-guide-to-catalog).
--------------------------------------------------------------------------------
/content/term/airbyte cdk.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Airbyte CDK"
3 | tags:
4 | - airbyte
5 | ---
6 |
7 | The Airbyte CDK (Connector Development Kit) allows you to create connectors for Sources or Destinations. If your source or destination doesn't exist, you can use the CDK to make the building process a lot easier. It generates all the tests and files you need and all you need to do is write the connector-specific code for your source or destination.
8 |
9 | An extensive [Step-by-Step Example](https://airbyte.com/tutorials/extract-data-from-the-webflow-api) of how to create a custom Airbyte source connector with the [Python CDK](https://docs.airbyte.com/connector-development/cdk-python/). Another example by the Faros AI team created with [Javascript/Typescript](https://docs.airbyte.com/connector-development/cdk-faros-js).
--------------------------------------------------------------------------------
/content/term/airbyte cloud.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Airbyte Cloud?"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | [Airbyte Cloud](https://cloud.airbyte.io) is the fastest, most reliable way to address all your [ELT](term/elt.md) needs that lets you get started in 10 minutes with hundreds of out-of-the-box connectors.
8 |
9 | Airbyte Cloud offers simple, predictable, scalable [pricing](https://airbyte.com/pricing). [Try for free](https://cloud.airbyte.io/signup) or read more on [Airbyte Cloud](https://airbyte.com/offer-cloud).
--------------------------------------------------------------------------------
/content/term/airbyte glossary of terms.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Glossary of Terms (Airbyte Specific)"
3 | tags:
4 | - airbyte
5 | ---
6 |
7 | This is the start of the Glossary relevant for [Airbyte](https://airbyte.com) specific, which are related to [docs.airbyte.com](https://docs.airbyte.com/) or when using Airbyte.
8 |
9 | You'll find all terms at [#airbyte](/tags/airbyte/), or you can get inspired with the following terms:
10 | - [Airbyte CDK](term/airbyte%20cdk.md)
11 | - Related to [Incremental Sync](term/incremental%20synchronization.md) and [Full Refresh Sync](term/full%20refresh%20synchronization.md):
12 | - [Cursor](term/cursor.md), [Soft Delete](term/soft%20delete.md), [Partial Success](term/partial%20success.md), and [Raw Tables](term/raw%20tables.md)
13 | - [Airbyte Normalization](term/normalization.md)
14 | - [ETL and ELT](term/etl%20elt%20airbyte.md)
15 |
16 | ## Advanced Terms
17 | - [Airbyte Catalog](term/airbyte%20catalog.md)
18 | - [Airbyte Specification](term/airbyte%20specification.md)
19 | - [Temporal](term/temporal.md)
--------------------------------------------------------------------------------
/content/term/airbyte specification.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Airbyte Specification"
3 | tags:
4 | - airbyte
5 | ---
6 | > [!info] Info
7 | >
8 | > This is only relevant for individuals who want to create a connector.
9 |
10 | This refers to the functions that a Source or Destination must implement to successfully retrieve data and load it, respectively. Implementing these functions using the Airbyte Specification makes a Source or Destination work correctly.
11 |
12 | Learn more on [[Airbyte Protocol]].
--------------------------------------------------------------------------------
/content/term/airbyte streams.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What are Airbyte Streams?"
3 | tags:
4 | - airbyte
5 | ---
6 | In order to understand **AirbyteStreams**, let’s first talk about the **AirbyteCatalog**. An **AirbyteCatalog** describes the structure of data in a data source. It has a single field called streams that contains a list of **AirbyteStreams**. Each **AirbyteStream** contains a _name_ and _json_schema_ field. The _json_schema_ field describes the structure of a stream. This data model is intentionally flexible.
7 |
8 | If we are using a data source that is a traditional relational database, each table in that database would map to an **AirbyteStream**. Each column in the table would be a key in the _properties_ field of the _json_schema_ field.
9 |
10 | If we are using a data source that wraps an API with multiple different resources (e.g. _api/customers_ and _api/products_) each route would correspond to a stream.
--------------------------------------------------------------------------------
/content/term/airbyte.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Airbyte?"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | [Airbyte](https://airbyte.com/) is the modern open-source [ELT](term/elt.md) standard and a [data integration](term/data%20integration.md) platform that syncs data from APIs, databases & files to [data warehouses](term/data%20warehouse.md), [lakes](term/data%20lake.md) and other destinations. In addition to covering the long tail of connectors with the involvement of its community, [[Airbyte Cloud]] differentiates itself with its transparent and predictable volume-based pricing. Airbyte addresses all connector needs through its open-source extensibility. Its ambition is to make [data integrations](term/data%20integration.md) a commodity.
8 |
9 | Read more on [Why Airbyte](https://airbyte.com/why-airbyte), check out our [Connector Catalog](https://docs.airbyte.com/integrations/), or read about the [Company Updates](https://airbyte.com/blog-categories/company-updates).
10 |
--------------------------------------------------------------------------------
/content/term/analytics.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Analytics?"
3 | tags:
4 | - data engineering
5 | aliases:
6 | - Dashboards
7 | - Visualization
8 | ---
9 | Analytics is the systematic computational analysis of [[data]] or statistics. It is used for the discovery, interpretation, and communication of meaningful patterns in data. It also entails applying data patterns toward effective decision-making.
10 |
11 | It's highly related to [Business Intelligence](term/business%20intelligence.md).
--------------------------------------------------------------------------------
/content/term/apache airflow.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Apache Airflow?"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | [Airflow](https://airflow.apache.org/) is a [data orchestrator](term/data%20orchestrator.md) and the first that made task scheduling popular with [Python](term/python.md). Originally created by [Maxime Beauchemin](term/maxime%20beauchemin.md) working at Airbnb.
8 |
9 | Airflow programmatically author, schedule, and monitor workflows. It follows the [imperative](term/imperative.md) paradigm of schedule as *how* a DAG is run has to be defined within the Airflow jobs. Airflow calls its *Workflow as code* with the main characteristics
10 | - **Dynamic**: Airflow pipelines are configured as Python code, allowing for dynamic pipeline generation.
11 | - **Extensible**: The Airflow framework contains operators to connect with numerous technologies. All Airflow components are extensible to easily adjust to your environment.
12 | - **Flexible**: Workflow parameterization is built-in leveraging the [Jinja Templating](term/jinja%20template.md) engine.
13 |
--------------------------------------------------------------------------------
/content/term/apache arrow.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Apache Arrow?"
3 | tags:
4 | - data engineering
5 | ---
6 | Apache Arrow is a development platform for in-memory analytics. It contains a set of technologies that enable big data systems to process and move data fast.
7 |
8 | Read more on [Data Lake File Format](term/data%20lake%20file%20format.md).
--------------------------------------------------------------------------------
/content/term/apache avro.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Apache Avro?"
3 | tags:
4 | - data engineering
5 | ---
6 | Avro is an open-source data serialization system that helps with data exchange between systems, [programming languages](term/programming%20languages.md), and processing frameworks. Avro helps define a binary format for your data, as well as map it to the programming language of your choice.
7 |
8 | Avro has a JSON-like data model, but can be represented as either JSON or in a compact binary form. It comes with a **very sophisticated schema description language** that describes data. Avro is another [Data Lake File Format](term/data%20lake%20file%20format.md).
9 |
10 | Read more about how to build a Data Lake on top of it on our [Data Lake and Lakehouse Guide](https://airbyte.com/blog/data-lake-lakehouse-guide-powered-by-table-formats-delta-lake-iceberg-hudi).
--------------------------------------------------------------------------------
/content/term/apache druid.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Apache Druid?"
3 | tags:
4 | - data engineering
5 | ---
6 | Druid is an open-source, column-oriented, distributed data store written in Java. It's designed to quickly ingest massive quantities of event data, and provide low-latency queries on top of the data.
7 |
8 | An analytics database designed for fast slice-and-dice analytics ([OLAP](term/olap%20(online%20analytical%20processing).md) queries) on large data sets. Most often, Druid powers use cases where real-time ingestion, fast query performance, and high uptime are important.
9 |
--------------------------------------------------------------------------------
/content/term/apache hadoop.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Apache Hadoop?"
3 | tags:
4 | - data engineering
5 | ---
6 | Apache Hadoop is a collection of open-source software utilities that facilitates using a network of many computers to solve problems involving massive amounts of data and computation. It provides a software framework for distributed storage and processing of big data using the [MapReduce](term/map%20reduce.md) programming model.
7 |
--------------------------------------------------------------------------------
/content/term/apache hive.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Apache Hive?"
3 | tags:
4 | - data engineering
5 | ---
6 | Apache Hive is a [Data Warehouse](term/data%20warehouse.md) software project built on top of [Apache Hadoop](term/apache%20hadoop.md) for providing data queries and analysis. Hive gives an SQL-like interface to query data stored in various databases and file systems that integrate with Hadoop. Traditional SQL queries must be implemented in the [MapReduce](term/map%20reduce.md) Java API to execute SQL applications and queries over distributed data. Hive provides the necessary SQL abstraction to integrate SQL-like queries ([HiveQL](https://en.wikipedia.org/wiki/Apache_Hive#HiveQL)) into the underlying Java without the need to implement queries in the low-level Java API.
--------------------------------------------------------------------------------
/content/term/apache hudi.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Apache Hudi?"
3 | tags:
4 | - data engineering
5 | ---
6 | Apache Hudi is a [Data Lake Table Format](term/data%20lake%20table%20format.md) and was originally developed at Uber in 2016 (code-named and pronounced "Hoodie"), open-sourced end of 2016 ([first commit](https://github.com/apache/hudi/commit/0512da094bad2f3bcd2ddddc29e8abfec175dcfe) in 2016-12-16), and submitted to the Apache Incubator in January 2019. More about the back story on [The Apache Software Foundation Announces Apache® Hudi™ as a Top-Level Project](https://www.globenewswire.com/news-release/2020/06/04/2043732/0/en/The-Apache-Software-Foundation-Announces-Apache-Hudi-as-a-Top-Level-Project.html).
7 |
8 | Read more about how to build a Data Lake on top of it on our [Data Lake and Lakehouse Guide](https://airbyte.com/blog/data-lake-lakehouse-guide-powered-by-table-formats-delta-lake-iceberg-hudi).
--------------------------------------------------------------------------------
/content/term/apache iceberg.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Apache Iceberg?"
3 | tags:
4 | - data engineering
5 | ---
6 | Apache Iceberg is a [Data Lake Table Format](term/data%20lake%20table%20format.md) and was [initially developed](https://github.com/Netflix/iceberg) at Netflix to solve long-standing issues using huge, petabyte-scale tables. It was open-sourced in 2018 as an Apache Incubator project and graduated from the incubator on the 19th of May 2020. Their [first public commit](https://github.com/apache/iceberg/commit/a5eb3f6ba171ecfc517a4f09ae9654e7d8ae0291) was 2017-12-19—more insights about the story on [A Short Introduction to Apache Iceberg](https://medium.com/expedia-group-tech/a-short-introduction-to-apache-iceberg-d34f628b6799).
7 |
8 | Read more about how to build a Data Lake on top of it on our [Data Lake and Lakehouse Guide](https://airbyte.com/blog/data-lake-lakehouse-guide-powered-by-table-formats-delta-lake-iceberg-hudi).
--------------------------------------------------------------------------------
/content/term/apache parquet.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Apache Parquet?"
3 | tags:
4 | - data engineering
5 | ---
6 | Apache Parquet is a free and open-source column-oriented [Data Lake File Format](term/data%20lake%20file%20format.md) in the Apache Hadoop ecosystem. It is similar to RCFile and [ORC](term/orc.md), the other columnar-storage file formats in Hadoop, and is compatible with most of the data processing frameworks around Hadoop.
7 |
8 | Read more about how to build a Data Lake on top of it on our [Data Lake and Lakehouse Guide](https://airbyte.com/blog/data-lake-lakehouse-guide-powered-by-table-formats-delta-lake-iceberg-hudi).
--------------------------------------------------------------------------------
/content/term/apache spark.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Apache Spark?"
3 | tags:
4 | - data engineering
5 | ---
6 | Apache Spark™ is an open-source multi-language engine for executing [Data Engineering](term/data%20engineering.md) and [Machine Learning](term/machine%20learning.md) on single-node machines or clusters. It's optimized for large-scale data processing.
7 |
8 | Spark runs well with [Kubernetes](term/kubernetes.md).
9 |
10 |
--------------------------------------------------------------------------------
/content/term/behavioral data.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Behavioral Data?"
3 | tags:
4 | - data engineering
5 | ---
6 | Behavioral data are a result of users performing actions or events while interacting with a product and are therefore also referred to as event data or product-usage data.
7 |
8 | ## Why collect Behavioral Data?
9 | Behavioral data serves two main purposes for teams — understanding how the product is being used or not used (user behavior) and building personalized customer experiences across various touchpoints to influence user behavior.
10 |
11 | Find more in-depth on [How to Collect Behavioral Data? A Guide for Data Engineers and Analysts](https://airbyte.com/blog/collect-behavioral-data-guide).
--------------------------------------------------------------------------------
/content/term/big o notation.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is the Big-O Notation?"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | Big-O Notation is an analysis of the algorithm using [Big – O asymptotic notation](https://www.geeksforgeeks.org/analysis-of-algorithms-set-3asymptotic-notations/). Mostly related to computing rather than storage, but having in mind that doing things not exponentially, such as copying the same data many times, will save lots of performance and money.
8 |
9 | We can express algorithmic complexity using the big-O notation. For a problem of size N:
10 | - A constant-time function/method is “order 1” : O(1)
11 | - A linear-time function/method is “order N” : O(N)
12 | - A quadratic-time function/method is “order N squared” : O(N^2)
13 |
14 | Check out more on [Analysis of Algorithms | Big-O analysis](https://www.geeksforgeeks.org/analysis-algorithms-big-o-analysis/).
15 |
--------------------------------------------------------------------------------
/content/term/bus matrix.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Bus Matrix?"
3 | tags:
4 | - data engineering
5 | ---
6 | A Bus Matrix was traditionally used in [OLAP](term/olap%20(online%20analytical%20processing).md) cubes such as Microsoft SSAS and co. They let you visually see what [[Measure]] can be queried with which [dimensions](term/dimensions.md).
7 |
8 | They look something like this, example of [SSAS](https://blog.exsilio.com/all/ssas-dimensions-and-cube-basics/):
9 | 
10 |
--------------------------------------------------------------------------------
/content/term/business intelligence tools.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What are Business Intelligence Tools?"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | [Business Intelligence](term/business%20intelligence.md) tools visualizing your data across the organizations. See a curated list of tools including the referenced image below on [Catalog of BI tools](https://notion.castordoc.com/catalog-of-bi-tools).
8 |
9 | 
10 |
11 |
--------------------------------------------------------------------------------
/content/term/business intelligence.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Business Intelligence?"
3 | tags:
4 | - data engineering
5 | ---
6 | Business intelligence (BI) leverages software and services to [transform data](term/data%20transformation.md) into actionable insights that inform an organization’s business decisions. The new term is [Data Engineering](term/data%20engineering.md). The language of a BI engineer is [SQL](term/sql.md).
7 |
8 | ## Goals of BI
9 | BI should produce a simple overview of your business, boost efficiency, and automate repetitive tasks across your organization. In more detail:
10 | * **Roll-up capability** - (data) [Visualization](term/analytics.md) over the most important [KPIs][2] (aggregations) - like a cockpit in an airplane which gives you the important information at one glance.
11 | * **Drill-down possibilities** - from the above high-level overview drill down the very details to figure out why something is not performing as planned. **Slice-and-dice or pivot your data from different angles.
12 | * **Single source of truth** - instead of multiple spreadsheets or other tools with different numbers, the process is automated and done for all unified. Employees can talk about the business problem instead of the various numbers everyone has. Reporting, budgeting, and forecasting are automatically updated and consistent, accurate, and in timely manner.
13 | * **Empower users**: With the so-called self-service BI, every user can analyze their data instead of only BI or IT persons.
14 |
15 | Read more on [Business Intelligence meets Data Engineering with Emerging Technologies](https://www.sspaeti.com/blog/business-intelligence-meets-data-engineering/).
16 |
--------------------------------------------------------------------------------
/content/term/cdp (customer data platform).md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a CDP (Customer Data Platform)?"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | A customer Data Platform (CDP) is a system that collects large quantities of customer data (i.e. information about your customers) from a variety of channels and devices, helping to make this data more accessible to the people who need it. CDPs are responsible for sorting and categorizing data, as well as data cleansing to remove inaccurate or out-of-date information.
8 |
--------------------------------------------------------------------------------
/content/term/cloud provider.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What are the top Cloud Providers?"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | Among the biggest cloud providers are [AWS](https://aws.amazon.com/), [Microsoft Azure](https://azure.microsoft.com/), [Google Cloud](https://cloud.google.com/). Whereas [Databrick](https://www.databricks.com/) and [Snowflake](https://www.snowflake.com/) provide dedicated [Data Warehouse](term/data%20warehouse.md) and [Lakehouse](term/data%20lakehouse.md) solutions.
8 |
--------------------------------------------------------------------------------
/content/term/contribute to glossary.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "How to Contribute to the Glossary"
3 | tags:
4 | - help
5 | weight: -9
6 | ---
7 |
8 | > [!info] General Infos
9 | >
10 | > If you want to know more general how this glossary works, see [General Info](term/general%20infos.md) with a description of folder structure, how to create a link, etc.
11 |
12 | Deployment wise you have three options, which are explained in the chapters below:
13 | * Web edits through GitHub
14 | * Creating an Issue and we'll do the rest
15 | * Or cloning and running it locally
16 |
17 | ## Web Edit with GitHub
18 | You can either click on `Edit Source` on each page and directly edit on GitHub or you can create a [New Issue](https://github.com/airbytehq/glossary/issues).
19 |
20 | ## Create an Issue
21 | If you are unsure, you can always [create an issue](https://github.com/airbytehq/glossary/issues) on our GitHub repo and we will make sure to add the new entry.
22 |
23 | ## Changing a lot? Clone locally
24 | ### Clone it locally with git
25 | Clone the [repo](https://github.com/airbytehq/glossary) with:
26 | ```sh
27 | git clone https://github.com/airbytehq/glossary.git
28 | ```
29 |
30 | ### Editors
31 | #### Obsidian as an Editor (Recommended)
32 | If you want to use [Obsidian](https://obsidian.md/), which I recommend as it will handle all links when renaming terms, adding a nice Markdown view with lots of features (even if you don't need them) and showing backlinks and [graph](term/about%20this%20glossary.md#interactive-graph). Just open the Obsidian in the folder `content/`, there is a hidden folder called `.obsidian` which does the rest.
33 |
34 | 1. 
35 | 2. 
36 |
37 | More details and step-by-step manual you see on [Quartz Setup](https://quartz.jzhao.xyz/notes/setup/), how to [Edit Notes ](https://quartz.jzhao.xyz/notes/editing/) and [How to set up Obsidian](https://quartz.jzhao.xyz/notes/obsidian/) (although the settings are already done when you open the `.obsidian` folder as described above).
38 |
39 | #### Use any other Editor
40 | Of course, as everything is Markdown, you can edit each file under `content/term` as a normal markdown file and publish (see below) changes to GitHub. Keep in mind, ethat very new term you create will automatically be created as a page in dthe eployment process or when you run it locally.
41 |
42 | ### Preview Locally
43 | #### Setup
44 | Quartz runs on top of [Hugo](https://gohugo.io/) so all notes are written in [Markdown](https://www.markdownguide.org/getting-started/).
45 |
46 | We need to install golang, hugo-obsidian and hugo. Follow the instructions on [Preview Changes on Quartz](https://quartz.jzhao.xyz/notes/preview-changes/).
47 |
48 | > [!info]
49 | >
50 | > If you are running into an error saying that `command not found: hugo-obsidian`, make sure you set your `GOPATH` correctly! This will allow your terminal to correctly recognize hugo-obsidian as an executable.
51 |
52 | I added to my `~/.zshrc` (or `~/.bashrc`):
53 | ```sh
54 | #go path
55 | export GOPATH=$HOME/go
56 | export PATH=$PATH:$GOROOT/bin:$GOPATH/bin
57 | ```
58 | #### Run it!
59 | All you need to do it goint to your root directory of you cloned repo and start `make serve`:
60 | ```sh
61 | # Navigate to your local Quartz folder
62 | cd
63 |
64 | # Start local server
65 | make serve
66 |
67 | # View your site in a browser at http://localhost:1313/
68 | ```
69 | That's it, from now on that's how you run it. All changes you make will be automatically published, you do not need to stop and restart when you add terms, etc. (only the graph view will only be updated after stopping and serving again).
70 |
71 | ## How to Publish
72 | Commit and Push to branch `hugo` and wait a couple of minutes until [GitHub Actions](https://github.com/airbytehq/glossary/actions) will deploy it automatically. At the moment we do not need to create PR's to make the updates as easy as possible.
73 |
74 | If we encounter problems in the future, we might change that. If you are unsure, you can always [create an issue](https://github.com/airbytehq/glossary/issues) and we will make sure to add the new entry.
75 |
--------------------------------------------------------------------------------
/content/term/cte (common table expression).md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a CTE (Common Table Expression)?"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | A Common Table Expression (CTE) is a temporary named result set that you can reference within a SELECT, INSERT, UPDATE, or DELETE statement. The CTE can also be used in a View.
8 |
9 | ```sql
10 | WITH cte_query AS
11 | (SELECT … subquery ...)
12 | SELECT main query ... FROM/JOIN with cte_query ...
13 | ```
14 |
15 | ## Types: Recursive and Non-Recursive
16 | ### Non-Recursive CTE
17 | There are two types of CTEs: Recursive and Non-Recursive.
18 |
19 | The non-recursive are simple where CTE is used to avoid SQL duplication by referencing a name instead of the actual SQL statement.
20 |
21 | E.g.
22 | ```sql
23 | WITH avg_per_store AS
24 | (SELECT store, AVG(amount) AS average_order
25 | FROM orders
26 | GROUP BY store)
27 | SELECT o.id, o.store, o.amount, avg.average_order AS avg_for_store
28 | FROM orders o
29 | JOIN avg_per_store avg
30 | ON o.store = avg.store;
31 | ```
32 |
33 | ### Recursive CTE
34 |
35 | Recursive CTEs use repeated procedural loops therefore the recursion. The recursive query calls itself until the query satisfied the condition. In a recursive CTE, we should provide a where condition to terminate the recursion.
36 |
37 | A recursive CTE is useful in querying hierarchical data such as organization charts where one employee reports to a manager or multi-level bill of materials when a product consists of many components, and each component itself also consists of many other components.
38 |
39 | ```sql
40 | WITH levels AS (
41 | SELECT
42 | id,
43 | first_name,
44 | last_name,
45 | superior_id,
46 | 1 AS level
47 | FROM employees
48 | WHERE superior_id IS NULL
49 | UNION ALL
50 | SELECT
51 | employees.id,
52 | employees.first_name,
53 | employees.last_name,
54 | employees.superior_id,
55 | levels.level + 1
56 | FROM employees, levels
57 | WHERE employees.superior_id = levels.id
58 | )
59 |
60 | SELECT *
61 | FROM levels;
62 | ```
63 |
64 | See more on[5 Practical SQL CTE Examples | LearnSQL.com](https://learnsql.com/blog/practical-sql-cte-examples/).
--------------------------------------------------------------------------------
/content/term/cursor.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Cursor?"
3 | tags:
4 | - airbyte
5 | ---
6 | At a conceptual level, a cursor is a tracker that is used during [incremental synchronization](term/incremental%20synchronization.md) to ensure that only newly updated or inserted records are sent from a data source to a destination in any given synchronization iteration.
7 |
8 | Airbyte’s incremental synchronization can be conceptually thought of as a loop which periodically executes synchronization operations. Each iteration of this loop only replicates records that have been inserted or updated in the source system since the previous execution of this synchronization loop – in other words, each synchronization operation will copy only records that have not previously been replicated by previous synchronizations. This is much more efficient than copying an entire dataset on each iteration, which is the behavior of full refresh synchronization.
9 |
10 | Sending only updated or newly inserted documents requires tracking which records have already been replicated in previous synchronizations. This is done by a cursor, which can be thought of as a pointer to the most recent record that has been replicated by a given synchronization. When selecting documents for synchronization, Airbyte includes the most recent cursor value as part of the query on the source system to ensure that only new/updated records will be replicated.
11 |
12 | For example, a source database could contain records which include a field called `updated_at`, which stores the most recent time that a record is inserted or updated. If `updated_at` is selected as the cursor field, then after a given synchronization operation the cursor will remember the largest `updated_at` value that has been seen in the records that have been replicated to the destination in that synchronization. In the subsequent synchronization operation, records that have been inserted or updated on the source are retrieved by including the cursor value as part of the query, so that it only selects records where the `updated_at` value is greater than (and in some edge cases greater than or equal to) the largest `updated_at` value seen in the previous synchronization.
13 |
14 | Note that while it is not strictly necessary to choose a time field for a cursor field, the field that is chosen should be monotonically increasing over time.
15 |
16 | Read more on [Incremental data synchronization between Postgres databases](https://airbyte.com/tutorials/incremental-data-synchronization).
--------------------------------------------------------------------------------
/content/term/dag directed acyclic graph.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Directed Acyclic Graph (DAG)?"
3 | tags:
4 | - data engineering
5 | ---
6 | DAG stands for **Directed Acyclic Graph**. A DAG is a graph where information must travel along with a finite set of nodes connected by vertices. There is no particular start or node and also no way for data to travel through the graph in a loop that circles back to the starting point.
7 |
8 | It's a popular way of building data pipelines in tools like [[Airflow]], [[Dagster]], [[Prefect]]. It clearly defines the [Data Lineage](term/data%20lineage.md). As well, it's made for a functional approach where you have the [idempotency](term/idempotency.md) to restart pipelines without side-effects.
9 |
10 | 
--------------------------------------------------------------------------------
/content/term/dagster.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Dagster?"
3 | tags:
4 | - data engineering
5 | ---
6 | [Dagster](https://dagster.io/) is a [data orchestrator](term/data%20orchestrator.md) focusing on data-aware scheduling that supports the whole development lifecycle, with integrated lineage and observability, a [declarative](term/declarative.md) programming model, and best-in-class testability.
7 |
8 | Key features are:
9 | - Manage your data assets with code
10 | - A single pane of glass for your data platform
--------------------------------------------------------------------------------
/content/term/data asset.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Data Asset?"
3 | tags:
4 | - data engineering
5 | ---
6 | A data asset is typically a database table, a machine learning model, or a report. A persistent object that captures some understanding of the world. It's more a technical term where [Data Product](term/data%20product.md) is more used in general or in [Data Mesh](term/data%20mesh.md).
--------------------------------------------------------------------------------
/content/term/data catalog.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Data Catalog?"
3 | tags:
4 | - data engineering
5 | ---
6 | A Data Catalog is a centralized store where all metadata data about your data is made searchable.
7 |
8 | **Think about a Google Search for your internal Metadata**. This is vital, as with [Data Lake](term/data%20lake.md) and other data stores, and you want the ability to search for your data. Data is growing exponentially, with 90% of the world’s data being generated alone in the last two years. It's hard to keep this amount over time. A data catalog solves the problem of the fast-growing handling of data internally.
9 |
10 | An interesting read about the beginning of the Data Catalog is explained in the 2017 published paper about a [Data Context Service](http://cidrdb.org/cidr2017/papers/p111-hellerstein-cidr17.pdf).
11 |
12 | See a High-Level Feature Comparison by the [Awesome Data Discovery and Observability](https://github.com/opendatadiscovery/awesome-data-catalogs) list on GitHub (check out the link for more):
13 | 
14 |
15 | Or a great overview by Sarah Krasnik on [Choosing a Data Catalog](https://sarahsnewsletter.substack.com/p/choosing-a-data-catalog):
16 | 
17 |
--------------------------------------------------------------------------------
/content/term/data contract.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Data Contract?"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | Data Contracts are API-like agreements between software/data engineers who own services and data consumers that understand how the business works. The goal is to generate well-modeled, high-quality, trusted, real-time data.
8 |
9 | It's an **abstraction** that allows engineers to decouple their databases and services from analytics and ML requirements. It will avoid production-breaking incidents when modifying the schema as they are validated and enforced.
10 |
11 | 
12 | Illustration by Chad Sanderson on [The Rise of Data Contracts - by Chad Sanderson](https://dataproducts.substack.com/p/the-rise-of-data-contracts)
13 |
14 | [Chad Sanderson](https://www.linkedin.com/in/chad-sanderson/) said that at Convoy, they use [[Protobuf]] and [[Apache Kafka]] to abstract the CRUD transactions. They define the schema based on what they *need*, not what they get from the source. Same as [[Software-Defined Assets]] describe the [Data Asset](term/data%20asset.md) in a declarative manner and set [expectations](https://github.com/dagster-io/dagster/discussions/9543).
15 |
16 | Confluent also built similar functions on top of Kafka with their [Schema Registry](https://docs.confluent.io/platform/current/schema-registry/), and terms such as [Semantic Layer](term/metrics%20layer.md) and [Analytics API](https://www.sspaeti.com/blog/analytics-api-with-graphql-the-next-level-of-data-engineering/#what-is-an-analytics-api) (with [[GraphQL]]) are trying to achieve similar things.
17 |
18 | Data Contracts are not meant to replace data pipelines and [Modern Data Stack](term/modern%20data%20stack.md), a more batch approach. These are good for fast prototyping. You could start defining data contracts when you have some knowledge about data.
19 |
20 | Interestingly, the differentiation to [Data Mesh](term/data%20mesh.md) is an organizational framework with a micro-service approach to data. Data Mesh doesn't inform which data should be emitted or validate the data being emitted from production is correct or conforms to a consumer's expectations.
21 |
22 | Also, data contracts are a form of [Data Governance](term/data%20governance.md). This term is very vague and gets more concrete with explicit contracts. You can also use [Great Expectations](https://greatexpectations.io/) to set expectations for your data, which I believe is a great way to start.
23 |
24 | ## From the Discussion on YouTube w/ Chad Sanderson vs Ethan Aaron
25 |
26 | Chad Sanderson says in [Data Contract Battle Royale w/ Chad Sanderson vs Ethan Aaron - YouTube](https://youtu.be/4BEpYAp3Qu4) :
27 | - It's just a database version of a real-world contract.
28 | - A real-world contract is just an agreement between two parties where:
29 | - There's some mechanism for enforcing that it happens.
30 | - A data contract is a similar agreement, but it's **between someone that produces data and consumes data** to vend a particular data set which usually includes a schema and some enforcement mechanism.
31 | - Differentiation between data contract and data product:
32 | - **Data contract**, which is *what* is the data and *how* do we enforce this quality
33 | - **[Data Product](term/data%20product.md)** which is *why* do we need this data
34 |
35 | Ethan Aaron is saying his problem with data contracts is that you focus on defining the interface/contract too early. E.g., if you have a big task done by several teams or people, you have a contract to agree on an interface. I'd argue that's precisely what the data products are, and instead of agreeing on some artificial contract, decide on the product, so the tools and teams can be distinct.
36 |
37 | ## Summary Blog Posts
38 | An excellent summary by [Mehdi Ouazza](https://www.linkedin.com/in/mehd-io) about data contracts [From Zero To Hero](https://towardsdatascience.com/data-contracts-from-zero-to-hero-343717ac4d5e). He is illustrating how [[Apache Kafka]] could also be the interface that defines the contract.
39 |
40 | 
41 | Illustration from [Data Contracts — From Zero To Hero](https://towardsdatascience.com/data-contracts-from-zero-to-hero-343717ac4d5e)
42 |
43 | See also [Semantic Warehouse](term/semantic%20warehouse.md).
44 |
--------------------------------------------------------------------------------
/content/term/data engineering concepts.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Data Engineering Concepts"
3 | tags:
4 | - data engineering
5 | - concept
6 | ---
7 | Some core concepts we are going to explore:
8 |
9 | | Data Engineering Topics |
10 | |--------------------------------------------------------------------------------------------------------|
11 | | [Data Warehouse](term/data%20warehouse.md), [Data Lake](term/data%20lake.md), [Data Lakehouse](term/data%20lakehouse.md) |
12 | | [Storage Layer](term/storage%20layer%20object%20store.md), [Data Lake File Format](term/data%20lake%20file%20format.md), [Data Lake Table Format](term/data%20lake%20table%20format.md) |
13 | | [Data Catalog](term/data%20catalog.md) |
14 | | [Modern Data Stack](term/modern%20data%20stack.md), [Open Data Stack](term/open%20data%20stack.md) |
15 | | [Data Engineering Lifecycle](term/data%20engineering%20lifecycle.md) |
16 | | [ELT](term/elt.md), [ETL](term/etl.md), [EtLT](term/etlt.md) |
17 | | [Functional Data Engineering](term/functional%20data%20engineering.md), [Software-Defined Assets](term/software-defined%20assets.md) |
18 | | [Metrics Layer](term/metrics%20layer.md), [Semantic Warehouse](term/semantic%20warehouse.md), [Data Virtualization](term/data%20virtualization.md) |
19 | | [Metrics](term/metric.md), [Key Performance Indicator (KPI)](term/key%20performance%20indicator%20(kpi).md) |
20 | | [Push-Downs](term/push-down.md), [Rollup](term/rollup.md) |
21 | | [Data Modeling](term/data%20modeling.md), [Dimensional Modeling](term/dimensional%20modeling.md) |
22 | | [Data Contract](term/data%20contract.md) |
23 | | [OLAP](term/olap%20(online%20analytical%20processing).md), [OLTP](term/oltp%20(online%20transactional%20processing).md) |
24 | | [MapReduce](term/map%20reduce.md), [Apache Hadoop](term/apache%20hadoop.md) |
25 | | [Declarative vs Imperative](term/declarative.md) |
26 | | [Notebooks](term/notebooks.md) |
27 |
28 |
29 |
30 |
31 |
32 |
37 |
38 | See also [What is Data Engineering](term/data%20engineering.md).
--------------------------------------------------------------------------------
/content/term/data engineering guides.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Data Engineering Guides"
3 | tags:
4 | - data engineering
5 | - concept
6 | ---
7 |
8 | Some Data Engineering Guides that will help you learn [data engineering](term/data%20engineering.md):
9 |
10 | - **[Data Quality](https://airbyte.com/blog/data-quality-issues)**
11 | - How to handle [[term/data quality]] issues by detecting, understanding, fixing, and reduce
12 | - **[Data Lake / Lakehouse Guide](https://airbyte.com/blog/data-lake-lakehouse-guide-powered-by-table-formats-delta-lake-iceberg-hudi)**
13 | - The what & why of a [Data Lake](term/data%20lake.md)
14 | - Differences between [Lakehouse](term/data%20lakehouse.md) & [Data Warehouse](term/data%20warehouse.md)
15 | - Components of a data lake
16 | 1. [Storage Layer](term/storage%20layer%20object%20store.md)
17 | 2. [Data Lake File Format](term/data%20lake%20file%20format.md)
18 | 3. [Data Lake Table Format](term/data%20lake%20table%20format.md) with [Apache Parquet](term/apache%20parquet.md), [Apache Iceberg](term/apache%20iceberg.md), and [Apache Hudi](term/apache%20hudi.md)
19 | - Trends in the market
20 | - We answer questions such as:
21 | - How to build an open-source data lake offloading data for analytics?
22 | - How to [govern](term/data%20governance.md) your hundreds to thousands of files and have more database-like features?
23 | - **[Reverse ETL Explained](https://airbyte.com/blog/reverse-etl)**
24 | - A Brief Story of Data Integration: [ETL](term/etl.md) vs. [ELT](term/elt.md)
25 | - So, What is a [Reverse ETL](term/reverse%20etl.md)?
26 | - Technical Differences Between ETL and Reverse ETL
27 | - Typical Reverse ETL Use Cases
28 | - Reverse ETL and the [Data Hierarchy of Needs](term/data%20hierarchy%20of%20needs.md)
29 | - [Data Orchestration Trends](https://airbyte.com/blog/data-orchestration-trends)
30 | - [Data Integration Guide](https://airbyte.com/blog/data-integration)
31 | - [Understanding Change Data Capture (CDC)](https://airbyte.com/blog/change-data-capture-definition-methods-and-benefits)
32 | - [Using an ETL Framework vs Writing Yet Another ETL Script](https://airbyte.com/blog/etl-framework-vs-etl-script)
33 |
34 | See more on [Data Insights Blog Posts](https://airbyte.com/blog-categories/data-insights).
--------------------------------------------------------------------------------
/content/term/data engineering lifecycle.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is the Data Engineering Lifecycle?"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | A data engineer today oversees the whole data engineering process, from collecting data from various sources to making it available for downstream processes. The role requires familiarity with the multiple stages of the data engineering lifecycle and an aptitude for evaluating data tools for optimal performance across several dimensions, including price, speed, flexibility, scalability, simplicity, reusability, and interoperability.
8 |
9 | 
10 | The data engineering lifecycle, inspired by [Fundamentals of Data Engineering](https://www.oreilly.com/library/view/fundamentals-of-data/9781098108298/)
11 |
12 | > [!example] Example Open Data Stack Project
13 | >
14 | > With the [Open Data Stack](term/open%20data%20stack.md) project, we are implementing a hands-on example with the core components of the lifecycle, such as ingestion, [transformation](term/data%20transformation.md), [analytics](term/analytics.md), and [machine learning](term/machine%20learning.md).
15 |
16 | Read more on [The Evolution of The Data Engineer: A Look at The Past, Present & Future](https://airbyte.com/blog/data-engineering-past-present-and-future).
17 |
--------------------------------------------------------------------------------
/content/term/data engineering.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Data Engineering?"
3 | tags:
4 | - data engineering
5 | ---
6 | The definition from the [Fundamentals of Data Engineering](https://www.oreilly.com/library/view/fundamentals-of-data/9781098108298/), as it’s one of the most recent and complete:
7 | > Data engineering is the development, implementation, and maintenance of systems and processes that take in raw data and produce high-quality, consistent information that supports downstream use cases, such as analysis and machine learning. Data engineering intersects security, data management, DataOps, data architecture, orchestration, and software engineering.
8 |
9 | A data engineer today oversees the whole data engineering process, from collecting data from various sources to making it available for downstream processes. The role requires familiarity with the multiple stages of the [Data Engineering Lifecycle](term/data%20engineering%20lifecycle.md) and an aptitude for evaluating data tools for optimal performance across several dimensions, including price, speed, flexibility, scalability, simplicity, reusability, and interoperability.
10 |
11 | Data Engineering helps also overcome the bottlenecks of [Business Intelligence](term/business%20intelligence.md):
12 | - More transparency as tools are open-source mostly
13 | - More frequent data loads
14 | - Supporting [Machine Learning](term/machine%20learning.md) capabilities
15 |
16 | Compared to existing roles it would be a **software engineering plus business intelligence engineer including big data abilities** as the [Hadoop](term/apache%20hadoop.md) ecosystem, streaming, and computation at scale. Business creates more reporting artifacts themselves but with more data that needs to be collected, cleaned, and updated near real-time and complexity is expanding every day.
17 |
18 | With that said more programmatic skills are needed similar to software engineering. **The emerging language at the moment is [Python](term/python.md)** which is used in engineering with tools alike [[Apache Airflow]], [Dagster](Dagster), [[Prefect]] as well as data science with powerful libraries.
19 |
20 | As a data engineer, you use mainly [SQL](term/sql.md) for almost everything except when using external data from an API. Here you'd use [ELT](term/elt.md) tools or write some [[data pipelines]] with the tools mentioned above.
21 |
22 | Want to know more about [The Evolution of The Data Engineer: A Look at The Past, Present & Future](https://airbyte.com/blog/data-engineering-past-present-and-future), check out the linked article or watch the video form of it:
23 | {{< youtube Si14Hgj4Lok >}}
24 |
--------------------------------------------------------------------------------
/content/term/data enrichment.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is data enrichment"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | Data enrichment is a kind of [data transformation](term/data%20transformation.md) which adds additional information to the data in order to makes new kinds of queries possible and/or more efficient.
8 |
9 | ## Example of data enrichment
10 | Imagine that you have a “System A” that contains an IP address to country mapping, and a “System B” that contains a data set with records that include an IP address (but no country). If you would like to execute queries on “System B” by country, it would be beneficial to transform records in “System B” to include a country field. This can be achieved by running a transformation job that reads the IP address from each record on “System B”, performs a lookup on “System A” to get the associated country name, and that writes the country name back into an “enriched” data set on “System B”. Future queries which break down the data by country can then be efficiently executed against this enriched data set on “System B”.
--------------------------------------------------------------------------------
/content/term/data federation.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Data Federation"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | Data Federation is a virtual layer very similar to [Data Virtualization](term/data%20virtualization.md). The slight difference is that data federations include federated query engines such as [[Trino]], [[Presto]], [[Spark]], and alike.
--------------------------------------------------------------------------------
/content/term/data governance.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Data Governance?"
3 | tags:
4 | - data engineering
5 | ---
6 | [**Data governance**](https://www.talend.com/resources/what-is-data-governance/) **is a collection of processes, roles, policies, standards, and metrics that ensure the effective and efficient use of information in enabling an organization to achieve its goals.** It establishes the processes and responsibilities that ensure the [data quality](term/data%20quality.md) and security of the data used across a business or organization. Data governance defines who can take what action, upon what data, in what situations, and using what methods.
7 |
8 | Read more on our [Data Lake and Lakehouse Guide](https://airbyte.com/blog/data-lake-lakehouse-guide-powered-by-table-formats-delta-lake-iceberg-hudi).
--------------------------------------------------------------------------------
/content/term/data hierarchy of needs.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "The Data Hierarchy of Needs"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | The data hierarchy of needs in this image is inspired by [Grouparoo's blog post](https://www.grouparoo.com/blog/data-hierarchy-of-needs):
8 | 
9 |
10 | More on [Reverse ETL Explained](https://airbyte.com/blog/reverse-etl#so-what-is-a-reverse-etl).
11 |
--------------------------------------------------------------------------------
/content/term/data integration.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Data Integration?"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | Data integration is the process of combining data from disparate source systems into a single unified view. This can be accomplished via manual integration, data virtualization, application integration, or by moving data from multiple sources into a unified destination. These data integration methods are discussed below.
8 |
9 | ## Manual integration
10 | Before implementing a systematic approach to data integration, organizations may initially make use of manual integration when trying to make sense of data that is spread across multiple systems. This involves analysts manually logging into source systems, analyzing and/or exporting data on these systems, and creating reports based on their findings.
11 |
12 | Manual integration as a data integration strategy has several disadvantages. In addition to being time-consuming, analysts require access to multiple operational systems which creates security risks. Furthermore, analysts may run expensive analytics operations on systems that are not optimized for such workloads, which may interfere with the functioning of these systems. Finally, data in the source systems may frequently change which means that manually generated reports will quickly become outdated.
13 |
14 | ## Data virtualization
15 | Organizations may also consider adopting a data virtualization solution to integrate their data. In this type of data integration, data from multiple sources is left in place and is accessed via a virtualization layer so that it _appears_ as a single data store. This virtualization layer makes use of adapters that translate queries executed on the virtualization layer into a format that each connected source system can execute. The virtualization layer then combines the responses from these source systems into a single result. This data integration strategy is sometimes used when a BI tool like Tableau needs to access data from multiple data sources.
16 |
17 | One disadvantage of data virtualization is that analytics workloads are executed on operational systems, which could interfere with their functioning. Another disadvantage is that the virtualization layer may act as a bottleneck on the performance of analytics operations.
18 |
19 | ## Application integration
20 | Another alternative data integration solution is to directly link multiple applications to each other and move data directly between them. This is known as application integration, and linking can be done via point-to-point communications, via a middleware layer such as an enterprise service bus (ESB), or through an application integration tool.
21 |
22 | Application integration may result in many copies of the same data across multiple source systems, which may increase cost, and may cause a large amount of point-to-point traffic between various systems. Furthermore, as with the previous data integration types, executing analytics workloads directly on operational systems could interfere with their functioning.
23 |
24 | ## Moving data to a unified destination
25 | Sending data from across an enterprise into a centralized system such as a database, a data warehouse, a data lake, or a data lakehouse results in a **single unified location for accessing and analyzing all the information that is flowing through an organization**. At Airbyte we are advocates of this data integration methodology, and the next section of this article is dedicated to discussing its benefits in more detail.
26 |
27 | Below is a high-level representation of [data replication](https://airbyte.com/blog/what-is-data-replication) from multiple sources into Google BigQuery.
28 |
29 | 
30 | Data replication into a central destination
31 |
32 | Read more on [Data Integration Guide: Techniques, Technologies, and Tools | Airbyte](https://airbyte.com/blog/data-integration).
--------------------------------------------------------------------------------
/content/term/data lake file format.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Data Lake File Format?"
3 | tags:
4 | - data engineering
5 | ---
6 | Data lake file formats are the new CSVs on the cloud. They are more column-oriented and compress large files with added features. The main players here are [Apache Parquet](term/apache%20parquet.md), [Apache Avro](term/apache%20avro.md), and [Apache Arrow](term/apache%20arrow.md). It’s the physical store with the actual files distributed around different buckets on your [Object Store](term/storage%20layer%20object%20store.md).
7 |
8 | You can build more features with [Data Lake Table Format](term/data%20lake%20table%20format.md) on top. Read more on our [Data Lake and Lakehouse Guide](https://airbyte.com/blog/data-lake-lakehouse-guide-powered-by-table-formats-delta-lake-iceberg-hudi).
--------------------------------------------------------------------------------
/content/term/data lake table format.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Data Lake Table Format?"
3 | tags:
4 | - data engineering
5 | ---
6 | Data lake table formats are very attractive as they are databases on [Data Lake](term/data%20lake.md). Same as a table, one **data lake table format bundles distributed files into one table that is otherwise hard to manage**. You can think of it as an abstraction layer between your physical data files and how they are structured to form a table.
7 |
8 |
9 |
10 | It is built on top o the [Storage Layer](term/storage%20layer%20object%20store.md) and [Data Lake File Format](term/data%20lake%20file%20format.md). Table Formats are [Delta Lake](term/delta%20lake.md), [Apache Iceberg](term/apache%20iceberg.md) or [Apache Hudi](term/apache%20hudi.md). Read more on our [Data Lake and Lakehouse Guide](https://airbyte.com/blog/data-lake-lakehouse-guide-powered-by-table-formats-delta-lake-iceberg-hudi).
11 |
--------------------------------------------------------------------------------
/content/term/data lake transaction log.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Data Lake Transaction Log?"
3 | tags:
4 | - data engineering
5 | ---
6 | The transaction log is the ordered record of every transaction, with a configurable duration that can optionally be set to retain all transactions (i.e. data infinite). A transaction log is a common component used through many of its above-mentioned features, including [ACID Transactions](term/acid%20transactions.md), scalable metadata handling, and [Time Travel](term/time%20travel.md). For example, [Delta Lake](term/delta%20lake.md) creates a single [folder called `_delta_log`](https://airbyte.com/tutorials/load-data-into-delta-lake-on-databricks-lakehouse#step-5).
--------------------------------------------------------------------------------
/content/term/data lake.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Data Lake?"
3 | tags:
4 | - data engineering
5 | ---
6 | A Data Lake is a storage system with vast amounts of unstructured and structured data, stored as-is, without a specific purpose in mind, that can be built on multiple technologies such as Hadoop, NoSQL, Amazon Simple Storage Service, a relational database, or various combinations and different formats (e.g. Excel, CSV, Text, Logs, etc.).
7 |
8 | According to [Hortonworks Data Lake Whitepaper](http://hortonworks.com/wp-content/uploads/2014/05/TeradataHortonworks_Datalake_White-Paper_20140410.pdf), the data lake arose because new types of data needed to be captured and exploited by the enterprise. As this data became increasingly available, early adopters discovered that they could extract insight through new applications built to serve the business. The data lake supports the following capabilities:
9 | - To capture and store raw data at scale for a low cost
10 | - To store many types of data in the same repository
11 | - To perform [data transformation](term/data%20transformation.md) on the data where the purpose may not be defined
12 | - To perform new types of data processing
13 | - To perform single-subject analytics based on particular use cases
14 |
15 | The initial concept was created by Databricks in the [CIDR Paper](http://cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf) in 2021. Read more on our [Data Lake and Lakehouse Guide](https://airbyte.com/blog/data-lake-lakehouse-guide-powered-by-table-formats-delta-lake-iceberg-hudi).
--------------------------------------------------------------------------------
/content/term/data lakehouse.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Data Lakehouse?"
3 | tags:
4 | - data engineering
5 | aliases:
6 | - Lakehouse
7 | ---
8 |
9 | A Data Lakehouse open data management architecture that combines the flexibility, cost-efficiency, and scale of [Data Lake](term/data%20lake.md) with the data management and ACID transactions of [Data Warehouse](term/data%20warehouse.md) with Data Lake Table Formats ([Delta Lake](term/delta%20lake.md), [Apache Iceberg](term/apache%20iceberg.md) & [Apache Hudi](term/apache%20hudi.md)) that enable Business Intelligence (BI) and Machine Learning (ML) on all data.
10 |
11 | The initial concept was created by Databricks in the [CIDR Paper](http://cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf) in 2021. Read more on our [Data Lake and Lakehouse Guide](https://airbyte.com/blog/data-lake-lakehouse-guide-powered-by-table-formats-delta-lake-iceberg-hudi).
--------------------------------------------------------------------------------
/content/term/data lineage.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Data Lineage?"
3 | tags:
4 | - data engineering
5 | ---
6 | Data lineage uncovers the life cycle of data. It aims to show the complete data flow from start to finish. Data lineage is the process of understanding, recording, and visualizing data as it flows from data sources to consumption. This includes all [data transformation](term/data%20transformation.md) (what changed and why).
--------------------------------------------------------------------------------
/content/term/data literacy.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Data Literacy?"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | Data literacy is the ability to read, work with, analyze, and argue with data in order to extract meaningful information and make informed decisions. This skill set is crucial for employees across various levels of an organization, especially as data-driven decision-making becomes increasingly important.
8 |
9 | Organizations should invest in data literacy training programs to empower their employees with the necessary skills to effectively engage with data. A data-literate employee can read charts, draw correct conclusions, recognize when data is being used inappropriately or misleadingly, and gain a deeper understanding of the business domain. This enables them to communicate more effectively using a common language of data, spot unexpected operational issues, identify root causes, and prevent poor decision-making due to data misinterpretation.
10 |
11 | Examples of data literacy in action include:
12 |
13 | * Implementing the Adoptive Framework to create a Data Literacy Program.
14 | * Employees working with spreadsheets to understand the rationale behind data-driven decisions and advocating for alternative courses of action.
15 | * Work teams identifying areas where data needs clarification for a project.
16 |
17 | By nurturing a data-literate workforce, businesses can improve their ability to make informed decisions, drive innovation, and achieve better outcomes.
18 |
--------------------------------------------------------------------------------
/content/term/data mesh.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Data Mesh?"
3 | tags:
4 | - data engineering
5 | ---
6 | The [Data Mesh Paper](https://martinfowler.com/articles/data-monolith-to-mesh.html) tries to eliminate silos between data teams, ensuring that the experience and knowledge about data are shared among all data consumers in the company. Data Mesh sees [Data as a Product](term/data%20product.md). Data meshes are also about connecting platforms that those teams are using so data can be easily moved around for the organization's benefit. Companies will try to find better ways of unifying and connecting the tools so that data professionals don’t have to switch and work in a silo.
7 |
8 | Data meshes try to eliminate the tensions between decentralizing and centralizing data resources, with some common infrastructure but otherwise mostly decentralized. It empowers data teams and gives ownership to domain experts.
9 |
10 | More valuable resources such as a [short version](https://cnr.sh/essays/what-the-heck-data-mesh), a [visually appealing one](https://www.datamesh-architecture.com/)), or [applied in practice](https://youtu.be/eiUhV56uVUc).
--------------------------------------------------------------------------------
/content/term/data modeling.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Data Modeling?"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | Data modeling in software engineering is the process of creating a data model for an information system by applying certain formal techniques. One specific example of data modeling is [Dimensional Modeling](term/dimensional%20modeling.md) which has a high state even in modern data architecture.
8 |
9 | Read more on [Wikipedia](https://en.wikipedia.org/wiki/Data_modeling).
--------------------------------------------------------------------------------
/content/term/data observability.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Data Observability?"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | Data observability, also known as monitoring, continuously collects metrics about your data. You can collect data about the number of rows, columns, and properties for each dataset. You can also manage metadata about the dataset, such as when it was last updated.
8 |
9 | From the great article [Choosing a Data Quality Tool - by Sarah Krasnik](https://sarahsnewsletter.substack.com/p/choosing-a-data-quality-tool?s=r), there are also different categories for observability:
10 | - **Auto-profiling data**
11 | - [Bigeye](https://www.bigeye.com/): unique in a wide range of ML-driven automatic threshold tests and alerts
12 | - [Datafold](https://www.datafold.com/): unique Github integration presenting Data Diff between environments with custom tests
13 | - [Monte Carlo](https://www.montecarlodata.com/): unique in being the most enterprise-ready enterprise-ready with many data lake integrations
14 | - [Lightup](https://www.lightup.ai/): unique self-hosted deployment option, appealing to highly regulated industries
15 | - [Metaplane](https://www.metaplane.dev/): unique in a high level of configuration for a hosted tool with both out-of-the-box and custom tests
16 | - **Pipeline Testing**
17 | - [Great Expectations](https://greatexpectations.io/): unique in its data quality specific community and automatic documentation of tests
18 | - [Soda](https://www.soda.io/): unique in its self-hosted cloud option
19 | - [dbt tests](https://docs.getdbt.com/docs/building-a-dbt-project/tests): unique in integration with dbt core and dbt Cloud builds (naturally), but not as versatile outside of the dbt ecosystem
20 | - **Infrastructure monitoring**
21 | - [DataDog](https://www.datadoghq.com/): unique agent implementation that can be deployed anywhere for monitoring, even at the container level, with custom Airflow metric reporting
22 | - [New Relic](https://newrelic.com/): unique one-step integration with the big three cloud
23 | - **A little bit of everything**
24 | - [Databand](https://databand.ai/): unique integration with Airflow and specific Airflow metric monitoring
25 | - [Unravel](https://www.unraveldata.com/): unique support for other data sources like Spark, data lake, and NoSQL databases
26 | - [Data Catalogs](term/data%20catalog.md): Helping observe existing data
27 |
28 | Related terms are [Data Governance](term/data%20governance.md) and [Data Quality](term/data%20quality.md).
29 |
--------------------------------------------------------------------------------
/content/term/data ops.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is DataOps?"
3 | tags:
4 | - devops
5 | ---
6 | Similar to how [DevOps](term/dev%20ops.md) changed the way software is developed, DataOps is changing the way data products are created. With DataOps, data engineers and data scientists can work together, bringing a level of collaboration and communication, with a common goal of producing valuable insight for the business.
7 |
8 | 
9 | Read more on [The Rise of DataOps](https://medium.com/towards-data-science/the-rise-of-dataops-2788958034ee).
10 |
--------------------------------------------------------------------------------
/content/term/data orchestrator.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Data Orchestrator?"
3 | tags:
4 | - data engineering
5 | ---
6 | A Data Orchestrator models dependencies between different tasks in [complex heterogeneous cloud environments](https://mattturck.com/data2021/) end-to-end. It handles integrations with legacy systems, new cloud-based tools, and your data lakes and data warehouses. It invokes [computation](https://en.wikipedia.org/wiki/Orchestration_(computing)), such as wrangling your business logic in [SQL](term/sql.md) and [Python](term/python.md) and applying ML models at the right time based on a time-based trigger or by custom-defined logic.
7 |
8 | More Insights in [Data Orchestration Trends: The Shift from Data Pipelines to Data Products](https://airbyte.com/blog/data-orchestration-trends).
--------------------------------------------------------------------------------
/content/term/data processing techniques.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What are Data Processing Techniques (row-based, columnar, vectorized)?"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | A collection of concepts and technologies that encompass various methods and optimizations for storing, retrieving, and processing data in database systems. This glossary page includes definitions for **columnar storage, row-based storage, and vectorized engines**, which are all techniques that aim to improve the efficiency and performance of different types of workloads, such as transactional, analytical, and large-scale data processing tasks.
8 |
9 | By understanding these techniques, database users and developers can make informed decisions about which approach best fits their specific use cases and requirements.
10 |
11 | ## Columnar Storage
12 | A database storage technique that stores data by columns rather than rows, allowing for more efficient compression and faster querying for analytical workloads. Columnar storage is particularly useful for read-heavy operations and large-scale data analytics, as it enables the retrieval of specific columns without the need to access the entire row. This contrasts with traditional row-based storage, where data is stored row by row, making it more suited for transactional workloads and operations that involve frequent updates and inserts.
13 |
14 | ## Row-based Storage
15 | A traditional database storage technique where data is stored in consecutive rows, allows for efficient processing of operations that involve entire records, such as inserts, updates, and deletions. Row-based storage is well-suited for transactional systems (OLTP) that require fast access to individual records. However, it can be less efficient for analytical workloads and large-scale data processing, where columnar storage offers better performance due to its ability to selectively retrieve specific columns without accessing the entire row.
16 |
17 | ## Vectorized Engine
18 | A modern database query execution engine designed to optimize data processing by leveraging vectorized operations and SIMD (Single Instruction, Multiple Data) capabilities of modern CPUs. Vectorized engines, such as Databricks' Photon Engine or [DuckDB](term/duckdb.md), process data in large blocks or batches, allowing for improved parallelism, cache locality, and reduced overhead compared to traditional row-at-a-time processing engines. This results in significantly faster query performance, particularly for [analytical](term/analytics.md) and large-scale data processing workloads, making vectorized engines a popular choice in the era of big data and real-time analytics.
--------------------------------------------------------------------------------
/content/term/data product.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Data Product?"
3 | tags:
4 | - data engineering
5 | ---
6 | [DJ Patil](https://twitter.com/dpatil), the former Chief Data Scientist of the United States, defined a data product as "a product that facilitates an end goal through data." Also, [Data Mesh](term/data%20mesh.md) talks about "data as a product." It applies more product thinking, whereas the "Data Product" essentially is a dashboard, report, and table in a [Data Warehouse](term/data%20warehouse.md) or a Machine Learning model. Sometimes Data Products are also called [Data Asset](term/data%20asset.md)s.
--------------------------------------------------------------------------------
/content/term/data quality.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Data Quality?"
3 | tags:
4 | - data engineering
5 | ---
6 | Data quality is the process of ensuring data meets expectations.
7 |
8 | There are three main ways to detect a data quality issue:
9 | - A business user reports an issue.
10 | - A data test fails.
11 | - Data monitoring raises an alert.
12 |
13 | 
14 |
15 | Read more on [Why is data quality harder than code quality?](https://airbyte.com/blog/data-quality-issues).
--------------------------------------------------------------------------------
/content/term/data swamp.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Data Swamp?"
3 | tags:
4 | - data engineering
5 | ---
6 | Data swamps start to arise when there is a lack of responsibilities, data ownership, availability, and data governance. It's when a [Data Lake](term/data%20lake.md) is unmanaged or unable to provide value. Sometimes a Data Swamp can also arise from a [Data Warehouse](term/data%20warehouse.md) due to existing hybrid models.
--------------------------------------------------------------------------------
/content/term/data transformation.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is data transformation?"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | Data transformation is the process of converting data from one format into a different format. Reasons for doing this could be to optimize the data for a different use case than it was originally intended for, or to meet the requirements for storing data in a different system. Data transformation may involve steps such as cleansing, normalizing, [structuring](term/structured%20data.md), validation, sorting, joining, or [enriching](term/data%20enrichment.md) data.
8 |
9 | ## How is data transformation done
10 | Data is often transformed as part of an [ETL (Extract, Transform, Load)](term/etl.md) or [ELT (Extract, Load, Transform)](term/elt.md) approach to [data integration](term/data%20integration.md).
11 |
12 | See [ETL vs. ELT](term/etl%20vs%20elt.md) for a comparison of these two approaches.
13 |
14 | Additionally, a hybrid approach has recently emerged which is known as [EtLT (Extract, “tweak”, Load, Transform)](term/etlt.md). This combines aspects of both ETL and ELT.
15 |
16 | ## Benefits of data transformation
17 | When used correctly, data transformation can provide the following benefits:
18 |
19 | - Improved query-time efficiency and speed.
20 | - Conversion of data into a format that is required by a target system.
21 | - Enrichment of data with additional information that allows insights to be more easily extracted.
22 | - Improved data quality by validating and fixing data, and removal of duplicates.
23 |
24 | ## Examples of data transformation
25 | Below are some examples of how data may be transformed to achieve some of the benefits mentioned above.
26 |
27 | ### Improved efficiency and speed
28 | One kind of transformation could be the extraction of structured data from data that is stored in a string. Imagine data that looks as follows:
29 |
30 | ```
31 | input_string: "Bob is 29"
32 | ```
33 |
34 | In order to efficiently process this data in the future, it may preferable to transform this data into additional/new fields, and store it as:
35 |
36 | ```
37 | name: "Bob"
38 | age: 29
39 | ```
40 |
41 | Storing the data in this manner makes it much more efficient to analyze with operations such as:
42 |
43 | ```sql
44 | SELECT * FROM X where Age=29
45 | ```
46 |
47 | ### Enriching data
48 | [Data enrichment](data%20enrichment.md) is a data transformation that adds additional information to the data that makes new kinds of queries possible.
49 |
--------------------------------------------------------------------------------
/content/term/data virtualization.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Data Virtualization?"
3 | tags:
4 | - data engineering
5 | ---
6 | Data Virtualization helps you when you have many source systems from different technologies, but all of them are rather fast in response time, and if you don't run a lot of operational applications. In that way, you don't move and copy data around and pre-aggregate, but you have a [Semantic Layer](term/metrics%20layer.md) where you create your business models (like cubes), and only if you query this data virtualization layer does it query the data source. If you use, e.g. [Dremio](https://www.dremio.com/), there you use [Apache Arrow](term/apache%20arrow.md) technology which will cache and optimize a lot in-memory for you that you have as well as stonishing fast response times.
7 |
8 | It's tightly connected to [Data Federation](term/data%20federation.md) and [Push-Downs](term/push-down.md).
--------------------------------------------------------------------------------
/content/term/data warehouse.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Data Warehouse?"
3 | tags:
4 | - data engineering
5 | ---
6 | A Data Warehouse, in short DWH, also known as an Enterprise Data Warehouse (EDW), is the traditional way of collecting data as we do [since 30+ years](https://tdwi.org/articles/2016/02/01/data-warehousing-30.aspx). The DWH serves to be the [data integration](term/data%20integration.md) from many different sources, the single point of truth and the data management, meaning cleaning, historizing, and data joined together. It provides greater executive insight into corporate performance with management Dashboards, Reports, or Ad-Hoc Analyses.
7 |
8 | Various types of business data are analyzed with Data Warehouses. The need for it often becomes evident when analytic requirements run afoul of the ongoing performance of operational databases. Running a complex query on a database requires the database to enter a temporarily fixed state. It is often untenable for transactional databases. A data warehouse is employed to do the analytical work, leaving the transactional database free to focus on transactions.
9 |
10 | The other characteristic is analyzing data from multiple origins (e.g., your Google Analytics with your CRM data). It is highly [transformed](term/data%20transformation.md) and structured due to the [ETL (Extract Transform Load)](term/etl.md) process.
11 |
12 | If you wonder about the difference between a Data Warehouse, Data Lake, and a Lakehouse, read more on our [Data Lake and Lakehouse Guide](https://airbyte.com/blog/data-lake-lakehouse-guide-powered-by-table-formats-delta-lake-iceberg-hudi).
--------------------------------------------------------------------------------
/content/term/database normalization.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Normalization?"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | Normalization is used in relational database design to reduce data redundancy and improve data integrity. Developed by British computer scientist [Edgar F. Codd ](https://en.wikipedia.org/wiki/Edgar_F._Codd) in the 1970s as part of his relational model, normalization involves organizing the columns (attributes) and tables (relations) in a database to ensure proper enforcement of dependencies through database integrity constraints.
8 |
9 | This is achieved by applying formal rules during the synthesis (creation of a new database design) or decomposition (improvement of an existing database design) process.
10 |
11 | 1. **First Normal Form (1NF)**:
12 | - Eliminate duplicate data by ensuring each attribute contains only atomic values and each table has a unique primary key.
13 | 2. **Second Normal Form (2NF)**:
14 | - Meet all requirements of 1NF and remove partial dependencies by ensuring that every non-prime attribute (attribute not part of any candidate key) entirely depends on the primary key.
15 | 3. **Third Normal Form (3NF)**:
16 | - Meet all requirements of 2NF and remove transitive dependencies by ensuring that no non-prime attribute is transitively dependent on the primary key.
17 |
18 | ## Denormalization
19 | **Denormalization**, on the other hand, is the process of intentionally introducing redundancy into a database design by combining tables or adding redundant data, aiming to improve query performance or simplify the database structure. Denormalization is the **opposite of normalization**. Please consider the trade-offs between data integrity and query performance. This technique is used with [Dimensional Modeling](term/dimensional%20modeling.md) in [OLAP](term/olap%20(online%20analytical%20processing).md) cubes, for example.
20 |
21 |
--------------------------------------------------------------------------------
/content/term/declarative.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is declarative?"
3 | tags:
4 | - data engineering
5 | ---
6 | A **[declarative](term/declarative.md)** data pipeline does not tell the order it needs to be executed but instead allows each step/task to find the best time and way to run. The declarative approach describes *what* the program does without explicitly specifying its control flow. [Functional Data Engineering](term/functional%20data%20engineering.md) and [Functional Programming](term/functional%20programming.md) is a **declarative** programming paradigm, in contrast to **[imperative](term/imperative.md)** programming paradigms.
7 |
8 | ## Declarative vs Imperative
9 | Declarative approaches appeal because they make systems easier to debug and automate. It's done by explicitly showing intention and offering a simple way to manage and apply changes. By explicitly declaring how the pipeline should look, for example, **defining the data products that should exist**, it becomes much easier to discover when it does not look like that, the reason why, and reconcile. It's the foundation layer for your entire platform's lineage, observability, and [data quality](https://airbyte.com/blog/data-quality-issues) monitoring.
10 |
11 | 
12 |
13 | Read more on [Data Orchestration Trends: The Shift From Data Pipelines to Data Products](https://airbyte.com/blog/data-orchestration-trends).
--------------------------------------------------------------------------------
/content/term/delta lake.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Delta Lake?"
3 | tags:
4 | - data engineering
5 | ---
6 | Delta Lake is an open-source [Data Lake Table Format](term/data%20lake%20table%20format.md) project created by Databricks and kindly open-sourced with its [first public GitHub Commit](https://github.com/delta-io/delta/commit/14cb4e0267cc188e0fdd47e5b4f0235baf87874e) on 2019-04-22. Recently announced [Delta Lake 2.0](https://www.databricks.com/blog/2022/06/30/open-sourcing-all-of-delta-lake.html).
7 |
8 | Read more about how to build a Data Lake on top of it on our [Data Lake and Lakehouse Guide](https://airbyte.com/blog/data-lake-lakehouse-guide-powered-by-table-formats-delta-lake-iceberg-hudi) or if you are curious to build a Delta Lake destination with Airbyte [Load Data into Delta Lake on Databricks Lakehouse](https://airbyte.com/tutorials/load-data-into-delta-lake-on-databricks-lakehouse).
--------------------------------------------------------------------------------
/content/term/dev ops.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is DevOps?"
3 | tags:
4 | - devops
5 | ---
6 | DevOps is a combination of software developers (dev) and operations (ops). It is defined as a software engineering methodology that aims to integrate the work of software development and software operations teams by facilitating a culture of collaboration and shared responsibility.
7 |
8 | Is also related to [DataOps](term/data%20ops.md)
--------------------------------------------------------------------------------
/content/term/dimensional modeling.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Dimensional Modeling?"
3 | tags:
4 | - data engineering
5 | ---
6 | Dimensional modeling (DM) is part of the Business Dimensional Lifecycle methodology developed by [[Ralph Kimball]], which includes a set of methods, techniques, and concepts for use in [Data Warehouse](term/data%20warehouse.md) design.
7 |
8 | As a bottom-up approach, the approach focuses on identifying the critical business processes within a business and modeling and implementing these before adding additional business processes. An alternative approach from [[Bill Inmon]] advocates a top-down design of the model of all the enterprise data using tools such as Entity-Relationship Modeling (ER).
9 |
10 | Read more on [Data Modeling with SQL and dbt](https://airbyte.com/blog/sql-data-modeling-with-dbt).
11 |
12 |
--------------------------------------------------------------------------------
/content/term/dimensions.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What are Dimensions?"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | Dimensions are the categorical buckets that can be used to segment, filter, or group—such as sales amount region, city, product, color, and distribution channel. Traditionally known from [OLAP](term/olap%20(online%20analytical%20processing).md) cubes with [Bus Matrixes](term/bus%20matrix.md).
8 |
--------------------------------------------------------------------------------
/content/term/elt.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is ELT?"
3 | tags:
4 | - data engineering
5 | ---
6 | ELT (Extract, Load, and [Transform](term/data%20transformation.md)) is a [data integration](term/data%20integration.md) approach that extracts (E) data from a source system, and loads (L) raw data into a destination system before it transforms (T) the data. In other words, in the ELT approach, transformation (T) of the data is done _within_ the destination [Data Warehouse](term/data%20warehouse.md) after data has been loaded.
7 |
8 | ELT is in contrast to the more traditional [ETL](term/etl.md) data integration approach, in which data is transformed before it arrives at the destination. See [ETL vs ELT](term/etl%20vs%20elt.md) for a more detailed comparision of these approaches.
9 |
10 | The shift from the ETL paradigm to the ELT paradigm has been made possible thanks to the plummeting cost of cloud-based computation and storage, and the appearance of cloud-based data warehouses like Redshift, BigQuery, or Snowflake.
11 |
12 | The following image demonstrates the ELT approach to data integration -- in this diagram [dbt](https://docs.getdbt.com/docs/introduction) creates and manages the SQL that is used for transforming the data in the destination:
13 |
14 | 
15 |
16 | ELT is also related to [Reverse ETL](term/reverse%20etl.md) which you can find more information about at: [Reverse ETL Explained](https://airbyte.com/blog/reverse-etl#so-what-is-a-reverse-etl) or [Airbyte.com](https://airbyte.com).
17 |
--------------------------------------------------------------------------------
/content/term/etl elt airbyte.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "ETL and ELT with Airbyte"
3 | tags:
4 | - airbyte
5 | ---
6 | [ELT](term/elt.md) and [ETL](term/etl.md) specific to Airbyte mean:
7 | - **Extract**: Retrieve data from a [source](https://docs.airbyte.com/integrations/#Sources), which can be an application, database, or anything really.
8 | - **Load**: Move data to your [destination](https://docs.airbyte.com/integrations/#Destinations).
9 | - **Transform**: Clean up the data. This is referred to as [normalization](term/normalization.md) in Airbyte and involves [incremental synchronization](term/incremental%20synchronization.md) and [deduplication](https://docs.airbyte.com/understanding-airbyte/connections/incremental-deduped-history), changing data types, formats, and more.
--------------------------------------------------------------------------------
/content/term/etl vs elt.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "ETL vs. ELT"
3 | tags:
4 | - data engineering
5 | ---
6 | [ETL](term/etl.md) (Extract, Transform, and Load) and [ELT](term/elt.md) (Extract, Load, and Transform) are two paradigms for moving data from one system to another. The main difference between them is that when an ETL approach is used, data is transformed before it is loaded into a destination system. On the other hand, in the case of ELT, any required transformations are done after the data has been written to the destination and are _then_ done _inside_ the destination -- often by executing SQL commands. The difference between these approaches is easier to understand by a visual comparison of the two approaches.
7 |
8 | The image below demonstrates the ETL approach to [data integration](term/data%20integration.md):
9 |
10 | 
11 |
12 | While the following image demonstrates the ELT approach to data integration:
13 |
14 | 
15 |
16 | ETL was originally used for [Data Warehousing](term/data%20warehouse.md) and ELT for creating a [Data Lake](term/data%20lake.md).
17 |
18 | ## Disadvantages of ETL compared to ELT
19 |
20 | **ETL** has several **disadvantages compared to ELT**, including the following:
21 |
22 | - Generally, only transformed data is stored in the destination system, and so analysts must know beforehand every way they are going to use the data, and every report they are going to produce.
23 | - Modifications to requirements can be costly, and often require re-ingesting data from source systems.
24 | - Every transformation that is performed on the data may obscure some of the underlying information, and analysts only see what was kept during the transformation phase.
25 | - Building an ETL-based data pipeline is often beyond the technical capabilities of analysts.
26 |
27 | Find more on [An overview of Airbyte’s replication modes](https://airbyte.com/blog/understanding-data-replication-modes).
28 |
29 | ## ELT/ETL Tool Comparision
30 | Need to find the best data integration tool for your business? Which platform integrates with hour data sources and destinations? Which one provides the features you’re looking for?
31 |
32 | We made it simple for you. Here’s a [spreadsheet](https://docs.google.com/spreadsheets/d/1QKrtBpg6PliPMpcndpmkZpDVIz_o6_Y-LWTTvQ6CfHA/edit?usp=sharing) with a comparison of all those actors. Or an extensive detailed comparison between the tools on [Top ETL tools compared in details](https://airbyte.com/etl-tools-comparison).
33 |
34 | See also more on [Airbyte.com](https://airbyte.com) or [Reverse ETL Explained](https://airbyte.com/blog/reverse-etl#so-what-is-a-reverse-etl).
35 |
--------------------------------------------------------------------------------
/content/term/etl.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is ETL?"
3 | tags:
4 | - data engineering
5 | ---
6 | ETL (Extract, [Transform](term/data%20transformation.md), and Load) is a paradigm for moving data from one system to another. It involves reading data (Extract) from one system, modifying the data (Transform), and then sending it (Load) to a destination system. The ETL paradigm emerged in the 1970s.
7 |
8 | A key feature of ETL is that data is transformed before being sent to the destination, as demonstrated in the following image:
9 |
10 | 
11 |
12 | However in recent years, the preferred data movement paradigm has shifted to [ELT](term/elt.md) (Extract, Load, and Transform). The ELT philosophy dictates that data should be untouched – apart from minor cleaning and filtering – as it moves through the extraction and loading stages so that the raw data is always accessible in the destination [Data Warehouse](term/data%20warehouse.md). See [ETL vs ELT](term/etl%20vs%20elt.md) for a comparison of these approaches.
13 |
14 |
15 | ## ETL is Changing
16 | The way we do ETL is changing. For a long time ETL was done with tools such as Informatica, IBM Datastage, Cognos, AbInitio, or Microsoft SSIS. Today we use more programmatic or configuration-driven platforms like [[Airflow]], [[Dagster]], and [Temporal](term/temporal.md).
17 |
18 | Historically **ETL was once preferred** over ELT for the following **no-longer-valid reasons**:
19 | - ETL could achieve cost savings by removing unwanted data before sending it to the destination – however, with the plummeting cost of cloud-based computation and storage the value of this proposition is greatly reduced.
20 | - Because ETL transforms data before it is stored, it avoids the complexity of transforming data _after_ sending it to the destination – however, new tools such as [[dbt]] (data build tool) make it preferable and easy to transform data in the destination.
21 |
22 |
--------------------------------------------------------------------------------
/content/term/etlt.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is EtLT?"
3 | tags:
4 | - data engineering
5 | - concept
6 | ---
7 | EtLT refers to Extract, “tweak”, Load, [Transform](term/data%20transformation.md), and can be thought of an extension to the [ELT](term/elt.md) approach to [data integration](term/data%20integration.md).
8 |
9 | When compared to ELT, the EtLT approach incorporates an additional light “tweak” (small “t”) transformation, which is done on the data after it is extracted from the source and before it is loaded into the destination. This is demonstrated in the following image:
10 |
11 | 
12 |
13 | For a more detailed explanation of EtLT see: [EtLT for improved GDPR compliance](https://airbyte.com/blog/etlt-gdpr-compliance).
14 |
--------------------------------------------------------------------------------
/content/term/full refresh synchronization.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Full Refresh Synchronization?"
3 | tags:
4 | - airbyte
5 | ---
6 | A Full Refresh Sync will attempt to retrieve all data from the source every time a sync is run. Then there are two choices, *Overwrite* and *Append*. Overwrite deletes the data in the destination before running the sync and Append doesn't.
7 |
8 |
--------------------------------------------------------------------------------
/content/term/functional data engineering.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Functional Data Engineering?"
3 | tags:
4 | - data engineering
5 | ---
6 | Functional Data Engineering brings _clarity_. When functions are "pure," they do not have side effects. They can be written, tested, reasoned about, and debugged in isolation without understanding the external context or history of events surrounding their execution. Its [Functional Programming](term/functional%20programming.md) applied to the field of data engineering initiated by [Maxime Beauchemin](term/maxime%20beauchemin.md) with [Functional Data Engineering — a modern paradigm for batch data processing](https://maximebeauchemin.medium.com/functional-data-engineering-a-modern-paradigm-for-batch-data-processing-2327ec32c42a).
7 |
--------------------------------------------------------------------------------
/content/term/functional programming.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Functional Programming?"
3 | tags:
4 | - coding
5 | ---
6 |
7 | Functional Programming is a style of building functions that threaten computation as a mathematical function that avoids changing state and mutable data. It is a declarative programming paradigm, which means programming expressive and [declarative](term/declarative.md) as opposed to imperative. It's getting more popular with the rise of [Functional Data Engineering](term/functional%20data%20engineering.md).
8 |
9 | See also [Programming Languages](term/programming%20languages.md).
--------------------------------------------------------------------------------
/content/term/general infos.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "General Infos (Folder Structure, Links)"
3 | tags:
4 | - help
5 | weight: -9
6 | ---
7 | [Quartz](https://quartz.jzhao.xyz) runs on top of [Hugo](https://gohugo.io/) so all notes are written in [Markdown](https://www.markdownguide.org/getting-started/) and can be edited through any editor.
8 |
9 | ## Folder Structure
10 | The content of the Glossary is in `content/term` folder. The rest outside of `content/` is the website/framework.
11 |
12 | To edit the main home page, open `/content/_index.md`.
13 | ## Links
14 | To create a link between terms in the glossary, just create a normal link using Markdown pointing to the document in question. Please note that **all links should be relative to the root `/content` path**.
15 | ```markdown
16 | For example, I want to link this current document to `term/config.md`.
17 | [A link to the config page](term/config.md)
18 | ```
19 |
20 | Similarly, you can put local images anywhere in the `/content` folder.
21 | ```markdown
22 | Example image (source is in content/images/example.png)
23 | 
24 | ```
25 |
26 | ## Lower Case
27 | Terms are lower case that links are also lowercase. When we create a link to a term, I usually capitalize the beginning of each word to make it look nice. E.g `[Apache Arrow](term/apache%20arrow.md)`. Other such as YAML I write all in capitals.
28 |
29 | We didn't activate wikilinks, but that would be an option as well. See more on [editing](https://quartz.jzhao.xyz/notes/editing/).
30 | ## Metatag with Front Matter
31 | Hugo is picky when it comes to metadata for files. Make sure that your title is double-quoted and that you have a title defined at the top of your file like so. You can also add tags here as well.
32 | ```yaml
33 | ---
34 | title: "What is a Glossary?"
35 | tags:
36 | - example-tag
37 | - here i can add more we keep it lower case
38 | url: "term/my-other-domain"
39 | aliases:
40 | - Digital Garden
41 | - Second Brain
42 | ---
43 |
44 | Rest of your content here.
45 | ```
46 |
47 | - `url`: this is not needed, only if the default link (name of the note) is not sufficient
48 | - all spaces will be replaced with `-` (dash).
49 | - `aliases`: Are like tags, you can add multiple and they will be linkable same as a adding a new term would be.
50 |
51 |
--------------------------------------------------------------------------------
/content/term/granularity.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Granularity"
3 | tags:
4 | - data engineering
5 | ---
6 | Declaring the granularity (or _grain_) is the pivotal step in [Dimensional Modeling](term/dimensional%20modeling.md). The grain establishes exactly what a single fact table row represents. The grain declaration becomes a binding contract on the design. The grain must be declared before choosing [[dimensions]] or [[facts]] because every candidate dimension or fact must be consistent with the grain. This consistency enforces uniformity on all dimensional designs which is critical to [Business Intelligence](term/business%20intelligence.md) application performance and ease of use.
7 |
8 | For example, in the **transformation layer**, you must balance low and high granularity. What level do you aggregate and store (e.g., [rollups](term/rollup.md) hourly data to daily to save storage), or what valuable dimensions to add. With each dimension and its column added, rows will [explode](https://www.ibm.com/docs/en/ida/9.1.1?topic=phase-step-identify-measures#c_dm_design_cycle_4__c_dm_4_step7) exponentially, and we can’t persist each of these representations to the filesystem.
9 |
10 | A [Semantic Layer](term/semantic%20layer.md) is much more flexible and makes the most sense on top of [transformed data](term/data%20transformation.md) in a [Data Warehouse](term/data%20warehouse.md). Avoid extensive reshuffles or reprocesses of large amounts of data. Think of [OLAP](term/olap%20(online%20analytical%20processing).md) cubes where you can dice-and-slice ad-hoc on significant amounts of data without storing them ahead of time
11 |
12 | Read more on [Kimball Dimensional Modeling Techniques](https://www.kimballgroup.com/data-warehouse-business-intelligence-resources/kimball-techniques/dimensional-modeling-techniques/grain/). Also related is [Rollup](term/rollup.md).
13 |
14 |
--------------------------------------------------------------------------------
/content/term/idempotency.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Idempotency?"
3 | tags:
4 | - data engineering
5 | ---
6 | Idempotency is the property of a particular operation that can be applied multiple times without changing the resulting outcome by being given the same inputs. It is used in [Functional Programming](term/functional%20programming.md) and was the foundation for [Functional Data Engineering](term/functional%20data%20engineering.md).
--------------------------------------------------------------------------------
/content/term/imperative.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is imperative?"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | An **imperative** pipeline tells _how_ to proceed at each step in a procedural manner. In contrast, a **[declarative](term/declarative.md)** data pipeline does not tell the order it needs to be executed but instead allows each step/task to find the best time and way to run.
8 |
9 | The *how* should be taken care of by the tool, framework, or platform running on. For example, update an asset when upstream data has changed. Both approaches result in the same output. However, the declarative approach benefits from **leveraging compile-time query planners** and **considering runtime statistics** to choose the best way to compute and find patterns to reduce the amount of transformed data.
10 |
11 | Read more on [Data Orchestration Trends: The Shift From Data Pipelines to Data Products](https://airbyte.com/blog/data-orchestration-trends).
12 |
--------------------------------------------------------------------------------
/content/term/in-memory format.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is an In-Memory Format?"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | In-memory formats are optimized to:
8 | - hit fast instruction sets
9 | - be cache friendly
10 | - be parallelizable
11 |
12 | Formats:
13 | - [Apache Arrow](term/apache%20arrow.md)
14 | - [Apache Spark](Apache%20Spark) [[DataFrames]]
15 | - [NumPy](term/numpy.md)
16 | - [Pandas](term/pandas.md)
17 |
18 | The opposed to in-memory formats are [Data Lake File Formats](Data%20Lake%20File%20Formats) which save space, be cross-language and serve as long-term storage. More on the Data AI Summit talk about [From bits to Data Frames](https://microsites.databricks.com/sites/default/files/2022-07/Sound-Data-Engineering-in-Rust_From-Bits%20to-DataFrames.pdf) by Jorge C Leitao.
19 |
--------------------------------------------------------------------------------
/content/term/incremental synchronization.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Incremental Synchronization?"
3 | tags:
4 | - airbyte
5 | ---
6 | Incremental synchronization is a process which efficiently copies data to a destination system by periodically executing queries on a source system for records that have been updated or inserted since the previous sync operation. Only those records that have been recently inserted or updated will be sent to the destination, which is much more efficient than copying an entire data set on each sync operation. Incremental synchronization makes use of a cursor field such as `updated_at` (or whatever you wish to call the field) to determine which records should be propagated, and only records with an `updated_at` value that is newer than the `updated_at` value of the most recent record sent in the previous sync should be replicated.
7 |
8 | However, without special consideration, records that have been deleted in the source system will not be propagated to the destination as they will never appear in the results from such a query. This may be addressed by [Soft Deletes](term/soft%20delete.md) or by making use of [CDC replication](https://airbyte.com/blog/change-data-capture-definition-methods-and-benefits).
9 |
10 | Read more on [Incremental data synchronization between Postgres databases](https://airbyte.com/tutorials/incremental-data-synchronization) or see related [Full Refresh Synchronization](term/full%20refresh%20synchronization.md).
--------------------------------------------------------------------------------
/content/term/jinja template.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Jinja Template?"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | Jinja is a fast, expressive, extensible templating engine. Special placeholders in the template allow writing code similar to [Python](term/python.md) syntax. Then the template is passed data to render the final document.
8 |
9 | Most popularized by [[dbt]]. Read more on the [Jinja Documentation](https://jinja.palletsprojects.com/).
10 |
--------------------------------------------------------------------------------
/content/term/key performance indicator (kpi).md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Key Performance Indicator (KPI)?"
3 | tags:
4 | - data engineering
5 | ---
6 | A performance indicator or key performance indicator (KPI) is a type of performance measurement. KPIs evaluate the success of an organization or of a particular activity (such as projects, programs, products, and other initiatives) in which it engages.
7 |
8 | See more on [What is a Metric](term/metric.md), as it's a synonym with a KPI.
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/content/term/kubernetes.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Kubernetes?"
3 | tags:
4 | - devops
5 | ---
6 | It’s a platform that allows you to run and orchestrate container workloads. [**Kubernetes**](https://stackoverflow.blog/2020/05/29/why-kubernetes-getting-so-popular/) **has become the de-facto standard** for your cloud-native apps to (auto-) [scale-out](https://stackoverflow.com/a/11715598/5246670) and deploy your open-source zoo fast, cloud-provider-independent. No lock-in here. Kubernetes is the **move from infrastructure as code** towards **infrastructure as data**, specifically as [YAML](term/yaml.md). With Kubernetes, developers can quickly write applications that run across multiple operating environments. Costs can be reduced by scaling down.
--------------------------------------------------------------------------------
/content/term/lambda architecture.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Lambda Architecture?"
3 | tags:
4 | - data engineering
5 | ---
6 | Lambda architecture is a data-processing architecture designed to handle massive quantities of data by taking advantage of both batch and stream-processing methods. This approach to architecture attempts to balance latency, throughput, and fault tolerance using batch processing to provide comprehensive and accurate views of batch data, while simultaneously using real-time stream processing to provide views of online data. The two view outputs may be joined before the presentation. The rise of lambda architecture is correlated with the growth of big data, real-time analytics, and the drive to mitigate the latencies of [MapReduce](term/map%20reduce.md).
--------------------------------------------------------------------------------
/content/term/machine learning.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Machine Learning?"
3 | tags:
4 | - data engineering
5 | aliases:
6 | - AI
7 | - data science
8 | ---
9 | Machine learning (ML) is a type of artificial intelligence (AI) that allows software applications to become more accurate at predicting outcomes without being explicitly programmed to do so. Machine learning [algorithms](https://www.techtarget.com/whatis/definition/algorithm) use historical data as input to predict new output values.
10 |
11 | More on [Machine Learning | Tech Target](https://www.techtarget.com/searchenterpriseai/definition/machine-learning-ML).
--------------------------------------------------------------------------------
/content/term/map reduce.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is MapReduce?"
3 | tags:
4 | - data engineering
5 | ---
6 | MapReduce is a programming paradigm that enables massive scalability across hundreds or thousands of servers in a Hadoop cluster. As the processing component, MapReduce is the heart of [Apache Hadoop](term/apache%20hadoop.md). The term "MapReduce" refers to two separate and distinct tasks that Apache Hadoop programs perform.
--------------------------------------------------------------------------------
/content/term/master data management (mdm).md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Master Data Management (MDM)?"
3 | tags:
4 | - data engineering
5 | ---
6 | Master data management is a method to centralize master data. It's the bridge between the business that maintain the data and know them best and the data folks, and it's a tool of choice. It helps with uniformity, accuracy, stewardship, semantic consistency, and accountability of mostly enterprise master [Data Assets](term/data%20asset.md).
--------------------------------------------------------------------------------
/content/term/maxime beauchemin.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Maxime Beauchemin"
3 | tags:
4 | - data engineering
5 | ---
6 | Creator of [[Apache Airflow]] and [Apache Superset](term/apache%20superset.md).
7 |
8 | Started as [Business Intelligence](term/business%20intelligence.md) Engineer and is working now at [[Preset]] (Superset as a Service).
9 |
10 | Starter of [idempotency](term/idempotency.md) and [functional data engineering](term/functional%20data%20engineering.md).
11 |
--------------------------------------------------------------------------------
/content/term/metric.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Metric?"
3 | tags:
4 | - data engineering
5 | aliases:
6 | - Measure
7 | ---
8 |
9 | A Metric, also called [KPI](term/key%20performance%20indicator%20(kpi).md) or (calculated) measure, are terms that serve as the building blocks for how business performance is both measured and defined, as knowledge of how to define an organization's KPIs. It is fundamental to have a common understanding of them. Metrics usually surface as business reports and dashboards with direct access to the entire organization.
10 |
11 | For example, think of operational metrics that represent your company's performance and service level or financial metrics that describe its financial health. Today these metrics are primarily defined in a lengthy [SQL](term/sql.md) statement inside the [BI tools](term/business%20intelligence%20tools.md).
12 |
13 | Calculated measures are part of metrics and apply to specific [dimensions](term/dimensions.md) traditionally mapped inside a [Bus Matrix](term/bus%20matrix.md). Dimensions are the categorical buckets that can be used to segment, filter, group, slice, and dice—such as sales amount, region, city, product, color, and distribution channel. Dimensions (and facts) are also known from the concept of [Dimensional Modeling](term/dimensional%20modeling.md).
14 |
15 | See more on [Semantic Layer](term/semantic%20layer.md) or [Metrics Layer](term/metrics%20layer.md).
16 |
--------------------------------------------------------------------------------
/content/term/metrics layer.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Metrics Layer?"
3 | tags:
4 | - data engineering
5 | aliases:
6 | - Metrics Store
7 | ---
8 | > [!info] Similarities to a Semantic Layer
9 | >
10 | > The metrics layer is one component of a [Semantic Layer](term/semantic%20layer.md). A limited metrics layer is usually built into a [BI tool](term/business%20intelligence%20tools.md), translating its [metrics](term/metric.md) to only that BI tool.
11 |
12 | A metrics layer, sometimes also called a metrics store, includes a specification of metrics such as [measures](term/metric.md) and [dimensions](term/dimensions.md). Additionally, it can contain model parsing from files (mostly [YAML](term/yaml.md)) and APIs to create and execute metric logic; some include a cache layer. A metrics layer encourages us to enforce the [DRY](https://en.wikipedia.org/wiki/Don%27t_repeat_yourself) (Do not repeat yourself) principle by defining metrics once before populating them to any BI tools used or integrating them into internal applications or processes.
13 |
14 | Read more on [Semantic Layer](term/semantic%20layer.md) or [The Rise of the Semantic Layer](https://airbyte.com/blog/the-rise-of-the-semantic-layer-metrics-on-the-fly).
15 |
--------------------------------------------------------------------------------
/content/term/modern data stack.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is the Modern Data Stack?"
3 | tags:
4 | - data engineering
5 | ---
6 | The Modern Data Stack (MDS) is a heap of open-source tools to achieve end-to-end analytics from ingestion to [transformation](term/data%20transformation.md) to ML over to a columnar data warehouse or lake solution with an analytics BI dashboard backend. This stack is extendable like lego blocks. Usually, it consists of **[data integration](term/data%20integration.md), a transformation tool, an [Orchestrator](term/data%20orchestrator.md), and a [Business Intelligence Tool](term/business%20intelligence%20tools.md)**. With growing data, you might add [Data Quality](term/data%20quality.md) and observability tools, [Data Catalog](term/data%20catalog.md), [Semantic Layer](term/semantic%20layer.md), and more.
7 |
8 | In a way, it is [unbundling](https://blog.fal.ai/the-unbundling-of-airflow-2/) the data stack as Gorkem says:
9 | > Products start small, in time, add adjacent verticals and functionality to their offerings, and become a platform. Once these **platforms** become big enough, people begin to figure out how to serve better-neglected verticals or abstract out functionality to break it down into purpose-built chunks, and the unbundling starts.
10 |
11 | The goal of an MDS is to get data insight with the best suitable tools for each part. It's noteworthy that it's a relatively new term.
12 |
13 | > [!note] New Terms popping up
14 | >
15 | > There is already a new term [ngods (new generation open-source data stack)](https://blog.devgenius.io/modern-data-stack-demo-5d75dcdfba50). Or *DataStack 2.0* in Dagster's recent [blog post](https://dagster.io/blog/evolution-iq-case-study).
16 |
17 | ## The Future of MDS
18 | If we look a little in the future, Barr Moses illustrates in her article [What's In Store For The Future Of The Modern Data Stack?](https://www.montecarlodata.com/blog-the-future-of-the-modern-data-stack/) more features such as data sharing, universal [Data Governance](term/data%20governance.md), [Data Lake](term/data%20lake.md), and [Data Warehouse](term/data%20warehouse.md) equalized, and a newer evolution of predictive analysis:
19 | 
--------------------------------------------------------------------------------
/content/term/normalization.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Airbyte Normalization"
3 | tags:
4 | - airbyte
5 | ---
6 | Normalization is the process of structuring data from the source into a format appropriate for consumption in the destination. For example, when writing data from a nested, dynamically typed source like a JSON API to a relational destination like Postgres, normalization is the process that un-nests JSON from the source into a relational table format that uses the appropriate column types in the destination.
7 |
8 | Read more on our [docs](https://docs.airbyte.com/cloud/core-concepts#normalization) or what [Database Normalization](term/database%20normalization.md) means in general.
9 |
--------------------------------------------------------------------------------
/content/term/notebooks.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What are Notebooks?"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | Data notebooks popularized by Jupyter notebooks are the centralized IDE inside a browser for doing collaborative work.
8 |
9 | 1. Notebooks that are popularized and in heavy use today.
10 | - [Jupyter Notebook](https://jupyter.org/) and [JupyterHub](https://jupyter.org/hub)
11 | - Automation on top of Jupyter notebooks: [Naas](https://github.com/jupyter-naas/awesome-notebooks)
12 | - [Zeppelin](https://zeppelin.apache.org/)
13 | - [Databricks Notebook](https://docs.databricks.com/notebooks/index.html)
14 | 2. Newer versions of Jupyter notebooks with more integrated features and an integrated cloud
15 | - [HEX](https://hex.tech/)
16 | - [Deepnote](https://deepnote.com/)
17 | - [Count.co](https://count.co)
18 |
19 |
--------------------------------------------------------------------------------
/content/term/olap (online analytical processing).md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is OLAP (Online Analytical Processing)?"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | OLAP is an acronym for **Online Analytical Processing**. OLAP performs multidimensional analysis of business data and provides the capability for complex calculations, trend analysis, and sophisticated [data modeling](term/data%20modeling.md). An **OLAP cube** is a multidimensional database that is optimized for [data warehouse](term/data%20warehouse.md) and online analytical processing (OLAP) applications.
8 |
9 | An OLAP cube is a method of storing data in a multidimensional form, generally for reporting purposes. In OLAP cubes, data ([Measures](term/metric.md)) are categorized by [dimensions](term/dimensions.md).
10 |
11 | In order to manage and perform processes with an OLAP cube, Microsoft developed a query language, known as [multidimensional expressions (MDX)](https://learn.microsoft.com/en-us/analysis-services/multidimensional-models/mdx/), in the late 1990s. Many other vendors of multidimensional databases have adopted MDX for querying data, but with this specific language, management of the cube requires personnel with the skill set.
12 |
13 | The opposite of OLAP is [OLTP](term/oltp%20(online%20transactional%20processing).md). Read more on [Wikipedia](https://en.wikipedia.org/wiki/Online_analytical_processing).
--------------------------------------------------------------------------------
/content/term/oltp (online transactional processing).md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is OLTP (Online Transactional Processing)="
3 | tags:
4 | - data engineering
5 | ---
6 | In online transaction processing (**OLTP**), information systems typically facilitate and manage **transaction-oriented** applications. It's the opposite of [OLAP (Online Analytical Processing)](term/olap%20(online%20analytical%20processing).md).
7 |
8 | Read more on [Wikipedia](https://en.wikipedia.org/wiki/Online_transaction_processing).
--------------------------------------------------------------------------------
/content/term/open data stack.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is the Open Data Stack?"
3 | tags:
4 | - data engineering
5 | ---
6 |
7 | The open data stack is a better term for the [Modern Data Stack](term/modern%20data%20stack.md) but focuses on solutions built on open-source and open standards covering the [Data Engineering Lifecycle](term/data%20engineering%20lifecycle.md). The open data stack is maintained by everyone using it. Companies can reuse existing battle-tested solutions and build on them instead of reinventing the wheel by re-implementing critical components for each component of the data stack.
8 |
9 | The *open* piece is so important and often overlooked because it’s what makes the #opendatastack more embeddable with tools from the open data stack such as [Airbyte](term/airbyte.md), [dbt](dbt), [Dagster](Dagster), [Superset](term/apache%20superset), and so forth. Letting you integrate them into your services, unlike closed-source services.
10 |
11 | > [!example] See a reference project building the open-data-stack
12 | >
13 | > This is the start of the open data stack in action. Check out the GitHub repo [Open-Data-Stack](https://github.com/airbytehq/open-data-stack/).
14 |
15 | Alternative names that came up beside the Modern Data Stack are [ngods (new generation open-source data stack)](https://blog.devgenius.io/modern-data-stack-demo-5d75dcdfba50), [DataStack 2.0](https://dagster.io/blog/evolution-iq-case-study), [DAD Stack](https://www.reddit.com/r/dataengineering/comments/11fhmqu/comment/jajkydk/?context=3), or more as a joke on Twitter, the boring data stack.
16 |
17 | See more on the topic of [The Open (aka *Modern*) Data Stack Distilled into Four Core Tools](https://airbyte.com/blog/modern-open-data-stack-four-core-tools).
18 |
19 |
--------------------------------------------------------------------------------
/content/term/orc.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is ORC?"
3 | tags:
4 | - data engineering
5 | ---
6 | The **Optimized Row Columnar** (ORC) [Data Lake File Format](term/data%20lake%20file%20format.md) provides a highly efficient way to store Hive data. It was designed to overcome the limitations of the other Hive file formats. Using ORC files improves performance when Hive is reading, writing, and processing data.
7 |
8 | Read more about how to build a Data Lake on top of it on our [Data Lake and Lakehouse Guide](https://airbyte.com/blog/data-lake-lakehouse-guide-powered-by-table-formats-delta-lake-iceberg-hudi).
9 |
--------------------------------------------------------------------------------
/content/term/other glossaries.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Other Data Glossaries"
3 | tags:
4 | - References
5 | ---
6 | Other helpful data glossaries:
7 | - [Seconda Glossary](https://www.secoda.co/glossary)
8 | - [Prisma's Data Guide](https://www.prisma.io/dataguide/)
9 | - [Reddit Data Engineering Wiki](https://dataengineering.wiki/)
--------------------------------------------------------------------------------
/content/term/pandas.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Pandas?"
3 | tags:
4 | - data engineering
5 | ---
6 | Pandas is a software library written for the [Python](term/python.md) programming language for data manipulation and analysis. In particular, it offers data structures and operations for manipulating numerical tables and time series. It is free software released under the three-clause BSD license.
7 |
8 | ## What is a Pandas DataFrame
9 | Two-dimensional, size-mutable, potentially heterogeneous tabular data.
10 |
11 | The data structure also contains labeled axes (rows and columns). Arithmetic operations align on both row and column labels. It can be thought of as a dict-like container for Series objects. The primary Pandas data structure.
12 |
13 | See more on [Pandas Documentation](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html).
14 |
15 | Another DataFrame with the same API is [Koalas](https://github.com/databricks/koalas), created by Databricks, optimized for more extensive data sets, and [Apache Spark](term/apache%20spark.md).
16 |
--------------------------------------------------------------------------------
/content/term/partial success.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Partial Success"
3 | tags:
4 | - airbyte
5 | ---
6 | A Partial Success indicates that some records were successfully committed to the destination during a sync, even when the overall sync status was reported as a failure.
7 |
--------------------------------------------------------------------------------
/content/term/programming languages.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What are Data Engineering Programming Languages?"
3 | tags:
4 | - coding
5 | ---
6 | 2022 marks JavaScript’s tenth year in a row as the most commonly used programming language according to [Stack Overflow Developer Survey 2022](https://survey.stackoverflow.co/2022/#section-most-popular-technologies-programming-scripting-and-markup-languages). Further, they say: People learning to code are more likely than Professional Developers to report using Python (58% vs 44%), C++ (35% vs 20%), and C (32% vs 17%).
7 |
8 | Programming Languages (so far):
9 | - [SQL](term/sql.md)
10 | - [Python](term/python.md)
11 |
12 | See also [Functional Programming](term/functional%20programming.md) or [Functional Data Engineering](term/functional%20data%20engineering.md).
--------------------------------------------------------------------------------
/content/term/push-down.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Push-Down?"
3 | tags:
4 | - data engineering
5 | aliases:
6 | - Query Pushdown
7 | ---
8 | Query pushdown aims to execute as much work as possible in the source databases.
9 |
10 | Push-downs or query pushdowns push transformation logic to the source database. This reduces to store data physically and transfers them over the network.
11 |
12 | For example, a [semantic layer](term/semantic%20layer.md) or [data virtualization](term/data%20virtualization.md) translates the transformation logic into [SQL](term/sql.md) queries and sends the SQL queries to the database. The source database runs the SQL queries to process the transformations.
13 |
14 | Pushdown optimization increases mapping performance when the source database can process transformation logic faster than the semantic layer itself.
15 |
--------------------------------------------------------------------------------
/content/term/python.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Python?"
3 | tags:
4 | - data engineering
5 | - coding
6 | ---
7 | Python is a high-level, general-purpose programming language. Its design philosophy emphasizes code readability with the use of significant indentation. Python is dynamically-typed and garbage-collected. It supports multiple programming paradigms, including structured, object-oriented, and [Functional Programming](term/functional%20programming.md).
8 |
9 | Python is the de facto standard for [Data Engineering](term/data%20engineering.md) next to [SQL](term/sql.md). If you want to learn Python, see the Freecodecamp Python Course in under 300 hours:
10 | {{< youtube vMl4YUch7x4 >}}
11 |
--------------------------------------------------------------------------------
/content/term/raw tables.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Raw Tables"
3 | tags:
4 | - airbyte
5 | ---
6 | Airbyte spits out tables with the prefix `_airbyte_raw_`. This is your replicated data, but the prefix indicates that it's not normalized. If you select basic [normalization](term/normalization.md), Airbyte will create renamed versions without the prefix.
--------------------------------------------------------------------------------
/content/term/reverse etl.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Reverse ETL?"
3 | tags:
4 | - data engineering
5 | aliases:
6 | - Data Activation
7 | - Operational Analytics
8 | ---
9 | Reverse ETL is the flip side of the [ETL](term/etl.md)/[ELT](term/elt.md). **With Reverse ETL, the data warehouse becomes the source rather than the destination**. Data is taken from the warehouse, transformed to match the destination's data formatting requirements, and loaded into an application – for example, a CRM like Salesforce – to enable action.
10 |
11 | In a way, the Reverse ETL concept is not new to data engineers, who have been enabling data movement warehouses to business applications for a long time. As [Maxime Beauchemin](term/maxime%20beauchemin.md) mentions in [his article](https://preset.io/blog/reshaping-data-engineering/), Reverse ETL “appears to be a modern new means of addressing a subset of what was formerly known as [Master Data Management (MDM)](term/master%20data%20management%20(mdm).md).”
12 |
13 | Read more about in [Reverse ETL Explained](https://airbyte.com/blog/reverse-etl#so-what-is-a-reverse-etl).
--------------------------------------------------------------------------------
/content/term/rollup.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Rollup?"
3 | tags:
4 | - data engineering
5 | ---
6 | Rollup is a form of summarization or pre-aggregation. Rolling up data can dramatically reduce the size of data to be stored and reduce row counts by potential orders of magnitude.
7 |
8 |
--------------------------------------------------------------------------------
/content/term/rust.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Rust?"
3 | tags:
4 | - data engineering
5 | ---
6 | Former Mozilla employee Graydon Hoare initially created [Rust](https://glossary.airbyte.com/term/rust) as a personal project. The first stable release, Rust 1.0 was released on May 15, 2015. Rust is a [**multi-paradigm programming language**](https://en.wikipedia.org/wiki/Comparison_of_multi-paradigm_programming_languages) that supports imperative procedural, concurrent actor, object-oriented and pure [functional](https://glossary.airbyte.com/term/functional-programming) styles, supporting generic programming and metaprogramming statically and dynamically.
7 |
8 | > The goal of Rust is to be a good programming language for creating highly **concurrent, safe, and performant systems**. [Learning Rust](https://learning-rust.github.io/docs/a1.why_rust.html)
9 |
10 | Find more comparisons to [python](term/python.md) and how Rust will take over [data engineering](term/data%20engineering.md) on [Will Rust Take over Data Engineering?](https://airbyte.com/blog/rust-for-data-engineering).
11 |
12 |
--------------------------------------------------------------------------------
/content/term/schema evolution.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Schema Evolution?"
3 | tags:
4 | - data engineering
5 | ---
6 | Automatic Schema Evolution is a crucial feature in [Data Lake Table Format](term/data%20lake%20table%20format.md)s as changing formats is still a pain in today's data engineer work. Schema Evolution means adding new columns without breaking anything or even enlarging some types. You can even rename or reorder columns, although that might break backward compatibilities. Still, we can change one table, and the table format takes care of switching it on all distributed files. Best of all does not require e rewrite of your table and underlying files.
7 |
8 | See also [ACID Transactions](term/acid%20transactions.md).
--------------------------------------------------------------------------------
/content/term/semantic layer.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Semantic Layer"
3 | tags:
4 | - data engineering
5 | aliases:
6 | - Headless BI
7 | ---
8 |
9 | > A semantic layer (sometimes also called Headless BI) calculates complex business [metrics](term/metric.md) at *query time*. It sits between your data sources/transformation layer and your analytics tools. You define a metric's aggregations (daily, weekly, monthly, and quarterly) and dimensions (region, customer, product). Examples of metrics could be "monthly active users", "weekly revenue", "number of paying customers", and so on.
10 |
11 | You can think of a semantic layer as a translation layer between any data presentation layer ([business intelligence](term/business%20intelligence.md), [notebooks](term/notebooks.md), data apps) and the data sources. A translation layer includes many features, such as integrating data sources, modeling the metrics, and integrating with the data consumers by translating metrics into [SQL](term/sql.md), REST, or GraphQL.
12 |
13 | Because everyone has different definitions of “active” users or “paying” customers, the semantic layer allows you to define these discrepancies once company-wide. Instead of having three different versions each presentation tool e.g. BI tool would show a different number than your Jupyter notebook or data app. And what if the metric changes to a new definition, with a semantic layer you change only one time. This powerful feature empowers domain experts and data practitioners to get a common understanding of business metrics.
14 |
15 | A sub-layer of the semantic layer is the [Metrics Layer](term/metrics%20layer.md).
16 |
17 | Read more on [The Rise of the Semantic Layer](https://airbyte.com/blog/the-rise-of-the-semantic-layer-metrics-on-the-fly) or other fascinating reads on the topic:
18 | - [Down the Semantic Rabbit Hole](https://jpmonteiro.substack.com/p/down-the-semantic-rabbit-hole)
19 | - [The Missing Piece of the Modern Data Stack](https://benn.substack.com/p/metrics-layer)
20 | - [Deep Dive: What The Heck Is the Metrics Layer](https://pedram.substack.com/p/what-is-the-metrics-layer)
21 | - Follow-up: [Deep dive: What the heck is the Semantic Layer](https://cube.dev/blog/what-the-heck-is-the-headless-bi)
22 | - [The Great Data Debate by Atlan](https://atlan.com/great-data-debate/)
23 | - [The Metrics Layer has Growing up to do](https://prakasha.substack.com/p/the-metrics-layer-has-growing-up)
24 | - [The Universal Semantic Layer, More Important Than Ever](https://www.atscale.com/blog/what-is-a-universal-semantic-layer-why-would-you-want-one/)
25 | - [Demystifying the Metrics Store and Semantic Layer](https://thenewstack.io/demystifying-the-metrics-store-and-semantic-layer/)
26 | - Semantic Superiority series: [Part 1](https://davidsj.substack.com/p/semantic-superiority-part-1), [Part 2](https://davidsj.substack.com/p/semantic-superiority-part-2), [Part 3](https://davidsj.substack.com/p/semantic-superiority-part-3), [Part 4](https://davidsj.substack.com/p/semantic-superiority-part-4), and [Part 5](https://davidsj.substack.com/p/semantic-superiority-part-5)
27 |
--------------------------------------------------------------------------------
/content/term/semantic warehouse.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Semantic Warehouse"
3 | tags:
4 | - data engineering
5 | ---
6 | It incorporates best practices espoused by [Bill Inmon](Bill%20Inmon) for robust, scalable warehouse design built for the cloud as an abstraction of the [[Modern Data Stack]] with [[term/ata modeling]] at its core.
7 |
8 | 
9 | Illustrating the Semantic Warehouse from [Chad Sanderson on LinkedIn](https://www.linkedin.com/posts/chad-sanderson_im-very-happy-to-unveil-the-semantic-warehouse-activity-6958091220157964288-JSXj/)
10 |
11 | Chad Sanders first introduced the term in this [LinkedIn post](https://www.linkedin.com/posts/chad-sanderson_im-very-happy-to-unveil-the-semantic-warehouse-activity-6958091220157964288-JSXj/). Some defining features:
12 | - Data as a product and capturing the natural world through events instead of batch processing with a clear defined schema
13 | - [Data Contract](term/data%20contract.md) as a foundation to introduce contracts to its underlying source tables.
14 | - Collaborative, peer-reviewed data modeling.
15 | - Centralized metrics with a [Metrics/Locical Layer](term/metrics%20layer.md) allow collaborative data modeling between the business and the (data) engineers and abstract away the complexity of the data stack.
16 | - Built-in incentives with semantics and modeling are required to generate good [Data Products](term/data%20product.md).
17 |
18 | The semantic warehouse tries to solve the following problems:
19 | 1. The [Modern Data Stack](term/modern%20data%20stack.md) (MDS) is a good set of tools for building things, but they do not help ensure that what is being built is high quality.
20 | 2. Most data architectures and data foundations are not scalable. The first version of data infrastructure (typically set up by engineers or junior data devs) is never refactored because it is tough to do so
21 | 3. Producers do not (but should) own data quality. Data Engineers should not be middle-men caught in the cross-fire of consumers
22 | 4. Semantics and context are missing. Data devs spend days to weeks just trying to understand what data we have, what it means, how it maps to services, and whether data can be trusted
23 | 5. Data modeling was not a first-class citizen. Modeling was challenging to do (because of #4) and, in some cases, impossible, thanks to data simply being missing.
24 | 6. Our [Data Warehouse](term/data%20warehouse.md) did not reflect the real world. Instead, it was a dumping ground for production services and 3rd party APIs.
25 | 7. A lack of interoperability due to tools not 'speaking the same language.' We have multiple products which each require their distinct modeling environment and no shared understanding of business concepts
26 | 8. [Data Governance](term/data%20governance.md) is critical, but businesses will reject it if it becomes a bottleneck. We cannot scale our data team horizontally with the complexity
27 |
28 | See also [[Metrics Layer|Semantic Layer]] and [[Data Contract]].
--------------------------------------------------------------------------------
/content/term/semi-structured data.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is semi-structured data?"
3 | tags:
4 | - data engineering
5 | - concept
6 | ---
7 |
8 | Semi-structured data is data that lacks a rigid structure and that does not conform directly to a data model, but that has tags, metadata, or elements that describe the data. Examples of semi-structured data are JSON or XML files. Semi-structured data often contains enough information that it can be relatively easily converted into structured data.
9 |
10 | JSON data embedded inside of a string, is an example of semi-structured data. The string contains all the information required to understand the structure of the data, but is still for the moment just a string -- it hasn't been structured yet. The Raw JSON stored by Airbyte during ELT is an example of semi-structured data. This looks as follows:
11 |
12 | | | **\_airbyte_data**|
13 | |---------| -----------|
14 | |Record 1| \"{'id': 1, 'name': 'Mary X'}\" |
15 | |Record 2| \"{'id': 2, 'name': 'John D'}\"|
16 |
17 | ## Semi-structured vs structured data
18 | In contrast to semi-structured data, [structured data](term/structured%20data.md) refers to data that has been formatted into a well-defined schema. An example would be data that is stored with precisely defined columns in a relational database or excel spreadsheet. Examples of structured fields could be age, name, phone number, credit card numbers or address.
19 |
20 | ## Structuring of semi-structured data
21 |
22 | It is often relatively straightforward to convert semi-structured data into structured data. Converting semi-structured data into structured data is often done during the [data transformation](term/data%20transformation.md) stage in an [ETL](term/etl.md) or [ELT](term/elt.md) process.
23 |
24 | For example, if normalization is enabled then Airbyte will automatically convert the JSON stored in the `_airbyte_data` field in the table above, into a table that looks as follows:
25 |
26 | | | **id** | **name** |
27 | |---------| -----------|---- |
28 | |Record 1| 1 | "Mary X" |
29 | |Record 2|2| "John D" |
30 |
31 | ## A real-world example of converting semi-structured to structured data
32 |
33 | If the semi-structured JSON data were stored in Postgres, then it could be converted into structured data by making use of [JSON Functions and Operators]([https://www.postgresql.org/docs/9.4/functions-json.html](https://www.postgresql.org/docs/9.4/functions-json.html)). A real-world implementation of this is discussed the tutorial: [Explore Airbyte's full refresh data synchronization](https://airbyte.com/tutorials/full-data-synchronization)
--------------------------------------------------------------------------------
/content/term/slowly changing dimension scd.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Slowly Changing Dimension?"
3 | tags:
4 | - data engineering
5 | ---
6 | A Slowly Changing Dimension (SCD) is **a dimension that stores and manages both current and historical data over time in a [Data Warehouse](term/data%20warehouse.md)**. It is considered and implemented as one of the most critical ETL tasks in tracking the history of dimension records.
--------------------------------------------------------------------------------
/content/term/soft delete.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Soft Delete?"
3 | tags:
4 | - airbyte
5 | ---
6 | In order to propagate records that have been deleted when using [Incremental Synchronization](term/incremental%20synchronization.md) modes, records in a database may include a field that indicates that a record should be treated as if it has been removed. This is necessary because incremental synchronization does not replicate documents that are fully deleted from a source system.
7 |
8 | For example, a boolean flag such as `is_deleted` could be used to indicate that a record should be treated as if it has been deleted. All queries would need to be written so as to exclude records/documents where `is_deleted` is set, and periodically executed background jobs can be used to remove all documents where `is_deleted` is set.
9 |
10 |
11 |
--------------------------------------------------------------------------------
/content/term/software-defined assets.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Software-Defined Asset?"
3 | tags:
4 | - data engineering
5 | ---
6 | The software-defined asset was first [introduced](https://dagster.io/blog/software-defined-assets) by [Dagster](term/Dagster.md) with the following definition:
7 | > A new, [declarative](term/declarative.md) approach to managing data and orchestrating its maintenance.
8 | > Declarative data management starts with using code to define the data assets that you want to exist. These asset definitions, version-controlled through git and inspectable via tooling, allow anyone in your organization to understand your canonical set of data assets, enable you to reproduce them at any time, and offer a foundation for asset-based orchestration.
9 |
10 | The key to software-defined assets is to declare a data asset/product pre-runtime. The SW-defined function in Dagster is like a microservice or the code that defines the asset in a [functional way](term/functional%20data%20engineering.md) (that can live independently). With the declarative approach, we have more information defined as code helping the [orchestrator](term/data%20orchestrator.md) to figure out the lineage, how to run, etc.
11 |
12 | The best thing, you get the actual data lineage of your physical assets, not an arbitrary lineage of tasks (that is interesting for engineers but not for data consumers).
13 |
14 | In the future, more code will be written with SW-defined assets as it reduces the need for writing lots of boilerplate as it's declarative and describes what the asset is supposed to do and include rather than how that is handled and run by Dagster. See more on [Data Orchestration Trends](https://airbyte.com/blog/data-orchestration-trends), where the trends included in this shift from an [imperative](term/imperative.md) pipeline with ops, jobs, graphs, etc., to Assets with Software-defined assets are explained.
15 |
16 | Much has been announced on the [Dagster Community Day](https://www.youtube.com/live/An78xLxM9zQ?feature=share), where Nick said, the founder of Dagster: "Think of an iPhone: It feels like this one device, but there is a lot of complexity, and heterogeneous. The same is true for orchestration which could be the future to bundle the [Open Data Stack](term/open%20data%20stack.md) into one coherent data stack. An alternative would be a vertical integration with one of the prominent vendors.
17 |
18 |
19 |
--------------------------------------------------------------------------------
/content/term/sql.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is SQL?"
3 | tags:
4 | - data engineering
5 | - coding
6 | ---
7 | SQL is **a standardized language used to interact with relational [[databases]]**. It stands for structured query language (SQL) and defines a standard [programming language](term/programming%20languages.md) utilized to extract, organize, manage, and manipulate data stored in relational databases.
8 |
9 | Here are different levels you can go into ([Source](https://twitter.com/largedatabank/status/1559651463919452161)):
10 | 
11 |
12 | See more on [SQL-Levels Explained](https://github.com/airbytehq/SQL-Levels-Explained).
--------------------------------------------------------------------------------
/content/term/storage layer object store.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a Storage Layer / Object Store?"
3 | tags:
4 | - data engineering
5 | aliases:
6 | - Object Store
7 | ---
8 | A storage layer or object storage are services from the three big cloud providers, AWS S3, Azure Blob Storage, and Google Cloud Storage. The web user interface is easy to use. **Its features are very basic, where, in fact, these object stores store distributed files exceptionally well.** They are also highly configurable, with solid security and reliability built-in.
9 |
10 | You can build on with [Data Lake File Format](term/data%20lake%20file%20format.md) or [Data Lake Table Format](term/data%20lake%20table%20format.md). Read more on our [Data Lake and Lakehouse Guide](https://airbyte.com/blog/data-lake-lakehouse-guide-powered-by-table-formats-delta-lake-iceberg-hudi).
--------------------------------------------------------------------------------
/content/term/sync run.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is a sync run?"
3 | tags:
4 | - airbyte
5 | ---
6 | Airbyte replication can be thought of as a loop which periodically requests records from a data source and sends them to a destination. Each iteration of this loop is referred to as a sync run, which is discussed in more detail in [How we scale workflow orchestration with Temporal](https://airbyte.com/blog/scale-workflow-orchestration-with-temporal#triggering-a-sync-run).
--------------------------------------------------------------------------------
/content/term/temporal.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Temporal"
3 | tags:
4 | - data engineering
5 | ---
6 | > [!info] Info
7 | >
8 | > This is only relevant for individuals who want to learn about or contribute to our underlying platform.
9 |
10 | [Temporal](https://temporal.io/) is a development kit that lets you create workflows, parallelize them, and handle failures/retries gracefully. We use it to reliably schedule each step of the ELT process, and a Temporal service is always deployed with each Airbyte installation.
11 |
12 | Read more on [How we Scale Workflow Orchestration with Temporal](https://airbyte.com/blog/scale-workflow-orchestration-with-temporal) at Airbyte.
--------------------------------------------------------------------------------
/content/term/time travel.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is Time Travel?"
3 | tags:
4 | - data engineering
5 | ---
6 | With time travel, the [Data Lake Table Format](term/data%20lake%20table%20format.md) versions the big data you store in your [Data Lake](term/data%20lake.md). You can access any historical version of that data, simplifying data management with easy-to-audit, rollback data in case of accidental bad writes or deletes, and reproduce experiments and reports. Time travel enables reproducible queries as you can query two different versions simultaneously.
7 |
8 | Read more about how to build a Data Lake on top of it on our [Data Lake and Lakehouse Guide](https://airbyte.com/blog/data-lake-lakehouse-guide-powered-by-table-formats-delta-lake-iceberg-hudi).
--------------------------------------------------------------------------------
/content/term/unstructured data.md:
--------------------------------------------------------------------------------
1 |
2 | ---
3 | title: "What is unstructured data?"
4 | tags:
5 | - data engineering
6 | - concepts
7 | ---
8 |
9 | Unstructured data is data that does not conform to a data model and has no easily identifiable structure. Unstructured data cannot be easily used by programs, and is difficult to analyze. Examples of unstructured data could be the contents of an email, contents of a word document, data from social media, photos, videos, survey results, etc.
10 |
11 | ## An example of unstructured data
12 |
13 | An simple example of unstructured data is a string that contains interesting information inside of it, but that has not been formatted into a well defined schema. An example is given below:
14 |
15 | | | **UnstructuredString**|
16 | |---------| -----------|
17 | |Record 1| "Bob is 29" |
18 | |Record 2| "Mary just turned 30"|
19 |
20 | ## Unstructured vs structured data
21 |
22 | In contrast with unstructured data, [structured data](term/structured%20data.md) refers to data that has been formatted into a well-defined schema. An example would be data that is stored with precisely defined columns in a relational database or excel spreadsheet. Examples of structured fields could be age, name, phone number, credit card numbers or address. Storing data in a structured format allows it to be easily understood and queried by machines and with tools such as SQL.
23 |
24 | ## Structuring of unstructured data
25 |
26 | Extracting structured data from unstructured data is often done during the [data transformation](term/data%20transformation.md) stage in an [ETL](term/etl.md) or [ELT](term/elt.md) process.
27 |
28 | For example, in order to efficiently make use of the unstructured data given in the previous example, it may desirable to transform it into structured data such as the following:
29 |
30 | | | **name** | **age** |
31 | |---------| -----------|---- |
32 | |Record 1| "Bob" | 29 |
33 | |Record 2| "Mary"| 30 |
34 |
35 | Storing the data in a structured manner makes it much more efficient to query the data. For example, after structuring the example data it is possible to easily and efficientl execute queries by name or by age.
36 |
37 |
38 |
--------------------------------------------------------------------------------
/content/term/yaml.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: "What is YAML?"
3 | tags:
4 | - devops
5 | ---
6 | YAML is a data serialization language often used to write configuration files. Depending on whom you ask, YAML stands for yet another markup language, or YAML isn’t markup language (a recursive acronym), which emphasizes that YAML is for data, not documents.
--------------------------------------------------------------------------------
/data/config.yaml:
--------------------------------------------------------------------------------
1 | name: Airbyte
2 | enableToc: true
3 | openToc: false
4 | enableLinkPreview: true
5 | enableLatex: true
6 | enableCodeBlockTitle: true
7 | enableCodeBlockCopy: true
8 | enableCallouts: true
9 | enableSPA: true
10 | enableFooter: true
11 | enableContextualBacklinks: true
12 | enableRecentNotes: true
13 | enableGitHubEdit: true
14 | GitHubLink: https://github.com/airbytehq/glossary/tree/hugo/content
15 | enableSemanticSearch: false
16 | operandApiKey: "REPLACE-WITH-YOUR-OPERAND-API-KEY"
17 | description:
18 | A single place for all data knowledge
19 | page_title:
20 | "Data Glossary 🧠"
21 | links:
22 | - link_name: Twitter
23 | link: https://twitter.com/AirbyteHQ
24 | - link_name: Github
25 | link: https://github.com/airbytehq
26 |
--------------------------------------------------------------------------------
/data/graphConfig.yaml:
--------------------------------------------------------------------------------
1 | # if true, a Global Graph will be shown on home page with full width, no backlink.
2 | # A different set of Local Graphs will be shown on sub pages.
3 | # if false, Local Graph will be default on every page as usual
4 | enableGlobalGraph: false
5 |
6 | ### Local Graph ###
7 |
8 | localGraph:
9 | enableLegend: false
10 | enableDrag: true
11 | enableZoom: true
12 | depth: 1 # set to -1 to show full graph
13 | scale: 1.5
14 | repelForce: 8
15 | centerForce: 2
16 | linkDistance: 1
17 | fontSize: 0.6
18 | opacityScale: 3
19 |
20 | ### Global Graph ###
21 |
22 | globalGraph:
23 | enableLegend: false
24 | enableDrag: true
25 | enableZoom: true
26 | depth: -1 # set to -1 to show full graph
27 | scale: 1.4
28 | repelForce: 1
29 | centerForce: 1
30 | linkDistance: 1
31 | fontSize: 0.5
32 | opacityScale: 3
33 |
34 | ### For all graphs ###
35 |
36 | paths:
37 | - /moc: "#4388cc"
38 |
--------------------------------------------------------------------------------
/deployment.md:
--------------------------------------------------------------------------------
1 | # Deployment Details
2 | There are two GitHub Actions. First [Deploy to GitHub Pages](https://github.com/airbytehq/glossary/actions/workflows/deploy.yaml) that package branch `hugo` and run the deployment (e.g. running GoHugo), essentially rendering the webpage to a static webpage. It will build and publish the static website to [master](https://github.com/airbyteglossary/airbyteglossary.github.io/tree/master) branch on the publishing repo.
3 |
4 | The `master` branch is also what you see on [glossary.airbyte.com](https://glossary.airbyte.com). So whenever you push changes to the `hugo` branch, it will automatically be merged and deployed.
5 |
6 | The `hugo` branch is not protected and everyone can add. Maybe will change that later. For now, we want a fast update cycle.
7 |
8 |
--------------------------------------------------------------------------------
/i18n/ar.toml:
--------------------------------------------------------------------------------
1 | [404_message]
2 | other = "يبدو أنك ضللت الطريق. هذه الصفحة غير موجودة (أو قد تكون خاصة)."
3 |
4 | [404_back]
5 | other = "↳ العودة للرئيسية."
6 |
7 | [all_posts]
8 | other = "كل منشورات {{.Title}}"
9 |
10 | [last_updated]
11 | other = "آخر تعديل"
12 |
13 | [notes_count]
14 | other = "ملاحظات بهذه التسمية"
15 |
16 | [first_10]
17 | other = "(تعرض أول 10 نتائج فقط)"
18 |
19 | [tag]
20 | other = "التسمية"
21 |
22 | [backlinks]
23 | other = "الروابط الخلفية"
24 |
25 | [no_backlinks]
26 | other = "لا توجد روابط خلفية"
27 |
28 | [home]
29 | other = "الرئيسية"
30 |
31 | [light_mode]
32 | other = "السمة الفاتحة"
33 |
34 | [dark_mode]
35 | other = "السمة الداكنة"
36 |
37 | [edit_source]
38 | other = "تعديل المصدر"
39 |
40 | [interactive_graph]
41 | other = "المخطط التفاعلي"
42 |
43 | [search]
44 | other = "البحث"
45 |
46 | [search_icon]
47 | other = "أيقونة البحث"
48 |
49 | [icon_search]
50 | other = "أيقونة فتح نافذة البحث"
51 |
52 | [recent_notes]
53 | other = "الملاحظات اﻷخيرة"
54 |
55 | [first_3_notes]
56 | other = "أول 3 {{ .notes }}"
57 |
58 | [search_for_something]
59 | other = "ابحث عن شيء ما..."
60 |
61 | [toc]
62 | other = "الفهرس"
63 |
64 | [copyright]
65 | other = "صُمم بواسطة {{ .name }} باستخدام كوارتز، {{ .year }} ©"
66 |
--------------------------------------------------------------------------------
/i18n/en.toml:
--------------------------------------------------------------------------------
1 | [404_message]
2 | other = "Hey! You look a little lost. This page doesn't exist (or may be private)."
3 |
4 | [404_back]
5 | other = "↳ Let's get you home."
6 |
7 | [all_posts]
8 | other = "All {{.Title}}"
9 |
10 | [last_updated]
11 | other = "Last updated"
12 |
13 | [notes_count]
14 | other = "notes with this tag"
15 |
16 | [first_10]
17 | other = "showing first 10 results"
18 |
19 | [tag]
20 | other = "Tag"
21 |
22 | [backlinks]
23 | other = "Backlinks"
24 |
25 | [no_backlinks]
26 | other = "No backlinks found"
27 |
28 | [home]
29 | other = "Home"
30 |
31 | [all_tags]
32 | other = "All Tags"
33 |
34 | [light_mode]
35 | other = "Light Mode"
36 |
37 | [dark_mode]
38 | other = "Dark Mode"
39 |
40 | [edit_source]
41 | other = "Edit Source"
42 |
43 | [interactive_graph]
44 | other = "Interactive Graph"
45 |
46 | [search]
47 | other = "Search"
48 |
49 | [search_icon]
50 | other = "Search Icon"
51 |
52 | [icon_search]
53 | other = "Icon to open search"
54 |
55 | [recent_notes]
56 | other = "Recent Notes"
57 |
58 | [first_3_notes]
59 | other = "first 3 {{ .notes }}"
60 |
61 | [search_for_something]
62 | other = "Search for something..."
63 |
64 | [toc]
65 | other = "Table of Contents"
66 |
67 | [copyright]
68 | other = "Made by {{ .name }} using Quartz, © {{ .year }}"
69 |
--------------------------------------------------------------------------------
/i18n/es.toml:
--------------------------------------------------------------------------------
1 | [404_message]
2 | other = "Hey! Te ves un poco perdido. Esta página no existe (o puede que sea privada)."
3 |
4 | [404_back]
5 | other = "↳ Vamos a llevarte de regreso a casa."
6 |
7 | [all_posts]
8 | other = "Todos {{.Title}}"
9 |
10 | [last_updated]
11 | other = "Actualizado por última vez"
12 |
13 | [notes_count]
14 | other = "notas con esta etiqueta"
15 |
16 | [first_10]
17 | other = "mostrando los primeros 10 resultados"
18 |
19 | [tag]
20 | other = "Etiqueta"
21 |
22 | [backlinks]
23 | other = "Backlinks"
24 |
25 | [no_backlinks]
26 | other = "No se encontraron backlinks"
27 |
28 | [home]
29 | other = "Casa"
30 |
31 | [light_mode]
32 | other = "Modo Claro"
33 |
34 | [dark_mode]
35 | other = "Modo Oscuro"
36 |
37 | [edit_source]
38 | other = "Editar Fuente"
39 |
40 | [interactive_graph]
41 | other = "Gráfico Interactivo"
42 |
43 | [search]
44 | other = "Búsqueda"
45 |
46 | [search_icon]
47 | other = "Ícono de Búsqueda"
48 |
49 | [icon_search]
50 | other = "Ícono para abrir la búsqueda"
51 |
52 | [recent_notes]
53 | other = "Notas Recientes"
54 |
55 | [first_3_notes]
56 | other = "primeras 3 {{ .notes }}"
57 |
58 | [search_for_something]
59 | other = "Buscar algo..."
60 |
61 | [toc]
62 | other = "Tabla de Contenido"
63 |
64 | [copyright]
65 | other = "Hecho por {{ .name }} usando Quartz, © {{ .year }}"
66 |
--------------------------------------------------------------------------------
/i18n/fr.toml:
--------------------------------------------------------------------------------
1 | [404_message]
2 | other = "Hey ! Vous semblez perdu‧e. Cette page n'existe pas (ou est privée)."
3 |
4 | [404_back]
5 | other = "↳ On va te faire retourner à l'accueil"
6 |
7 | [all_posts]
8 | other = "Tout {{.Title}}"
9 |
10 | [last_updated]
11 | other = "Dernière modification"
12 |
13 | [notes_count]
14 | other = "notes avec ce tag"
15 |
16 | [first_10]
17 | other = "affichant les 10 premiers résultats"
18 |
19 | [tag]
20 | other = "Tag"
21 |
22 | [backlinks]
23 | other = "Backlinks"
24 |
25 | [no_backlinks]
26 | other = "Pas de backlinks trouvés"
27 |
28 | [home]
29 | other = "Accueil"
30 |
31 | [light_mode]
32 | other = "Mode Clair"
33 |
34 | [dark_mode]
35 | other = "Mode Sombre"
36 |
37 | [edit_source]
38 | other = "Editer la source"
39 |
40 | [interactive_graph]
41 | other = "Graphique interactif"
42 |
43 | [search]
44 | other = "Rechercher"
45 |
46 | [search_icon]
47 | other = "Icône de recherche"
48 |
49 | [icon_search]
50 | other = "Icon pour ouvrir la recherche"
51 |
52 | [recent_notes]
53 | other = "Notes récentes"
54 |
55 | [first_3_notes]
56 | other = "3 premières {{ .notes }}"
57 |
58 | [search_for_something]
59 | other = "Rechercher quelque-chose..."
60 |
61 | [toc]
62 | other = "Table des matières"
63 |
64 | [copyright]
65 | other = "Fait par {{ .name }} en utilisant Quartz, © {{ .year }}"
66 |
--------------------------------------------------------------------------------
/i18n/uk.toml:
--------------------------------------------------------------------------------
1 | [404_message]
2 | other = "Хей! Виглядаєте здивовано. Цієї сторінки не існує (або вона приватна)."
3 |
4 | [404_back]
5 | other = "↳ Повернемося додому."
6 |
7 | [all_posts]
8 | other = "Всі {{.Title}}"
9 |
10 | [last_updated]
11 | other = "Оновлено"
12 |
13 | [notes_count]
14 | other = "нонаток з цим тегом"
15 |
16 | [first_10]
17 | other = "показано 10 перших результатів"
18 |
19 | [tag]
20 | other = "Тег"
21 |
22 | [backlinks]
23 | other = "Зворотнє посилання"
24 |
25 | [no_backlinks]
26 | other = "Зворотних посилань не знайдено"
27 |
28 | [home]
29 | other = "Дім"
30 |
31 | [light_mode]
32 | other = "Світлий Режим"
33 |
34 | [dark_mode]
35 | other = "Темний Режим"
36 |
37 | [edit_source]
38 | other = "Редагувати Джерело"
39 |
40 | [interactive_graph]
41 | other = "Інтерактивний граф"
42 |
43 | [search]
44 | other = "Пошук"
45 |
46 | [search_icon]
47 | other = "Іконка Пошуку"
48 |
49 | [icon_search]
50 | other = "Іконка для відкриття пошуку"
51 |
52 | [recent_notes]
53 | other = "Нещодавні Нотатки"
54 |
55 | [first_3_notes]
56 | other = "перші 3 {{ .notes }}"
57 |
58 | [search_for_something]
59 | other = "Знайти щось..."
60 |
61 | [toc]
62 | other = "Зміст"
63 |
64 | [copyright]
65 | other = "Створено {{ .name }} з використанням Quartz, © {{ .year }}"
66 |
--------------------------------------------------------------------------------
/layouts/404.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | {{ partial "head.html" . }}
4 |
5 |
6 |
7 | {{partial "darkmode.html" .}}
8 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/layouts/_default/_markup/render-image.html:
--------------------------------------------------------------------------------
1 | {{$src := .Destination | safeURL }}
2 | {{$width := index (split .Text "|") 1 | default "auto" }}
3 | {{$external := strings.HasPrefix $src "http" }}
4 | {{- if $external -}}
5 |
6 | {{- else -}}
7 | {{$fixedUrl := (cond (hasPrefix $src "/") $src (print "/" $src)) | urlize}}
8 |
9 | {{- end -}}
10 |
--------------------------------------------------------------------------------
/layouts/_default/_markup/render-link.html:
--------------------------------------------------------------------------------
1 | {{$trimmed := strings.TrimSuffix ".md" (.Destination | safeURL)}}
2 | {{$dashedurl := replace $trimmed "%20" "-" }}
3 | {{$external := strings.HasPrefix $dashedurl "http" }}
4 | {{- if $external -}}
5 | {{ .Text | safeHTML }}
6 | {{- else -}}
7 | {{$spacedurl := replace $trimmed "%20" " " }}
8 | {{$fixedUrl := (cond (hasPrefix $spacedurl "/") $spacedurl (print "/" $spacedurl)) | urlize}}
9 | {{$nonexistent := eq (.Page.GetPage $spacedurl).RelPermalink ""}}
10 | {{$rooted := default $spacedurl ((.Page.GetPage $spacedurl).RelPermalink) }}
11 | {{- .Text | safeHTML -}}
15 |
16 | {{- end -}}
17 |
--------------------------------------------------------------------------------
/layouts/_default/baseof.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | {{ block "head" . }}
4 | {{ end }}
5 |
6 |
7 | {{ block "main" . }}
8 | {{ end }}
9 |
10 |
--------------------------------------------------------------------------------
/layouts/_default/section.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | {{ partial "head.html" . }}
4 |
5 |
6 | {{partial "search.html" .}}
7 |
8 |
9 |
10 | {{partial "header.html" .}}
11 | {{ i18n "all_posts" . }}
12 | {{with .Params.description}}
13 | {{.}}
14 | {{end}}
15 | {{partial "page-list.html" .Paginator.Pages.ByLastmod.Reverse }}
16 | {{ template "_internal/pagination.html" .}}
17 | {{partial "contact.html" .}}
18 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/layouts/_default/single.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | {{ partial "head.html" . }}
4 |
5 |
6 | {{partial "search.html" .}}
7 |
8 |
9 |
10 | {{partial "header.html" .}}
11 | {{if .Title}}{{ .Title }}
{{end}}
12 |
13 | {{ i18n "last_updated" }} {{ partial "date-fmt.html" .}}
14 | {{ partial "github.html" . }}
15 |
16 |
21 | {{partial "toc.html" .}}
22 | {{partial "textprocessing.html" . }}
23 | {{partial "footer.html" .}}
24 |
25 |
26 | {{partial "backlinks.html" .}}
27 |
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/layouts/_default/taxonomy.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | {{ partial "head.html" . }}
4 |
5 |
6 | {{partial "search.html" .}}
7 |
8 |
9 |
10 | {{partial "header.html" .}}
11 | {{ i18n "all_posts" . }}
12 | {{with .Params.description}}
13 | {{.}}
14 | {{end}}
15 |
26 | {{partial "contact.html" .}}
27 |
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/layouts/_default/term.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | {{ partial "head.html" . }}
4 |
5 |
6 | {{partial "search.html" .}}
7 |
8 |
9 |
10 | {{partial "header.html" .}}
11 | {{ i18n "tag" }}: {{ .Title }}
12 | {{with .Params.description}}
13 | {{.}}
14 | {{end}}
15 | {{partial "page-list.html" .Paginator.Pages}}
16 | {{ template "_internal/pagination.html" . }}
17 | {{partial "contact.html" .}}
18 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/layouts/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 | {{ partial "head.html" . }}
4 |
5 |
6 | {{partial "search.html" .}}
7 |
8 |
9 |
10 | {{partial "header.html" .}}
11 | {{partial "toc.html" .}}
12 | {{partial "textprocessing.html" . }}
13 | {{if $.Site.Data.config.enableRecentNotes}}
14 | {{partial "recent.html" . }}
15 | {{end}}
16 | {{partial "footerIndex.html" .}}
17 |
18 |
19 |
20 |
21 |
--------------------------------------------------------------------------------
/layouts/partials/backlinks.html:
--------------------------------------------------------------------------------
1 | {{ i18n "backlinks" }}
2 |
3 | {{$url := urls.Parse .Site.BaseURL }}
4 | {{$host := strings.TrimRight "/" $url.Path }}
5 | {{$curPage := strings.TrimPrefix $host (strings.TrimRight "/" .Page.RelPermalink)}}
6 | {{$linkIndex := getJSON "/assets/indices/linkIndex.json"}}
7 | {{$inbound := index $linkIndex.index.backlinks $curPage}}
8 | {{$contentTable := getJSON "/assets/indices/contentIndex.json"}}
9 | {{if $inbound}}
10 | {{$backlinks := dict "SENTINEL" "SENTINEL"}}
11 | {{range $k, $v := $inbound}}
12 | {{$cleanedInbound := replace $v.source " " "-"}}
13 | {{$ctx := $v.text}}
14 | {{$backlinks = merge $backlinks (dict $cleanedInbound $ctx)}}
15 | {{end}}
16 | {{- range $lnk, $ctx := $backlinks -}}
17 | {{$l := printf "%s%s/" $host $lnk}}
18 | {{$l = cond (eq $l "//") "/" $l}}
19 | {{with (index $contentTable $lnk)}}
20 | -
21 | {{index (index . "title")}}
22 |
23 | {{end}}
24 | {{- end -}}
25 | {{else}}
26 | -
27 | {{ i18n "no_backlinks" }}
28 |
29 | {{end}}
30 |
31 |
--------------------------------------------------------------------------------
/layouts/partials/contact.html:
--------------------------------------------------------------------------------
1 |
2 | {{ $config := cond (eq $.Site.Language.Lang "en") "config" (printf "config.%s" $.Site.Language.Lang) }}
3 | {{ $data := index $.Site.Data $config }}
4 |
5 |
21 |
--------------------------------------------------------------------------------
/layouts/partials/darkmode.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
9 |
15 |
--------------------------------------------------------------------------------
/layouts/partials/date-fmt.html:
--------------------------------------------------------------------------------
1 | {{if .Date}}
2 | {{.Date.Format "Jan 2, 2006"}}
3 | {{else if .Lastmod}}
4 | {{.Lastmod.Format "Jan 2, 2006"}}
5 | {{else}}
6 | Unknown
7 | {{end}}
8 |
--------------------------------------------------------------------------------
/layouts/partials/footer.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | {{if $.Site.Data.config.enableFooter}}
6 |
11 | {{end}}
12 |
13 | {{partial "contact.html" .}}
14 |
--------------------------------------------------------------------------------
/layouts/partials/footerIndex.html:
--------------------------------------------------------------------------------
1 | {{if $.Site.Data.config.enableFooter}}
2 | {{if $.Site.Data.graphConfig.enableGlobalGraph}}
3 |
10 | {{else}}
11 |
12 |
18 | {{end}}
19 | {{end}}
20 |
21 | {{partial "contact.html" .}}
22 |
--------------------------------------------------------------------------------
/layouts/partials/github.html:
--------------------------------------------------------------------------------
1 | {{if $.Site.Data.config.enableGitHubEdit}}
2 |
3 | - Edit Source
4 | {{end}}
5 |
--------------------------------------------------------------------------------
/layouts/partials/graph.html:
--------------------------------------------------------------------------------
1 |
6 | {{ i18n "interactive_graph" }}
7 |
8 |
17 | {{ $js := resources.Get "js/graph.js" | resources.Fingerprint "md5" }}
18 |
19 |
--------------------------------------------------------------------------------
/layouts/partials/header.html:
--------------------------------------------------------------------------------
1 |
2 | {{ $config := cond (eq $.Site.Language.Lang "en") "config" (printf "config.%s" $.Site.Language.Lang) }}
3 |
4 |
5 |
6 |
{{ i18n "search" }}
7 |
8 |
9 | {{partial "tags.html" .}}
10 | {{partial "darkmode.html" .}}
11 |
12 |
--------------------------------------------------------------------------------
/layouts/partials/katex.html:
--------------------------------------------------------------------------------
1 | {{if $.Site.Data.config.enableLatex}}
2 |
3 |
4 |
5 |
6 | {{end}}
7 |
--------------------------------------------------------------------------------
/layouts/partials/page-list.html:
--------------------------------------------------------------------------------
1 |
2 | {{- range . -}}
3 | -
4 |
5 |
6 | {{partial "date-fmt.html" .}}
7 |
8 |
11 |
12 |
17 |
18 |
19 | {{- end -}}
20 |
21 |
--------------------------------------------------------------------------------
/layouts/partials/recent.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
Recent Updates
4 |
10 | {{$notes := .Site.RegularPages.ByLastmod.Reverse}}
11 | {{partial "page-list.html" (first 6 $notes)}}
12 |
13 |
14 |
--------------------------------------------------------------------------------
/layouts/partials/search.html:
--------------------------------------------------------------------------------
1 |
9 | {{if $.Site.Data.config.enableSemanticSearch}}
10 | {{ $js := resources.Get "js/semantic-search.js" | resources.ExecuteAsTemplate "js/semantic-search.js" . | resources.Fingerprint "md5" | resources.Minify }}
11 |
12 | {{else}}
13 |
15 | {{ $js := resources.Get "js/full-text-search.js" | resources.Fingerprint "md5" | resources.Minify }}
16 |
17 | {{end}}
18 |
19 |
--------------------------------------------------------------------------------
/layouts/partials/tags.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
13 |
21 |
22 |
23 |
--------------------------------------------------------------------------------
/layouts/partials/toc.html:
--------------------------------------------------------------------------------
1 | {{ $hasHeaders := gt (len (findRE "(.|\n)*?" .Content)) 0 }}
2 | {{ if (and $.Site.Data.config.enableToc (ne .Params.enableToc false) $hasHeaders) }}
3 | {{ if not .IsHome }}
4 |
10 | {{end}}{{end}}
11 |
--------------------------------------------------------------------------------
/lower_case.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 |
4 | content_path = "content"
5 |
6 |
7 | def convert_glossary_terms_lower_case(file_path: str):
8 | # recursively convert all filenames to lower case
9 | for root, dirs, files in os.walk(file_path):
10 | for filename in files:
11 | os.rename(
12 | os.path.join(root, filename), os.path.join(root, filename.lower())
13 | )
14 |
15 |
16 | if __name__ == "__main__":
17 | convert_glossary_terms_lower_case(content_path)
18 |
--------------------------------------------------------------------------------
/lower_link_index.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 |
4 | # accept first argument
5 | link_index_path = sys.argv[1] # "public/indices/"
6 |
7 |
8 | def read_file_name(file_path: str):
9 | prefixed = [
10 | filename for filename in os.listdir(file_path) if filename.startswith("link")
11 | ]
12 | return os.path.join(file_path, prefixed[0])
13 |
14 |
15 | def convert_to_lower_case(file_path: str):
16 | """converting linkIndex.json file that is used for creating the paths. As we lowe lower case all glossary terms, we need to update these here as well"""
17 | with open(file_path, "r") as f:
18 | content = f.read()
19 | content = content.lower()
20 | with open(file_path, "w") as f:
21 | f.write(content)
22 |
23 |
24 | if __name__ == "__main__":
25 | convert_to_lower_case(read_file_name(link_index_path))
26 |
--------------------------------------------------------------------------------
/static/glossary-feature.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airbytehq/glossary/90017025f55eb565184c51e6b110bf8da9704a9e/static/glossary-feature.jpeg
--------------------------------------------------------------------------------
/static/icon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/airbytehq/glossary/90017025f55eb565184c51e6b110bf8da9704a9e/static/icon.png
--------------------------------------------------------------------------------
/utils/requirements.txt:
--------------------------------------------------------------------------------
1 | pandoc
2 |
--------------------------------------------------------------------------------