├── argo
    ├── input-files-example
    │   ├── tap-covid-19
    │   │   └── state.json
    │   ├── target-csv
    │   │   └── config.json
    │   ├── mc-cp-files.sh
    │   └── tap-covid-19-workflow-with-files.yml
    ├── argo-artifact-patch.yml
    ├── template-example
    │   ├── invoke-template.yml
    │   ├── workflow-template.yml
    │   └── exit-handler-template.yml
    ├── tap-exchange-rate-workflow.yml
    ├── cronworkflow-example
    │   └── tap-to-target-cronworkflow.yml
    └── tap-covid-19-example
    │   └── tap-covid-19-workflow.yml
├── assets
    ├── argo-ui.png
    ├── argo-dag-example.png
    ├── argo-tap-workflow.png
    └── kubernetes-resources.png
├── singer-containers
    ├── target-csv
    │   ├── Makefile
    │   ├── Dockerfile
    │   └── entrypoint.py
    ├── tap-covid-19
    │   ├── Makefile
    │   ├── entrypoint.py
    │   └── Dockerfile
    └── tap-exchange-rates
    │   ├── Makefile
    │   ├── entrypoint.py
    │   └── Dockerfile
├── .gitignore
├── LICENSE
└── README.md


/argo/input-files-example/tap-covid-19/state.json:
--------------------------------------------------------------------------------
1 | {"start_date": "2020-09-10"}


--------------------------------------------------------------------------------
/assets/argo-ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stkbailey/data-replication-on-kubernetes/HEAD/assets/argo-ui.png


--------------------------------------------------------------------------------
/argo/input-files-example/target-csv/config.json:
--------------------------------------------------------------------------------
1 | {"delimiter": "\t", "quotechar": "'", "destination_path": "/tmp/data/"}


--------------------------------------------------------------------------------
/assets/argo-dag-example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stkbailey/data-replication-on-kubernetes/HEAD/assets/argo-dag-example.png


--------------------------------------------------------------------------------
/assets/argo-tap-workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stkbailey/data-replication-on-kubernetes/HEAD/assets/argo-tap-workflow.png


--------------------------------------------------------------------------------
/assets/kubernetes-resources.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/stkbailey/data-replication-on-kubernetes/HEAD/assets/kubernetes-resources.png


--------------------------------------------------------------------------------
/singer-containers/target-csv/Makefile:
--------------------------------------------------------------------------------
1 | ACCOUNT := stkbailey
2 | REPO := target-csv
3 | 
4 | docker-publish:
5 | 	docker login
6 | 	docker build --tag ${ACCOUNT}/${REPO}:latest .
7 | 	docker push ${ACCOUNT}/${REPO}:latest
8 | 


--------------------------------------------------------------------------------
/singer-containers/tap-covid-19/Makefile:
--------------------------------------------------------------------------------
1 | ACCOUNT := stkbailey
2 | REPO := tap-covid-19
3 | 
4 | docker-publish:
5 | 	docker login
6 | 	docker build --tag ${ACCOUNT}/${REPO}:latest .
7 | 	docker push ${ACCOUNT}/${REPO}:latest
8 | 


--------------------------------------------------------------------------------
/singer-containers/tap-exchange-rates/Makefile:
--------------------------------------------------------------------------------
1 | ACCOUNT := stkbailey
2 | REPO := tap-exchange-rates
3 | 
4 | docker-publish:
5 | 	docker login
6 | 	docker build --tag ${ACCOUNT}/${REPO}:latest .
7 | 	docker push ${ACCOUNT}/${REPO}:latest
8 | 


--------------------------------------------------------------------------------
/argo/argo-artifact-patch.yml:
--------------------------------------------------------------------------------
 1 | data:
 2 |   artifactRepository: |
 3 |     archiveLogs: true
 4 |     s3:
 5 |       bucket: artifacts
 6 |       endpoint: argo-artifacts:9000
 7 |       insecure: true
 8 |       accessKeySecret:
 9 |         name: argo-artifacts
10 |         key: accesskey
11 |       secretKeySecret:
12 |         name: argo-artifacts
13 |         key: secretkey


--------------------------------------------------------------------------------
/singer-containers/tap-covid-19/entrypoint.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from singer_container_utils import TapRunner
 3 | 
 4 | tap_configs = dict(
 5 |     execute_command="tap-covid-19",
 6 |     required_config_keys=["api_token", "start_date"],
 7 |     path_to_config="/tmp/config.json",
 8 |     path_to_catalog= "/tmp/catalog.json",
 9 |     path_to_state = "/tmp/state.json",
10 |     path_to_output="/tmp/tap_output.txt",
11 |     discover_catalog=True,
12 | )
13 | 
14 | if __name__ == "__main__":
15 |     tap = TapRunner(**tap_configs)
16 |     tap.run()
17 | 


--------------------------------------------------------------------------------
/singer-containers/tap-exchange-rates/entrypoint.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from singer_container_utils import TapRunner
 3 | 
 4 | 
 5 | tap_configs = dict(
 6 |     execute_command="tap-exchangeratesapi",
 7 |     required_config_keys=["start_date"],
 8 |     path_to_config="/tmp/config.json",
 9 |     path_to_catalog= "/tmp/catalog.json",
10 |     path_to_state = "/tmp/state.json",
11 |     path_to_output="/tmp/tap_output.txt",
12 |     discover_catalog=False,
13 | )
14 | 
15 | if __name__ == "__main__":
16 |     tap = TapRunner(**tap_configs)
17 |     tap.run()
18 | 


--------------------------------------------------------------------------------
/singer-containers/target-csv/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Set up the container dependencies
 2 | FROM python:3.8-slim
 3 | RUN apt-get update && apt-get install -y gcc git libpq-dev libssl-dev 
 4 | 
 5 | # Install a singer-container helper library
 6 | RUN pip install https://github.com/immuta/singer-container-utils/archive/master.zip
 7 | 
 8 | # Install the tap
 9 | RUN pip install target-csv
10 | 
11 | # Set /opt/code as the default directory and copy entrypoint script
12 | RUN mkdir -p /opt/code
13 | WORKDIR /opt/code
14 | COPY entrypoint.py .
15 | 
16 | # Run the entrypoint file on container start
17 | ENTRYPOINT [ "python", "./entrypoint.py" ]
18 | 


--------------------------------------------------------------------------------
/singer-containers/tap-covid-19/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Set up the container dependencies
 2 | FROM python:3.8-slim
 3 | RUN apt-get update && apt-get install -y gcc git libpq-dev libssl-dev 
 4 | 
 5 | # Install a singer-container helper library
 6 | RUN pip install https://github.com/immuta/singer-container-utils/archive/master.zip
 7 | 
 8 | # Install the tap
 9 | RUN pip install tap-covid-19
10 | 
11 | # Set /opt/code as the default directory and copy entrypoint script
12 | RUN mkdir -p /opt/code
13 | WORKDIR /opt/code
14 | COPY entrypoint.py .
15 | 
16 | # Run the entrypoint file on container start
17 | ENTRYPOINT [ "python", "./entrypoint.py" ]
18 | 


--------------------------------------------------------------------------------
/singer-containers/tap-exchange-rates/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Set up the container dependencies
 2 | FROM python:3.8-slim
 3 | RUN apt-get update && apt-get install -y gcc git libpq-dev libssl-dev 
 4 | 
 5 | # Install a singer-container helper library
 6 | RUN pip install https://github.com/immuta/singer-container-utils/archive/master.zip
 7 | 
 8 | # Install the tap
 9 | RUN pip install tap-exchangeratesapi
10 | 
11 | # Set /opt/code as the default directory and copy entrypoint script
12 | RUN mkdir -p /opt/code
13 | WORKDIR /opt/code
14 | COPY entrypoint.py .
15 | 
16 | # Run the entrypoint file on container start
17 | ENTRYPOINT [ "python", "./entrypoint.py" ]
18 | 


--------------------------------------------------------------------------------
/argo/input-files-example/mc-cp-files.sh:
--------------------------------------------------------------------------------
 1 | # Make sure the local MinIO service is mapped for the MinIO CLI
 2 | mc config host add argo-artifacts-local http://localhost:9000 YOURACCESSKEY YOURSECRETKEY
 3 | 
 4 | # If the `singer` bucket is not created, create it
 5 | mc mb argo-artifacts-local/singer
 6 | 
 7 | # Copy the whole "tap-exhange-rates" config directory to the bucket
 8 | mc cp --recursive                               \
 9 |     argo/input-files-example/tap-covid-19 \
10 |     argo-artifacts-local/singer/config/
11 | 
12 | mc cp --recursive                           \
13 |     argo/input-files-example/target-csv     \
14 |     argo-artifacts-local/singer/config/


--------------------------------------------------------------------------------
/singer-containers/target-csv/entrypoint.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import pathlib
 3 | import zipfile
 4 | 
 5 | from singer_container_utils import TargetRunner
 6 | 
 7 | 
 8 | # target-csv does not connect to a database, but rather outputs a set of files
 9 | # that need to be zipped up and mapped to an artifact
10 | out = pathlib.Path("/tmp/data")
11 | out.mkdir()
12 | 
13 | # Run the tap
14 | if __name__ == "__main__":
15 |     target = TargetRunner(
16 |         execute_command="target-csv",
17 |         required_config_keys=["destination_path"],
18 |         path_to_config = "/tmp/config.json",
19 |         path_to_input = "/tmp/tap_input.txt",
20 |         path_to_output = "/tmp/target_output.txt",
21 |     )
22 |     target.run()
23 | 
24 | 
25 | # Zip up the outputs and map to outfile
26 | zf = zipfile.ZipFile(out / "data.zip", mode='w')
27 | for f in out.glob("*.csv"):
28 |     zf.write(f)
29 | zf.close()
30 | 


--------------------------------------------------------------------------------
/argo/template-example/invoke-template.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: argoproj.io/v1alpha1
 2 | kind: Workflow
 3 | metadata:
 4 |   generateName: invoked-template-example-
 5 |   namespace: argo
 6 | spec:
 7 |   entrypoint: workflow-runner
 8 | 
 9 |   templates:
10 |   - name: workflow-runner
11 |     dag:
12 |       tasks:
13 |       - name: workflow-1
14 |         arguments:
15 |           parameters:
16 |           - name: tap_image
17 |             value: tap-exchange-rates
18 |         templateRef:
19 |           name: singer-tap-to-csv-template
20 |           template: tap-to-target
21 | 
22 |       - name: workflow-2
23 |         arguments:
24 |           parameters:
25 |           - name: tap_image
26 |             value: tap-exchange-rates
27 |         templateRef:
28 |           name: singer-tap-to-csv-template
29 |           template: tap-to-target
30 | 
31 |       - name: workflow-3
32 |         arguments:
33 |           parameters:
34 |           - name: tap_image
35 |             value: tap-exchange-rates
36 |         templateRef:
37 |           name: singer-tap-to-csv-template
38 |           template: tap-to-target
39 | 
40 |           


--------------------------------------------------------------------------------
/argo/template-example/workflow-template.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: argoproj.io/v1alpha1
 2 | kind: WorkflowTemplate
 3 | metadata:
 4 |   name: singer-tap-to-csv-template
 5 |   namespace: argo
 6 | spec:
 7 |   entrypoint: tap-to-target
 8 | 
 9 |   templates:
10 |   - name: tap-to-target
11 |     inputs:
12 |       parameters:
13 |       - name: tap_image
14 |     steps:
15 |     - - name: tap
16 |         template: singer-tap
17 |         arguments:
18 |           parameters:
19 |           - name: tap_image
20 |             value: "{{inputs.parameters.tap_image}}"
21 |     - - name: target
22 |         template: singer-target
23 |         arguments:
24 |           parameters:
25 |           - name: target_image
26 |             value: "target-csv"
27 |           - name: tap_image
28 |             value: "{{inputs.parameters.tap_image}}"
29 |           artifacts:
30 |           - name: tap-output
31 |             from: "{{steps.tap.outputs.artifacts.tap-output}}"
32 | 
33 |   - name: singer-tap
34 |     container:
35 |       image: "stkbailey/{{inputs.parameters.tap_image}}:latest"
36 |     inputs:
37 |       parameters:
38 |       - name: tap_image
39 |       artifacts:
40 |       - name: tap-config
41 |         path: /tmp/config.json
42 |         raw:
43 |           data: |
44 |             {"start_date": "2020-08-01"}
45 |     outputs:
46 |       artifacts:
47 |       - name: tap-output
48 |         path: /tmp/tap_output.txt
49 | 
50 |   - name: singer-target
51 |     container:
52 |       image: "stkbailey/{{inputs.parameters.target_image}}:latest"
53 |     inputs:
54 |       parameters:
55 |       - name: target_image
56 |       - name: tap_image
57 |       artifacts:
58 |       - name: target-config
59 |         path: /tmp/config.json
60 |         raw:
61 |           data: |
62 |             {"delimiter": "\t", "quotechar": "'", "destination_path": "/tmp/data/"}
63 |       - name: tap-output
64 |         path: /tmp/tap_input.txt
65 |     outputs:
66 |       artifacts:
67 |       - name: target-output
68 |         path: /tmp/target_output.txt
69 |       - name: target-data
70 |         path: /tmp/data/data.zip
71 |         s3:
72 |           bucket: singer
73 |           key: "outputs/{{inputs.parameters.tap_image}}/results.zip"
74 |           endpoint: argo-artifacts:9000
75 |           insecure: true
76 |           accessKeySecret:
77 |             name: argo-artifacts
78 |             key: accesskey
79 |           secretKeySecret:
80 |             name: argo-artifacts
81 |             key: secretkey
82 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # LLocal Settings
132 | .vscode
133 | .venv


--------------------------------------------------------------------------------
/argo/tap-exchange-rate-workflow.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: argoproj.io/v1alpha1
 2 | kind: Workflow
 3 | metadata:
 4 |   generateName: singer-tap-exchange-rates-
 5 |   namespace: argo
 6 | spec:
 7 |   entrypoint: tap-to-target
 8 | 
 9 |   templates:
10 |   - name: tap-to-target
11 |     inputs:
12 |       parameters:
13 |       - name: tap_image
14 |         value: tap-exchange-rates
15 |       - name: target_image
16 |         value: target-csv
17 |     steps:
18 |     - - name: tap
19 |         template: singer-tap
20 |         arguments:
21 |           parameters:
22 |           - name: tap_image
23 |             value: "{{inputs.parameters.tap_image}}"
24 |     - - name: target
25 |         template: singer-target
26 |         arguments:
27 |           parameters:
28 |           - name: target_image
29 |             value: "{{inputs.parameters.target_image}}"
30 |           - name: tap_image
31 |             value: "{{inputs.parameters.tap_image}}"
32 |           artifacts:
33 |           - name: tap-output
34 |             from: "{{steps.tap.outputs.artifacts.tap-output}}"
35 | 
36 |   - name: singer-tap
37 |     container:
38 |       image: "stkbailey/{{inputs.parameters.tap_image}}:latest"
39 |     inputs:
40 |       parameters:
41 |       - name: tap_image
42 |       artifacts:
43 |       - name: tap-config
44 |         path: /tmp/config.json
45 |         raw:
46 |           data: |
47 |             {"start_date": "2020-08-01"}
48 |     outputs:
49 |       artifacts:
50 |       - name: tap-output
51 |         path: /tmp/tap_output.txt
52 | 
53 |   - name: singer-target
54 |     container:
55 |       image: "stkbailey/{{inputs.parameters.target_image}}:latest"
56 |     inputs:
57 |       parameters:
58 |       - name: target_image
59 |       - name: tap_image
60 |       artifacts:
61 |       - name: target-config
62 |         path: /tmp/config.json
63 |         raw:
64 |           data: |
65 |             {"delimiter": "\t", "quotechar": "'", "destination_path": "/tmp/data/"}
66 |       - name: tap-output
67 |         path: /tmp/tap_input.txt
68 |     outputs:
69 |       artifacts:
70 |       - name: target-output
71 |         path: /tmp/target_output.txt
72 |       - name: target-data
73 |         path: /tmp/data/data.zip
74 |         s3:
75 |           bucket: singer
76 |           key: "outputs/{{inputs.parameters.tap_image}}/results.zip"
77 |           endpoint: argo-artifacts:9000
78 |           insecure: true
79 |           accessKeySecret:
80 |             name: argo-artifacts
81 |             key: accesskey
82 |           secretKeySecret:
83 |             name: argo-artifacts
84 |             key: secretkey
85 | 


--------------------------------------------------------------------------------
/argo/template-example/exit-handler-template.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: argoproj.io/v1alpha1
 2 | kind: WorkflowTemplate
 3 | metadata:
 4 |   name: slack-exit-handler-template
 5 |   namespace: argo
 6 | spec:
 7 |   entrypoint: exit-handler
 8 | 
 9 |   templates:
10 |   - name: exit-handler
11 |     inputs:
12 |       parameters:
13 |       - name: workflow_status
14 |         value: null
15 |       - name: workflow_name
16 |         value: null
17 |       - name: workflow_failures
18 |         value: null
19 |       - name: webhook_url
20 |         value: "https://slack.com/my-webhook-url"
21 |     steps:
22 |     - - name: woop-woop-message
23 |         template: slack-notify
24 |         arguments:
25 |           parameters:
26 |           - name: workflow_name
27 |             value: "{{inputs.parameters.workflow_name}}"
28 |           - name: workflow_status
29 |             value: "{{inputs.parameters.workflow_status}}"
30 |           - name: workflow_failures
31 |             value: "{{inputs.parameters.workflow_failures}}"
32 |           - name: slack_color
33 |             value: "#008000"
34 |         when: "{{inputs.parameters.workflow_status}} == Succeeded"
35 |       - name: arrgghho-message
36 |         template: slack-notify
37 |         arguments:
38 |           parameters:
39 |           - name: workflow_name
40 |             value: "{{inputs.parameters.workflow_name}}"
41 |           - name: workflow_status
42 |             value: "{{inputs.parameters.workflow_status}}"
43 |           - name: workflow_failures
44 |             value: "{{inputs.parameters.workflow_failures}}"
45 |           - name: slack_color
46 |             value: "#FF0000"
47 |         when: "{{inputs.parameters.workflow_status}} != Succeeded"
48 | 
49 |   - name: slack-notify
50 |     inputs:
51 |       parameters:
52 |       - name: slack_color
53 |       - name: slack_channel
54 |         value: "notify-channel"
55 |       - name: webhook_url
56 |       - name: workflow_name
57 |       - name: workflow_status
58 |       - name: workflow_failures
59 |     container:
60 |       image: technosophos/slack-notify
61 |       env:
62 |       - name: SLACK_WEBHOOK
63 |         value: {{inputs.parameters.webhook_url}}
64 |       - name: SLACK_TITLE
65 |         value: "Workflow Complete: {{inputs.parameters.workflow_name}}"
66 |       - name: SLACK_MESSAGE
67 |         value: |
68 |           Name: {{inputs.parameters.workflow_name}}
69 |           Status: {{inputs.parameters.workflow_status}}
70 |           Failures: {{inputs.parameters.workflow_failures}}
71 |       - name: SLACK_CHANNEL
72 |         value: {{inputs.parameters.slack_channel}}
73 |       - name: SLACK_COLOR
74 |         value: "{{inputs.parameters.slack_color}}"
75 | 


--------------------------------------------------------------------------------
/argo/cronworkflow-example/tap-to-target-cronworkflow.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: argoproj.io/v1alpha1
 2 | kind: CronWorkflow
 3 | metadata:
 4 |   generateName: singer-tap-exchange-rates-
 5 |   namespace: argo
 6 | spec:
 7 |   schedule: "0 0 * * *"
 8 |   workflowSpec:
 9 |     entrypoint: tap-to-target
10 | 
11 |     templates:
12 |     - name: tap-to-target
13 |       inputs:
14 |         parameters:
15 |         - name: tap_image
16 |           value: tap-exchange-rates
17 |         - name: target_image
18 |           value: target-csv
19 |       steps:
20 |       - - name: tap
21 |           template: singer-tap
22 |           arguments:
23 |             parameters:
24 |             - name: tap_image
25 |               value: "{{inputs.parameters.tap_image}}"
26 |       - - name: target
27 |           template: singer-target
28 |           arguments:
29 |             parameters:
30 |             - name: target_image
31 |               value: "{{inputs.parameters.target_image}}"
32 |             - name: tap_image
33 |               value: "{{inputs.parameters.tap_image}}"
34 |             artifacts:
35 |             - name: tap-output
36 |               from: "{{steps.tap.outputs.artifacts.tap-output}}"
37 | 
38 |     - name: singer-tap
39 |       container:
40 |         image: "stkbailey/{{inputs.parameters.tap_image}}:latest"
41 |       inputs:
42 |         parameters:
43 |         - name: tap_image
44 |         artifacts:
45 |         - name: tap-config
46 |           path: /tmp/config.json
47 |           raw:
48 |             data: |
49 |               {"start_date": "2020-08-01"}
50 |       outputs:
51 |         artifacts:
52 |         - name: tap-output
53 |           path: /tmp/tap_output.txt
54 | 
55 |     - name: singer-target
56 |       container:
57 |         image: "stkbailey/{{inputs.parameters.target_image}}:latest"
58 |       inputs:
59 |         parameters:
60 |         - name: target_image
61 |         - name: tap_image
62 |         artifacts:
63 |         - name: target-config
64 |           path: /tmp/config.json
65 |           raw:
66 |             data: |
67 |               {"delimiter": "\t", "quotechar": "'", "destination_path": "/tmp/data/"}
68 |         - name: tap-output
69 |           path: /tmp/tap_input.txt
70 |       outputs:
71 |         artifacts:
72 |         - name: target-output
73 |           path: /tmp/target_output.txt
74 |         - name: target-data
75 |           path: /tmp/data/data.zip
76 |           s3:
77 |             bucket: singer
78 |             key: "outputs/{{inputs.parameters.tap_image}}/results.zip"
79 |             endpoint: argo-artifacts:9000
80 |             insecure: true
81 |             accessKeySecret:
82 |               name: argo-artifacts
83 |               key: accesskey
84 |             secretKeySecret:
85 |               name: argo-artifacts
86 |               key: secretkey
87 | 


--------------------------------------------------------------------------------
/argo/tap-covid-19-example/tap-covid-19-workflow.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: argoproj.io/v1alpha1
 2 | kind: Workflow
 3 | metadata:
 4 |   generateName: singer-tap-covid-19-
 5 |   namespace: argo
 6 | spec:
 7 |   entrypoint: tap-to-target
 8 | 
 9 |   templates:
10 |   - name: tap-to-target
11 |     inputs:
12 |       parameters:
13 |       - name: tap_image
14 |         value: tap-covid-19
15 |       - name: target_image
16 |         value: target-csv
17 |       - name: github_api_token
18 |         value: 3cd19e1bac7d9696ce83a81f93ed500fd7c44d6f
19 |     steps:
20 |     - - name: tap
21 |         template: singer-tap
22 |         arguments:
23 |           parameters:
24 |           - name: tap_image
25 |             value: "{{inputs.parameters.tap_image}}"
26 |           - name: github_api_token
27 |             value: "{{inputs.parameters.github_api_token}}"
28 |     - - name: target
29 |         template: singer-target
30 |         arguments:
31 |           parameters:
32 |           - name: target_image
33 |             value: "{{inputs.parameters.target_image}}"
34 |           - name: tap_image
35 |             value: "{{inputs.parameters.tap_image}}"
36 |           artifacts:
37 |           - name: tap-output
38 |             from: "{{steps.tap.outputs.artifacts.tap-output}}"
39 | 
40 |   - name: singer-tap
41 |     container:
42 |       image: "stkbailey/{{inputs.parameters.tap_image}}:latest"
43 |     inputs:
44 |       parameters:
45 |       - name: tap_image
46 |       - name: github_api_token
47 |       artifacts:
48 |       - name: tap-config
49 |         path: /tmp/config.json
50 |         raw:
51 |           data: |
52 |             {
53 |               "api_token": "{{inputs.parameters.github_api_token}}",
54 |               "start_date": "2020-09-01T00:00:00Z",
55 |               "user_agent": "tap-covid-19 tutorial@immuta.com"
56 |             }
57 |     outputs:
58 |       artifacts:
59 |       - name: tap-output
60 |         path: /tmp/tap_output.txt
61 | 
62 |   - name: singer-target
63 |     container:
64 |       image: "stkbailey/{{inputs.parameters.target_image}}:latest"
65 |     inputs:
66 |       parameters:
67 |       - name: target_image
68 |       - name: tap_image
69 |       artifacts:
70 |       - name: target-config
71 |         path: /tmp/config.json
72 |         raw:
73 |           data: |
74 |             {"delimiter": "\t", "quotechar": "'", "destination_path": "/tmp/data/"}
75 |       - name: tap-output
76 |         path: /tmp/tap_input.txt
77 |     outputs:
78 |       artifacts:
79 |       - name: target-output
80 |         path: /tmp/target_output.txt
81 |       - name: target-data
82 |         path: /tmp/data/data.zip
83 |         s3:
84 |           bucket: singer
85 |           key: "outputs/{{inputs.parameters.tap_image}}/results.zip"
86 |           endpoint: argo-artifacts:9000
87 |           insecure: true
88 |           accessKeySecret:
89 |             name: argo-artifacts
90 |             key: accesskey
91 |           secretKeySecret:
92 |             name: argo-artifacts
93 |             key: secretkey
94 | 


--------------------------------------------------------------------------------
/argo/input-files-example/tap-covid-19-workflow-with-files.yml:
--------------------------------------------------------------------------------
  1 | apiVersion: argoproj.io/v1alpha1
  2 | kind: Workflow
  3 | metadata:
  4 |   generateName: singer-tap-covid-19-
  5 |   namespace: argo
  6 | spec:
  7 |   entrypoint: tap-to-target
  8 | 
  9 |   templates:
 10 |   - name: tap-to-target
 11 |     inputs:
 12 |       parameters:
 13 |       - name: tap_image
 14 |         value: tap-covid-19
 15 |       - name: target_image
 16 |         value: target-csv
 17 |     steps:
 18 |     - - name: tap
 19 |         template: singer-tap
 20 |         arguments:
 21 |           parameters:
 22 |           - name: tap_image
 23 |             value: "{{inputs.parameters.tap_image}}"
 24 |     - - name: target
 25 |         template: singer-target
 26 |         arguments:
 27 |           parameters:
 28 |           - name: target_image
 29 |             value: "{{inputs.parameters.target_image}}"
 30 |           - name: tap_image
 31 |             value: "{{inputs.parameters.tap_image}}"
 32 |           artifacts:
 33 |           - name: tap-output
 34 |             from: "{{steps.tap.outputs.artifacts.tap-output}}"
 35 | 
 36 |   - name: singer-tap
 37 |     container:
 38 |       image: "stkbailey/{{inputs.parameters.tap_image}}:latest"
 39 |     inputs:
 40 |       parameters:
 41 |       - name: tap_image
 42 |       artifacts:
 43 |       - name: tap-config
 44 |         path: /tmp/config.json
 45 |         s3:
 46 |           bucket: singer
 47 |           key: "config/{{inputs.parameters.tap_image}}/config.json"
 48 |           endpoint: argo-artifacts:9000
 49 |           insecure: true
 50 |           accessKeySecret:
 51 |             name: argo-artifacts
 52 |             key: accesskey
 53 |           secretKeySecret:
 54 |             name: argo-artifacts
 55 |             key: secretkey
 56 |       - name: tap-catalog
 57 |         path: /tmp/catalog.json
 58 |         s3:
 59 |           bucket: singer
 60 |           key: "config/{{inputs.parameters.tap_image}}/catalog.json"
 61 |           endpoint: argo-artifacts:9000
 62 |           insecure: true
 63 |           accessKeySecret:
 64 |             name: argo-artifacts
 65 |             key: accesskey
 66 |           secretKeySecret:
 67 |             name: argo-artifacts
 68 |             key: secretkey
 69 |       - name: tap-state
 70 |         path: /tmp/state.json
 71 |         s3:
 72 |           bucket: singer
 73 |           key: "config/{{inputs.parameters.tap_image}}/state.json"
 74 |           endpoint: argo-artifacts:9000
 75 |           insecure: true
 76 |           accessKeySecret:
 77 |             name: argo-artifacts
 78 |             key: accesskey
 79 |           secretKeySecret:
 80 |             name: argo-artifacts
 81 |             key: secretkey
 82 |     outputs:
 83 |       artifacts:
 84 |       - name: tap-output
 85 |         path: /tmp/tap_output.txt
 86 | 
 87 |   - name: singer-target
 88 |     container:
 89 |       image: "stkbailey/{{inputs.parameters.target_image}}:latest"
 90 |     inputs:
 91 |       parameters:
 92 |       - name: target_image
 93 |       - name: tap_image
 94 |       artifacts:
 95 |       - name: target-config
 96 |         path: /tmp/config.json
 97 |         s3:
 98 |           bucket: singer
 99 |           key: "config/{{inputs.parameters.target_image}}/config.json"
100 |           endpoint: argo-artifacts:9000
101 |           insecure: true
102 |           accessKeySecret:
103 |             name: argo-artifacts
104 |             key: accesskey
105 |           secretKeySecret:
106 |             name: argo-artifacts
107 |             key: secretkey      
108 |       - name: tap-output
109 |         path: /tmp/tap_input.txt
110 |     outputs:
111 |       artifacts:
112 |       - name: target-output
113 |         path: /tmp/target_output.txt
114 |         s3:
115 |           bucket: singer
116 |           key: "config/{{inputs.parameters.tap_image}}/state.json"
117 |           endpoint: argo-artifacts:9000
118 |           insecure: true
119 |           accessKeySecret:
120 |             name: argo-artifacts
121 |             key: accesskey
122 |           secretKeySecret:
123 |             name: argo-artifacts
124 |             key: secretkey
125 |       - name: target-data
126 |         path: /tmp/data/data.zip
127 |         s3:
128 |           bucket: singer
129 |           key: "outputs/{{inputs.parameters.tap_image}}/results.zip"
130 |           endpoint: argo-artifacts:9000
131 |           insecure: true
132 |           accessKeySecret:
133 |             name: argo-artifacts
134 |             key: accesskey
135 |           secretKeySecret:
136 |             name: argo-artifacts
137 |             key: secretkey
138 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # data-replication-on-kubernetes
  2 | 
  3 | ELT is the hot new data pipelining pattern, and thousands of data teams are using some combination of Stitch / FiveTran, dbt and a cloud data warehouse to manage their data warehouse. There are great SaaS offerings for these tools, but a lot of folks like pain or don't have budget. As a small data team at Immuta, we were both.
  4 | 
  5 | This article assumes some familiarity with [Docker](https://docker.com), [Kubernetes](https://kubernetes.io/) and the [Singer](https://singer.io) specification. Even if you're new to these technologies, though, I will try to point out helpful resources to get you pointed in the right direction.
  6 | 
  7 | First, we will discuss the problem we are trying to solve and why using containers can make a lot of sense. Then, I'll walk you through setting up some workflows at home (as long as your home is Mac OSX Catalina). And finally, we'll talk about considerations you want to make if you'd like to move into production.
  8 | 
  9 | ## Motivation
 10 | 
 11 | Let's start the discussion with two assertions:
 12 | 
 13 | 1. Data replication is a "solved" problem.
 14 | 2. Kubernetes is a scalable data (process) management platform.
 15 | 
 16 | ETL is not the reason that anyone gets into data science or engineering. There is little creativity, lots of maintenance, and no recognition until something goes wrong. Fortunately, SaaS tools like [Stitch](www.stitchdata.com) and [FiveTran](www.fivetran.com) have pretty much turned data replication into a commodity that small teams can leverage. And where there are no existing supported connectors (for internal applications, say), a data scientist could write their own [Singer](https://singer.io) script and load data themselves.
 17 | 
 18 | Keep this last scenario I want to focus on here, because of course a "data pipeline" is hardly _just_ data replication.
 19 | 
 20 | The "solved" nature of data replication makes it easier for data scientists to own projects end-to-end, freeing data engineers to think about the "platform" rather than point solutions. ([StitchFix has a terrific post on this.](https://multithreaded.stitchfix.com/blog/2016/03/16/engineers-shouldnt-write-etl/)) In fact, the players in this market recognize that it's really the stuff "around" the integrations that are differentiators: the [Meltano](https://meltano.com) project out of GitLab, for example, found a niche in being [a "runner" for integration processes](https://www.dataengineeringpodcast.com/meltano-data-integration-episode-141/), rather than managing the end-to-end analytics pipeline.
 21 | 
 22 | ### Singer taps and targets
 23 | 
 24 | A quick summary on the Singer spec mentioned above:
 25 | 
 26 | - a `tap` connects to service, extracts data, then emits a standardized stream of schemas and records using JSON
 27 | - a `target` reads the record stream of a tap and load it into a warehouse
 28 | 
 29 | This separation of tap and target decouplesthe "extract" step and the "load" step. So an organization with 10 sources and 1 data warehouse has ten tap-to-target pipelines to manage. If they migrate this database, though, they only need to "swap out" the target; they won't make a single change to any taps.
 30 | 
 31 | One pecularity of this isolation is that a best practice for running taps and targets is to isolate them in their own Python virtual environments, since each tap and target may have different dependencies, be built on different Singer versions, etc. And when I first read that, I thought: containers! Yet, there is scarcely a mention of containers in the Singer community, at least as far as I could tell.
 32 | 
 33 | ### Kubernetes and Argo Workflows
 34 | 
 35 | At Immuta, we have a small data team but a world-class engineering organization. As a rule, I like to defer to the engineering team, who have both knowledge and skill, since I, as a data scientist, have neither of these things. 
 36 | 
 37 | The rest of our internal infrastructure runs on Kubernetes; as the lone data scientist, I felt the peer pressure of learning k9s and managing kubeconfigs. And I’m here to say -- this is a great approach, if you’re willing to do some dirty work.
 38 | 
 39 | [Argo](https://argoproj.github.io/) is the "Kubernetes-Native Workflow Engine"
 40 | 
 41 | Argo Workflows is an open source container-native workflow engine for orchestrating parallel jobs on Kubernetes. It is an alternative to other orchestration tools, such as Airflow or Prefect, and the key differentiator is that it is container-based. This Data Council talk provides a nice comparison with Airflow specifically:  [Kubernetes-Native Workflow Orchestration with Argo](https://www.datacouncil.ai/talks/kubernetes-native-workflow-orchestration-with-argo)
 42 | 
 43 | Now that we have a motivation for using Singer and Argo together, let's get to work!
 44 | 
 45 | ## Tutorial
 46 | 
 47 | In this tutorial, we are going to first set up a local Kubernetes cluster with Argo and Minio storage for artifacts and config files,
 48 | then we will deploy a Singer tap-to-target workflow, and then we'll discuss enhancements and extensions.
 49 | 
 50 | We could go deep on any one of these areas, but I'll try to keep it shallow and manageable for the tutorial, leaving further discussion for "production" considerations to the end. The tutorial has the following pre-requisites:
 51 | 
 52 | 1. Docker Desktop. Installed locally, with your Kubernetes for Docker (Minikube / another cluster is fine as well)
 53 | 2. Helm. We will install both Argo and Minio for running this tutorial.
 54 | 
 55 | To start, we are going to be using two of the simplest singer.io packages -- `tap-exchangeratesapi` and `target-csv` -- for demonstrating how this works.
 56 | 
 57 | ### 1. Setting up Argo
 58 | 
 59 | In this first section, we need to set up a Kubernetes cluster, Argo Workflows, an artifact repository, and a couple of storage buckets. The simplest approach is to use Docker for Desktop, which is multi-platform and can deploy a Kubernetes cluster in it. This also makes local development a bit easier, since you can build containers and deploy to a local cluster without pushing to external container repositories.
 60 | 
 61 | #### Install Docker Desktop and enable Kubernetes
 62 | 
 63 | Youu will need to have a Kubernetes cluster which you have admin access to and can deploy resources to using `kubectl`. If you already have that, then you can skip this section. If you don't, the easiest way to do it is using Docker for Desktop. You'll want to follow the documentation on Docker's website.
 64 | 
 65 | 1. Install Docker for Desktop. [Mac](https://docs.docker.com/docker-for-mac/install/).
 66 | 2. Enable Kubernetes. [Mac](https://docs.docker.com/docker-for-mac/kubernetes/)
 67 | 3. Bump up Docker's resource allocation to at least 12 GB to ensure enough room for Minio storage
 68 |    - ![](assets/kubernetes-resources.png)
 69 | 4. Test out the `kubectl` command. [Mac](https://docs.docker.com/docker-for-mac/kubernetes/#use-the-kubectl-command)
 70 | 5. Install the latest Argo CLI. [Mac](https://github.com/argoproj/argo/releases)
 71 | 
 72 | Once you have that working, feel free to play around with your Kubernetes cluster.
 73 | 
 74 | #### Install Argo Workflows
 75 | 
 76 | Next, we need to install Argo. You can follow the [Quick Start Guide](https://argoproj.github.io/argo/quick-start/) to get some context, or simply use the commands below to create an `argo` namespace in your cluster and deploy the resources.
 77 | 
 78 | ```{zsh}
 79 | # Create the argo namespace then deploy the "quick start" resources
 80 | kubectl create ns argo
 81 | kubectl apply -n argo -f https://raw.githubusercontent.com/argoproj/argo/stable/manifests/quick-start-postgres.yaml
 82 | ```
 83 | 
 84 | You should now have an Argo server and a few new Kubernetes resource types used by Argo, including `Workflow` and `CronWorkflow`. To get a glimpse of the Argo Workflows UI, you can forward the Kubernetes port to your localhost.
 85 | 
 86 | ```{zsh}
 87 | kubectl -n argo port-forward deployment/argo-server 2746:2746
 88 | ```
 89 | 
 90 | ![The Argo UI](assets/argo-ui.png)
 91 | 
 92 | #### Set up Minio Storage
 93 | 
 94 | Argo Workflows can pass files into or out of a container through the use of "artifacts". For local deployments, an easy way to configure artifact passing is through a Kubernetes deployment of [MinIO](https://min.io/). Argo has [plenty of guidance](https://argoproj.github.io/argo/configure-artifact-repository/) on setting this up with other services, but you can follow along below for a quick MinIO deployment.
 95 | 
 96 | Note that you'll need to have `helm` installed (essentially, a Kubernetes package manager). On Mac, you can use Homebrew: `brew install helm`. Then, run the following:
 97 | 
 98 | ```{zsh}
 99 | # Add the MinIO helm chart
100 | helm repo add minio https://helm.min.io/
101 | helm repo update
102 | 
103 | # Deploy MinIO in the "argo" namespace (this may take a minute)
104 | helm install argo-artifacts minio/minio     \
105 |   -n argo                                   \
106 |   --set service.type=LoadBalancer           \
107 |   --set defaultBucket.enabled=true          \
108 |   --set defaultBucket.name=artifacts        \
109 |   --set persistence.enabled=false           \
110 |   --set fullnameOverride=argo-artifacts
111 | ```
112 | 
113 | You now have an artifacts server running, but it's empty! Let's create an artifacts bucket, along with a bucket for singer configuration and outputs. To use the commands below, you'll need the MinIO CLI tool: `brew install minio/stable/mc`.
114 | 
115 | ```{zsh}
116 | # Add config host
117 | mc config host add argo-artifacts-local http://localhost:9000 YOURACCESSKEY YOURSECRETKEY
118 | 
119 | # Create buckets in min.io
120 | mc mb argo-artifacts-local/artifacts
121 | mc mb argo-artifacts-local/singer
122 | ```
123 | 
124 | You can go and check these in your browser using `kubectl -n argo port-forward service/argo-artifacts 9000:9000`.
125 | 
126 | ```
127 | mc ls argo-artifacts-local
128 | ```
129 | 
130 | #### Map Argo and MinIO together
131 | 
132 |  on setting up your artifact repository. But let's just start with the below for now:
133 | 
134 | Finally, we need to tell Argo where the default artifact repository is, so that it knows which bucket to map artifacts to and has the appropriate secrets for authentication. You could follow the instructions [elsewhere](https://sourcegraph.com/github.com/argoproj/argo@095d67f8d0f1d309529c8a400cb16d0a0e2765b9/-/blob/demo.md#5-install-an-artifact-repository), but for simplicity's sake, I suggest
135 | applying this "patch" to your resource.
136 | 
137 | ```{zsh}
138 | wget -o patch.yml https://raw.githubusercontent.com/stkbailey/data-replication-on-kubernetes/master/argo/argo-artifact-patch.yml
139 | kubectl patch configmap workflow-controller-configmap -n argo --patch "$(cat argo-artifact-patch.yml)"
140 | ```
141 | 
142 | To make sure you've got everything working in this first section, try running the "artifact-passing" example from the Argo examples repository.
143 | 
144 | ```{zsh}
145 | argo submit -n argo https://raw.githubusercontent.com/argoproj/argo/master/examples/artifact-passing.yaml --watch
146 | ```
147 | 
148 | 
149 | 
150 | ### The Argo workflow
151 | 
152 | You've got Kubernetes, you've got Argo: let's not wait around! Run the following command:
153 | 
154 | ```
155 | argo submit -n argo --watch https://raw.githubusercontent.com/stkbailey/data-replication-on-kubernetes/master/argo/tap-exchange-rate-workflow.yml
156 | ```
157 | 
158 | You should see a workflow kick off in your terminal. (Alternatively, you could go to that link, copy the text and paste it into the Argo Workflows "new workflow" UI.) If you receive an error message that says something along the lines of `failed to save outputs: timed out waiting for the condition`, simply try running the workflow again.
159 | 
160 | You should see a two-step Workflow create and finish. Let's check the "outputs" storage bucket and see what's available.
161 | 
162 | ```
163 | mc ls argo-artifacts-local/singer/outputs/tap-exchange-rates/
164 | ```
165 | 
166 | #### Dissecting the workflow
167 | 
168 | Let's walk through what this tap does:
169 | 
170 | 1. It starts a `tap-to-target` multi-step workflow, consisting of two child steps: the `tap` and the `target` step.
171 | 2. It kicks off the `tap` child step and passes in some configuration files, which get mapped to `/tmp/config.json`. When the tap step completes, the output file at `/tmp/tap_output.txt` is stored in the default MinIO artifact repository.
172 | 3. It then kicks off the `target` child step and maps both configuration and tap output into the containers file system. When the target runs, the output (from the CSV target) is mapped to the `outputs` MinIO bucket.
173 | 
174 | Argo makes all of this extremely "template-able", as we'll see in a few moments. But it is all dependent on the containers knowing where to expect certain files, etc. Which leads us to ask: What's going on in the containers?
175 | 
176 | #### The Preamble
177 | 
178 | We begin the Workflow by adding some metadata andd specifying that we want to create a "Workflow" resource. This is a single run of the workflow.
179 | 
180 | ```
181 | apiVersion: argoproj.io/v1alpha1
182 | kind: Workflow
183 | metadata:
184 |   name: singer-workflow
185 |   namespace: argo
186 | spec:
187 |   entrypoint: <<template-name>>
188 |   templates: [ ... ]
189 | ```
190 | 
191 | Next, we specify our DAG templates. We will be building three of these: a `tap-to-target` DAG, a `tap`, and a `target`. We _could_ do this all in a single step, but we'll talk about reusability in a moment.
192 | 
193 | Let's begin with the DAG template, which takes a couple input parameters -- names, images and the artifact bucket
194 | 
195 | - name:
196 | - input parameters
197 | - steps:
198 | 
199 | The DAG Workflow references a couple other "templates" that also need to be defined. 
200 | 
201 | - name:
202 | - container
203 | - input parameters:
204 | - input artifacts:
205 | - output artifacts:
206 | 
207 | You can re-run the workflow, changing the configuration as desired. And that's the beauty of this workflow: once set up, it's simply a matter of changing some container locations and config files to add a new tap. 
208 | 
209 | #### Dockerizing Singer
210 | 
211 | Let's put Kubernetes to the side for a moment and focus on a typical Singer pipeline. It's a linear process:
212 | 
213 | 1. Tap inputs: configuration file, catalog file, state file.
214 | 2. Tap outputs: a stream of log data in Singer format.
215 | 3. Target inputs: configuration file, tap output stream.
216 | 4. Target outputs: loaded/exported data (e.g. to a database or CSV file), state file
217 | 
218 | We can treat the containers themselves as black boxes, so long as we know how to feed in the appropriate inputs and outputs. To keep things simple in this tutorial, I have pre-created `tap` and `target` containers for our use. To test it out, run this command:
219 | 
220 | ```{zsh}
221 | docker run -e START_DATE=2020-08-01 stkbailey/tap-exchange-rates
222 | ```
223 | This should kick off the `tap-exchangeratesapi`, which doesn't require any special configuration to run.
224 | 
225 | Without going into too much detail, we are wrapping the singer "tap / target" commands in a Runner object that defines where to look for the necessary files. (It's available at [singer-container-utils](https://github.com/immuta/singer-container-utils).) THere are many ways to skin that cat at this point, but the important thing is to make it easy to reproduce across multiple taps.
226 | 
227 | We won't dig into the details, but each of these Docker containers is running a Python `entrypoint.py` script when they are initialized. The gist of the entrypoint script is:
228 | 
229 | 1. Identify the paths where `config`, `catalog`, `state` and `input` files exist.
230 | 2. Build the appropriate tap/target executable command, based on file availability.
231 | 3. Run the command and write output to a file.
232 | 
233 | It's a lot of overhead for a single command, but when you have multiple containers to run, the abstractions make life easier.
234 | 
235 | 
236 | ## Discussion
237 | 
238 | It is hopefully not hard to see how, with a few additional tweaks and some well-protected S3 buckets, this couldl be turned into a fairly robust architecture. We can easily turn the workflow specified above into an Argo `TemplateWorkflows` and reference it in a `CronWorkflow` to have it run on an hourly or daily basis.
239 | 
240 | But, exciting as all this is, it has to be noted that this is not for the faint of heart. As a data scientist, there are a lot of considerations.
241 | 
242 | - How are you building and updating the containers? Does your cluster have access to that container repository?
243 | - How are you triggering new workflows?
244 | - Networking can be a pain.
245 | 
246 | You may find that `pipelinewise` or `meltano` can do the same work for you, with less overhead. Alternatively, you may find that running them on Argo gives you a nice method for doing custom replications.
247 | 
248 | ### Logging and observability are _different_ in containers
249 | 
250 | We use a Slack exit-handler to notify our team of successes and failures. 
251 | 
252 | ### Schedules and templates
253 | 
254 | The "promised land" of Argo, though, is that once it is set up, you simply have to change the image location to add a new tap -- and this is true! The only thing that should change between runs is the container that's invoked and the configuration files passed in.
255 | 
256 | The questions you need to ask are:
257 | 
258 | - Do I have the budget to just pay for this (Stitch / Fivetran)?
259 | - Does a simpler (single-node) architecture work, such as Meltano?
260 | - Does a Kubernetes architecture fit with the rest of the company's infrastructure?
261 | - Can I reuse the Argo architecture for other processes, such as transformation pipelines, data quality testing, etc?
262 | 
263 | At Immuta, we went with this architecture becuase:
264 | 
265 | 1. We didn't have budget (yet) for an enterprise service but found ourselves needing to run custom taps.
266 | 2. We were already comfortable with containerized applications.
267 | 3. The rest of our company's infrastructure was run on Kubernetes and leveraged other Argo products.
268 | 4. We had other projects, such as data quality jobs, that we need a platform to run on, and we did not have previous expertise with Airflow or Prefect.
269 | 
270 | Thanks for reading!
271 | 
272 | ## References
273 | 
274 | - Getting Started with Singer [Blog](https://github.com/singer-io/getting-started)
275 | - Building a Singer Tap - Infographic [Blog](https://www.stitchdata.com/blog/how-to-build-a-singer-tap-infographic/)
276 | - Singer Container Utils [GitHub](https://github.com/immuta/singer-container-utils)
277 | 
278 | ## Draft
279 | 
280 | #### Doing this at home
281 | 
282 | 1. Create an Argo Workflow template that uses variables to select the tap and target container. This serves as the backbone of your process; everything else is parameterized.
283 | 2. Create a `target` container for your data warehouse. 
284 | 3. Save the `config.json` for your target to a secure location.
285 | 
286 | Once that's set up, for each new tap, you:
287 | 
288 | 1. Create a new `tap` container. The tap runs a Python script on start that checks a few default locations for configs, catalogs, etc. Deploy the new container to ECR so that it is accessible by the Argo service user.
289 | 2. Save the `config.json`, `catalog.json`, and an initial `state.json` to a config folder on S3.
290 | 3. Create a new CronWorkflow (or Workflow Template) for your job, that references your
291 | 
292 | You should see that the container emits a logging stream of updates -- but it does not emit the actual data itself. This is slightly different from how the tap would work if you were to run it locally. What we have done instead is written the tap output to a file inside the container.
293 | 
294 | This decision makes it less straightforward to simply "pipe" the output of one container to another, but it gives us greater control over where the logs (which are the data) are ultimately stored.
295 | 
296 | Now, let's try mapping a configuration file into the container, rather than providing a `START_DATE` configuration directly.
297 | 
298 | ```
299 | docker run    \
300 |     --mount type=bind,source="$(pwd)"/singer-configs/tap-exchange-rates/config.json,target=/opt/code/config.json \
301 |     stkbailey/tap-exchange-rates:latest
302 | ```
303 | 
304 | You should see a similar results to the first run.
305 | 
306 | Next, let's take a quick look at how the target works by running
307 | 
308 | ```{zsh}
309 | docker run                          \
310 |     --env DESTINATION_PATH=/tmp     \
311 |     stkbailey/target-csv:latest
312 | ```
313 | 
314 | Once again, we could map some additional files into the container -- and will need to do so to pass the tap output along.
315 | 


--------------------------------------------------------------------------------