├── argo ├── input-files-example │ ├── tap-covid-19 │ │ └── state.json │ ├── target-csv │ │ └── config.json │ ├── mc-cp-files.sh │ └── tap-covid-19-workflow-with-files.yml ├── argo-artifact-patch.yml ├── template-example │ ├── invoke-template.yml │ ├── workflow-template.yml │ └── exit-handler-template.yml ├── tap-exchange-rate-workflow.yml ├── cronworkflow-example │ └── tap-to-target-cronworkflow.yml └── tap-covid-19-example │ └── tap-covid-19-workflow.yml ├── assets ├── argo-ui.png ├── argo-dag-example.png ├── argo-tap-workflow.png └── kubernetes-resources.png ├── singer-containers ├── target-csv │ ├── Makefile │ ├── Dockerfile │ └── entrypoint.py ├── tap-covid-19 │ ├── Makefile │ ├── entrypoint.py │ └── Dockerfile └── tap-exchange-rates │ ├── Makefile │ ├── entrypoint.py │ └── Dockerfile ├── .gitignore ├── LICENSE └── README.md /argo/input-files-example/tap-covid-19/state.json: -------------------------------------------------------------------------------- 1 | {"start_date": "2020-09-10"} -------------------------------------------------------------------------------- /assets/argo-ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stkbailey/data-replication-on-kubernetes/HEAD/assets/argo-ui.png -------------------------------------------------------------------------------- /argo/input-files-example/target-csv/config.json: -------------------------------------------------------------------------------- 1 | {"delimiter": "\t", "quotechar": "'", "destination_path": "/tmp/data/"} -------------------------------------------------------------------------------- /assets/argo-dag-example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stkbailey/data-replication-on-kubernetes/HEAD/assets/argo-dag-example.png -------------------------------------------------------------------------------- /assets/argo-tap-workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stkbailey/data-replication-on-kubernetes/HEAD/assets/argo-tap-workflow.png -------------------------------------------------------------------------------- /assets/kubernetes-resources.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stkbailey/data-replication-on-kubernetes/HEAD/assets/kubernetes-resources.png -------------------------------------------------------------------------------- /singer-containers/target-csv/Makefile: -------------------------------------------------------------------------------- 1 | ACCOUNT := stkbailey 2 | REPO := target-csv 3 | 4 | docker-publish: 5 | docker login 6 | docker build --tag ${ACCOUNT}/${REPO}:latest . 7 | docker push ${ACCOUNT}/${REPO}:latest 8 | -------------------------------------------------------------------------------- /singer-containers/tap-covid-19/Makefile: -------------------------------------------------------------------------------- 1 | ACCOUNT := stkbailey 2 | REPO := tap-covid-19 3 | 4 | docker-publish: 5 | docker login 6 | docker build --tag ${ACCOUNT}/${REPO}:latest . 7 | docker push ${ACCOUNT}/${REPO}:latest 8 | -------------------------------------------------------------------------------- /singer-containers/tap-exchange-rates/Makefile: -------------------------------------------------------------------------------- 1 | ACCOUNT := stkbailey 2 | REPO := tap-exchange-rates 3 | 4 | docker-publish: 5 | docker login 6 | docker build --tag ${ACCOUNT}/${REPO}:latest . 7 | docker push ${ACCOUNT}/${REPO}:latest 8 | -------------------------------------------------------------------------------- /argo/argo-artifact-patch.yml: -------------------------------------------------------------------------------- 1 | data: 2 | artifactRepository: | 3 | archiveLogs: true 4 | s3: 5 | bucket: artifacts 6 | endpoint: argo-artifacts:9000 7 | insecure: true 8 | accessKeySecret: 9 | name: argo-artifacts 10 | key: accesskey 11 | secretKeySecret: 12 | name: argo-artifacts 13 | key: secretkey -------------------------------------------------------------------------------- /singer-containers/tap-covid-19/entrypoint.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from singer_container_utils import TapRunner 3 | 4 | tap_configs = dict( 5 | execute_command="tap-covid-19", 6 | required_config_keys=["api_token", "start_date"], 7 | path_to_config="/tmp/config.json", 8 | path_to_catalog= "/tmp/catalog.json", 9 | path_to_state = "/tmp/state.json", 10 | path_to_output="/tmp/tap_output.txt", 11 | discover_catalog=True, 12 | ) 13 | 14 | if __name__ == "__main__": 15 | tap = TapRunner(**tap_configs) 16 | tap.run() 17 | -------------------------------------------------------------------------------- /singer-containers/tap-exchange-rates/entrypoint.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from singer_container_utils import TapRunner 3 | 4 | 5 | tap_configs = dict( 6 | execute_command="tap-exchangeratesapi", 7 | required_config_keys=["start_date"], 8 | path_to_config="/tmp/config.json", 9 | path_to_catalog= "/tmp/catalog.json", 10 | path_to_state = "/tmp/state.json", 11 | path_to_output="/tmp/tap_output.txt", 12 | discover_catalog=False, 13 | ) 14 | 15 | if __name__ == "__main__": 16 | tap = TapRunner(**tap_configs) 17 | tap.run() 18 | -------------------------------------------------------------------------------- /singer-containers/target-csv/Dockerfile: -------------------------------------------------------------------------------- 1 | # Set up the container dependencies 2 | FROM python:3.8-slim 3 | RUN apt-get update && apt-get install -y gcc git libpq-dev libssl-dev 4 | 5 | # Install a singer-container helper library 6 | RUN pip install https://github.com/immuta/singer-container-utils/archive/master.zip 7 | 8 | # Install the tap 9 | RUN pip install target-csv 10 | 11 | # Set /opt/code as the default directory and copy entrypoint script 12 | RUN mkdir -p /opt/code 13 | WORKDIR /opt/code 14 | COPY entrypoint.py . 15 | 16 | # Run the entrypoint file on container start 17 | ENTRYPOINT [ "python", "./entrypoint.py" ] 18 | -------------------------------------------------------------------------------- /singer-containers/tap-covid-19/Dockerfile: -------------------------------------------------------------------------------- 1 | # Set up the container dependencies 2 | FROM python:3.8-slim 3 | RUN apt-get update && apt-get install -y gcc git libpq-dev libssl-dev 4 | 5 | # Install a singer-container helper library 6 | RUN pip install https://github.com/immuta/singer-container-utils/archive/master.zip 7 | 8 | # Install the tap 9 | RUN pip install tap-covid-19 10 | 11 | # Set /opt/code as the default directory and copy entrypoint script 12 | RUN mkdir -p /opt/code 13 | WORKDIR /opt/code 14 | COPY entrypoint.py . 15 | 16 | # Run the entrypoint file on container start 17 | ENTRYPOINT [ "python", "./entrypoint.py" ] 18 | -------------------------------------------------------------------------------- /singer-containers/tap-exchange-rates/Dockerfile: -------------------------------------------------------------------------------- 1 | # Set up the container dependencies 2 | FROM python:3.8-slim 3 | RUN apt-get update && apt-get install -y gcc git libpq-dev libssl-dev 4 | 5 | # Install a singer-container helper library 6 | RUN pip install https://github.com/immuta/singer-container-utils/archive/master.zip 7 | 8 | # Install the tap 9 | RUN pip install tap-exchangeratesapi 10 | 11 | # Set /opt/code as the default directory and copy entrypoint script 12 | RUN mkdir -p /opt/code 13 | WORKDIR /opt/code 14 | COPY entrypoint.py . 15 | 16 | # Run the entrypoint file on container start 17 | ENTRYPOINT [ "python", "./entrypoint.py" ] 18 | -------------------------------------------------------------------------------- /argo/input-files-example/mc-cp-files.sh: -------------------------------------------------------------------------------- 1 | # Make sure the local MinIO service is mapped for the MinIO CLI 2 | mc config host add argo-artifacts-local http://localhost:9000 YOURACCESSKEY YOURSECRETKEY 3 | 4 | # If the `singer` bucket is not created, create it 5 | mc mb argo-artifacts-local/singer 6 | 7 | # Copy the whole "tap-exhange-rates" config directory to the bucket 8 | mc cp --recursive \ 9 | argo/input-files-example/tap-covid-19 \ 10 | argo-artifacts-local/singer/config/ 11 | 12 | mc cp --recursive \ 13 | argo/input-files-example/target-csv \ 14 | argo-artifacts-local/singer/config/ -------------------------------------------------------------------------------- /singer-containers/target-csv/entrypoint.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import pathlib 3 | import zipfile 4 | 5 | from singer_container_utils import TargetRunner 6 | 7 | 8 | # target-csv does not connect to a database, but rather outputs a set of files 9 | # that need to be zipped up and mapped to an artifact 10 | out = pathlib.Path("/tmp/data") 11 | out.mkdir() 12 | 13 | # Run the tap 14 | if __name__ == "__main__": 15 | target = TargetRunner( 16 | execute_command="target-csv", 17 | required_config_keys=["destination_path"], 18 | path_to_config = "/tmp/config.json", 19 | path_to_input = "/tmp/tap_input.txt", 20 | path_to_output = "/tmp/target_output.txt", 21 | ) 22 | target.run() 23 | 24 | 25 | # Zip up the outputs and map to outfile 26 | zf = zipfile.ZipFile(out / "data.zip", mode='w') 27 | for f in out.glob("*.csv"): 28 | zf.write(f) 29 | zf.close() 30 | -------------------------------------------------------------------------------- /argo/template-example/invoke-template.yml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: Workflow 3 | metadata: 4 | generateName: invoked-template-example- 5 | namespace: argo 6 | spec: 7 | entrypoint: workflow-runner 8 | 9 | templates: 10 | - name: workflow-runner 11 | dag: 12 | tasks: 13 | - name: workflow-1 14 | arguments: 15 | parameters: 16 | - name: tap_image 17 | value: tap-exchange-rates 18 | templateRef: 19 | name: singer-tap-to-csv-template 20 | template: tap-to-target 21 | 22 | - name: workflow-2 23 | arguments: 24 | parameters: 25 | - name: tap_image 26 | value: tap-exchange-rates 27 | templateRef: 28 | name: singer-tap-to-csv-template 29 | template: tap-to-target 30 | 31 | - name: workflow-3 32 | arguments: 33 | parameters: 34 | - name: tap_image 35 | value: tap-exchange-rates 36 | templateRef: 37 | name: singer-tap-to-csv-template 38 | template: tap-to-target 39 | 40 | -------------------------------------------------------------------------------- /argo/template-example/workflow-template.yml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: WorkflowTemplate 3 | metadata: 4 | name: singer-tap-to-csv-template 5 | namespace: argo 6 | spec: 7 | entrypoint: tap-to-target 8 | 9 | templates: 10 | - name: tap-to-target 11 | inputs: 12 | parameters: 13 | - name: tap_image 14 | steps: 15 | - - name: tap 16 | template: singer-tap 17 | arguments: 18 | parameters: 19 | - name: tap_image 20 | value: "{{inputs.parameters.tap_image}}" 21 | - - name: target 22 | template: singer-target 23 | arguments: 24 | parameters: 25 | - name: target_image 26 | value: "target-csv" 27 | - name: tap_image 28 | value: "{{inputs.parameters.tap_image}}" 29 | artifacts: 30 | - name: tap-output 31 | from: "{{steps.tap.outputs.artifacts.tap-output}}" 32 | 33 | - name: singer-tap 34 | container: 35 | image: "stkbailey/{{inputs.parameters.tap_image}}:latest" 36 | inputs: 37 | parameters: 38 | - name: tap_image 39 | artifacts: 40 | - name: tap-config 41 | path: /tmp/config.json 42 | raw: 43 | data: | 44 | {"start_date": "2020-08-01"} 45 | outputs: 46 | artifacts: 47 | - name: tap-output 48 | path: /tmp/tap_output.txt 49 | 50 | - name: singer-target 51 | container: 52 | image: "stkbailey/{{inputs.parameters.target_image}}:latest" 53 | inputs: 54 | parameters: 55 | - name: target_image 56 | - name: tap_image 57 | artifacts: 58 | - name: target-config 59 | path: /tmp/config.json 60 | raw: 61 | data: | 62 | {"delimiter": "\t", "quotechar": "'", "destination_path": "/tmp/data/"} 63 | - name: tap-output 64 | path: /tmp/tap_input.txt 65 | outputs: 66 | artifacts: 67 | - name: target-output 68 | path: /tmp/target_output.txt 69 | - name: target-data 70 | path: /tmp/data/data.zip 71 | s3: 72 | bucket: singer 73 | key: "outputs/{{inputs.parameters.tap_image}}/results.zip" 74 | endpoint: argo-artifacts:9000 75 | insecure: true 76 | accessKeySecret: 77 | name: argo-artifacts 78 | key: accesskey 79 | secretKeySecret: 80 | name: argo-artifacts 81 | key: secretkey 82 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # LLocal Settings 132 | .vscode 133 | .venv -------------------------------------------------------------------------------- /argo/tap-exchange-rate-workflow.yml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: Workflow 3 | metadata: 4 | generateName: singer-tap-exchange-rates- 5 | namespace: argo 6 | spec: 7 | entrypoint: tap-to-target 8 | 9 | templates: 10 | - name: tap-to-target 11 | inputs: 12 | parameters: 13 | - name: tap_image 14 | value: tap-exchange-rates 15 | - name: target_image 16 | value: target-csv 17 | steps: 18 | - - name: tap 19 | template: singer-tap 20 | arguments: 21 | parameters: 22 | - name: tap_image 23 | value: "{{inputs.parameters.tap_image}}" 24 | - - name: target 25 | template: singer-target 26 | arguments: 27 | parameters: 28 | - name: target_image 29 | value: "{{inputs.parameters.target_image}}" 30 | - name: tap_image 31 | value: "{{inputs.parameters.tap_image}}" 32 | artifacts: 33 | - name: tap-output 34 | from: "{{steps.tap.outputs.artifacts.tap-output}}" 35 | 36 | - name: singer-tap 37 | container: 38 | image: "stkbailey/{{inputs.parameters.tap_image}}:latest" 39 | inputs: 40 | parameters: 41 | - name: tap_image 42 | artifacts: 43 | - name: tap-config 44 | path: /tmp/config.json 45 | raw: 46 | data: | 47 | {"start_date": "2020-08-01"} 48 | outputs: 49 | artifacts: 50 | - name: tap-output 51 | path: /tmp/tap_output.txt 52 | 53 | - name: singer-target 54 | container: 55 | image: "stkbailey/{{inputs.parameters.target_image}}:latest" 56 | inputs: 57 | parameters: 58 | - name: target_image 59 | - name: tap_image 60 | artifacts: 61 | - name: target-config 62 | path: /tmp/config.json 63 | raw: 64 | data: | 65 | {"delimiter": "\t", "quotechar": "'", "destination_path": "/tmp/data/"} 66 | - name: tap-output 67 | path: /tmp/tap_input.txt 68 | outputs: 69 | artifacts: 70 | - name: target-output 71 | path: /tmp/target_output.txt 72 | - name: target-data 73 | path: /tmp/data/data.zip 74 | s3: 75 | bucket: singer 76 | key: "outputs/{{inputs.parameters.tap_image}}/results.zip" 77 | endpoint: argo-artifacts:9000 78 | insecure: true 79 | accessKeySecret: 80 | name: argo-artifacts 81 | key: accesskey 82 | secretKeySecret: 83 | name: argo-artifacts 84 | key: secretkey 85 | -------------------------------------------------------------------------------- /argo/template-example/exit-handler-template.yml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: WorkflowTemplate 3 | metadata: 4 | name: slack-exit-handler-template 5 | namespace: argo 6 | spec: 7 | entrypoint: exit-handler 8 | 9 | templates: 10 | - name: exit-handler 11 | inputs: 12 | parameters: 13 | - name: workflow_status 14 | value: null 15 | - name: workflow_name 16 | value: null 17 | - name: workflow_failures 18 | value: null 19 | - name: webhook_url 20 | value: "https://slack.com/my-webhook-url" 21 | steps: 22 | - - name: woop-woop-message 23 | template: slack-notify 24 | arguments: 25 | parameters: 26 | - name: workflow_name 27 | value: "{{inputs.parameters.workflow_name}}" 28 | - name: workflow_status 29 | value: "{{inputs.parameters.workflow_status}}" 30 | - name: workflow_failures 31 | value: "{{inputs.parameters.workflow_failures}}" 32 | - name: slack_color 33 | value: "#008000" 34 | when: "{{inputs.parameters.workflow_status}} == Succeeded" 35 | - name: arrgghho-message 36 | template: slack-notify 37 | arguments: 38 | parameters: 39 | - name: workflow_name 40 | value: "{{inputs.parameters.workflow_name}}" 41 | - name: workflow_status 42 | value: "{{inputs.parameters.workflow_status}}" 43 | - name: workflow_failures 44 | value: "{{inputs.parameters.workflow_failures}}" 45 | - name: slack_color 46 | value: "#FF0000" 47 | when: "{{inputs.parameters.workflow_status}} != Succeeded" 48 | 49 | - name: slack-notify 50 | inputs: 51 | parameters: 52 | - name: slack_color 53 | - name: slack_channel 54 | value: "notify-channel" 55 | - name: webhook_url 56 | - name: workflow_name 57 | - name: workflow_status 58 | - name: workflow_failures 59 | container: 60 | image: technosophos/slack-notify 61 | env: 62 | - name: SLACK_WEBHOOK 63 | value: {{inputs.parameters.webhook_url}} 64 | - name: SLACK_TITLE 65 | value: "Workflow Complete: {{inputs.parameters.workflow_name}}" 66 | - name: SLACK_MESSAGE 67 | value: | 68 | Name: {{inputs.parameters.workflow_name}} 69 | Status: {{inputs.parameters.workflow_status}} 70 | Failures: {{inputs.parameters.workflow_failures}} 71 | - name: SLACK_CHANNEL 72 | value: {{inputs.parameters.slack_channel}} 73 | - name: SLACK_COLOR 74 | value: "{{inputs.parameters.slack_color}}" 75 | -------------------------------------------------------------------------------- /argo/cronworkflow-example/tap-to-target-cronworkflow.yml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: CronWorkflow 3 | metadata: 4 | generateName: singer-tap-exchange-rates- 5 | namespace: argo 6 | spec: 7 | schedule: "0 0 * * *" 8 | workflowSpec: 9 | entrypoint: tap-to-target 10 | 11 | templates: 12 | - name: tap-to-target 13 | inputs: 14 | parameters: 15 | - name: tap_image 16 | value: tap-exchange-rates 17 | - name: target_image 18 | value: target-csv 19 | steps: 20 | - - name: tap 21 | template: singer-tap 22 | arguments: 23 | parameters: 24 | - name: tap_image 25 | value: "{{inputs.parameters.tap_image}}" 26 | - - name: target 27 | template: singer-target 28 | arguments: 29 | parameters: 30 | - name: target_image 31 | value: "{{inputs.parameters.target_image}}" 32 | - name: tap_image 33 | value: "{{inputs.parameters.tap_image}}" 34 | artifacts: 35 | - name: tap-output 36 | from: "{{steps.tap.outputs.artifacts.tap-output}}" 37 | 38 | - name: singer-tap 39 | container: 40 | image: "stkbailey/{{inputs.parameters.tap_image}}:latest" 41 | inputs: 42 | parameters: 43 | - name: tap_image 44 | artifacts: 45 | - name: tap-config 46 | path: /tmp/config.json 47 | raw: 48 | data: | 49 | {"start_date": "2020-08-01"} 50 | outputs: 51 | artifacts: 52 | - name: tap-output 53 | path: /tmp/tap_output.txt 54 | 55 | - name: singer-target 56 | container: 57 | image: "stkbailey/{{inputs.parameters.target_image}}:latest" 58 | inputs: 59 | parameters: 60 | - name: target_image 61 | - name: tap_image 62 | artifacts: 63 | - name: target-config 64 | path: /tmp/config.json 65 | raw: 66 | data: | 67 | {"delimiter": "\t", "quotechar": "'", "destination_path": "/tmp/data/"} 68 | - name: tap-output 69 | path: /tmp/tap_input.txt 70 | outputs: 71 | artifacts: 72 | - name: target-output 73 | path: /tmp/target_output.txt 74 | - name: target-data 75 | path: /tmp/data/data.zip 76 | s3: 77 | bucket: singer 78 | key: "outputs/{{inputs.parameters.tap_image}}/results.zip" 79 | endpoint: argo-artifacts:9000 80 | insecure: true 81 | accessKeySecret: 82 | name: argo-artifacts 83 | key: accesskey 84 | secretKeySecret: 85 | name: argo-artifacts 86 | key: secretkey 87 | -------------------------------------------------------------------------------- /argo/tap-covid-19-example/tap-covid-19-workflow.yml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: Workflow 3 | metadata: 4 | generateName: singer-tap-covid-19- 5 | namespace: argo 6 | spec: 7 | entrypoint: tap-to-target 8 | 9 | templates: 10 | - name: tap-to-target 11 | inputs: 12 | parameters: 13 | - name: tap_image 14 | value: tap-covid-19 15 | - name: target_image 16 | value: target-csv 17 | - name: github_api_token 18 | value: 3cd19e1bac7d9696ce83a81f93ed500fd7c44d6f 19 | steps: 20 | - - name: tap 21 | template: singer-tap 22 | arguments: 23 | parameters: 24 | - name: tap_image 25 | value: "{{inputs.parameters.tap_image}}" 26 | - name: github_api_token 27 | value: "{{inputs.parameters.github_api_token}}" 28 | - - name: target 29 | template: singer-target 30 | arguments: 31 | parameters: 32 | - name: target_image 33 | value: "{{inputs.parameters.target_image}}" 34 | - name: tap_image 35 | value: "{{inputs.parameters.tap_image}}" 36 | artifacts: 37 | - name: tap-output 38 | from: "{{steps.tap.outputs.artifacts.tap-output}}" 39 | 40 | - name: singer-tap 41 | container: 42 | image: "stkbailey/{{inputs.parameters.tap_image}}:latest" 43 | inputs: 44 | parameters: 45 | - name: tap_image 46 | - name: github_api_token 47 | artifacts: 48 | - name: tap-config 49 | path: /tmp/config.json 50 | raw: 51 | data: | 52 | { 53 | "api_token": "{{inputs.parameters.github_api_token}}", 54 | "start_date": "2020-09-01T00:00:00Z", 55 | "user_agent": "tap-covid-19 tutorial@immuta.com" 56 | } 57 | outputs: 58 | artifacts: 59 | - name: tap-output 60 | path: /tmp/tap_output.txt 61 | 62 | - name: singer-target 63 | container: 64 | image: "stkbailey/{{inputs.parameters.target_image}}:latest" 65 | inputs: 66 | parameters: 67 | - name: target_image 68 | - name: tap_image 69 | artifacts: 70 | - name: target-config 71 | path: /tmp/config.json 72 | raw: 73 | data: | 74 | {"delimiter": "\t", "quotechar": "'", "destination_path": "/tmp/data/"} 75 | - name: tap-output 76 | path: /tmp/tap_input.txt 77 | outputs: 78 | artifacts: 79 | - name: target-output 80 | path: /tmp/target_output.txt 81 | - name: target-data 82 | path: /tmp/data/data.zip 83 | s3: 84 | bucket: singer 85 | key: "outputs/{{inputs.parameters.tap_image}}/results.zip" 86 | endpoint: argo-artifacts:9000 87 | insecure: true 88 | accessKeySecret: 89 | name: argo-artifacts 90 | key: accesskey 91 | secretKeySecret: 92 | name: argo-artifacts 93 | key: secretkey 94 | -------------------------------------------------------------------------------- /argo/input-files-example/tap-covid-19-workflow-with-files.yml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: Workflow 3 | metadata: 4 | generateName: singer-tap-covid-19- 5 | namespace: argo 6 | spec: 7 | entrypoint: tap-to-target 8 | 9 | templates: 10 | - name: tap-to-target 11 | inputs: 12 | parameters: 13 | - name: tap_image 14 | value: tap-covid-19 15 | - name: target_image 16 | value: target-csv 17 | steps: 18 | - - name: tap 19 | template: singer-tap 20 | arguments: 21 | parameters: 22 | - name: tap_image 23 | value: "{{inputs.parameters.tap_image}}" 24 | - - name: target 25 | template: singer-target 26 | arguments: 27 | parameters: 28 | - name: target_image 29 | value: "{{inputs.parameters.target_image}}" 30 | - name: tap_image 31 | value: "{{inputs.parameters.tap_image}}" 32 | artifacts: 33 | - name: tap-output 34 | from: "{{steps.tap.outputs.artifacts.tap-output}}" 35 | 36 | - name: singer-tap 37 | container: 38 | image: "stkbailey/{{inputs.parameters.tap_image}}:latest" 39 | inputs: 40 | parameters: 41 | - name: tap_image 42 | artifacts: 43 | - name: tap-config 44 | path: /tmp/config.json 45 | s3: 46 | bucket: singer 47 | key: "config/{{inputs.parameters.tap_image}}/config.json" 48 | endpoint: argo-artifacts:9000 49 | insecure: true 50 | accessKeySecret: 51 | name: argo-artifacts 52 | key: accesskey 53 | secretKeySecret: 54 | name: argo-artifacts 55 | key: secretkey 56 | - name: tap-catalog 57 | path: /tmp/catalog.json 58 | s3: 59 | bucket: singer 60 | key: "config/{{inputs.parameters.tap_image}}/catalog.json" 61 | endpoint: argo-artifacts:9000 62 | insecure: true 63 | accessKeySecret: 64 | name: argo-artifacts 65 | key: accesskey 66 | secretKeySecret: 67 | name: argo-artifacts 68 | key: secretkey 69 | - name: tap-state 70 | path: /tmp/state.json 71 | s3: 72 | bucket: singer 73 | key: "config/{{inputs.parameters.tap_image}}/state.json" 74 | endpoint: argo-artifacts:9000 75 | insecure: true 76 | accessKeySecret: 77 | name: argo-artifacts 78 | key: accesskey 79 | secretKeySecret: 80 | name: argo-artifacts 81 | key: secretkey 82 | outputs: 83 | artifacts: 84 | - name: tap-output 85 | path: /tmp/tap_output.txt 86 | 87 | - name: singer-target 88 | container: 89 | image: "stkbailey/{{inputs.parameters.target_image}}:latest" 90 | inputs: 91 | parameters: 92 | - name: target_image 93 | - name: tap_image 94 | artifacts: 95 | - name: target-config 96 | path: /tmp/config.json 97 | s3: 98 | bucket: singer 99 | key: "config/{{inputs.parameters.target_image}}/config.json" 100 | endpoint: argo-artifacts:9000 101 | insecure: true 102 | accessKeySecret: 103 | name: argo-artifacts 104 | key: accesskey 105 | secretKeySecret: 106 | name: argo-artifacts 107 | key: secretkey 108 | - name: tap-output 109 | path: /tmp/tap_input.txt 110 | outputs: 111 | artifacts: 112 | - name: target-output 113 | path: /tmp/target_output.txt 114 | s3: 115 | bucket: singer 116 | key: "config/{{inputs.parameters.tap_image}}/state.json" 117 | endpoint: argo-artifacts:9000 118 | insecure: true 119 | accessKeySecret: 120 | name: argo-artifacts 121 | key: accesskey 122 | secretKeySecret: 123 | name: argo-artifacts 124 | key: secretkey 125 | - name: target-data 126 | path: /tmp/data/data.zip 127 | s3: 128 | bucket: singer 129 | key: "outputs/{{inputs.parameters.tap_image}}/results.zip" 130 | endpoint: argo-artifacts:9000 131 | insecure: true 132 | accessKeySecret: 133 | name: argo-artifacts 134 | key: accesskey 135 | secretKeySecret: 136 | name: argo-artifacts 137 | key: secretkey 138 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # data-replication-on-kubernetes 2 | 3 | ELT is the hot new data pipelining pattern, and thousands of data teams are using some combination of Stitch / FiveTran, dbt and a cloud data warehouse to manage their data warehouse. There are great SaaS offerings for these tools, but a lot of folks like pain or don't have budget. As a small data team at Immuta, we were both. 4 | 5 | This article assumes some familiarity with [Docker](https://docker.com), [Kubernetes](https://kubernetes.io/) and the [Singer](https://singer.io) specification. Even if you're new to these technologies, though, I will try to point out helpful resources to get you pointed in the right direction. 6 | 7 | First, we will discuss the problem we are trying to solve and why using containers can make a lot of sense. Then, I'll walk you through setting up some workflows at home (as long as your home is Mac OSX Catalina). And finally, we'll talk about considerations you want to make if you'd like to move into production. 8 | 9 | ## Motivation 10 | 11 | Let's start the discussion with two assertions: 12 | 13 | 1. Data replication is a "solved" problem. 14 | 2. Kubernetes is a scalable data (process) management platform. 15 | 16 | ETL is not the reason that anyone gets into data science or engineering. There is little creativity, lots of maintenance, and no recognition until something goes wrong. Fortunately, SaaS tools like [Stitch](www.stitchdata.com) and [FiveTran](www.fivetran.com) have pretty much turned data replication into a commodity that small teams can leverage. And where there are no existing supported connectors (for internal applications, say), a data scientist could write their own [Singer](https://singer.io) script and load data themselves. 17 | 18 | Keep this last scenario I want to focus on here, because of course a "data pipeline" is hardly _just_ data replication. 19 | 20 | The "solved" nature of data replication makes it easier for data scientists to own projects end-to-end, freeing data engineers to think about the "platform" rather than point solutions. ([StitchFix has a terrific post on this.](https://multithreaded.stitchfix.com/blog/2016/03/16/engineers-shouldnt-write-etl/)) In fact, the players in this market recognize that it's really the stuff "around" the integrations that are differentiators: the [Meltano](https://meltano.com) project out of GitLab, for example, found a niche in being [a "runner" for integration processes](https://www.dataengineeringpodcast.com/meltano-data-integration-episode-141/), rather than managing the end-to-end analytics pipeline. 21 | 22 | ### Singer taps and targets 23 | 24 | A quick summary on the Singer spec mentioned above: 25 | 26 | - a `tap` connects to service, extracts data, then emits a standardized stream of schemas and records using JSON 27 | - a `target` reads the record stream of a tap and load it into a warehouse 28 | 29 | This separation of tap and target decouplesthe "extract" step and the "load" step. So an organization with 10 sources and 1 data warehouse has ten tap-to-target pipelines to manage. If they migrate this database, though, they only need to "swap out" the target; they won't make a single change to any taps. 30 | 31 | One pecularity of this isolation is that a best practice for running taps and targets is to isolate them in their own Python virtual environments, since each tap and target may have different dependencies, be built on different Singer versions, etc. And when I first read that, I thought: containers! Yet, there is scarcely a mention of containers in the Singer community, at least as far as I could tell. 32 | 33 | ### Kubernetes and Argo Workflows 34 | 35 | At Immuta, we have a small data team but a world-class engineering organization. As a rule, I like to defer to the engineering team, who have both knowledge and skill, since I, as a data scientist, have neither of these things. 36 | 37 | The rest of our internal infrastructure runs on Kubernetes; as the lone data scientist, I felt the peer pressure of learning k9s and managing kubeconfigs. And I’m here to say -- this is a great approach, if you’re willing to do some dirty work. 38 | 39 | [Argo](https://argoproj.github.io/) is the "Kubernetes-Native Workflow Engine" 40 | 41 | Argo Workflows is an open source container-native workflow engine for orchestrating parallel jobs on Kubernetes. It is an alternative to other orchestration tools, such as Airflow or Prefect, and the key differentiator is that it is container-based. This Data Council talk provides a nice comparison with Airflow specifically: [Kubernetes-Native Workflow Orchestration with Argo](https://www.datacouncil.ai/talks/kubernetes-native-workflow-orchestration-with-argo) 42 | 43 | Now that we have a motivation for using Singer and Argo together, let's get to work! 44 | 45 | ## Tutorial 46 | 47 | In this tutorial, we are going to first set up a local Kubernetes cluster with Argo and Minio storage for artifacts and config files, 48 | then we will deploy a Singer tap-to-target workflow, and then we'll discuss enhancements and extensions. 49 | 50 | We could go deep on any one of these areas, but I'll try to keep it shallow and manageable for the tutorial, leaving further discussion for "production" considerations to the end. The tutorial has the following pre-requisites: 51 | 52 | 1. Docker Desktop. Installed locally, with your Kubernetes for Docker (Minikube / another cluster is fine as well) 53 | 2. Helm. We will install both Argo and Minio for running this tutorial. 54 | 55 | To start, we are going to be using two of the simplest singer.io packages -- `tap-exchangeratesapi` and `target-csv` -- for demonstrating how this works. 56 | 57 | ### 1. Setting up Argo 58 | 59 | In this first section, we need to set up a Kubernetes cluster, Argo Workflows, an artifact repository, and a couple of storage buckets. The simplest approach is to use Docker for Desktop, which is multi-platform and can deploy a Kubernetes cluster in it. This also makes local development a bit easier, since you can build containers and deploy to a local cluster without pushing to external container repositories. 60 | 61 | #### Install Docker Desktop and enable Kubernetes 62 | 63 | Youu will need to have a Kubernetes cluster which you have admin access to and can deploy resources to using `kubectl`. If you already have that, then you can skip this section. If you don't, the easiest way to do it is using Docker for Desktop. You'll want to follow the documentation on Docker's website. 64 | 65 | 1. Install Docker for Desktop. [Mac](https://docs.docker.com/docker-for-mac/install/). 66 | 2. Enable Kubernetes. [Mac](https://docs.docker.com/docker-for-mac/kubernetes/) 67 | 3. Bump up Docker's resource allocation to at least 12 GB to ensure enough room for Minio storage 68 | - ![](assets/kubernetes-resources.png) 69 | 4. Test out the `kubectl` command. [Mac](https://docs.docker.com/docker-for-mac/kubernetes/#use-the-kubectl-command) 70 | 5. Install the latest Argo CLI. [Mac](https://github.com/argoproj/argo/releases) 71 | 72 | Once you have that working, feel free to play around with your Kubernetes cluster. 73 | 74 | #### Install Argo Workflows 75 | 76 | Next, we need to install Argo. You can follow the [Quick Start Guide](https://argoproj.github.io/argo/quick-start/) to get some context, or simply use the commands below to create an `argo` namespace in your cluster and deploy the resources. 77 | 78 | ```{zsh} 79 | # Create the argo namespace then deploy the "quick start" resources 80 | kubectl create ns argo 81 | kubectl apply -n argo -f https://raw.githubusercontent.com/argoproj/argo/stable/manifests/quick-start-postgres.yaml 82 | ``` 83 | 84 | You should now have an Argo server and a few new Kubernetes resource types used by Argo, including `Workflow` and `CronWorkflow`. To get a glimpse of the Argo Workflows UI, you can forward the Kubernetes port to your localhost. 85 | 86 | ```{zsh} 87 | kubectl -n argo port-forward deployment/argo-server 2746:2746 88 | ``` 89 | 90 | ![The Argo UI](assets/argo-ui.png) 91 | 92 | #### Set up Minio Storage 93 | 94 | Argo Workflows can pass files into or out of a container through the use of "artifacts". For local deployments, an easy way to configure artifact passing is through a Kubernetes deployment of [MinIO](https://min.io/). Argo has [plenty of guidance](https://argoproj.github.io/argo/configure-artifact-repository/) on setting this up with other services, but you can follow along below for a quick MinIO deployment. 95 | 96 | Note that you'll need to have `helm` installed (essentially, a Kubernetes package manager). On Mac, you can use Homebrew: `brew install helm`. Then, run the following: 97 | 98 | ```{zsh} 99 | # Add the MinIO helm chart 100 | helm repo add minio https://helm.min.io/ 101 | helm repo update 102 | 103 | # Deploy MinIO in the "argo" namespace (this may take a minute) 104 | helm install argo-artifacts minio/minio \ 105 | -n argo \ 106 | --set service.type=LoadBalancer \ 107 | --set defaultBucket.enabled=true \ 108 | --set defaultBucket.name=artifacts \ 109 | --set persistence.enabled=false \ 110 | --set fullnameOverride=argo-artifacts 111 | ``` 112 | 113 | You now have an artifacts server running, but it's empty! Let's create an artifacts bucket, along with a bucket for singer configuration and outputs. To use the commands below, you'll need the MinIO CLI tool: `brew install minio/stable/mc`. 114 | 115 | ```{zsh} 116 | # Add config host 117 | mc config host add argo-artifacts-local http://localhost:9000 YOURACCESSKEY YOURSECRETKEY 118 | 119 | # Create buckets in min.io 120 | mc mb argo-artifacts-local/artifacts 121 | mc mb argo-artifacts-local/singer 122 | ``` 123 | 124 | You can go and check these in your browser using `kubectl -n argo port-forward service/argo-artifacts 9000:9000`. 125 | 126 | ``` 127 | mc ls argo-artifacts-local 128 | ``` 129 | 130 | #### Map Argo and MinIO together 131 | 132 | on setting up your artifact repository. But let's just start with the below for now: 133 | 134 | Finally, we need to tell Argo where the default artifact repository is, so that it knows which bucket to map artifacts to and has the appropriate secrets for authentication. You could follow the instructions [elsewhere](https://sourcegraph.com/github.com/argoproj/argo@095d67f8d0f1d309529c8a400cb16d0a0e2765b9/-/blob/demo.md#5-install-an-artifact-repository), but for simplicity's sake, I suggest 135 | applying this "patch" to your resource. 136 | 137 | ```{zsh} 138 | wget -o patch.yml https://raw.githubusercontent.com/stkbailey/data-replication-on-kubernetes/master/argo/argo-artifact-patch.yml 139 | kubectl patch configmap workflow-controller-configmap -n argo --patch "$(cat argo-artifact-patch.yml)" 140 | ``` 141 | 142 | To make sure you've got everything working in this first section, try running the "artifact-passing" example from the Argo examples repository. 143 | 144 | ```{zsh} 145 | argo submit -n argo https://raw.githubusercontent.com/argoproj/argo/master/examples/artifact-passing.yaml --watch 146 | ``` 147 | 148 | 149 | 150 | ### The Argo workflow 151 | 152 | You've got Kubernetes, you've got Argo: let's not wait around! Run the following command: 153 | 154 | ``` 155 | argo submit -n argo --watch https://raw.githubusercontent.com/stkbailey/data-replication-on-kubernetes/master/argo/tap-exchange-rate-workflow.yml 156 | ``` 157 | 158 | You should see a workflow kick off in your terminal. (Alternatively, you could go to that link, copy the text and paste it into the Argo Workflows "new workflow" UI.) If you receive an error message that says something along the lines of `failed to save outputs: timed out waiting for the condition`, simply try running the workflow again. 159 | 160 | You should see a two-step Workflow create and finish. Let's check the "outputs" storage bucket and see what's available. 161 | 162 | ``` 163 | mc ls argo-artifacts-local/singer/outputs/tap-exchange-rates/ 164 | ``` 165 | 166 | #### Dissecting the workflow 167 | 168 | Let's walk through what this tap does: 169 | 170 | 1. It starts a `tap-to-target` multi-step workflow, consisting of two child steps: the `tap` and the `target` step. 171 | 2. It kicks off the `tap` child step and passes in some configuration files, which get mapped to `/tmp/config.json`. When the tap step completes, the output file at `/tmp/tap_output.txt` is stored in the default MinIO artifact repository. 172 | 3. It then kicks off the `target` child step and maps both configuration and tap output into the containers file system. When the target runs, the output (from the CSV target) is mapped to the `outputs` MinIO bucket. 173 | 174 | Argo makes all of this extremely "template-able", as we'll see in a few moments. But it is all dependent on the containers knowing where to expect certain files, etc. Which leads us to ask: What's going on in the containers? 175 | 176 | #### The Preamble 177 | 178 | We begin the Workflow by adding some metadata andd specifying that we want to create a "Workflow" resource. This is a single run of the workflow. 179 | 180 | ``` 181 | apiVersion: argoproj.io/v1alpha1 182 | kind: Workflow 183 | metadata: 184 | name: singer-workflow 185 | namespace: argo 186 | spec: 187 | entrypoint: <> 188 | templates: [ ... ] 189 | ``` 190 | 191 | Next, we specify our DAG templates. We will be building three of these: a `tap-to-target` DAG, a `tap`, and a `target`. We _could_ do this all in a single step, but we'll talk about reusability in a moment. 192 | 193 | Let's begin with the DAG template, which takes a couple input parameters -- names, images and the artifact bucket 194 | 195 | - name: 196 | - input parameters 197 | - steps: 198 | 199 | The DAG Workflow references a couple other "templates" that also need to be defined. 200 | 201 | - name: 202 | - container 203 | - input parameters: 204 | - input artifacts: 205 | - output artifacts: 206 | 207 | You can re-run the workflow, changing the configuration as desired. And that's the beauty of this workflow: once set up, it's simply a matter of changing some container locations and config files to add a new tap. 208 | 209 | #### Dockerizing Singer 210 | 211 | Let's put Kubernetes to the side for a moment and focus on a typical Singer pipeline. It's a linear process: 212 | 213 | 1. Tap inputs: configuration file, catalog file, state file. 214 | 2. Tap outputs: a stream of log data in Singer format. 215 | 3. Target inputs: configuration file, tap output stream. 216 | 4. Target outputs: loaded/exported data (e.g. to a database or CSV file), state file 217 | 218 | We can treat the containers themselves as black boxes, so long as we know how to feed in the appropriate inputs and outputs. To keep things simple in this tutorial, I have pre-created `tap` and `target` containers for our use. To test it out, run this command: 219 | 220 | ```{zsh} 221 | docker run -e START_DATE=2020-08-01 stkbailey/tap-exchange-rates 222 | ``` 223 | This should kick off the `tap-exchangeratesapi`, which doesn't require any special configuration to run. 224 | 225 | Without going into too much detail, we are wrapping the singer "tap / target" commands in a Runner object that defines where to look for the necessary files. (It's available at [singer-container-utils](https://github.com/immuta/singer-container-utils).) THere are many ways to skin that cat at this point, but the important thing is to make it easy to reproduce across multiple taps. 226 | 227 | We won't dig into the details, but each of these Docker containers is running a Python `entrypoint.py` script when they are initialized. The gist of the entrypoint script is: 228 | 229 | 1. Identify the paths where `config`, `catalog`, `state` and `input` files exist. 230 | 2. Build the appropriate tap/target executable command, based on file availability. 231 | 3. Run the command and write output to a file. 232 | 233 | It's a lot of overhead for a single command, but when you have multiple containers to run, the abstractions make life easier. 234 | 235 | 236 | ## Discussion 237 | 238 | It is hopefully not hard to see how, with a few additional tweaks and some well-protected S3 buckets, this couldl be turned into a fairly robust architecture. We can easily turn the workflow specified above into an Argo `TemplateWorkflows` and reference it in a `CronWorkflow` to have it run on an hourly or daily basis. 239 | 240 | But, exciting as all this is, it has to be noted that this is not for the faint of heart. As a data scientist, there are a lot of considerations. 241 | 242 | - How are you building and updating the containers? Does your cluster have access to that container repository? 243 | - How are you triggering new workflows? 244 | - Networking can be a pain. 245 | 246 | You may find that `pipelinewise` or `meltano` can do the same work for you, with less overhead. Alternatively, you may find that running them on Argo gives you a nice method for doing custom replications. 247 | 248 | ### Logging and observability are _different_ in containers 249 | 250 | We use a Slack exit-handler to notify our team of successes and failures. 251 | 252 | ### Schedules and templates 253 | 254 | The "promised land" of Argo, though, is that once it is set up, you simply have to change the image location to add a new tap -- and this is true! The only thing that should change between runs is the container that's invoked and the configuration files passed in. 255 | 256 | The questions you need to ask are: 257 | 258 | - Do I have the budget to just pay for this (Stitch / Fivetran)? 259 | - Does a simpler (single-node) architecture work, such as Meltano? 260 | - Does a Kubernetes architecture fit with the rest of the company's infrastructure? 261 | - Can I reuse the Argo architecture for other processes, such as transformation pipelines, data quality testing, etc? 262 | 263 | At Immuta, we went with this architecture becuase: 264 | 265 | 1. We didn't have budget (yet) for an enterprise service but found ourselves needing to run custom taps. 266 | 2. We were already comfortable with containerized applications. 267 | 3. The rest of our company's infrastructure was run on Kubernetes and leveraged other Argo products. 268 | 4. We had other projects, such as data quality jobs, that we need a platform to run on, and we did not have previous expertise with Airflow or Prefect. 269 | 270 | Thanks for reading! 271 | 272 | ## References 273 | 274 | - Getting Started with Singer [Blog](https://github.com/singer-io/getting-started) 275 | - Building a Singer Tap - Infographic [Blog](https://www.stitchdata.com/blog/how-to-build-a-singer-tap-infographic/) 276 | - Singer Container Utils [GitHub](https://github.com/immuta/singer-container-utils) 277 | 278 | ## Draft 279 | 280 | #### Doing this at home 281 | 282 | 1. Create an Argo Workflow template that uses variables to select the tap and target container. This serves as the backbone of your process; everything else is parameterized. 283 | 2. Create a `target` container for your data warehouse. 284 | 3. Save the `config.json` for your target to a secure location. 285 | 286 | Once that's set up, for each new tap, you: 287 | 288 | 1. Create a new `tap` container. The tap runs a Python script on start that checks a few default locations for configs, catalogs, etc. Deploy the new container to ECR so that it is accessible by the Argo service user. 289 | 2. Save the `config.json`, `catalog.json`, and an initial `state.json` to a config folder on S3. 290 | 3. Create a new CronWorkflow (or Workflow Template) for your job, that references your 291 | 292 | You should see that the container emits a logging stream of updates -- but it does not emit the actual data itself. This is slightly different from how the tap would work if you were to run it locally. What we have done instead is written the tap output to a file inside the container. 293 | 294 | This decision makes it less straightforward to simply "pipe" the output of one container to another, but it gives us greater control over where the logs (which are the data) are ultimately stored. 295 | 296 | Now, let's try mapping a configuration file into the container, rather than providing a `START_DATE` configuration directly. 297 | 298 | ``` 299 | docker run \ 300 | --mount type=bind,source="$(pwd)"/singer-configs/tap-exchange-rates/config.json,target=/opt/code/config.json \ 301 | stkbailey/tap-exchange-rates:latest 302 | ``` 303 | 304 | You should see a similar results to the first run. 305 | 306 | Next, let's take a quick look at how the target works by running 307 | 308 | ```{zsh} 309 | docker run \ 310 | --env DESTINATION_PATH=/tmp \ 311 | stkbailey/target-csv:latest 312 | ``` 313 | 314 | Once again, we could map some additional files into the container -- and will need to do so to pass the tap output along. 315 | --------------------------------------------------------------------------------