├── tac ├── __init__.py ├── fetch.py ├── predict.py ├── transform.py ├── task-dummy.py └── task.py ├── requirements.txt ├── setup.py ├── Dockerfile └── README.md /tac/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | luigi 2 | click 3 | boto3 4 | pykube 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='tac', 5 | version='0.1', 6 | packages=find_packages(), 7 | ) 8 | 9 | 10 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python 2 | 3 | COPY requirements.txt /requirements.txt 4 | RUN pip install -r /requirements.txt 5 | COPY . /tac 6 | RUN pip install /tac 7 | 8 | ARG AWS_ACCESS_KEY_ID 9 | ENV AWS_ACCESS_KEY_ID=${AWS_ACCESS_KEY_ID} 10 | ARG AWS_SECRET_ACCESS_KEY 11 | ENV AWS_SECRET_ACCESS_KEY=${AWS_SECRET_ACCESS_KEY} 12 | ARG S3_BUCKET 13 | ENV S3_BUCKET=${S3_BUCKET} 14 | 15 | -------------------------------------------------------------------------------- /tac/fetch.py: -------------------------------------------------------------------------------- 1 | import click 2 | from time import sleep 3 | from luigi.contrib.s3 import S3Target 4 | import boto3.s3.transfer # Luigi's bug workaround 5 | 6 | 7 | def fetch_data(input_path, output_path): 8 | print('Reading from {} and writing to {}'.format(input_path, output_path)) 9 | sleep(1) 10 | S3Target(output_path).open('w').close() 11 | 12 | 13 | @click.command() 14 | @click.argument('input-path') 15 | @click.argument('output-path') 16 | def cli(input_path, output_path): 17 | fetch_data(input_path, output_path) 18 | 19 | 20 | if __name__ == '__main__': 21 | cli() 22 | -------------------------------------------------------------------------------- /tac/predict.py: -------------------------------------------------------------------------------- 1 | import click 2 | from luigi.contrib.s3 import S3Target 3 | from time import sleep 4 | import boto3.s3.transfer # Luigi's bug workaround 5 | 6 | 7 | def predict(model_name, data_path, output_path): 8 | print('Reading data from {}, predicting with model {} and writing to {}' 9 | .format(data_path, model_name, output_path)) 10 | sleep(1) 11 | S3Target(output_path).open('w').close() 12 | 13 | 14 | @click.command() 15 | @click.argument('model-name') 16 | @click.argument('data-path') 17 | @click.argument('output-path') 18 | def cli(model_name, data_path, output_path): 19 | predict(model_name, data_path, output_path) 20 | 21 | 22 | if __name__ == '__main__': 23 | cli() 24 | -------------------------------------------------------------------------------- /tac/transform.py: -------------------------------------------------------------------------------- 1 | import click 2 | from time import sleep 3 | from luigi.contrib.s3 import S3Target 4 | import boto3.s3.transfer # Luigi's bug workaround 5 | 6 | 7 | def transform_data(paths): 8 | print('Transforming data') 9 | sleep(3) 10 | return 123 11 | 12 | 13 | def save_result(data, path): 14 | print('Saving result') 15 | sleep(3) 16 | S3Target(path).open('w').close() 17 | 18 | 19 | @click.command() 20 | @click.argument('output-path') 21 | @click.argument('input-paths', nargs=-1) 22 | def cli(output_path, input_paths): 23 | result = transform_data(paths=input_paths) 24 | save_result(data=result, path=output_path) 25 | 26 | 27 | if __name__ == '__main__': 28 | cli() 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Task as Containers example project 2 | 3 | ## Details 4 | Please read the detailed article at [Data Revenue's Blog](https://www.datarevenue.com/blog/). 5 | 6 | ## How to run 7 | - Clone the repo and `cd` into its root. 8 | - Install requirements and the project itself. 9 | - Export your [AWS credentials](https://console.aws.amazon.com/iam/home#/security_credential) 10 | into AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables. 11 | - Export name of your S3 bucket name into S3_BUCKET env variable. 12 | - Spin up a Minikube cluster. 13 | - Build a docker image inside Minikube VM: 14 | ```bash 15 | eval $(minikube docker-env) 16 | docker build -t tac-example:v1 . --build-arg AWS_ACCESS_KEY_ID=$AWS_ACCESS_KEY_ID --build-arg AWS_SECRET_ACCESS_KEY=$AWS_SECRET_ACCESS_KEY --build-arg S3_BUCKET=$S3_BUCKET 17 | ``` 18 | - Run the pipeline: 19 | ```bash 20 | luigi --module tac.task MakePredictions --date 2018-01-01 21 | ``` 22 | -------------------------------------------------------------------------------- /tac/task-dummy.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | from time import sleep 3 | import luigi 4 | import os 5 | 6 | from luigi.contrib.kubernetes import KubernetesJobTask 7 | from luigi.contrib.s3 import S3Target 8 | 9 | 10 | IMAGE = 'tac-example:v1' 11 | BUCKET = os.environ['S3_BUCKET'] 12 | 13 | 14 | class SourceData(luigi.ExternalTask): 15 | date = luigi.DateParameter() 16 | 17 | def output(self): 18 | return S3Target( 19 | path='s3://{bucket}/tac-example/data/source/{date:%Y-%m-%d}.csv' 20 | .format(bucket=BUCKET, date=self.date) 21 | ) 22 | 23 | def complete(self): 24 | """Hack so we don't have to create input files manually. 25 | 26 | Luigi will always think that this task is done, without checking for 27 | presence of source files. 28 | """ 29 | return True 30 | 31 | 32 | class FetchData(luigi.Task): 33 | date = luigi.DateParameter() 34 | 35 | def requires(self): 36 | return SourceData(date=self.date) 37 | 38 | def output(self): 39 | return S3Target( 40 | path='s3://{bucket}/tac-example/data/raw/{date:%Y-%m-%d}.csv' 41 | .format(bucket=BUCKET, date=self.date) 42 | ) 43 | 44 | def run(self): 45 | print('Reading from {} and writing to {}' 46 | .format(self.input().path, self.output().path)) 47 | sleep(1) 48 | # self.output().makedirs() 49 | self.output().open('w').close() 50 | 51 | 52 | class TransformData(KubernetesJobTask): 53 | date = luigi.DateParameter() 54 | 55 | @property 56 | def name(self): 57 | return 'transform-data' 58 | 59 | @property 60 | def spec_schema(self): 61 | return { 62 | "containers": [{ 63 | "name": self.name, 64 | "image": 'tac-example:v1', 65 | "command": self.cmd 66 | }], 67 | } 68 | 69 | def requires(self): 70 | for delta in range(1, 11): 71 | yield FetchData(date=self.date - timedelta(days=delta)) 72 | 73 | def output(self): 74 | return S3Target( 75 | path='s3://{bucket}/tac-example/data/transformed/{date:%Y-%m-%d}.csv' 76 | .format(bucket=BUCKET, date=self.date) 77 | ) 78 | 79 | @property 80 | def cmd(self): 81 | command = ['python', '-m', 'tac.transform', self.output().path] 82 | command += [item.path for item in self.input()] 83 | return command 84 | 85 | 86 | class Predict(luigi.Task): 87 | date = luigi.DateParameter() 88 | model_name = luigi.Parameter() 89 | 90 | def requires(self): 91 | return TransformData(date=self.date) 92 | 93 | def output(self): 94 | return S3Target( 95 | path='s3://{bucket}/tac-example/data/predictions/{date:%Y-%m-%d}_{model}.csv' 96 | .format(bucket=BUCKET, date=self.date, model=self.model_name) 97 | ) 98 | 99 | def run(self): 100 | print('Predicting with model {} and saving to {}' 101 | .format(self.model_name, self.output().path)) 102 | sleep(1) 103 | # self.output().makedirs() 104 | self.output().open('w').close() 105 | 106 | 107 | class MakePredictions(luigi.WrapperTask): 108 | date = luigi.DateParameter() 109 | 110 | def requires(self): 111 | for model_name in ['A', 'B']: 112 | yield Predict(date=self.date, model_name=model_name) 113 | -------------------------------------------------------------------------------- /tac/task.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | 3 | import luigi 4 | from luigi.contrib.s3 import S3Target 5 | from luigi.contrib.kubernetes import KubernetesJobTask 6 | import os 7 | 8 | IMAGE = "tac-example:v1" 9 | BUCKET = os.environ['S3_BUCKET'] 10 | 11 | 12 | class SourceData(luigi.ExternalTask): 13 | date = luigi.DateParameter() 14 | 15 | def output(self): 16 | return S3Target( 17 | path='s3://{bucket}/tac-example/source/{date:%Y-%m-%d}.csv' 18 | .format(bucket=BUCKET, date=self.date) 19 | ) 20 | 21 | def complete(self): 22 | """Hack so we don't have to create input files manually. 23 | 24 | Luigi will always think that this task is done, without checking for 25 | presence of source files. 26 | """ 27 | return True 28 | 29 | 30 | class FetchData(KubernetesJobTask): 31 | date = luigi.DateParameter() 32 | 33 | @property 34 | def name(self): 35 | return 'transform-data' 36 | 37 | @property 38 | def spec_schema(self): 39 | return { 40 | "containers": [{ 41 | "name": self.name, 42 | "image": IMAGE, 43 | "command": self.cmd 44 | }], 45 | } 46 | 47 | def requires(self): 48 | return SourceData(date=self.date) 49 | 50 | def output(self): 51 | return S3Target( 52 | path='s3://{bucket}/tac-example/data/raw/{date:%Y-%m-%d}.csv' 53 | .format(bucket=BUCKET, date=self.date) 54 | ) 55 | 56 | @property 57 | def cmd(self): 58 | command = ['python', '-m', 'tac.fetch', 59 | self.input().path, self.output().path] 60 | return command 61 | 62 | 63 | class TransformData(KubernetesJobTask): 64 | date = luigi.DateParameter() 65 | 66 | @property 67 | def name(self): 68 | return 'transform-data' 69 | 70 | @property 71 | def spec_schema(self): 72 | return { 73 | "containers": [{ 74 | "name": self.name, 75 | "image": IMAGE, 76 | "command": self.cmd 77 | }], 78 | } 79 | 80 | def requires(self): 81 | for delta in range(1, 11): 82 | yield FetchData(date=self.date - timedelta(days=delta)) 83 | 84 | def output(self): 85 | return S3Target( 86 | path='s3://{bucket}/tac-example/data/transformed/{date:%Y-%m-%d}.csv' 87 | .format(bucket=BUCKET, date=self.date) 88 | ) 89 | 90 | @property 91 | def cmd(self): 92 | command = ['python', '-m', 'tac.transform', self.output().path] 93 | command += [item.path for item in self.input()] 94 | return command 95 | 96 | 97 | class Predict(KubernetesJobTask): 98 | date = luigi.DateParameter() 99 | model_name = luigi.Parameter() 100 | 101 | @property 102 | def name(self): 103 | return 'predict' 104 | 105 | @property 106 | def spec_schema(self): 107 | return { 108 | "containers": [{ 109 | "name": self.name, 110 | "image": IMAGE, 111 | "command": self.cmd 112 | }], 113 | } 114 | 115 | def requires(self): 116 | return TransformData(date=self.date) 117 | 118 | def output(self): 119 | return S3Target( 120 | path='s3://{bucket}/tac-example/data/predictions/{date:%Y-%m-%d}_{model}.csv' 121 | .format(bucket=BUCKET, date=self.date, model=self.model_name) 122 | ) 123 | 124 | @property 125 | def cmd(self): 126 | command = ['python', '-m', 'tac.predict', 127 | self.model_name, self.input().path, self.output().path] 128 | return command 129 | 130 | 131 | class MakePredictions(luigi.WrapperTask): 132 | date = luigi.DateParameter() 133 | 134 | def requires(self): 135 | for model_name in ['A', 'B']: 136 | yield Predict(date=self.date, model_name=model_name) 137 | --------------------------------------------------------------------------------