├── .flake8
├── .github
└── workflows
│ ├── adorable_unicorns_deploy.yaml
│ └── ghcr_flytelab.yml
├── .gitignore
├── LICENSE
├── README.md
├── projects
├── bravemusic
│ ├── .dockerignore
│ ├── .flyte
│ │ ├── remote-config.yaml
│ │ ├── remote.config
│ │ ├── sandbox-config.yaml
│ │ └── sandbox.config
│ ├── .gitpod.yml
│ ├── Dockerfile
│ ├── README.md
│ ├── bravemusic
│ │ ├── __init__.py
│ │ ├── datasource.py
│ │ ├── preprocess.py
│ │ ├── train.py
│ │ └── workflows.py
│ ├── dashboard
│ │ ├── Option_B
│ │ │ ├── app.py
│ │ │ └── serve.py
│ │ ├── app.py
│ │ ├── remote.config
│ │ └── sandbox.config
│ ├── deploy.py
│ ├── procfile
│ ├── requirements-dev.txt
│ ├── requirements.txt
│ └── setup.sh
├── destinations_similarity
│ ├── .dockerignore
│ ├── .flyte
│ │ ├── remote-config.yaml
│ │ ├── remote.config
│ │ ├── sandbox-config.yaml
│ │ └── sandbox.config
│ ├── Dockerfile
│ ├── Makefile
│ ├── README.md
│ ├── conf.py
│ ├── dashboard
│ │ ├── app.py
│ │ ├── beach_kinder.jpeg
│ │ ├── kinder.jpeg
│ │ └── requirements.txt
│ ├── deploy.py
│ ├── destinations_similarity
│ │ ├── __init__.py
│ │ ├── processing
│ │ │ ├── __init__.py
│ │ │ ├── feature_engineering.py
│ │ │ └── text_preprocessing.py
│ │ ├── scraper
│ │ │ ├── __init__.py
│ │ │ ├── brazilian_cities.py
│ │ │ └── extractor.py
│ │ ├── tasks.py
│ │ └── workflows.py
│ ├── docs
│ │ ├── Dockerfile
│ │ ├── dashboard
│ │ │ ├── dashboard.rst
│ │ │ └── streamlit.rst
│ │ ├── guides
│ │ │ ├── deploy.rst
│ │ │ ├── deploy_code.rst
│ │ │ ├── docs.rst
│ │ │ └── guide.rst
│ │ ├── images
│ │ │ ├── SolutionDiagram.png
│ │ │ ├── kinzinhoApresentando.jpg
│ │ │ ├── kinzinhoBagunceiro.jpg
│ │ │ ├── kinzinhoBigDog.png
│ │ │ ├── kinzinhoCachu.jpg
│ │ │ ├── kinzinhoGalante.jpg
│ │ │ ├── kinzinhoPensativo.jpg
│ │ │ ├── sphinx_server.png
│ │ │ └── vamoDalheLogo.jpeg
│ │ ├── model
│ │ │ ├── feature_engineering.rst
│ │ │ ├── model.rst
│ │ │ └── text_preprocessing.rst
│ │ └── scraper
│ │ │ ├── extractor.rst
│ │ │ ├── scraper.rst
│ │ │ └── source.rst
│ ├── index.rst
│ ├── requirements-dev.txt
│ ├── requirements.txt
│ └── scripts
│ │ ├── open_docs.sh
│ │ └── rebuild_docs.sh
├── weather_forecasting
│ ├── .flyte
│ │ ├── remote-config.yaml
│ │ ├── remote.config
│ │ ├── sandbox-config.yaml
│ │ └── sandbox.config
│ ├── .gitignore
│ ├── DEPLOYMENT.md
│ ├── Dockerfile
│ ├── Makefile
│ ├── README.md
│ ├── app
│ │ ├── __init__.py
│ │ └── workflow.py
│ ├── dashboard
│ │ ├── flyte.config
│ │ ├── requirements.txt
│ │ └── weather_forecasting.py
│ ├── in_container.mk
│ ├── requirements.txt
│ └── scripts
│ │ ├── activate-launch-plans.sh
│ │ ├── archive-launch-plans.sh
│ │ └── launch-plan-status.sh
└── whats_cooking_good_looking
│ ├── .flyte
│ ├── remote-config.yaml
│ ├── remote.config
│ ├── sandbox-config.yaml
│ └── sandbox.config
│ ├── Dockerfile
│ ├── README.md
│ ├── dashboard
│ ├── app.py
│ ├── remote.config
│ └── sandbox.config
│ ├── deploy.py
│ ├── docs
│ ├── actual_pipeline.png
│ ├── apply_pipeline.png
│ ├── label_studio.png
│ ├── target_pipeline.png
│ └── train_pipeline.png
│ ├── requirements-dev.txt
│ ├── requirements.txt
│ └── whats_cooking_good_looking
│ ├── __init__.py
│ ├── apply_ner_workflow.py
│ ├── config.json
│ ├── keywords.txt
│ ├── train_ner_workflow.py
│ └── utils.py
└── templates
├── _common
└── deploy.py
├── basic
├── README.md
├── cookiecutter.json
├── hooks
│ └── pre_gen_project.py
└── {{cookiecutter.project_name}}
│ ├── .dockerignore
│ ├── .flyte
│ ├── remote-config.yaml
│ ├── remote.config
│ ├── sandbox-config.yaml
│ └── sandbox.config
│ ├── Dockerfile
│ ├── README.md
│ ├── dashboard
│ ├── app.py
│ ├── remote.config
│ └── sandbox.config
│ ├── deploy.py
│ ├── requirements-dev.txt
│ ├── requirements.txt
│ └── {{cookiecutter.project_name}}
│ ├── __init__.py
│ └── workflows.py
└── pytorch-gpu
├── README.md
├── cookiecutter.json
├── hooks
└── pre_gen_project.py
└── {{cookiecutter.project_name}}
├── .dockerignore
├── .flyte
├── remote-config.yaml
├── remote.config
├── sandbox-config.yaml
└── sandbox.config
├── Dockerfile
├── README.md
├── dashboard
├── app.py
├── remote.config
└── sandbox.config
├── deploy.py
├── requirements-dev.txt
├── requirements.txt
└── {{cookiecutter.project_name}}
├── __init__.py
└── workflows.py
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120
3 | exclude =
4 | projects/whats_cooking_good_looking/dashboard/
5 | ignore = E265, E999
6 |
--------------------------------------------------------------------------------
/.github/workflows/adorable_unicorns_deploy.yaml:
--------------------------------------------------------------------------------
1 | # This is a basic workflow to help you get started with Actions
2 |
3 | name: CI
4 |
5 | # Controls when the workflow will run
6 | on:
7 | push:
8 |
9 | # Allows you to run this workflow manually from the Actions tab
10 | workflow_dispatch:
11 |
12 | env:
13 | REGISTERY: eu.gcr.io/flyte-sandbox-342013/flytelab
14 | PROJECT_NAME: adorable-unicorns-23
15 | PROJECT_PATH : projects/whats_cooking_good_looking
16 |
17 | # A workflow run is made up of one or more jobs that can run sequentially or in parallel
18 | jobs:
19 | # This workflow contains a single job called "build"
20 | build-and-deploy:
21 | # The type of runner that the job will run on
22 | runs-on: ubuntu-latest
23 |
24 | # Steps represent a sequence of tasks that will be executed as part of the job
25 | steps:
26 | # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it
27 | - name: Get tags
28 | id: tags
29 | run: |
30 | echo "::set-output name=date::$(date +'%Y-%m-%d-%H-%M-%S')"
31 | foo=${{ github.ref }}
32 | branch_name=${foo#refs/heads/}
33 | branch_slug=${branch_name/\//-}
34 | echo "::set-output name=tag::$branch_slug"
35 |
36 | - name: is preprod
37 | id: is-preprod
38 | env:
39 | main: main
40 | run: |
41 | if [ "${{ steps.tags.outputs.tag }}" = "main" ];then
42 | echo "::set-output name=domain::staging"
43 | echo "::set-output name=tag::${{ steps.tags.outputs.date }}"
44 | else
45 | echo "::set-output name=domain::development"
46 | echo "::set-output name=tag::${{steps.tags.outputs.tag}}-${{steps.tags.outputs.date}}"
47 | fi
48 |
49 | - uses: actions/checkout@v2
50 | - name: Build the Docker image
51 | run: docker build ${{ env.PROJECT_PATH }}/. --file ${{ env.PROJECT_PATH }}/Dockerfile --build-arg config=.flyte/remote-config.yaml --build-arg image=${{env.REGISTERY}}:${{steps.is-preprod.outputs.tag}} --build-arg creds=${{secrets.RUNNER_KEY}} --tag ${{env.REGISTERY}}:${{steps.is-preprod.outputs.tag}}
52 |
53 | - uses: google-github-actions/setup-gcloud@v0
54 | with:
55 | service_account_key: ${{ secrets.SERVICE_ACCOUNT_KEY }}
56 | project_id: flyte-sandbox-342013
57 | export_default_credentials: true
58 |
59 | - run: gcloud auth configure-docker -q
60 |
61 | - name: Push the Docker image
62 | run: docker push ${{env.REGISTERY}}:${{steps.is-preprod.outputs.tag}}
63 |
64 | - name: serialize
65 | uses: louisRDSC/FlyteSerializeAction@v1.2
66 | with:
67 | config: ${{ env.PROJECT_PATH }}/.flyte/remote.config
68 | tag: ${{env.REGISTERY}}:${{steps.is-preprod.outputs.tag}}
69 | requirements: ${{ env.PROJECT_PATH }}/requirements.txt
70 | pkgs : whats_cooking_good_looking
71 | source: ${{ env.PROJECT_PATH }}/
72 |
73 |
74 | - name: Register
75 | uses: louisRDSC/FlyteRegisterAction@v1.3
76 | with:
77 | project: ${{ env.PROJECT_NAME }}
78 | config: ${{ env.PROJECT_PATH }}/.flyte/remote-config.yaml
79 | domain: ${{ steps.is-preprod.outputs.domain }}
80 | package: ${{ env.PROJECT_PATH }}/flyte-package.tgz
81 | version: ${{ steps.is-preprod.outputs.tag }}
82 | clientId: ${{ secrets.CLIENT_ID }}
83 | clientSecret: ${{ secrets.CLIENT_SECRET }}
84 |
85 |
--------------------------------------------------------------------------------
/.github/workflows/ghcr_flytelab.yml:
--------------------------------------------------------------------------------
1 | name: Build & Push Flytelab Docker Image
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | create:
9 | tags:
10 | - v*
11 |
12 | jobs:
13 | push-github:
14 | if: ${{ github.repository_owner }} == 'flyteorg'
15 | name: Push to GHCR
16 | runs-on: ubuntu-latest
17 | defaults:
18 | run:
19 | working-directory: projects/weather_forecasting
20 | steps:
21 | - uses: actions/checkout@v2
22 | with:
23 | fetch-depth: "0"
24 | - name: Push Flytelab Weather Forecasting Docker Image to Github Registry
25 | uses: whoan/docker-build-with-cache-action@v5
26 | with:
27 | # https://docs.github.com/en/packages/learn-github-packages/publishing-a-package
28 | username: "${{ secrets.FLYTE_BOT_USERNAME }}"
29 | password: "${{ secrets.FLYTE_BOT_PAT }}"
30 | image_name: ${{ github.repository_owner }}/flytelab
31 | image_tag: weather-forecasting-latest,weather-forecasting-${{ github.sha }}
32 | push_git_tag: ${{ github.event_name != 'pull_request' }}
33 | push_image_and_stages: ${{ github.event_name != 'pull_request' }}
34 | registry: ghcr.io
35 | build_extra_args: "--compress=true --build-arg=tag=ghcr.io/${{ github.repository_owner }}/flytelab:weather-forecasting-${{ github.sha }}"
36 | context: ./projects/weather_forecasting
37 | dockerfile: Dockerfile
38 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | secrets/
2 | credentials/
3 | local_explo/
4 |
5 | # Repo-specific
6 | projects/**/.config
7 | projects/**/.kube
8 | .env/**/*
9 | env.txt
10 | .cache/*
11 | .vscode
12 | **/_pb_output/*
13 | bin
14 | flyte-package.tgz
15 | .DS_Store
16 |
17 | # Byte-compiled / optimized / DLL files
18 | __pycache__/
19 | *.py[cod]
20 | *$py.class
21 |
22 | # C extensions
23 | *.so
24 |
25 | # Distribution / packaging
26 | .Python
27 | build/
28 | develop-eggs/
29 | dist/
30 | downloads/
31 | eggs/
32 | .eggs/
33 | lib/
34 | lib64/
35 | parts/
36 | sdist/
37 | var/
38 | wheels/
39 | pip-wheel-metadata/
40 | share/python-wheels/
41 | *.egg-info/
42 | .installed.cfg
43 | *.egg
44 | MANIFEST
45 |
46 | # PyInstaller
47 | # Usually these files are written by a python script from a template
48 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
49 | *.manifest
50 | *.spec
51 |
52 | # Installer logs
53 | pip-log.txt
54 | pip-delete-this-directory.txt
55 |
56 | # Unit test / coverage reports
57 | htmlcov/
58 | .tox/
59 | .nox/
60 | .coverage
61 | .coverage.*
62 | .cache
63 | nosetests.xml
64 | coverage.xml
65 | *.cover
66 | *.py,cover
67 | .hypothesis/
68 | .pytest_cache/
69 |
70 | # Translations
71 | *.mo
72 | *.pot
73 |
74 | # Django stuff:
75 | *.log
76 | local_settings.py
77 | db.sqlite3
78 | db.sqlite3-journal
79 |
80 | # Flask stuff:
81 | instance/
82 | .webassets-cache
83 |
84 | # Scrapy stuff:
85 | .scrapy
86 |
87 | # Sphinx documentation
88 | docs/_build/
89 |
90 | # PyBuilder
91 | target/
92 |
93 | # Jupyter Notebook
94 | .ipynb_checkpoints
95 |
96 | # IPython
97 | profile_default/
98 | ipython_config.py
99 |
100 | # pyenv
101 | .python-version
102 |
103 | # pipenv
104 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
105 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
106 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
107 | # install all needed dependencies.
108 | #Pipfile.lock
109 |
110 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
111 | __pypackages__/
112 |
113 | # Celery stuff
114 | celerybeat-schedule
115 | celerybeat.pid
116 |
117 | # SageMath parsed files
118 | *.sage.py
119 |
120 | # Environments
121 | .env
122 | .venv
123 | env/
124 | venv/
125 | ENV/
126 | env.bak/
127 | venv.bak/
128 |
129 | # Spyder project settings
130 | .spyderproject
131 | .spyproject
132 |
133 | # Rope project settings
134 | .ropeproject
135 |
136 | # mkdocs documentation
137 | /site
138 |
139 | # mypy
140 | .mypy_cache/
141 | .dmypy.json
142 | dmypy.json
143 |
144 | # Pyre type checker
145 | .pyre/
146 |
147 | */gtzan/*
148 | projects/bravemusic/bravemusic/gtzan/
--------------------------------------------------------------------------------
/projects/bravemusic/.dockerignore:
--------------------------------------------------------------------------------
1 | !.flyte
--------------------------------------------------------------------------------
/projects/bravemusic/.flyte/remote-config.yaml:
--------------------------------------------------------------------------------
1 | admin:
2 | # For GRPC endpoints you might want to use dns:///flyte.myexample.com
3 | endpoint: dns:///playground.hosted.unionai.cloud
4 | authType: Pkce
5 | # Change insecure flag to ensure that you use the right setting for your environment
6 | insecure: false
7 | storage:
8 | type: stow
9 | stow:
10 | kind: s3
11 | config:
12 | auth_type: iam
13 | region: us-east-2
14 | logger:
15 | # Logger settings to control logger output. Useful to debug logger:
16 | show-source: true
17 | level: 1
18 |
--------------------------------------------------------------------------------
/projects/bravemusic/.flyte/remote.config:
--------------------------------------------------------------------------------
1 | [sdk]
2 | # This option specifies the python packages where-in to search for workflows and tasks workflow packages. These workflows and tasks are then serialized during the `make serialize` commands
3 | workflow_packages=bravemusic
4 |
5 | [auth]
6 | raw_output_data_prefix=s3://open-compute-playground
7 |
--------------------------------------------------------------------------------
/projects/bravemusic/.flyte/sandbox-config.yaml:
--------------------------------------------------------------------------------
1 | admin:
2 | # For GRPC endpoints you might want to use dns:///flyte.myexample.com
3 | endpoint: dns:///localhost:30081
4 | authType: Pkce
5 | insecure: true
6 | logger:
7 | show-source: true
8 | level: 0
9 | storage:
10 | connection:
11 | access-key: minio
12 | auth-type: accesskey
13 | disable-ssl: true
14 | endpoint: http://localhost:30084
15 | region: us-east-1
16 | secret-key: miniostorage
17 | type: minio
18 | container: "my-s3-bucket"
19 | enable-multicontainer: true
20 |
--------------------------------------------------------------------------------
/projects/bravemusic/.flyte/sandbox.config:
--------------------------------------------------------------------------------
1 | [sdk]
2 | # This option specifies the python packages where-in to search for workflows and tasks workflow packages. These workflows and tasks are then serialized during the `make serialize` commands
3 | workflow_packages=bravemusic
4 |
5 | [auth]
6 | raw_output_data_prefix=s3://my-s3-bucket/flytelab
7 |
--------------------------------------------------------------------------------
/projects/bravemusic/.gitpod.yml:
--------------------------------------------------------------------------------
1 | tasks:
2 | - init: |
3 | python -m venv ~/venvs/brave
4 | source ~/venvs/brave/bin/activate
5 | pip install -r requirements.txt -r requirements-dev.txt
6 | command: python3 projects/bravemusic/bravemusic/workflows.py
7 |
8 |
--------------------------------------------------------------------------------
/projects/bravemusic/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9-slim-buster
2 |
3 | WORKDIR /root
4 | ENV VENV /opt/venv
5 | ENV LANG C.UTF-8
6 | ENV LC_ALL C.UTF-8
7 | ENV PYTHONPATH /root
8 |
9 | # e.g. flyte.config or sandbox.config
10 | ARG config
11 |
12 | RUN apt-get update && \
13 | apt-get install -y \
14 | libsm6 \
15 | libxext6 \
16 | libxrender-dev \
17 | ffmpeg \
18 | build-essential
19 |
20 | # Install the AWS cli separately to prevent issues with boto being written over
21 | RUN pip3 install awscli
22 |
23 | ENV VENV /opt/venv
24 |
25 | # Virtual environment
26 | RUN python3 -m venv ${VENV}
27 | ENV PATH="${VENV}/bin:$PATH"
28 |
29 | # Install Python dependencies
30 | COPY requirements.txt /root
31 | RUN pip install -r /root/requirements.txt
32 |
33 | COPY bravemusic /root/bravemusic
34 | COPY $config /root/flyte.config
35 |
36 | # This image is supplied by the build script and will be used to determine the version
37 | # when registering tasks, workflows, and launch plans
38 | ARG image
39 | ENV FLYTE_INTERNAL_IMAGE $image
40 |
--------------------------------------------------------------------------------
/projects/bravemusic/README.md:
--------------------------------------------------------------------------------
1 | # Design Doc: Brave-Hyenas-2
2 | ## MLOps Community: Engineering labs
3 |
4 | | Team name | brave-hyenas-2 |
5 | |---------------------|:----------------------------------------:|
6 | |Project name | brave-hyenas-2 |
7 | | Project description | Hackathon - brave-hyenas-2 team |
8 | |Using GPUs? (Yes/No) | No |
9 |
10 |
11 |
12 | ### Problem Statement
13 | What problem are you solving?
14 | It’s usually hard to identify correctly what kind of music genre is playing thus our team embraced in tackling to classify music genre using deep learning.
15 |
16 |
17 | ### ......
18 |
19 |
20 |
21 |
22 |
23 | 
24 |
25 |
26 |
27 |
28 | ### Solution (working progress) ....
29 |
--------------------------------------------------------------------------------
/projects/bravemusic/bravemusic/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/bravemusic/bravemusic/__init__.py
--------------------------------------------------------------------------------
/projects/bravemusic/bravemusic/datasource.py:
--------------------------------------------------------------------------------
1 | import os
2 | import tarfile
3 | import git
4 |
5 | GIT_URL = "https://huggingface.co/datasets/marsyas/gtzan"
6 | GTZAN_PATH = "./gtzan"
7 | GTZAN_ZIP_FILE_PATH = "./gtzan/data"
8 | GTZAN_ZIP_FILE_NAME = "genres.tar.gz"
9 |
10 |
11 | class Progress(git.remote.RemoteProgress):
12 | def update(self, op_code, cur_count, max_count=None, message=""):
13 | print(self._cur_line)
14 |
15 |
16 | def download_gtzan_repo():
17 | if not os.path.isdir(GTZAN_PATH) or not any(os.scandir(GTZAN_PATH)):
18 | git.Repo.clone_from(url=GIT_URL, to_path=GTZAN_PATH, progress=Progress())
19 | extract_gtzan_repo_tarball()
20 | else:
21 | print("dataset already exists")
22 |
23 |
24 | def extract_gtzan_repo_tarball():
25 | # open file
26 | file = tarfile.open(f"{GTZAN_ZIP_FILE_PATH}/{GTZAN_ZIP_FILE_NAME}")
27 | # extracting file
28 | file.extractall(GTZAN_ZIP_FILE_PATH)
29 | file.close()
30 |
31 |
32 | if __name__ == "__main__":
33 | download_gtzan_repo()
34 |
--------------------------------------------------------------------------------
/projects/bravemusic/bravemusic/preprocess.py:
--------------------------------------------------------------------------------
1 | import os
2 | import math
3 | import librosa
4 | from datasource import download_gtzan_repo, GTZAN_ZIP_FILE_PATH
5 |
6 | SAMPLE_RATE = 22050
7 | TRACK_DURATION = 30 # measured in seconds
8 | SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION
9 | BAD_FORMATS = ["jazz.00054.wav"]
10 |
11 |
12 | def clean_dataset():
13 | for (dir_path, dir_names, filenames) in os.walk(f"{GTZAN_ZIP_FILE_PATH}/genres/"):
14 | print(dir_path)
15 | [
16 | os.remove(f"{dir_path}{filename}")
17 | for filename in filenames
18 | if not filename.endswith(".wav")
19 | ]
20 | [
21 | os.renames(
22 | old=f"{dir_path}/{filename}",
23 | new=f"{dir_path}/{filename}".replace("._", ""),
24 | )
25 | for filename in filenames
26 | if f"{dir_path}/{filename}".startswith("._")
27 | ]
28 | [
29 | os.remove(f"{dir_path}/{filename}")
30 | for filename in filenames
31 | if filename.startswith("._")
32 | ]
33 |
34 |
35 | def preprocess(
36 | dataset_path: str,
37 | num_mfcc: int = 13,
38 | n_fft: int = 2048,
39 | hop_length: int = 512,
40 | num_segments: int = 10,
41 | ) -> dict:
42 | data = {"mapping": [], "labels": [], "mfcc": []}
43 |
44 | samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
45 | num_mfcc_vectors_per_segment = math.ceil(samples_per_segment / hop_length)
46 |
47 | # loop through all genre sub-folder
48 | for i, (dir_path, dir_names, filenames) in enumerate(
49 | os.walk(f"{GTZAN_ZIP_FILE_PATH}/genres/")
50 | ):
51 | # ensure we're processing a genre sub-folder level
52 | if dir_path is not dataset_path:
53 | # save genre label (i.e., sub-folder name) in the mapping
54 | semantic_label = dir_path.split("/")[-1]
55 | print(semantic_label)
56 | data["mapping"].append(semantic_label)
57 | print("Processing: {}".format(semantic_label))
58 |
59 | # process all audio files in genre sub-dir
60 | for f in filenames:
61 | if f not in BAD_FORMATS:
62 | # load audio file
63 | file_path = os.path.join(dir_path, f)
64 | signal, sample_rate = librosa.load(path=file_path, sr=SAMPLE_RATE)
65 |
66 | # process all segments of audio file
67 | for d in range(num_segments):
68 |
69 | # calculate start and finish sample for current segment
70 | start = samples_per_segment * d
71 | finish = start + samples_per_segment
72 |
73 | # extract mfcc
74 | mfcc = librosa.feature.mfcc(
75 | y=signal[start:finish],
76 | sr=sample_rate,
77 | n_mfcc=num_mfcc,
78 | n_fft=n_fft,
79 | hop_length=hop_length,
80 | )
81 | mfcc = mfcc.T
82 |
83 | # store only mfcc feature with expected number of vectors
84 | if len(mfcc) == num_mfcc_vectors_per_segment:
85 | data["mfcc"].append(mfcc.tolist())
86 | data["labels"].append(i - 1)
87 | # print("{}, segment:{}".format(file_path, d + 1))
88 | return data
89 |
90 |
91 | if __name__ == "__main__":
92 | download_gtzan_repo()
93 | clean_dataset()
94 | data = preprocess(dataset_path=GTZAN_ZIP_FILE_PATH)
95 | print(data)
96 |
--------------------------------------------------------------------------------
/projects/bravemusic/bravemusic/train.py:
--------------------------------------------------------------------------------
1 | import json
2 | import typing
3 | import warnings
4 | import numpy as np
5 | from tensorflow import keras
6 | from dataclasses import dataclass
7 | from preprocess import preprocess
8 | from datasource import GTZAN_ZIP_FILE_PATH
9 | from dataclasses_json import dataclass_json
10 | from flytekit.types.directory import FlyteDirectory
11 | from sklearn.model_selection import train_test_split
12 |
13 |
14 | warnings.filterwarnings("ignore")
15 | MODELSAVE = [typing.TypeVar("str")]
16 | model_file = typing.NamedTuple("Model", model=FlyteDirectory[MODELSAVE])
17 |
18 |
19 | @dataclass_json
20 | @dataclass
21 | class Hyperparameters(object):
22 | batch_size: int = 32
23 | metrics: str = "accuracy"
24 | loss = ("sparse_categorical_crossentropy",)
25 | epochs: int = 30
26 | learning_rate: float = 0.0001
27 |
28 |
29 | def train(
30 | data: dict,
31 | hp: Hyperparameters
32 | ) -> model_file:
33 | # with open("data.json", "r") as fp:
34 | # data = json.load(fp)
35 |
36 | # convert lists to numpy arrays
37 | X = np.array(data["mfcc"])
38 | y = np.array(data["labels"])
39 |
40 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
41 |
42 | model = keras.Sequential(
43 | [
44 | keras.layers.Flatten(input_shape=(X.shape[1], X.shape[2])),
45 | keras.layers.Dense(512, activation="relu"),
46 | keras.layers.Dense(256, activation="relu"),
47 | keras.layers.Dense(64, activation="relu"),
48 | keras.layers.Dense(10, activation="softmax"),
49 | ]
50 | )
51 | optimiser = keras.optimizers.Adam(learning_rate=hp.learning_rate)
52 | model.compile(
53 | optimizer=optimiser,
54 | loss=hp.loss,
55 | metrics=[hp.metrics],
56 | )
57 | # train model
58 | model.fit(
59 | X_train,
60 | y_train,
61 | validation_data=(X_test, y_test),
62 | batch_size=hp.batch_size,
63 | epochs=hp.epochs,
64 | )
65 |
66 | Dir = "model"
67 | model.save(Dir)
68 | return model
69 |
70 |
71 | if __name__ == '__main__':
72 | data = preprocess(dataset_path=GTZAN_ZIP_FILE_PATH)
73 | model = train(
74 | data=data,
75 | hp=Hyperparameters(epochs=1)
76 | )
77 |
--------------------------------------------------------------------------------
/projects/bravemusic/bravemusic/workflows.py:
--------------------------------------------------------------------------------
1 | import json
2 | import typing
3 | import warnings
4 | from train import Hyperparameters, train
5 | from flytekit import Resources, task, workflow
6 | from preprocess import clean_dataset, preprocess
7 | from flytekit.types.directory import FlyteDirectory
8 | from datasource import download_gtzan_repo, GTZAN_ZIP_FILE_PATH
9 |
10 |
11 | SAMPLE_RATE = 22050
12 | TRACK_DURATION = 30 # measured in seconds
13 | warnings.filterwarnings("ignore")
14 | SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION
15 |
16 | MODELSAVE = [typing.TypeVar("str")]
17 | model_file = typing.NamedTuple("Model", model=FlyteDirectory[MODELSAVE])
18 | workflow_outputs = typing.NamedTuple("WorkflowOutputs", model=FlyteDirectory[MODELSAVE])
19 |
20 |
21 | @task
22 | def download_gtzan_dataset():
23 | download_gtzan_repo()
24 |
25 |
26 | @task
27 | def clean_gtzan_dataset():
28 | clean_dataset()
29 |
30 |
31 | @task(cache_version="1.0", cache=True, limits=Resources(mem="2000Mi"))
32 | def preprocess_gtzan_dataset(
33 | dataset_path: str
34 | ) -> dict:
35 | processed_data = preprocess(dataset_path=dataset_path)
36 | return processed_data
37 |
38 |
39 | @task(cache_version="1.0", cache=True, limits=Resources(mem="2000Mi"))
40 | def train_gtzan_dataset(
41 | data: dict,
42 | hp: Hyperparameters,
43 | )-> model_file:
44 | model = train(data=data, hp=hp)
45 | Dir = "model"
46 | model.save(Dir)
47 | return (Dir,)
48 |
49 |
50 | @workflow
51 | def flyteworkflow(
52 | dataset_path: str = GTZAN_ZIP_FILE_PATH
53 | )-> workflow_outputs:
54 | download_gtzan_dataset()
55 | clean_gtzan_dataset()
56 | processed_data = preprocess_gtzan_dataset(
57 | dataset_path=dataset_path,
58 | )
59 | model = train_gtzan_dataset(
60 | data=processed_data,
61 | hp=Hyperparameters(epochs=10)
62 | )
63 |
64 | return (model.model,)
65 |
66 |
67 | if __name__ == "__main__":
68 | print(f"Running {__file__} main...")
69 | print(flyteworkflow())
70 |
--------------------------------------------------------------------------------
/projects/bravemusic/dashboard/Option_B/app.py:
--------------------------------------------------------------------------------
1 | from fastapi import File
2 | import streamlit as st
3 | from streamlit_option_menu import option_menu
4 | import requests
5 | from pydub import AudioSegment
6 |
7 | with st.sidebar:
8 | selected = option_menu(
9 | menu_title="Main Menu", # required
10 | options=["Home", "Project Design", "Meet The Team"], # required
11 | icons=["house", "diagram-2", "people"], # optional
12 | menu_icon="cast", # optional
13 | default_index=0, # optional
14 | )
15 |
16 |
17 | if selected == "Home":
18 | st.markdown(
19 | """
Music Genre Classification
""",
20 | unsafe_allow_html=True,
21 | )
22 | global type
23 | UploadAudio = st.file_uploader("Upload Music To Classify", type=["wav", "mp3"])
24 | st.markdown(""" Play:
""", unsafe_allow_html=True)
25 | st.audio(UploadAudio)
26 |
27 | if st.button("Predict"):
28 | if UploadAudio is not None:
29 | if type == "mp3":
30 | UploadAudio = AudioSegment.from_mp3(UploadAudio)
31 | UploadAudio.export("file.wav", format="wav")
32 | response = requests.post("http://127.0.0.1:8000/predict", data=UploadAudio)
33 | prediction = response
34 | st.success(f"You're Listening to: {prediction}")
35 |
36 |
37 | if selected == "Project Design":
38 | st.markdown(
39 | """Our Project Holistic View
""",
40 | unsafe_allow_html=True,
41 | )
42 | if selected == "Meet The Team":
43 | st.markdown(
44 | """Meet Our Amazing Team
""",
45 | unsafe_allow_html=True,
46 | )
47 |
--------------------------------------------------------------------------------
/projects/bravemusic/dashboard/Option_B/serve.py:
--------------------------------------------------------------------------------
1 | import joblib
2 | import uvicorn
3 | import numpy as np
4 | import pandas as pd
5 | from pydantic import BaseModel
6 | import mlflow
7 | from fastapi import FastAPI, File, UploadFile
8 | from pip import main
9 | import tensorflow as tf
10 | import librosa
11 | import math
12 |
13 | # TODO
14 | # To accept Audio file sending from streamlit
15 |
16 |
17 | # Initiate app instance
18 | app = FastAPI(title="Brave Hyena", version="1.0", description="Trying Locally")
19 |
20 | # in deployment we use remote.fetch_workflow_execution to get the model
21 | model = tf.keras.models.load_model("< Copy paste the flyteworkflow output url>")
22 | genre = {
23 | 0: "blues",
24 | 1: "classical",
25 | 2: "country",
26 | 3: "disco",
27 | 4: "hiphop",
28 | 5: "jazz",
29 | 6: "metal",
30 | 7: "pop",
31 | 8: "reggae",
32 | 9: "rock",
33 | }
34 |
35 |
36 | # Api root or home endpoint
37 | @app.get("/")
38 | @app.get("/home")
39 | def read_home():
40 | """
41 | Home endpoint which can be used to test the availability of the application.
42 | """
43 | return {"message": "Looks Good"}
44 |
45 |
46 | data = {"mfcc": []}
47 |
48 |
49 | @app.post("/predict")
50 | async def predict(file: UploadFile = File(...)):
51 | # Extract data in correct order
52 | hop_length = 512
53 | num_segments = 10
54 | SAMPLE_RATE = 22050
55 | TRACK_DURATION = 30 # measured in seconds
56 | SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION
57 |
58 | samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
59 | num_mfcc_vectors_per_segment = math.ceil(samples_per_segment / hop_length)
60 | audio, sample_rate = librosa.load(file.file, 22050)
61 | for d in range(num_segments):
62 | start = samples_per_segment * d
63 | finish = start + samples_per_segment
64 | mfcc = librosa.feature.mfcc(
65 | audio[start:finish], sample_rate, n_mfcc=13, n_fft=2048, hop_length=512
66 | )
67 | mfcc = mfcc.T
68 | break
69 |
70 | data["mfcc"].append(mfcc.tolist()) if len(
71 | mfcc
72 | ) == num_mfcc_vectors_per_segment else print(
73 | "It's not the same as the Trained data"
74 | )
75 | test = np.array(data["mfcc"])
76 | predict_x = model.predict(test)
77 | prediction = np.argmax(predict_x, axis=1)
78 |
79 | return {"Genre": genre[round(prediction.mean())]}
80 | return item
81 |
82 |
83 | @app.get("/")
84 | async def root():
85 | def main():
86 | uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)
87 |
88 |
89 | if __name__ == "__main__":
90 | main()
91 |
--------------------------------------------------------------------------------
/projects/bravemusic/dashboard/app.py:
--------------------------------------------------------------------------------
1 | import os
2 | from argparse import ArgumentParser
3 | from pathlib import Path
4 |
5 | import streamlit as st
6 |
7 | from flytekit.remote import FlyteRemote
8 | from flytekit.models import filters
9 | from flytekit.models.admin.common import Sort
10 |
11 | from sklearn.datasets import load_digits
12 | from sqlite3 import DatabaseError
13 | from itsdangerous import json
14 | import streamlit as st
15 | from streamlit_option_menu import option_menu
16 | import numpy as np
17 | from pydub import AudioSegment
18 | import librosa
19 | import math
20 | import tensorflow as tf
21 |
22 |
23 | PROJECT_NAME = "flytelab-final".replace("_", "-")
24 | WORKFLOW_NAME = "final.workflows.main"
25 |
26 |
27 | parser = ArgumentParser()
28 | parser.add_argument("--remote", action="store_true")
29 | args = parser.parse_args()
30 |
31 | backend = os.getenv("FLYTE_BACKEND", "remote" if args.remote else "sandbox")
32 |
33 | # configuration for accessing a Flyte cluster backend
34 | remote = FlyteRemote.from_config(
35 | default_project=PROJECT_NAME,
36 | default_domain="development",
37 | config_file_path=Path(__file__).parent / f"{backend}.config",
38 | )
39 |
40 | # get the latest workflow execution
41 | [latest_execution, *_], _ = remote.client.list_executions_paginated(
42 | PROJECT_NAME,
43 | "development",
44 | limit=1,
45 | filters=[
46 | filters.Equal("launch_plan.name", WORKFLOW_NAME),
47 | filters.Equal("phase", "SUCCEEDED"),
48 | ],
49 | sort_by=Sort.from_python_std("desc(execution_created_at)"),
50 | )
51 |
52 | wf_execution = remote.fetch_workflow_execution(name=latest_execution.id.name)
53 | remote.sync(wf_execution, sync_nodes=False)
54 | modelurl = wf_execution.outputs["o0"]
55 | print(model)
56 |
57 |
58 | ############
59 | # App Code #
60 | ############
61 |
62 |
63 | with st.sidebar:
64 | selected = option_menu(
65 | menu_title="Main Menu", # required
66 | options=["Home", "Project Design", "Meet The Team"], # required
67 | icons=["house", "diagram-2", "people"], # optional
68 | menu_icon="cast", # optional
69 | default_index=0, # optional
70 | )
71 |
72 |
73 | if selected == "Home":
74 | st.markdown(
75 | """Music Genre Classification
""",
76 | unsafe_allow_html=True,
77 | )
78 | # in deployment we use remote.fetch_workflow_execution to get the model url
79 | model = tf.keras.models.load_model(modelurl)
80 | genre = {
81 | 0: "Blues",
82 | 1: "Classical",
83 | 2: "Country",
84 | 3: "Disco",
85 | 4: "Hiphop",
86 | 5: "Jazz",
87 | 6: "Metal",
88 | 7: "Pop",
89 | 8: "Reggae",
90 | 9: "Rock",
91 | }
92 |
93 | global type
94 | UploadAudio = st.file_uploader("Upload Music To Classify", type=["wav", "mp3"])
95 | st.markdown(""" Play:
""", unsafe_allow_html=True)
96 | st.audio(UploadAudio)
97 | hop_length = 512
98 | num_segments = 10
99 | SAMPLE_RATE = 22050
100 | TRACK_DURATION = 30 # measured in seconds
101 | SAMPLES_PER_TRACK = SAMPLE_RATE * TRACK_DURATION
102 |
103 | data = {"mfcc": []}
104 |
105 | if st.button("Predict"):
106 | if UploadAudio is not None:
107 | type = UploadAudio.type
108 | if type == "audio/mpeg":
109 | UploadAudio = AudioSegment.from_mp3(UploadAudio)
110 | UploadAudio.export("file.wav", format="wav")
111 | samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
112 | num_mfcc_vectors_per_segment = math.ceil(
113 | samples_per_segment / hop_length
114 | )
115 | audio, sample_rate = librosa.load(UploadAudio, 22050)
116 | for d in range(num_segments):
117 | start = samples_per_segment * d
118 | finish = start + samples_per_segment
119 | mfcc = librosa.feature.mfcc(
120 | audio[start:finish],
121 | sample_rate,
122 | n_mfcc=13,
123 | n_fft=2048,
124 | hop_length=512,
125 | )
126 | mfcc = mfcc.T
127 | break
128 |
129 | data["mfcc"].append(mfcc.tolist()) if len(
130 | mfcc
131 | ) == num_mfcc_vectors_per_segment else print(
132 | "It's not the same as the Trained data"
133 | )
134 |
135 | test = np.array(data["mfcc"])
136 | predict_x = model.predict(test)
137 | predictions = np.argmax(predict_x, axis=1)
138 | prediction = genre[round(predictions.mean())]
139 |
140 | st.markdown(
141 | f"""You're Listening to : {prediction}
""",
142 | unsafe_allow_html=True,
143 | )
144 |
145 | else:
146 | samples_per_segment = int(SAMPLES_PER_TRACK / num_segments)
147 | num_mfcc_vectors_per_segment = math.ceil(
148 | samples_per_segment / hop_length
149 | )
150 | audio, sample_rate = librosa.load(UploadAudio, 22050)
151 | for d in range(num_segments):
152 | start = samples_per_segment * d
153 | finish = start + samples_per_segment
154 | mfcc = librosa.feature.mfcc(
155 | audio[start:finish],
156 | sample_rate,
157 | n_mfcc=13,
158 | n_fft=2048,
159 | hop_length=512,
160 | )
161 | mfcc = mfcc.T
162 | break
163 |
164 | data["mfcc"].append(mfcc.tolist()) if len(
165 | mfcc
166 | ) == num_mfcc_vectors_per_segment else print(
167 | "It's not the same as the Trained data"
168 | )
169 |
170 | test = np.array(data["mfcc"])
171 | predict_x = model.predict(test)
172 | predictions = np.argmax(predict_x, axis=1)
173 | prediction = genre[round(predictions.mean())]
174 |
175 | st.markdown(
176 | f"""You're Listening to : {prediction}
""",
177 | unsafe_allow_html=True,
178 | )
179 | # st.success(f"You're Listening to: {genre[round(prediction.mean())]}")
180 |
181 |
182 | if selected == "Project Design":
183 | st.markdown(
184 | """Our Project Holistic View
""",
185 | unsafe_allow_html=True,
186 | )
187 | if selected == "Meet The Team":
188 | st.markdown(
189 | """Meet Our Amazing Team
""",
190 | unsafe_allow_html=True,
191 | )
192 |
--------------------------------------------------------------------------------
/projects/bravemusic/dashboard/remote.config:
--------------------------------------------------------------------------------
1 | [platform]
2 | url=playground.hosted.unionai.cloud
3 | insecure=False
4 |
5 | [credentials]
6 | client_id=flytepropeller
7 | auth_mode=basic
8 | authorization_metadata-key=flyte-authorization
9 | oauth_scopes=all
10 |
--------------------------------------------------------------------------------
/projects/bravemusic/dashboard/sandbox.config:
--------------------------------------------------------------------------------
1 | [platform]
2 | url=localhost:30081
3 | insecure=True
4 |
5 | [aws]
6 | access_key_id=minio
7 | secret_access_key=miniostorage
8 | endpoint=http://localhost:30084
9 |
--------------------------------------------------------------------------------
/projects/bravemusic/deploy.py:
--------------------------------------------------------------------------------
1 | import os
2 | import subprocess
3 | import uuid
4 | from pathlib import Path
5 |
6 | import docker
7 | import git
8 | import typer
9 |
10 |
11 | app = typer.Typer()
12 |
13 | docker_client = docker.from_env()
14 |
15 |
16 | IMAGE_NAME = "flytelab"
17 | REGISTRY = "ghcr.io/Abdullahi-Ahmed".lower()
18 | PROJECT_NAME = "flytelab-bravemusic".replace("_", "-").lower()
19 | DESCRIPTION = "Hackathon brave-hyenas-2 team project"
20 |
21 |
22 | def create_project(remote: bool):
23 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}-config.yaml"
24 | output = subprocess.run(
25 | [
26 | "flytectl",
27 | "get",
28 | "project",
29 | PROJECT_NAME,
30 | "--config",
31 | config,
32 | ],
33 | capture_output=True,
34 | check=True,
35 | )
36 | if output.stdout.decode().strip():
37 | return
38 |
39 | typer.echo(f"Creating project {PROJECT_NAME}")
40 | subprocess.run(
41 | [
42 | "flytectl",
43 | "create",
44 | "project",
45 | "--project",
46 | PROJECT_NAME,
47 | "--name",
48 | PROJECT_NAME,
49 | "--id",
50 | PROJECT_NAME,
51 | "--description",
52 | DESCRIPTION,
53 | "--config",
54 | config,
55 | ],
56 | check=True,
57 | )
58 |
59 |
60 | def get_version(fast: bool):
61 | repo = git.Repo(".", search_parent_directories=True)
62 | if not fast and repo.is_dirty():
63 | typer.echo(
64 | "Please commit git changes before building. If you haven't updated any system/python dependencies "
65 | "but want to deploy task/workflow code changes, use the --fast flag to do fast registration.",
66 | err=True,
67 | )
68 | raise typer.Exit(code=1)
69 | commit = repo.rev_parse("HEAD")
70 | return commit.hexsha
71 |
72 |
73 | def get_tag(version, registry=None):
74 | return f"{REGISTRY if registry is None else registry}/{IMAGE_NAME}:{PROJECT_NAME}-{version}"
75 |
76 |
77 | def sandbox_docker_build(tag):
78 | typer.echo("Building image in Flyte sandbox")
79 | subprocess.run(
80 | [
81 | "flytectl",
82 | "sandbox",
83 | "exec",
84 | "--",
85 | "docker",
86 | "build",
87 | ".",
88 | "--tag",
89 | tag,
90 | ],
91 | check=True,
92 | )
93 |
94 |
95 | def docker_build(tag: str, remote: bool) -> docker.models.images.Image:
96 | client = docker.from_env()
97 |
98 | # TODO: image build, push, flytectl serialization and registration
99 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}.config"
100 |
101 | typer.echo(f"Building image: {tag}")
102 | image, build_logs = client.images.build(
103 | path=".",
104 | dockerfile="Dockerfile",
105 | tag=tag,
106 | buildargs={
107 | "image": tag,
108 | "config": str(config),
109 | },
110 | )
111 | for line in build_logs:
112 | typer.echo(line)
113 | return image
114 |
115 |
116 | def docker_push(image: docker.models.images.Image):
117 | for line in docker_client.api.push(image.tags[0], stream=True, decode=True):
118 | typer.echo(line)
119 |
120 |
121 | def serialize(tag: str, remote: bool, fast: bool):
122 | typer.echo("Serializing Flyte workflows")
123 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}.config"
124 | package = Path(".") / "flyte-package.tgz"
125 | if package.exists():
126 | os.remove(package)
127 | subprocess.run(
128 | [
129 | "pyflyte",
130 | "-c",
131 | str(config),
132 | "--pkgs",
133 | "bravemusic",
134 | "package",
135 | "--force",
136 | "--image",
137 | tag,
138 | *(["--fast"] if fast else ["--in-container-source-path", "/root"]),
139 | ],
140 | check=True,
141 | # inject the FLYTE_SANDBOX environment variable to the serialization runtime
142 | env={"FLYTE_SANDBOX": "1" if not remote else "0", **os.environ},
143 | )
144 |
145 |
146 | def register(version: str, remote: bool, fast: bool, domain: str):
147 | typer.echo("Registering Flyte workflows")
148 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}-config.yaml"
149 | if fast:
150 | version = f"{version}-fast{uuid.uuid4().hex[:7]}"
151 | subprocess.run(
152 | [
153 | "flytectl",
154 | "-c",
155 | config,
156 | "register",
157 | "files",
158 | "--project",
159 | PROJECT_NAME,
160 | "--domain",
161 | domain,
162 | "--archive",
163 | "flyte-package.tgz",
164 | "--force",
165 | "--version",
166 | version,
167 | ],
168 | check=True,
169 | )
170 | typer.echo(f"Successfully registered version {version}")
171 |
172 |
173 | @app.command()
174 | def main(
175 | remote: bool = False,
176 | fast: bool = False,
177 | domain: str = "staging",
178 | registry: str = None,
179 | ):
180 | if remote and fast:
181 | typer.echo(
182 | "Fast registration is not enabled when deploying to remote. "
183 | "Please deploy your workflows without the --fast flag.",
184 | err=True,
185 | )
186 | create_project(remote)
187 | version = get_version(fast)
188 | tag = get_tag(version, registry)
189 | if not fast:
190 | if remote:
191 | docker_push(docker_build(tag, remote))
192 | else:
193 | sandbox_docker_build(tag)
194 | serialize(tag, remote, fast)
195 | register(version, remote, fast, domain)
196 |
197 |
198 | if __name__ == "__main__":
199 | app()
200 |
--------------------------------------------------------------------------------
/projects/bravemusic/procfile:
--------------------------------------------------------------------------------
1 | web: sh setup.sh && streamlit run ./dashboard/app.py
--------------------------------------------------------------------------------
/projects/bravemusic/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | docker==5.0.3
2 | GitPython==3.1.27
3 | streamlit==1.8.1
4 | typer==0.4.1
5 | click==8.0.4
6 | streamlit_option_menu==0.3.2
7 | pydub==0.25.1
8 | itsdangerous==2.1.2
9 |
--------------------------------------------------------------------------------
/projects/bravemusic/requirements.txt:
--------------------------------------------------------------------------------
1 | flytekit>=0.30.3
2 | pandas==1.4.1
3 | s3fs==2022.2.0
4 | scikit-learn==1.0.2
5 | librosa==0.9.1
6 | tensorflow==2.8.0
7 | numpy==1.21
8 | joblib==1.1.0
9 | requests==2.27.1
10 | dataclasses_json==0.5.7
11 | black==22.1.0
--------------------------------------------------------------------------------
/projects/bravemusic/setup.sh:
--------------------------------------------------------------------------------
1 | mkdir -p ~/.streamlit/
2 |
3 | echo "\
4 | [general]\n\
5 | email = \"your-email@domain.com\"\n\
6 | " > ~/.streamlit/credentials.toml
7 |
8 | echo "\
9 | [server]\n\
10 | headless = true\n\
11 | enableCORS=false\n\
12 | port = $PORT\n\
13 | " > ~/.streamlit/config.toml
--------------------------------------------------------------------------------
/projects/destinations_similarity/.dockerignore:
--------------------------------------------------------------------------------
1 | !.flyte
--------------------------------------------------------------------------------
/projects/destinations_similarity/.flyte/remote-config.yaml:
--------------------------------------------------------------------------------
1 | admin:
2 | # For GRPC endpoints you might want to use dns:///flyte.myexample.com
3 | endpoint: dns:///playground.hosted.unionai.cloud
4 | authType: Pkce
5 | # Change insecure flag to ensure that you use the right setting for your environment
6 | insecure: false
7 | storage:
8 | type: stow
9 | stow:
10 | kind: s3
11 | config:
12 | auth_type: iam
13 | region: us-east-2
14 | logger:
15 | # Logger settings to control logger output. Useful to debug logger:
16 | show-source: true
17 | level: 1
18 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/.flyte/remote.config:
--------------------------------------------------------------------------------
1 | [sdk]
2 | # This option specifies the python packages where-in to search for workflows and tasks workflow packages. These workflows and tasks are then serialized during the `make serialize` commands
3 | workflow_packages=destinations_similarity
4 |
5 | [auth]
6 | raw_output_data_prefix=s3://open-compute-playground
7 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/.flyte/sandbox-config.yaml:
--------------------------------------------------------------------------------
1 | admin:
2 | # For GRPC endpoints you might want to use dns:///flyte.myexample.com
3 | endpoint: dns:///localhost:30081
4 | authType: Pkce
5 | insecure: true
6 | logger:
7 | show-source: true
8 | level: 0
9 | storage:
10 | connection:
11 | access-key: minio
12 | auth-type: accesskey
13 | disable-ssl: true
14 | endpoint: http://localhost:30084
15 | region: us-east-1
16 | secret-key: miniostorage
17 | type: minio
18 | container: "my-s3-bucket"
19 | enable-multicontainer: true
20 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/.flyte/sandbox.config:
--------------------------------------------------------------------------------
1 | [sdk]
2 | # This option specifies the python packages where-in to search for workflows and tasks workflow packages. These workflows and tasks are then serialized during the `make serialize` commands
3 | workflow_packages=destinations_similarity
4 |
5 | [auth]
6 | raw_output_data_prefix=s3://my-s3-bucket/flytelab
7 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/Dockerfile:
--------------------------------------------------------------------------------
1 | # syntax=docker/dockerfile:1
2 |
3 | # Dockerfile that sets up the Flyte image
4 |
5 | ARG image
6 | ARG config
7 |
8 | FROM pytorch/pytorch:1.11.0-cuda11.3-cudnn8-runtime
9 |
10 | # Defining environment variables
11 | ENV APPUSER="flyte"
12 | ENV LANG="C.UTF-8"
13 | ENV LC_ALL="C.UTF-8"
14 | ENV PATH="/home/${APPUSER}/.local/bin:${PATH}"
15 | ENV PYTHONPATH="/home/${APPUSER}:${PYTHONPATH}"
16 |
17 | # Updating and cleaning system
18 | RUN apt-get update && \
19 | apt-get upgrade -y && \
20 | apt-get install -y build-essential git && \
21 | apt-get autoremove -yqq --purge && \
22 | apt-get clean
23 |
24 | # Changing the user so that the container is non-root
25 | RUN useradd -u 1024 -m "${APPUSER}"
26 | USER "${APPUSER}"
27 | WORKDIR "/home/${APPUSER}"
28 |
29 | # Setup virtual environment
30 | ENV VENV="/home/${APPUSER}/venv"
31 | RUN python -m venv ${VENV}
32 | ENV PATH="${VENV}/bin:${PATH}"
33 |
34 | # Copy requirements
35 | COPY requirements.txt "/home/${APPUSER}/requirements.txt"
36 |
37 | # Upgrade pip, install dependencies and awscli
38 | RUN python -m pip install -U pip && \
39 | pip install -r requirements.txt awscli
40 |
41 | # Copy the code and configuration
42 | COPY --chown="${APPUSER}:${APPUSER}" \
43 | destinations_similarity/ "/home/${APPUSER}/destinations_similarity"
44 | COPY $config "/home/${APPUSER}/flyte.config"
45 |
46 | # Download nltk files
47 | RUN python -c "import nltk; nltk.download('punkt'); nltk.download('stopwords')"
48 |
49 | # Tag the image
50 | ENV FLYTE_INTERNAL_IMAGE="$image"
51 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/Makefile:
--------------------------------------------------------------------------------
1 | open-docs:
2 | @sh scripts/open_docs.sh
3 |
4 | rebuild-docs:
5 | @sh scripts/rebuild_docs.sh
6 |
7 | stop-docs-server:
8 | @docker stop sphinx-nginx
--------------------------------------------------------------------------------
/projects/destinations_similarity/README.md:
--------------------------------------------------------------------------------
1 | # Destinations Similarity
2 |
3 | ### Short description
4 |
5 | Similar destination search.
6 |
7 | ### Problem statement
8 |
9 | Kinzinho is an adventurous dog who wants to know all about the destinations he could go to. To do so, Kinzinho decided to extract public data from Wikipedia and Wikivoyager to get to know them all! But now he realized it's too much and wants some recommendations based on where he's traveled before. Can we help him?
10 |
11 | Tip: of course! Everything is 'paw-sible' when you are a dog! :D
12 |
13 | 
14 |
15 |
16 | ### Solution implementation
17 |
18 | The solution to the problem was to extract the public database of Brazilian cities from Wikidata and model the relevant characteristics of cities to build a unique representation of each city. From there, we were able to determine the similarities by calculating the distances between the
19 | vector representations.
20 |
21 | The system's workflow was implemented in Flyte and is shown below:
22 |
23 | 
24 |
25 | ### Detailed solution
26 |
27 | **Objective**: To help Kinzinho define his next travel destination, we seek to find other cities similar to the last travel destination he liked the most.
28 |
29 | **Strategy Solution**: To make a good evaluation between the cities, we chose to make a vector representation of each city in Brazil, encoding general information about each city such as its history, its geography, its climate and its tourist attractions. We chose this strategy because with a vector representation of the city we were able to apply similarity calculation operations between cities, considering various information about them.
30 |
31 | **Input data**: For our solution we use the following data from Wikipedia PT: summary, history, geography, climate; from Wikivoyage EN: summary, "See" section, "Do" section.
32 |
33 | **Preprocessing**: To process the data and extract only important information, we apply a series of pre-processing to clean unnecessary information and homogenize the texts.
34 |
35 | **Model**: To generate the best vector representation of each city's features, we used a pre-trained state-of-the-art model based on Transformers (BERTimbau). As a vector representation of each feature of the city, we use the output of the last layer of the BERTimbau language model. The vector representation of each city is generated from the average of the vectors of its features.
36 |
37 | **Similarity**: To calculate the similarity between the vector representations of each city, we use [Faiss](https://github.com/facebookresearch/faiss), an highly optimized similarity search library, to calculate the Euclidean distance between an input vector query (vector of the last city visited by Kinzinho) and all the other vectors of the cities available in our portfolio.
38 |
39 |
40 | ### Streamlit interface
41 |
42 | > ###### WARNING
43 | > It is important to note that our app on Streamlit does not have all the cities available in the original database for design reasons. The original database has about 5 thousand cities while the base used in the app will only present about 400 cities, the ones with pages on Wikivoyage. However, with some adjustments and a properly developed environment, it is possible to extend this analysis to all other cities.
44 |
45 | The user interaction interface was built using the Streamlit tool. After local testing, the stable version of the tool was posted on Streamlit's public server. You can access the interface through the link below.
46 |
47 |
48 | ### Sphinx docs
49 |
50 | > ###### WARNING
51 | > This documentation was built with Unix operating systems (or executions using WSL) in mind.
52 |
53 | It is possible to generate a detailed HTML documentation of the project through automation made in Sphinx and NGIX server to host the static HTMLs.
54 |
55 | 
56 |
57 | There's not much mystery about building the documentation in HTML. We've already automated some things to make it easier. Generally speaking, Sphinx is responsible for creating a static page of an HTML documentation using manually typed information or other information inserted into the developed code. These generated static pages are moved into a folder in a container running an NGINX image which hosts the documentation page.
58 |
59 | To build the Docker image responsible for the documentation and start hosting the server, just run the command
60 |
61 | make open-docs
62 |
63 | > ###### WARNING
64 | > For this command to work it is necessary that in your system it is possible to run Makefile files and ensure that the working directory is inside `projects/destinations_similarity`.
65 |
66 | Once the command has been successfully executed, you can check with the command below if the container is running normally on your machine.
67 |
68 |
69 | docker ps
70 |
71 | the result should be
72 |
73 |
74 | $ docker ps
75 | CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
76 | ... nginx "/docker-entrypoint.…" 36 seconds ago Up 35 seconds 0.0.0.0:8080->80/tcp, :::8080->80/tcp sphinx-nginx
77 |
78 |
79 | ### Team responsible for the project
80 |
81 | If you want to get in touch with the team members, use the communication channels below.
82 |
83 | | | E-mail | Linkedin |
84 | |---------------- |------------------------- |------------------------------------------------------------ |
85 | | Sergio Junior | sergio.junior@hurb.com | https://www.linkedin.com/in/sergio-barreto-jr/ |
86 | | Renata Gotler | renata.gotler@hurb.com | https://www.linkedin.com/in/renata-gotler/ |
87 | | Matheus Moreno | matheus.moreno@hurb.com | https://www.linkedin.com/in/matheusfmoreno/ |
88 | | Patrick Braz | patrick.braz@hurb.com | https://www.linkedin.com/in/patrick-franco-braz-752948163/ |
89 |
90 | ### Acknowledgments
91 |
92 | Kinzinho and his humans would like to thank everyone involved in making this project possible. They would also like to thank [Hurb](https://us.hurb.com/?pos=us)'s support in allowing and influencing participation in the Hackathon as training and recognition of the team's potential. And finally, thank Kinzinho himself for making the days of his humans around better.
93 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/conf.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Configuration file for the Sphinx documentation builder.
3 | #
4 | # This file only contains a selection of the most common options. For a full
5 | # list see the documentation:
6 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
7 |
8 | # -- Path setup --------------------------------------------------------------
9 |
10 | # If extensions (or modules to document with autodoc) are in another directory,
11 | # add these directories to sys.path here. If the directory is relative to the
12 | # documentation root, use os.path.abspath to make it absolute, like shown here.
13 |
14 | import os
15 | import sys
16 | #sys.path.insert(0, os.path.abspath("."))
17 | sys.path.insert(0, "/home")
18 |
19 | # -- Project information -----------------------------------------------------
20 | #from nightswatch.version import __version__
21 |
22 | # The master toctree document.
23 | master_doc = "index"
24 |
25 | project = "Destination Similarity"
26 | copyright = "2022, hurb.com"
27 | author = "data.science@hurb.com"
28 | release = "0.0.1"
29 | # -- General configuration ---------------------------------------------------
30 |
31 | # Add any Sphinx extension module names here, as strings. They can be
32 | # extensions coming with Sphinx (named "sphinx.ext.*") or your custom
33 | # ones.
34 | extensions = [
35 | "sphinx_rtd_theme",
36 | "sphinx.ext.autodoc",
37 | "sphinx.ext.viewcode",
38 | "sphinx.ext.napoleon",
39 | "sphinx.ext.autosummary"
40 | ]
41 |
42 | # Add any paths that contain templates here, relative to this directory.
43 | templates_path = ["_templates"]
44 |
45 | # List of patterns, relative to source directory, that match files and
46 | # directories to ignore when looking for source files.
47 | # This pattern also affects html_static_path and html_extra_path.
48 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store", "env", ".flyte", "scripts"]
49 |
50 | # The suffix(es) of source filenames.
51 | # You can specify multiple suffix as a list of string:
52 | # source_suffix = ['.rst', '.md']
53 | source_suffix = [".rst"]
54 |
55 | # If true, the current module name will be prepended to all description
56 | # unit titles (such as .. function::).
57 | add_module_names = True
58 |
59 | # A boolean that decides whether codeauthor and sectionauthor directives produce any output in the
60 | # built files.
61 | show_authors = True
62 |
63 | suppress_warnings = [
64 | ]
65 |
66 | autodoc_mock_imports = ["flytekit", "faiss", "torch", "requests", "BeautifulSoup", "pandas", "streamlit", "numpy", "bs4",
67 | "docker", "git", "typer", "sklearn", "streamlit", "argparse", "transformers", "swifter", "nltk",
68 | "unidecode", "deep_translator"]
69 |
70 | # -- Options for HTML output -------------------------------------------------
71 |
72 | # The theme to use for HTML and HTML Help pages. See the documentation for
73 | # a list of builtin themes.
74 | #
75 | html_theme = "sphinx_rtd_theme"
76 |
77 | html_theme_options = {
78 | "analytics_anonymize_ip": False,
79 | "logo_only": False,
80 | "display_version": True,
81 | "prev_next_buttons_location": "bottom",
82 | "style_external_links": True,
83 | "vcs_pageview_mode": "",
84 | "style_nav_header_background": "gray",
85 | "collapse_navigation": False,
86 | "sticky_navigation": True,
87 | "navigation_depth": 4,
88 | "includehidden": True,
89 | "titles_only": False
90 | }
91 |
92 | html_logo = "docs/images/vamoDalheLogo.jpeg"
93 |
94 | # Add any paths that contain custom static files (such as style sheets) here,
95 | # relative to this directory. They are copied after the builtin static files,
96 | # so a file named "default.css" will overwrite the builtin "default.css".
97 | html_static_path = ["_static"]
98 |
99 | # ---------------------------------------------------
100 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/dashboard/app.py:
--------------------------------------------------------------------------------
1 | """Module for the Streamlit app."""
2 |
3 | # pylint: disable=no-value-for-parameter
4 |
5 | import os
6 | import sys
7 | import logging
8 | from argparse import ArgumentParser
9 |
10 | from typing import List
11 | import faiss
12 | import streamlit as st
13 | import pandas as pd
14 | import numpy as np
15 | from PIL import Image
16 |
17 | GCS_BUCKET_PATH = "https://storage.googleapis.com/dsc-public-info/datasets/"
18 | EMBEDDINGS_FILENAME = "flytelab_embeddings.parquet"
19 | DATASET_FILENAME = "flytelab_dataset.parquet"
20 |
21 | CURRENT_DIRECTORY = os.path.dirname(os.path.abspath(__file__))
22 |
23 | # Logging config
24 | LOGGER = logging.getLogger(__name__)
25 |
26 | logging.basicConfig(
27 | stream=sys.stdout,
28 | level=logging.INFO,
29 | format="[%(asctime)s] %(name)s: %(levelname)s | %(message)s"
30 | )
31 |
32 |
33 | def retrieve_dataframe_from_remote(dataset_name: str) -> pd.DataFrame:
34 | """Retrieve a dataset saved as Parquet from remote."""
35 | return pd.read_parquet(GCS_BUCKET_PATH + dataset_name)
36 |
37 |
38 | def get_k_nearest_neighbors(
39 | embeddings: pd.DataFrame, k_neighbors: int, city_name: str, state_name: str
40 | ) -> pd.DataFrame:
41 | """Retrieve the k-nearest neighbors of a city.
42 |
43 | Args:
44 | embeddings (pd.DataFrame): city vectors
45 | k_neighbors (int): number os similar cities to present
46 | city_name (str): last city visited
47 | state_name (str): last state visited
48 | Returns:
49 | pd.DataFrame: the cities most similar to city_name
50 | """
51 | # Retrieve vectors to search
52 | vec_name = embeddings[~(
53 | (embeddings['city'] == city_name) & (embeddings['state'] == state_name)
54 | )].reset_index(drop=True)
55 | vec = vec_name.drop(['city', 'state'], axis=1)
56 |
57 | # Initialize faiss
58 | index = faiss.IndexFlatL2(vec.shape[1])
59 | index.add(np.ascontiguousarray(np.float32(vec.values)))
60 |
61 | # Build query
62 | query = embeddings[(
63 | (embeddings['city'] == city_name) & (embeddings['state'] == state_name)
64 | )].drop(['city', 'state'], axis=1).values
65 | query = np.float32(query)
66 |
67 | # Retrieve k-nearest neighbors
68 | _, indexes = index.search(query, k_neighbors)
69 | nearest = vec_name[['city', 'state']].iloc[indexes[0]]
70 |
71 | return nearest
72 |
73 |
74 | def build_output(
75 | dataset: pd.DataFrame, nearest_cities: pd.DataFrame,
76 | columns_to_retrieve: List[str]
77 | ) -> pd.DataFrame:
78 | """Build the output text of inference.
79 |
80 | Args:
81 | dataset (pd.DataFrame): dataset scraper from wikipedia and wikivoyage
82 | nearest_cities (pd.DataFrame): output model of the nearest cities
83 | columns_to_retrieve (List[str]): list of columns to add to output
84 |
85 | Returns:
86 | str: Markdown-formatted text
87 | """
88 | output = ""
89 | default_desc = (
90 | "\nOops... Unfortunately we don't have records for this city. "
91 | "\U0001F615\n"
92 | )
93 |
94 | for _, row in nearest_cities.iterrows():
95 | output += f"\n## {row.city}, {row.state}\n"
96 |
97 | pois_suggestion = dataset[
98 | (dataset['city'] == row.city) & (dataset['state'] == row.state)
99 | ][columns_to_retrieve].iloc[0]
100 |
101 | for column in columns_to_retrieve:
102 | section = ' '.join(column.split('_')[:-2]).capitalize()
103 | output += (
104 | f"\n### {section}"
105 | f"\n{pois_suggestion[column] or default_desc}"
106 | )
107 |
108 | return output
109 |
110 |
111 | if __name__ == '__main__':
112 | # Retrieve arguments
113 | parser = ArgumentParser()
114 | parser.add_argument("--remote", action="store_true")
115 | args = parser.parse_args()
116 | backend = os.getenv(
117 | "FLYTE_BACKEND", 'remote' if args.remote else 'sandbox')
118 |
119 | # Retrieve datasets from remote
120 | embs_df = retrieve_dataframe_from_remote(EMBEDDINGS_FILENAME)
121 | wiki_df = retrieve_dataframe_from_remote(DATASET_FILENAME)
122 |
123 | # App definition
124 | st.write(
125 | "# Flytelab: Destinations Similarity\n"
126 | "Kinder is an adventurous dog who loves to travel! He enjoys "
127 | "specially nature places: beaches, waterfalls, trails and more, "
128 | "which Brazil surely is abundant of. He wants experiences in other "
129 | "cities but he doesn't know where to go.\n"
130 | "## So he is now asking, **where should I go next**?"
131 | )
132 |
133 | beach_kinder = Image.open(
134 | os.path.join(CURRENT_DIRECTORY, 'beach_kinder.jpeg'))
135 | st.image(beach_kinder, caption='Kinder in love with the beach')
136 |
137 | st.write(
138 | "Help Kinder by selecting a city you like in Brazil below so we can "
139 | "recommend similar places that he will most certainly enjoy!"
140 | )
141 |
142 | # Select city, state, and n of recommendations
143 | desired_state = st.selectbox(
144 | 'From state...',
145 | embs_df['state'].unique().tolist()
146 | )
147 | desired_city = st.selectbox(
148 | 'I like the city:',
149 | embs_df[embs_df['state'] == desired_state]['city'].unique().tolist()
150 | )
151 |
152 | n_cities = st.slider('How many recommendations do you want?', 1, 30, 5)
153 |
154 | # Get recommendations
155 | cities_recommended = get_k_nearest_neighbors(
156 | embeddings=embs_df, k_neighbors=n_cities,
157 | city_name=desired_city, state_name=desired_state
158 | )
159 |
160 | st.write("## So, where next?")
161 | st.write(build_output(
162 | dataset=wiki_df, nearest_cities=cities_recommended,
163 | columns_to_retrieve=[
164 | 'summary_wikivoyage_en'
165 | ]
166 | ))
167 |
168 | kinder = Image.open(os.path.join(CURRENT_DIRECTORY, 'kinder.jpeg'))
169 | st.image(kinder, caption='The marvelous Kinder')
170 |
171 | st.write(
172 | "We hope you enjoy the recommendations! See you on your next trip."
173 | )
174 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/dashboard/beach_kinder.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/destinations_similarity/dashboard/beach_kinder.jpeg
--------------------------------------------------------------------------------
/projects/destinations_similarity/dashboard/kinder.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/destinations_similarity/dashboard/kinder.jpeg
--------------------------------------------------------------------------------
/projects/destinations_similarity/dashboard/requirements.txt:
--------------------------------------------------------------------------------
1 | flytekit>=0.30.3
2 | pandas>=1.3.5
3 | requests~=2.27.1
4 | faiss-cpu~=1.7.2
5 | numpy>=1.21.5
6 | deep-translator~=1.8.3
7 | streamlit
--------------------------------------------------------------------------------
/projects/destinations_similarity/deploy.py:
--------------------------------------------------------------------------------
1 | """Deployment script for Flyte projects."""
2 |
3 | import os
4 | import uuid
5 | import subprocess
6 | from pathlib import Path
7 |
8 | import docker
9 | import git
10 | import typer
11 |
12 |
13 | IMAGE_NAME = "flytelab"
14 | REGISTRY = "ghcr.io/patrickfbraz".lower()
15 | PROJECT_NAME = "vamos-dalhe"
16 | DESCRIPTION = "Hurb project to the Flyte Hackathon"
17 |
18 |
19 | app = typer.Typer()
20 |
21 | docker_client = docker.from_env()
22 |
23 |
24 | def create_project(remote: bool):
25 | """Create project on Flyte cluster."""
26 | config_type = 'remote' if remote else 'sandbox'
27 | config = Path(".flyte") / f"{config_type}-config.yaml"
28 |
29 | output = subprocess.run(
30 | [
31 | "flytectl", "get", "project", PROJECT_NAME,
32 | "--config", config,
33 | ],
34 | capture_output=True,
35 | check=True,
36 | )
37 |
38 | if not output.stdout.decode().strip():
39 | typer.echo(f"Creating project {PROJECT_NAME}...")
40 | subprocess.run(
41 | [
42 | "flytectl", "create", "project",
43 | "--project", PROJECT_NAME,
44 | "--name", PROJECT_NAME,
45 | "--id", PROJECT_NAME,
46 | "--description", DESCRIPTION,
47 | "--config", config,
48 | ],
49 | check=True,
50 | )
51 |
52 |
53 | def get_version(fast: bool):
54 | """Get git version of code."""
55 | repo = git.Repo(".", search_parent_directories=True)
56 |
57 | if not fast and repo.is_dirty():
58 | typer.echo(
59 | "Please commit git changes before building. If you haven't updated"
60 | " any system/python dependencies but want to deploy task/workflow "
61 | "code changes, use the --fast flag to do fast registration.",
62 | err=True
63 | )
64 | raise typer.Exit(code=1)
65 |
66 | commit = repo.rev_parse("HEAD")
67 | return commit.hexsha
68 |
69 |
70 | def get_tag(version, registry=None):
71 | """Get the tag of the project's image."""
72 | return (
73 | f"{REGISTRY if registry is None else registry}/{IMAGE_NAME}:"
74 | f"{PROJECT_NAME}-{version}"
75 | )
76 |
77 |
78 | def sandbox_docker_build(tag):
79 | """Build image on the sandbox cluster."""
80 | typer.echo("Building image in Flyte sandbox...")
81 | subprocess.run(
82 | [
83 | "flytectl", "sandbox", "exec", "--",
84 | "docker", "build", ".", "--tag", tag,
85 | ],
86 | check=True,
87 | )
88 |
89 |
90 | def docker_build(tag: str, remote: bool) -> docker.models.images.Image:
91 | """Build the image locally."""
92 | client = docker.from_env()
93 |
94 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}.config"
95 |
96 | typer.echo(f"Building image: {tag}...")
97 | image, build_logs = client.images.build(
98 | path=".", dockerfile="Dockerfile", tag=tag,
99 | buildargs={"image": tag, "config": str(config)}, rm=True,
100 | )
101 |
102 | for line in build_logs:
103 | typer.echo(line)
104 |
105 | return image
106 |
107 |
108 | def docker_push(image: docker.models.images.Image):
109 | """Push the image to the remote registry."""
110 | for line in docker_client.api.push(
111 | image.tags[0], stream=True, decode=True
112 | ):
113 | typer.echo(line)
114 |
115 |
116 | def serialize(tag: str, remote: bool, fast: bool):
117 | """Perform serialization of source code."""
118 | typer.echo("Serializing Flyte workflows...")
119 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}.config"
120 |
121 | package = Path(".") / "flyte-package.tgz"
122 | if package.exists():
123 | os.remove(package)
124 |
125 | subprocess.run(
126 | [
127 | "pyflyte", "-c", str(config),
128 | "--pkgs", "destinations_similarity",
129 | "package", "--force", "--image", tag,
130 | *(
131 | ["--fast"] if fast
132 | else ["--in-container-source-path", "/home/flyte"]
133 | ),
134 | ],
135 | check=True,
136 | # Inject the FLYTE_SANDBOX env variable to the serialization runtime
137 | env={"FLYTE_SANDBOX": "1" if not remote else "0", **os.environ},
138 | )
139 |
140 |
141 | def register(version: str, remote: bool, fast: bool, domain: str):
142 | """Register workflows to cluster."""
143 | typer.echo("Registering Flyte workflows...")
144 | config_type = 'remote' if remote else 'sandbox'
145 | config = Path(".flyte") / f"{config_type}-config.yaml"
146 |
147 | if fast:
148 | version = f"{version}-fast{uuid.uuid4().hex[:7]}"
149 |
150 | subprocess.run(
151 | [
152 | "flytectl", "-c", config, "register", "files",
153 | "--project", PROJECT_NAME,
154 | "--domain", domain,
155 | "--archive", "flyte-package.tgz",
156 | "--force",
157 | "--version", version
158 | ],
159 | check=True,
160 | )
161 | typer.echo(f"Successfully registered version {version}.")
162 |
163 |
164 | @app.command()
165 | def main(
166 | remote: bool = False, fast: bool = False, domain: str = "development",
167 | registry: str = None
168 | ) -> None:
169 | """Deploy Flyte workflows locally or remotely."""
170 | if remote and fast:
171 | typer.echo(
172 | "Fast registration is not enabled when deploying to remote. "
173 | "Please deploy your workflows without the --fast flag.",
174 | err=True
175 | )
176 |
177 | create_project(remote)
178 | version = get_version(fast)
179 | tag = get_tag(version, registry)
180 | if not fast:
181 | if remote:
182 | docker_push(docker_build(tag, remote))
183 | else:
184 | sandbox_docker_build(tag)
185 | serialize(tag, remote, fast)
186 | register(version, remote, fast, domain)
187 |
188 |
189 | if __name__ == "__main__":
190 | app()
191 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/destinations_similarity/__init__.py:
--------------------------------------------------------------------------------
1 | """Package with the source code for the destinations_similarity project."""
2 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/destinations_similarity/processing/__init__.py:
--------------------------------------------------------------------------------
1 | """Processing submodule for the project."""
2 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/destinations_similarity/processing/feature_engineering.py:
--------------------------------------------------------------------------------
1 | """Feature engineering for the data."""
2 |
3 | import torch
4 | import pandas as pd
5 | from transformers import AutoTokenizer, AutoModel
6 |
7 |
8 | BASE_MODEL = 'neuralmind/bert-base-portuguese-cased'
9 |
10 |
11 | class TextVectorizer():
12 | """Class used to vectorize text."""
13 |
14 | def __init__(self, model: str = BASE_MODEL) -> None:
15 | """Initialize class.
16 |
17 | Args:
18 | model (str): huggingface path
19 | """
20 | self.tokenizer = AutoTokenizer.from_pretrained(
21 | model, do_lower_case=False)
22 | self.model = AutoModel.from_pretrained(model)
23 |
24 | def encode_inputs(self, series_text: pd.Series) -> torch.Tensor:
25 | """Encode inputs.
26 |
27 | Args:
28 | series_text (pd.Series): text to be vectorized
29 |
30 | Returns:
31 | torch.tensor: tokens ids
32 | """
33 | input_ids = self.tokenizer(
34 | list(series_text), padding=True, truncation=True,
35 | max_length=256, return_tensors="pt", add_special_tokens=True
36 | )
37 | return input_ids
38 |
39 | def get_df_embedding(self, input_ids: pd.Series) -> pd.DataFrame:
40 | """Generate DataFrame with all text vector representations.
41 |
42 | Args:
43 | input_ids (torch.tensor): tokens ids
44 |
45 | Returns:
46 | pd.DataFrame: input id vectors
47 | """
48 | with torch.no_grad():
49 | outs = self.model(
50 | input_ids['input_ids']
51 | )[0][:, 1:-1, :].mean(axis=1).cpu().numpy()
52 | return pd.DataFrame(outs)
53 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/destinations_similarity/processing/text_preprocessing.py:
--------------------------------------------------------------------------------
1 | """Text preprocessing tools."""
2 |
3 | # pylama: ignore=W0611
4 | # pylint: disable=unused-import,broad-except
5 |
6 | import re
7 | from typing import List
8 | from string import punctuation
9 |
10 | import swifter
11 | import pandas as pd
12 | from nltk.tokenize import word_tokenize
13 | from nltk.corpus import stopwords
14 | from unidecode import unidecode
15 | from deep_translator import GoogleTranslator
16 |
17 |
18 | def lower_text(text: str) -> str:
19 | """Lower a text.
20 |
21 | Args:
22 | text (str): text to be lowered
23 |
24 | Returns:
25 | str: lower text
26 | """
27 | return text.lower()
28 |
29 |
30 | def clean_text(texts: str) -> str:
31 | """Remove unnecessary parts of the text.
32 |
33 | Args:
34 | text (str): text to be cleaned
35 |
36 | Returns:
37 | str: cleaned text
38 | """
39 | # Remove empty lines
40 | clean_empt_msg = re.compile(r'\n\s*\n')
41 | text = re.sub(clean_empt_msg, " ", texts)
42 |
43 | # Transliterate into ASCII
44 | text = unidecode(text)
45 |
46 | # Remove API mensage
47 | clean_msg = re.compile(r'\(.*?\)')
48 | text = re.sub(clean_msg, ' ', text)
49 |
50 | # Remove HTML characteres
51 | cleanr = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
52 | text = re.sub(cleanr, ' ', text)
53 |
54 | # Remove punctuations and numbers
55 | clean_pontuation = re.compile(r'[^a-zA-Z]')
56 | text = re.sub(clean_pontuation, ' ', text)
57 |
58 | # Single character removal
59 | clean_char = re.compile(r"\s+[a-zA-Z]\s+")
60 | text = re.sub(clean_char, ' ', text)
61 |
62 | # Removing multiple spaces
63 | clean_space = re.compile(r'\s+')
64 | text = re.sub(clean_space, ' ', text)
65 |
66 | return text
67 |
68 |
69 | def remove_stopwords(
70 | list_tokens: List[str],
71 | stopword_list: List[str] = stopwords.words('portuguese')
72 | ) -> str:
73 | """Remove stopwords of the text.
74 |
75 | Args:
76 | list_tokens (List[str]): list of sentence tokens
77 | stopword_list (List[str], optional): list of stopwords. Defaults to
78 | nltk's portuguese stopwords.
79 |
80 | Returns:
81 | List[str]: text without stopwords
82 | """
83 | stopword = (
84 | stopword_list +
85 | list(punctuation) +
86 | ["\n", 'municipio', 'clima']
87 | )
88 |
89 | txt_wo_stopwords = filter(lambda item: item not in stopword, list_tokens)
90 | return " ".join(txt_wo_stopwords)
91 |
92 |
93 | def tokenizer(text: str) -> List[str]:
94 | """Tokenize the text.
95 |
96 | Args:
97 | text (str): text to be tokenized
98 |
99 | Returns:
100 | List[str]: list of sentence tokens
101 | """
102 | return word_tokenize(text)
103 |
104 |
105 | def preprocess_text(dataframe: pd.DataFrame, column_name: str) -> pd.Series:
106 | """Execute all of the preprocess methods.
107 |
108 | Args:
109 | dataframe (pd.DataFrame): dataframe with column to be processed
110 | column_name (str): column name to be processed
111 |
112 | Returns:
113 | pd.Series: column processed
114 | """
115 | aux = dataframe[column_name].str.lower()
116 | aux = aux.swifter.apply(lambda x: clean_text(str(x)))
117 | aux = aux.swifter.apply(
118 | lambda x: remove_stopwords(list_tokens=tokenizer(x)))
119 | return aux
120 |
121 |
122 | def translate_description_series(
123 | dataframe: pd.DataFrame, column_name: str, target_lang: str = 'pt'
124 | ) -> pd.Series:
125 | """Translate columns to another language.
126 |
127 | Args:
128 | dataframe (pd.DataFrame): dataframe with column to be translated
129 | column_name (str): column name to be translated
130 | target_lang (str): taget language
131 |
132 | Returns:
133 | pd.Series: column translated
134 | """
135 | dataframe[column_name] = dataframe[column_name].fillna("")
136 | dataframe[column_name] = dataframe[column_name].swifter.apply(
137 | lambda x: translate_description(x, target_lang)
138 | if isinstance(x, str) else x
139 | )
140 | return dataframe[column_name]
141 |
142 |
143 | def translate_description(text: str, target_lang: str = 'pt') -> str:
144 | """Translate non-portuguese text.
145 |
146 | Args:
147 | text (str): column name to be translated
148 | target_lang (str): taget language
149 |
150 | Returns:
151 | str: text translated
152 | """
153 | try:
154 | return GoogleTranslator(
155 | source='auto', target=target_lang).translate(text)
156 | except Exception:
157 | return text
158 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/destinations_similarity/scraper/__init__.py:
--------------------------------------------------------------------------------
1 | """Scraper submodule for the project."""
2 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/destinations_similarity/scraper/brazilian_cities.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | """Module used to extract the base data from the wikis."""
3 |
4 | import json
5 | from typing import Any
6 |
7 | import requests
8 | import pandas as pd
9 |
10 |
11 | WIKIDATA_ENDPOINT = 'https://query.wikidata.org/sparql'
12 |
13 | WIKIDATA_QUERY = """
14 | PREFIX schema:
15 |
16 | SELECT ?cityLabel ?stateLabel ?wikivoyageLabel ?wikipediaLabel WHERE {
17 | ?city wdt:P31 wd:Q3184121;
18 | wdt:P131 ?state.
19 | OPTIONAL {
20 | ?wikipedia schema:about ?city.
21 | ?wikipedia schema:isPartOf ;
22 | schema:name ?wikipediaLabel.
23 | }
24 | OPTIONAL {
25 | ?wikivoyage schema:about ?city.
26 | ?wikivoyage schema:isPartOf ;
27 | schema:name ?wikivoyageLabel.
28 | }
29 | SERVICE wikibase:label {
30 | bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en".
31 | }
32 | }
33 | """
34 |
35 |
36 | def get_dataframe(df_object: object, **kwargs) -> pd.DataFrame:
37 | """Generate a pandas DataFrame from a DataFrame-like object.
38 |
39 | Args:
40 | df_object (object): A DataFrame-like object (dict, list, etc).
41 |
42 | Returns:
43 | pd.DataFrame: The DataFrame.
44 | """
45 | dataframe = pd.DataFrame(df_object)
46 |
47 | if kwargs.get('generate_city_id'):
48 | dataframe['city_id'] = [(row + 1) for row in range(dataframe.shape[0])]
49 |
50 | return dataframe
51 |
52 |
53 | def get_brazilian_cities_data(save_data: callable, *args, **kwargs) -> Any:
54 | """Get data from brazilian cities from Wikimedia pages.
55 |
56 | Args:
57 | save_data (callable): Function to process the retrieved data.
58 |
59 | Returns:
60 | Any: Type returned by save_data.
61 | """
62 | request = requests.get(
63 | WIKIDATA_ENDPOINT,
64 | params={
65 | 'query': WIKIDATA_QUERY,
66 | 'format': 'json',
67 | },
68 | allow_redirects=True,
69 | stream=True,
70 | )
71 |
72 | response = json.loads(request.text)
73 | cities_raw = response['results']['bindings']
74 |
75 | cities = sorted([{
76 | 'city': elem.get('cityLabel', {}).get('value'),
77 | 'state': elem.get('stateLabel', {}).get('value'),
78 | 'title_wikipedia_pt': elem.get('wikipediaLabel', {}).get('value'),
79 | 'title_wikivoyage_en': elem.get('wikivoyageLabel', {}).get('value'),
80 | } for elem in cities_raw], key=lambda x: x['city'])
81 |
82 | return save_data(cities, *args, **kwargs)
83 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/destinations_similarity/scraper/extractor.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | """Base driver to scrape data from Wikimedia websites."""
3 |
4 | import re
5 | import json
6 | from typing import Dict, List
7 |
8 | import requests
9 | from bs4 import BeautifulSoup
10 |
11 |
12 | APPLICATION_HEADERS = {
13 | 'User-Agent': 'destinations_similarity/0.1'
14 | }
15 |
16 |
17 | class WikiExtractor(object):
18 | """Class for extracting content from Wikimedia."""
19 |
20 | def __init__(self, wiki: str, lang: str):
21 | """Initialize driver."""
22 | self.wiki = wiki
23 | self.lang = lang
24 | self.rest_url = f"https://{lang}.{wiki}.org/api/rest_v1"
25 |
26 | # Create Session object for faster retrieval
27 | self.session = requests.Session()
28 | self.session.headers.update(APPLICATION_HEADERS)
29 |
30 | @classmethod
31 | def clean_content(cls, text: str, tags: List[str] = None) -> List[str]:
32 | """Remove HTML tags and citations from text.
33 |
34 | Args:
35 | text (str): The text to be cleaned.
36 | tags (List[str], optional): List of tags to be extracted.
37 | Defaults to ['p', 'li'].
38 |
39 | Returns:
40 | List[str]: A list with each piece of text extracted from the
41 | specified tags.
42 | """
43 | tags = tags or ['p', 'li']
44 | soup = BeautifulSoup(text, "html.parser")
45 | return [
46 | re.sub(r'\[.*?\]|<.*?>', '', str(x)).strip()
47 | for x in soup.find_all(tags)
48 | ]
49 |
50 | def extract_images(self, page: str) -> List[str]:
51 | """Retrieve images (as links) for a specified page.
52 |
53 | Args:
54 | page (str): The name of the page.
55 |
56 | Returns:
57 | List[str]: A list with the URLs of the images.
58 | """
59 | request = self.session.get(f"{self.rest_url}/page/media-list/{page}")
60 | response = json.loads(request.text)
61 | items = response.get('items', [])
62 |
63 | images_links = []
64 |
65 | for item in items:
66 | if item['type'] == 'image' and 'srcset' in item:
67 | images_links += [f"https:{item['srcset'][0]['src']}"]
68 |
69 | return images_links
70 |
71 | def extract_content_raw(
72 | self, page: str, summary: bool, sections: List[str] = None
73 | ) -> Dict[str, str]:
74 | """Retrieve the HTML-formatted sections from a page.
75 |
76 | Args:
77 | page (str): The name of the page.
78 | summary (bool): Boolean that specifies if the summary for the page
79 | must be retrieved.
80 | sections (List[str], optional): A list of sections to be retrieved.
81 | Defaults to None.
82 |
83 | Returns:
84 | Dict[str, str]: A dictionary with the sections, where each key is
85 | the section name.
86 | """
87 | sections = sections or []
88 |
89 | request = self.session.get(
90 | f"{self.rest_url}/page/mobile-sections/{page}")
91 | response = json.loads(request.text)
92 |
93 | sections_data = {}
94 |
95 | # Retrieve summary
96 | if summary and 'lead' in response:
97 | sections_data['summary'] = response['lead']['sections'][0]['text']
98 |
99 | # Retrieve sections and subsections (with HTML tags)
100 | if sections and 'remaining' in response:
101 | page_sections = response['remaining']['sections']
102 |
103 | # Get index of sections found
104 | idx_sections_found = [
105 | i for i, section in enumerate(page_sections)
106 | if section.get('line') in sections
107 | ]
108 |
109 | # Get level of each section, to identify subsections
110 | levels = [section.get('toclevel', -2) for section in page_sections]
111 |
112 | for start in idx_sections_found:
113 | try:
114 | # Get index next section at same toclevel
115 | end = next(
116 | i + (start + 1)
117 | for i, level in enumerate(levels[start + 1:])
118 | if level <= levels[start]
119 | )
120 | except StopIteration: # End of page reached
121 | end = len(page_sections)
122 |
123 | # Update dictionary
124 | sections_data[page_sections[start]['line']] = '\n'.join([
125 | subsection.get('text', '')
126 | for subsection in page_sections[start:end]
127 | ])
128 |
129 | return sections_data
130 |
131 | def extract_content(
132 | self, page: str, summary: bool, sections: List[str] = None,
133 | sections_tags: Dict[str, List[str]] = None,
134 | section_types: Dict[str, str] = None
135 | ) -> Dict[str, str]:
136 | """Retrieve formatted (clean) text from Wikipedia."""
137 | sections = sections or []
138 | results = self.extract_content_raw(page, summary, sections)
139 |
140 | # Get tags to keep and types to convert
141 | sections_tags = {
142 | section: sections_tags.get(section, ['p', 'li'])
143 | for section in ['summary'] + sections
144 | }
145 | section_types = {
146 | section: section_types.get(section, 'str')
147 | for section in ['summary'] + sections
148 | }
149 |
150 | # Clean the sections
151 | for section in results:
152 | if section_types[section] == 'list':
153 | results[section] = self.clean_content(
154 | results[section], tags=sections_tags[section])
155 | elif section_types[section] == 'str':
156 | results[section] = '\n'.join(self.clean_content(
157 | results[section], tags=sections_tags[section]))
158 | else:
159 | raise NotImplementedError(
160 | f"No implementation for this type of output: "
161 | f"{section_types[section]}"
162 | )
163 |
164 | return results
165 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/destinations_similarity/tasks.py:
--------------------------------------------------------------------------------
1 | """Tasks for the destinations_similarity Flyte project."""
2 |
3 | import sys
4 | import logging
5 | from typing import List, Dict, Tuple
6 |
7 | import torch
8 | import pandas as pd
9 | import numpy as np
10 | from unidecode import unidecode
11 | from flytekit import task, Resources
12 | from flytekit.types.file import FlyteFile
13 |
14 | from destinations_similarity.scraper.extractor import WikiExtractor
15 | from destinations_similarity.scraper.brazilian_cities import (
16 | get_brazilian_cities_data, get_dataframe)
17 | from destinations_similarity.processing.text_preprocessing import (
18 | translate_description_series, preprocess_text)
19 | from destinations_similarity.processing.feature_engineering import (
20 | TextVectorizer)
21 |
22 |
23 | # Logging config
24 | LOGGER = logging.getLogger(__name__)
25 |
26 | logging.basicConfig(
27 | stream=sys.stdout,
28 | level=logging.INFO,
29 | format="[%(asctime)s] %(name)s: %(levelname)s | %(message)s"
30 | )
31 |
32 | # Flyte configuration
33 | LIGHT_RESOURCES = Resources(cpu="0.5", mem="1Gi")
34 | BASE_RESOURCES = Resources(cpu="1", mem="2Gi")
35 | INTENSIVE_RESOURCES = Resources(cpu="2", mem="16Gi")
36 |
37 |
38 | @task(retries=3, requests=LIGHT_RESOURCES)
39 | def get_base_data(generate_city_id: bool) -> pd.DataFrame:
40 | """Retrieve base data for the dataset.
41 |
42 | Args:
43 | generate_city_id (bool): Informs if an ID must be generated for each
44 | row of the dataset.
45 |
46 | Returns:
47 | pd.DataFrame: Base dataset.
48 | """
49 | return get_brazilian_cities_data(
50 | get_dataframe, generate_city_id=generate_city_id)
51 |
52 |
53 | @task(retries=3, requests=LIGHT_RESOURCES)
54 | def scrap_wiki(
55 | base_data: pd.DataFrame, wiki: str, lang: str, summary: bool,
56 | sections: List[str], sections_tags: Dict[str, List[str]],
57 | sections_types: Dict[str, str]
58 | ) -> pd.DataFrame:
59 | """Scrap a Wikimedia page for info.
60 |
61 | Args:
62 | base_data (pd.DataFrame): Base dataset.
63 | wiki (str): Type of wiki ('wikipedia', 'wikivoyage').
64 | lang (str): Language of wiki.
65 | summary (bool): If the summary must be retrieved.
66 | sections (List[str]): Which sections must be retrieved.
67 | sections_tags (Dict[str, List[str]]): Which HTML tags must be preserved
68 | for a given section.
69 | sections_types (Dict[str, str]): How each section will be
70 | saved on the dataset, 'str' or 'list'.
71 |
72 | Returns:
73 | pd.DataFrame: The updated dataset.
74 | """
75 | # Initialize scraper
76 | extractor = WikiExtractor(wiki=wiki, lang=lang)
77 |
78 | # Setup fields for the sections
79 | sections_fields = {
80 | section: f"{section}_{wiki}_{lang}".lower().replace(' ', '_')
81 | for section in ['summary'] + sections
82 | }
83 |
84 | # Initialize dataset
85 | dataset = base_data.copy()
86 | dataset[f"images_{wiki}_{lang}"] = [[] for _ in range(len(dataset))]
87 | for section, field in sections_fields.items():
88 | dataset[field] = (
89 | [[] for _ in range(len(dataset))]
90 | if sections_types.get(section) == 'list' else ""
91 | )
92 |
93 | # Retrieve data for each city
94 | for i, row in dataset.iterrows():
95 | page_name = row[f"title_{wiki}_{lang}"]
96 |
97 | # Set content
98 | page_content = extractor.extract_content(
99 | page_name, summary=summary, sections=sections,
100 | sections_tags=sections_tags, section_types=sections_types
101 | )
102 | for section, text in page_content.items():
103 | dataset.at[i, sections_fields[section]] = text
104 |
105 | # Set images links
106 | page_images = extractor.extract_images(page_name)
107 | dataset.at[i, f"images_{wiki}_{lang}"] = page_images
108 |
109 | return dataset
110 |
111 |
112 | @task(cache=True, cache_version='1.0', requests=LIGHT_RESOURCES)
113 | def merge_dataframes(
114 | df_x: pd.DataFrame, df_y: pd.DataFrame, join: str
115 | ) -> pd.DataFrame:
116 | """Merge two DataFrames together.
117 |
118 | Args:
119 | df_x (pd.DataFrame): First DataFrame.
120 | df_y (pd.DataFrame): Second DataFrame.
121 | join (str): The type of merge, 'inner' or 'outer'.
122 |
123 | Returns:
124 | pd.DataFrame: The concatenation of the DataFrames.
125 | """
126 | df_y_columns = df_y.columns.difference(df_x.columns)
127 | return pd.concat([df_x, df_y[df_y_columns]], axis=1, join=join)
128 |
129 |
130 | @task(cache=True, cache_version='1.0')
131 | def check_if_remote(uri: str) -> Tuple[bool, FlyteFile]:
132 | """Check if a URI points to a remote file."""
133 | if uri:
134 | return True, uri
135 | return False, uri
136 |
137 |
138 | @task(retries=3, requests=LIGHT_RESOURCES)
139 | def retrieve_dataset_from_remote(uri: FlyteFile) -> pd.DataFrame:
140 | """Retrieve a dataset from a remote URL.
141 |
142 | Args:
143 | url (FlyteFile): Remote address of the dataset. Must be a Parquet file.
144 |
145 | Returns:
146 | pd.DataFrame: DataFrame with the dataset.
147 | """
148 | # Download file if it has a remote source
149 | if uri.remote_source is not None:
150 | uri.download()
151 |
152 | dataset_df = pd.read_parquet(uri.path)
153 | dataset_df.columns = dataset_df.columns.astype(str)
154 | LOGGER.info("Retrieved dataset from '%s'.", uri.remote_source or uri.path)
155 | return dataset_df
156 |
157 |
158 | @task(cache=True, cache_version='1.0', requests=BASE_RESOURCES)
159 | def preprocess_input_data(
160 | dataframe: pd.DataFrame, columns_to_translate: List[str],
161 | columns_to_process: List[str], wikivoyage_summary: str
162 | ) -> pd.DataFrame:
163 | """Preprocess the scraped data.
164 |
165 | Args:
166 | dataframe (pd.DataFrame): remote dataframe with cities features
167 | columns_to_translate (List[str]): city features to be translated
168 | columns_to_process (List[str]): city features to be processed
169 | wikivoyage_summary (str): summary wikivoyage column name
170 |
171 | Returns:
172 | pd.DataFrame: remote dataframe pre-processed
173 | """
174 | LOGGER.info("Preprocessing input data.")
175 |
176 | if wikivoyage_summary:
177 | dataframe = dataframe[
178 | dataframe[wikivoyage_summary].notna()
179 | ].copy().reset_index(drop=True)
180 | LOGGER.info("Using %s rows of data.", dataframe.shape[0])
181 |
182 | # Translate columns
183 | for col in columns_to_translate:
184 | dataframe[col] = translate_description_series(dataframe, col)
185 |
186 | LOGGER.info("Columns %s translated.", columns_to_translate)
187 |
188 | # Process specified columns
189 | for col in columns_to_process:
190 | dataframe[col] = dataframe[col].fillna("").swifter.apply(
191 | lambda x: unidecode(x) if isinstance(x, str) else x).str.lower()
192 | dataframe[col] = preprocess_text(dataframe, col)
193 |
194 | LOGGER.info("Columns %s processed.", columns_to_process)
195 | dataframe.columns = dataframe.columns.astype(str)
196 | return dataframe
197 |
198 |
199 | @task(cache=True, cache_version='1.0', requests=INTENSIVE_RESOURCES)
200 | def vectorize_columns(
201 | dataframe: pd.DataFrame, columns_to_vec: List[str],
202 | city_column: str, state_column: str
203 | ) -> List[pd.DataFrame]:
204 | """Generate embeddings with the cities' infos.
205 |
206 | Args:
207 | dataframe (pd.DataFrame): remote dataset pre-processed
208 | columns_to_vec (List[str]): city features to be vectorized
209 | city_column (str): city column name
210 | state_column (str): state column name
211 |
212 | Returns:
213 | List[pd.DataFrame]: list of dataframes with city feature vectors
214 | """
215 | model = TextVectorizer()
216 | model.model.to('cuda')
217 | column_embeddings = []
218 |
219 | LOGGER.info("Generating embeddings for columns.")
220 |
221 | # Generate embeddings for each column
222 | for col in columns_to_vec:
223 | inputs_ids = model.encode_inputs(dataframe[col]).to('cuda')
224 | embeddings = model.get_df_embedding(inputs_ids)
225 | city_embeddings = pd.concat(
226 | [dataframe[[city_column, state_column]], embeddings], axis=1)
227 | city_embeddings.columns = city_embeddings.columns.astype(str)
228 | column_embeddings.append(city_embeddings)
229 |
230 | LOGGER.info("Embeddings generated.")
231 | return column_embeddings
232 |
233 |
234 | @task(cache=True, cache_version='1.0', requests=INTENSIVE_RESOURCES)
235 | def build_mean_embedding(
236 | list_dataframes: List[pd.DataFrame]
237 | ) -> pd.DataFrame:
238 | """Build mean embeddings for cities.
239 |
240 | Args:
241 | list_dataframes (List[pd.DataFrame]): list of dataframes with
242 | city feature vectors
243 |
244 | Returns:
245 | pd.DataFrame: city vectors
246 | """
247 | LOGGER.info("Building mean embeddings.")
248 |
249 | # Retrieve embeddings
250 | column_embeddings = [data.iloc[:, 2:].values for data in list_dataframes]
251 |
252 | # Compute mean embeddings
253 | aux = torch.Tensor(np.array(column_embeddings))
254 | aux_mean = aux.mean(axis=0)
255 | aux_mean = pd.DataFrame(aux_mean).astype("float")
256 | aux_mean = aux_mean.fillna(0)
257 | aux_mean = pd.concat(
258 | [list_dataframes[0][['city', 'state']], aux_mean], axis=1)
259 | aux_mean.columns = aux_mean.columns.astype(str)
260 |
261 | LOGGER.info("Mean embeddings calculated.")
262 | return aux_mean
263 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/destinations_similarity/workflows.py:
--------------------------------------------------------------------------------
1 | """Workflows for the destinations_similarity Flyte project."""
2 |
3 | from datetime import timedelta
4 | from typing import List
5 |
6 | import pandas as pd
7 | from flytekit import workflow, conditional, LaunchPlan, FixedRate
8 |
9 | from destinations_similarity import tasks
10 |
11 |
12 | @workflow
13 | def generate_dataset() -> pd.DataFrame:
14 | """Generate the dataset to be used for training.
15 |
16 | Returns:
17 | pd.DataFrame: The generated dataset.
18 | """
19 | base_data = tasks.get_base_data(generate_city_id=False)
20 |
21 | # Retrieve data from pt.wikipedia.org
22 | data_wikipedia_pt = tasks.scrap_wiki(
23 | base_data=base_data, wiki='wikipedia', lang='pt', summary=True,
24 | sections=['Clima', 'Economia', 'História', 'Geografia'],
25 | sections_tags={}, sections_types={}
26 | )
27 |
28 | # Retrieve data from en.wikivoyage.org
29 | data_wikivoyage_en = tasks.scrap_wiki(
30 | base_data=base_data, wiki='wikivoyage', lang='en', summary=True,
31 | sections=['Do', 'See', 'Go next'],
32 | sections_tags={'Go next': ['a', 'b']},
33 | sections_types={'Go next': 'list'}
34 | )
35 |
36 | # Merge data
37 | dataset = tasks.merge_dataframes(
38 | df_x=data_wikipedia_pt, df_y=data_wikivoyage_en, join='outer')
39 |
40 | return dataset
41 |
42 |
43 | @workflow
44 | def build_knowledge_base(
45 | columns_to_translate: List[str], columns_to_process: List[str],
46 | summary_wikivoyage_column_name: str, remote_dataset: str = ""
47 | ) -> pd.DataFrame:
48 | """Generate knowledge database.
49 |
50 | Args:
51 | columns_to_translate (List[str]): city features to be translated
52 | columns_to_process (List[str]): city features to be processed
53 | summary_wikivoyage_column_name (str): summary wikivoyage column name
54 | remote_dataset (str, optional): Remote dataset's URL. Generates
55 | dataset if no path is specified.
56 |
57 | Returns:
58 | pd.DataFrame: The generated dataset.
59 | """
60 | remote, flyte_file = tasks.check_if_remote(uri=remote_dataset)
61 |
62 | dataframe = (
63 | conditional("remote_dataset")
64 | .if_(remote.is_true()) # pylint: disable=no-member
65 | .then(tasks.retrieve_dataset_from_remote(uri=flyte_file))
66 | .else_()
67 | .then(generate_dataset())
68 | )
69 |
70 | dataframe_processed = tasks.preprocess_input_data(
71 | dataframe=dataframe,
72 | columns_to_translate=columns_to_translate,
73 | columns_to_process=columns_to_process,
74 | wikivoyage_summary=summary_wikivoyage_column_name
75 | )
76 |
77 | list_dataframes = tasks.vectorize_columns(
78 | dataframe=dataframe_processed,
79 | columns_to_vec=columns_to_process,
80 | city_column='city',
81 | state_column='state'
82 | )
83 |
84 | city_vectors = tasks.build_mean_embedding(list_dataframes=list_dataframes)
85 |
86 | return city_vectors
87 |
88 |
89 | # Launch plans
90 | build_knowledge_base_lp = LaunchPlan.get_or_create(
91 | name='build_knowledge_base_default_lp',
92 | workflow=build_knowledge_base,
93 | default_inputs={
94 | 'columns_to_translate': [
95 | "see_wikivoyage_en",
96 | "do_wikivoyage_en",
97 | "summary_wikivoyage_en"
98 | ],
99 | 'columns_to_process': [
100 | "summary_wikipedia_pt",
101 | "história_wikipedia_pt",
102 | "geografia_wikipedia_pt",
103 | "clima_wikipedia_pt",
104 | "see_wikivoyage_en",
105 | "do_wikivoyage_en",
106 | "summary_wikivoyage_en"
107 | ],
108 | 'summary_wikivoyage_column_name': "summary_wikivoyage_en",
109 | 'remote_dataset':
110 | "https://storage.googleapis.com"
111 | "/dsc-public-info/datasets/flytelab_dataset.parquet",
112 | },
113 | schedule=FixedRate(duration=timedelta(weeks=4))
114 | )
115 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/docs/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.7.12-slim-buster
2 |
3 | RUN apt-get update && \
4 | pip install --no-cache-dir --upgrade pip && \
5 | pip install --no-cache-dir sphinx==4.2.0 sphinx_rtd_theme==1.0.0
6 |
7 | WORKDIR /home
8 |
9 | CMD ["sh","-c","rm -rf _build/ && sphinx-build -b html . _build/"]
--------------------------------------------------------------------------------
/projects/destinations_similarity/docs/dashboard/dashboard.rst:
--------------------------------------------------------------------------------
1 | Streamlit Dashboard
2 | ---------------------------
3 |
4 | The user interaction interface was built using the streamlit tool. After local testing, the stable version of the tool was posted on Streamlit's public server. You can access the interface through the link below.
5 |
6 |
7 |
8 | To access the app code, just access the link below:
9 |
10 | .. toctree::
11 | streamlit
--------------------------------------------------------------------------------
/projects/destinations_similarity/docs/dashboard/streamlit.rst:
--------------------------------------------------------------------------------
1 | .. _streamlit:
2 |
3 | Streamlit app
4 | ---------------------------
5 |
6 | .. automodule:: dashboard.app
7 | :members:
8 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/docs/guides/deploy.rst:
--------------------------------------------------------------------------------
1 | Deploy project
2 | ----------------------------------
3 |
4 | To deploy the project, just follow the steps recommended in the flyte README. Below are the functions performed to deploy the project. Just run the following command:
5 |
6 | .. code-block::
7 |
8 | python3 deploy
9 |
10 |
11 | .. toctree::
12 | deploy_code
13 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/docs/guides/deploy_code.rst:
--------------------------------------------------------------------------------
1 | .. _deploy_code:
2 |
3 | Deploy code
4 | ----------------------------------
5 |
6 | .. automodule:: deploy
7 | :members:
8 |
9 |
10 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/docs/guides/docs.rst:
--------------------------------------------------------------------------------
1 | .. _docs:
2 |
3 | Building Sphinx docs
4 | ---------------------------
5 |
6 | .. warning::
7 | This documentation was built thinking about linux/mac operating systems or executions using WSL.
8 |
9 |
10 | In addition to the local README, we decided to detail a few more things about the project and maybe in a more playful way to facilitate understanding.
11 |
12 | There's not much mystery about building the documentation in HTML. We've already automated some things to make it easier. Generally speaking, Sphinx is responsible for creating a static page of HTML documentation using manually typed information or other information inserted into the developed code. These generated static pages are moved into a folder in a container running an NGINX image which hosts the documentation page.
13 |
14 |
15 | To build the Docker image responsible for the documentation and start hosting the server, just run the command
16 |
17 | .. warning::
18 | For this command to work it is necessary that in your location it is possible to run Makefile files and ensure that the working directory is inside projects/destinations_similarity
19 |
20 | .. code-block::
21 |
22 | make open-docs
23 |
24 | Once the command has been successfully executed, you can check with the command below if the container is running normally on your machine.
25 |
26 | .. code-block::
27 |
28 | docker ps
29 |
30 | the result should be
31 |
32 | .. code-block::
33 |
34 | $ docker ps
35 | CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
36 | 84cb390d977f nginx "/docker-entrypoint.…" 36 seconds ago Up 35 seconds 0.0.0.0:8080->80/tcp, :::8080->80/tcp sphinx-nginx
37 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/docs/guides/guide.rst:
--------------------------------------------------------------------------------
1 | Developer Guide
2 | ---------------------------
3 |
4 | Here is some information that might be useful and maybe even cause some doubts about the project. Feel free to ask any of the contributors if something is not clear.
5 |
6 | May the dogs be with you!
7 |
8 | .. toctree::
9 | docs
10 | deploy
11 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/docs/images/SolutionDiagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/destinations_similarity/docs/images/SolutionDiagram.png
--------------------------------------------------------------------------------
/projects/destinations_similarity/docs/images/kinzinhoApresentando.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/destinations_similarity/docs/images/kinzinhoApresentando.jpg
--------------------------------------------------------------------------------
/projects/destinations_similarity/docs/images/kinzinhoBagunceiro.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/destinations_similarity/docs/images/kinzinhoBagunceiro.jpg
--------------------------------------------------------------------------------
/projects/destinations_similarity/docs/images/kinzinhoBigDog.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/destinations_similarity/docs/images/kinzinhoBigDog.png
--------------------------------------------------------------------------------
/projects/destinations_similarity/docs/images/kinzinhoCachu.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/destinations_similarity/docs/images/kinzinhoCachu.jpg
--------------------------------------------------------------------------------
/projects/destinations_similarity/docs/images/kinzinhoGalante.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/destinations_similarity/docs/images/kinzinhoGalante.jpg
--------------------------------------------------------------------------------
/projects/destinations_similarity/docs/images/kinzinhoPensativo.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/destinations_similarity/docs/images/kinzinhoPensativo.jpg
--------------------------------------------------------------------------------
/projects/destinations_similarity/docs/images/sphinx_server.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/destinations_similarity/docs/images/sphinx_server.png
--------------------------------------------------------------------------------
/projects/destinations_similarity/docs/images/vamoDalheLogo.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/destinations_similarity/docs/images/vamoDalheLogo.jpeg
--------------------------------------------------------------------------------
/projects/destinations_similarity/docs/model/feature_engineering.rst:
--------------------------------------------------------------------------------
1 | .. _feature_engineering:
2 |
3 | Feature engineering
4 | ----------------------
5 |
6 | .. automodule:: destinations_similarity.processing.feature_engineering
7 | :members:
8 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/docs/model/model.rst:
--------------------------------------------------------------------------------
1 | .. _model:
2 |
3 | Machine Learning Model
4 | ---------------------------
5 |
6 | **Objective**: To help Kinzinho define his next travel destination, we seek to find other cities similar to the last travel destination he liked the most.
7 |
8 | **Strategy Solution**: To make a good evaluation between the cities, we chose to make a vector representation of each city in Brazil, encoding general information about each city such as its history, its geography, its climate and its tourist attractions. We chose this strategy because with a vector representation of the city we were able to apply similarity calculation operations between cities, considering various information about them.
9 |
10 | **Input data**: For our solution we use the following data from wikipedia and wikivoyage: "summary_wikipedia_en", "history_wikipedia_pt", "geografia_wikipedia_pt", "clima_wikipedia_pt", "see_wikivoyage_en", "do_wikivoyage_en", "summary_wikivoyage_en"
11 |
12 | **Preprocessing**: To process the data and extract only important information, we apply a series of pre-processing to clean unnecessary information and homogenize the texts.
13 |
14 | **Model**: To generate the best vector representation of each city's features, we used a pre-trained state-of-the-art model based on Transformers (BERTimbau). As a vector representation of each feature of the city, we use the output of the last layer of the BERTimbau language model. The vector representation of each city is generated from the average of the vectors of its features.
15 |
16 | **Similarity**: To calculate the similarity between the vector representations of each city, we are using an high optimized library and calculate the Euclidean distance between an input vector query (vector of the last city visited by Kinzinho) and all vectors of the cities available in our portfolio.
17 |
18 |
19 | To see the dataset processing codes, access the links below.
20 |
21 | .. toctree::
22 | text_preprocessing
23 | feature_engineering
--------------------------------------------------------------------------------
/projects/destinations_similarity/docs/model/text_preprocessing.rst:
--------------------------------------------------------------------------------
1 | .. _text_preprocessing:
2 |
3 | Text preprocessing
4 | ----------------------
5 |
6 | .. automodule:: destinations_similarity.processing.text_preprocessing
7 | :members:
8 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/docs/scraper/extractor.rst:
--------------------------------------------------------------------------------
1 | .. _extractor:
2 |
3 | Extractor
4 | ----------------------
5 |
6 | .. automodule:: destinations_similarity.scraper.extractor
7 | :members:
8 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/docs/scraper/scraper.rst:
--------------------------------------------------------------------------------
1 | .. _source:
2 |
3 | Generate source dataset
4 | ----------------------------------
5 |
6 | Kinzinho's humans built this module to meet all the information extraction needs of Brazilian cities. This information is further enriched with data extracted from Wikipedia and Wikivoyager.
7 |
8 | .. toctree::
9 | source
10 |
11 |
12 | Wikipedia and Wikivoyager scraping
13 | ----------------------------------
14 |
15 | As with the module above, humans were quite clever in exploring the Wikipedia and Wikivoyager APIs using information extracted from Brazilian city data sources. The API, unlike the raw HTML page, returns the information well separated and makes the search for the information of interest much easier.
16 |
17 | .. toctree::
18 | extractor
19 |
20 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/docs/scraper/source.rst:
--------------------------------------------------------------------------------
1 | .. _scraper:
2 |
3 | Source Dataset
4 | ----------------------------------
5 |
6 | .. automodule:: destinations_similarity.scraper.brazilian_cities
7 | :members:
8 |
9 |
10 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/index.rst:
--------------------------------------------------------------------------------
1 | Destination Similarity - Vamo Dalhe
2 | ============================
3 |
4 | .. toctree::
5 | :maxdepth: 2
6 |
7 |
8 | Hi, I hope you are well!
9 |
10 | This documentation page is entirely for Destination Similarity design.
11 |
12 | First of all, we would like to comment on the team's mascot, Kinzinho. Kinzinho is an adventurous puppy who has a great interest in traveling and seeing different places. To help his cause, it would be ideal for him to know about all the possible places and, in addition, to seek places similar to others he loves so much.
13 |
14 | .. image:: ./docs/images/kinzinhoGalante.jpg
15 | :width: 400
16 | :align: center
17 |
18 | In the following sections, the project will be presented and how the humans of Kinzinho worked to try to help it.
19 |
20 | ---------------------------
21 |
22 | Getting the data
23 | ---------------------------
24 |
25 | Kinzinho is an adventurous little dog who wanted to know about all the destinations he could go to. Kinzinho had the trouble of having to search for public databases which could be used. It is important to point out that his main focus was Brazil (even because Kinzinho is passionate about nature here). So their humans immediately came up with the idea of using information from Wikipedia and Wikivoyager.
26 |
27 | .. image:: ./docs/images/kinzinhoCachu.jpg
28 | :width: 400
29 | :align: center
30 |
31 |
32 | In the links below it is possible to access information from the codes made by Kinzinho's humans to search for all possible destinations in Brazil.
33 |
34 | .. toctree::
35 | :maxdepth: 2
36 |
37 | docs/scraper/scraper
38 | ...
39 |
40 | Machine Learning Model
41 | ---------------------------
42 |
43 | Kinzinho has so many adventurous options that he had to extract public data on Wikipedia and Wikivoyager to get to know all of them! But he now realized that it is too much and wants some recommendations based on where people traveled before. Can we help him?
44 |
45 | Of course! Everything is 'paw-sible' when you have a dog :D !!
46 |
47 | .. toctree::
48 | :maxdepth: 2
49 |
50 | docs/model/model
51 | ...
52 |
53 | Creating a user interface (Streamlit)
54 | ---------------------------
55 |
56 | Once the similarity inference model is ready, Kinzinho's humans built an interface to help not only him, but everyone interested in being as adventurous as Kinzinho is.
57 |
58 | .. image:: ./docs/images/kinzinhoBagunceiro.jpg
59 | :width: 400
60 | :align: center
61 |
62 |
63 | Shall we start the mess? I mean... Travel?
64 |
65 |
66 | .. toctree::
67 | docs/dashboard/dashboard
68 |
69 |
70 | Developer guides (How to)
71 | ---------------------------
72 |
73 | Kinzinho is not the programming professional. But he asked his humans to leave a minimally detailed description so that evaluators and other Hackathon participants could understand a few things about the project. Access the link below for more details.
74 |
75 | .. toctree::
76 | docs/guides/guide
77 |
78 |
79 | Project references
80 | ---------------------------
81 |
82 | * [Wikipedia API] - https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/queries/examples
83 | * [Calculate vector similarity] - https://github.com/facebookresearch/faiss
84 | * [Translate model] - https://deep-translator.readthedocs.io/en/latest/
85 | * [NLP Model] - https://huggingface.co/neuralmind/bert-base-portuguese-cased
86 | * [Streamlit docs] - https://docs.streamlit.io/
87 | * [Flyte docs] - https://docs.flyte.org/en/latest/
88 | * [Sphinx docs] - https://www.sphinx-doc.org/en/master/
89 |
90 | Acknowledgments
91 | ---------------------------
92 |
93 | Kinzinho and his humans would like to thank everyone involved who made this project possible. They would also like to thank `Hurb's `_ support in allowing and influencing participation in the hackathon as training and recognition of the team's potential. And finally, thank Kinzinho himself for making the days of his humans around better.
--------------------------------------------------------------------------------
/projects/destinations_similarity/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | docker
2 | gitpython
3 | streamlit
4 | typer
5 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/requirements.txt:
--------------------------------------------------------------------------------
1 | flytekit>=0.30.3
2 | pandas>=1.3.5
3 | beautifulsoup4~=4.10.0
4 | requests~=2.27.1
5 | transformers[torch]~=4.17.0
6 | numpy>=1.21.5
7 | unidecode~=1.3.4
8 | torch~=1.11.0
9 | swifter~=1.1.2
10 | nltk~=3.7
11 | deep-translator~=1.8.3
12 |
--------------------------------------------------------------------------------
/projects/destinations_similarity/scripts/open_docs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | echo "Building sphinx HTML from $(pwd)"
3 | docker build --tag sphinx-server ./docs/
4 | docker run --rm --name build-docs -it -v $(pwd):/home/ sphinx-server
5 | docker run --rm --name sphinx-nginx -v $(pwd)/_build/:/usr/share/nginx/html:ro -d -p 8080:80 nginx
6 | echo "Sphinx docs is hosted at: http://localhost:8080/"
--------------------------------------------------------------------------------
/projects/destinations_similarity/scripts/rebuild_docs.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | docker stop sphinx-nginx
3 | docker run --rm --name build-docs -it -v $(pwd):/home/ sphinx-server
4 | docker run --rm --name sphinx-nginx -v $(pwd)/_build/:/usr/share/nginx/html:ro -d -p 8080:80 nginx
5 | echo "Sphinx docs is hosted at: http://localhost:8080/"
--------------------------------------------------------------------------------
/projects/weather_forecasting/.flyte/remote-config.yaml:
--------------------------------------------------------------------------------
1 | admin:
2 | # For GRPC endpoints you might want to use dns:///flyte.myexample.com
3 | endpoint: dns:///playground.hosted.unionai.cloud
4 | authType: Pkce
5 | # Change insecure flag to ensure that you use the right setting for your environment
6 | insecure: false
7 | storage:
8 | type: stow
9 | stow:
10 | kind: s3
11 | config:
12 | auth_type: iam
13 | region: us-east-2
14 | logger:
15 | # Logger settings to control logger output. Useful to debug logger:
16 | show-source: true
17 | level: 1
18 |
--------------------------------------------------------------------------------
/projects/weather_forecasting/.flyte/remote.config:
--------------------------------------------------------------------------------
1 | [sdk]
2 | # This option specifies the python packages where-in to search for workflows and tasks workflow packages. These workflows and tasks are then serialized during the `make serialize` commands
3 | workflow_packages=app
4 |
5 | [auth]
6 | # Uncomment if you want to use a service account for all your tasks and workflow executions. This service account should be created by you and available on the k8s cluster and it will be used to read and write data from the backend store like S3/GCS, or to connect to any services that you use in your tasks.
7 | # to the blobstore (e.g. s3) used to write task execution outputs.
8 | # kubernetes_service_account=demo
9 | # You can set this prefix to specify where task output schema and blobs should be written to.
10 | raw_output_data_prefix=s3://open-compute-playground
11 |
--------------------------------------------------------------------------------
/projects/weather_forecasting/.flyte/sandbox-config.yaml:
--------------------------------------------------------------------------------
1 | admin:
2 | # For GRPC endpoints you might want to use dns:///flyte.myexample.com
3 | endpoint: dns:///localhost:30081
4 | authType: Pkce
5 | insecure: true
6 | logger:
7 | show-source: true
8 | level: 0
9 | storage:
10 | connection:
11 | access-key: minio
12 | auth-type: accesskey
13 | disable-ssl: true
14 | endpoint: http://localhost:30084
15 | region: us-east-1
16 | secret-key: miniostorage
17 | type: minio
18 | container: "my-s3-bucket"
19 | enable-multicontainer: true
--------------------------------------------------------------------------------
/projects/weather_forecasting/.flyte/sandbox.config:
--------------------------------------------------------------------------------
1 | [sdk]
2 | # This option specifies the python packages where-in to search for workflows and tasks workflow packages. These workflows and tasks are then serialized during the `make serialize` commands
3 | workflow_packages=app
4 |
5 | [auth]
6 | # Uncomment if you want to use a service account for all your tasks and workflow executions. This service account should be created by you and available on the k8s cluster and it will be used to read and write data from the backend store like S3/GCS, or to connect to any services that you use in your tasks.
7 | # to the blobstore (e.g. s3) used to write task execution outputs.
8 | # kubernetes_service_account=demo
9 | # You can set this prefix to specify where task output schema and blobs should be written to.
10 | raw_output_data_prefix=s3://my-s3-bucket/flytelab
11 |
--------------------------------------------------------------------------------
/projects/weather_forecasting/.gitignore:
--------------------------------------------------------------------------------
1 | .kube
--------------------------------------------------------------------------------
/projects/weather_forecasting/DEPLOYMENT.md:
--------------------------------------------------------------------------------
1 | # Deployment Instructions
2 |
3 | This page contains notes regarding how to use Flyte in an end-to-end ML
4 | system.
5 |
6 | ## Project Setup
7 |
8 | ### Sandbox
9 |
10 | Create project:
11 |
12 | ```bash
13 | flytectl create project \
14 | --name flytelab \
15 | --id flytelab \
16 | --description "flytelab: ml projects in flyte" \
17 | --config .flyte/sandbox-config.yaml \
18 | --project flytelab
19 | ```
20 |
21 | Update cluster resource attributes:
22 |
23 | ```bash
24 | flyte-cli -i \
25 | -h localhost:30081 \
26 | -p flytelab \
27 | -d development update-cluster-resource-attributes \
28 | --attributes projectQuotaCpu 16 \
29 | --attributes projectQuotaMemory 30Gi
30 | ```
31 |
32 | ### Remote
33 |
34 | Create project:
35 |
36 | ```bash
37 | flytectl create project \
38 | --name flytelab \
39 | --id flytelab \
40 | --description "flytelab: ml projects in flyte" \
41 | --config .flyte/remote-config.yaml \
42 | --project flytelab
43 | ```
44 |
45 |
46 | Make sure the NOAA api key is available in the shell session:
47 | ```
48 | eval $(sed 's/^/export /g' env.txt)
49 | ```
50 |
51 | ## Workflow Registration
52 |
53 | ## Sandbox
54 |
55 | ### Register Workflows
56 |
57 | ```bash
58 | FLYTECTL_CONFIG=.flyte/sandbox-config.yaml REGISTRY=ghcr.io/flyteorg make register
59 | ```
60 |
61 | ### Fast Registering New Code
62 |
63 | In case you've only changed user code and not system-level dependencies:
64 |
65 | ```bash
66 | FLYTECTL_CONFIG=.flyte/sandbox-config.yaml REGISTRY=ghcr.io/flyteorg make fast_register
67 | ```
68 |
69 | ## Production [playground.hosted.unionai.cloud](https://playground.hosted.unionai.cloud/console)
70 |
71 | ### Register Workflows
72 |
73 | ```bash
74 | FLYTECTL_CONFIG=.flyte/remote-config.yaml FLYTE_CONFIG=.flyte/remote.config REGISTRY=ghcr.io/flyteorg make register
75 | ```
76 |
77 | ### Fast Registering New Code
78 |
79 | ```bash
80 | FLYTECTL_CONFIG=.flyte/remote-config.yaml REGISTRY=ghcr.io/flyteorg make fast_register
81 | ```
82 |
83 | ### Activating Launch Plans
84 |
85 | List launch plan versions
86 |
87 | ```bash
88 | ./scripts/launch-plan-status.sh
89 | ```
90 |
91 | To activate launch plans
92 |
93 | ```bash
94 | ./scripts/activate-launch-plans.sh # [VERSION] argument is optional to activate a specific version
95 | ```
96 |
97 | To deactivate:
98 |
99 | ```bash
100 | ./scripts/archive-launch-plans.sh # [VERSION] argument is optional to activate a specific version
101 | ```
102 |
--------------------------------------------------------------------------------
/projects/weather_forecasting/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.8-slim-buster
2 | LABEL org.opencontainers.image.source https://github.com/flyteorg/flytelab
3 |
4 | WORKDIR /root
5 | ENV VENV /opt/venv
6 | ENV LANG C.UTF-8
7 | ENV LC_ALL C.UTF-8
8 | ENV PYTHONPATH /root
9 |
10 | # e.g. flyte.config or sandbox.config
11 | ARG config
12 |
13 | # This is necessary for opencv to work
14 | RUN apt-get update && \
15 | apt-get install -y \
16 | libsm6 \
17 | libxext6 \
18 | libxrender-dev \
19 | ffmpeg \
20 | build-essential
21 |
22 | # Install the AWS cli separately to prevent issues with boto being written over
23 | RUN pip3 install awscli
24 |
25 | ENV VENV /opt/venv
26 | # Virtual environment
27 | RUN python3 -m venv ${VENV}
28 | ENV PATH="${VENV}/bin:$PATH"
29 |
30 | # Install Python dependencies
31 | COPY requirements.txt /root
32 | RUN pip install -r /root/requirements.txt
33 |
34 | COPY app /root/app
35 | COPY $config /root/flyte.config
36 |
37 | # This tag is supplied by the build script and will be used to determine the version
38 | # when registering tasks, workflows, and launch plans
39 | ARG tag
40 | ENV FLYTE_INTERNAL_IMAGE $tag
41 | ARG noaa_api_key
42 | ENV NOAA_API_KEY $noaa_api_key
43 |
--------------------------------------------------------------------------------
/projects/weather_forecasting/Makefile:
--------------------------------------------------------------------------------
1 | export REPOSITORY=flytelab
2 |
3 | VERSION=$(shell git rev-parse HEAD)
4 | IMAGE_NAME=flytelab
5 |
6 | ifeq ($(INSECURE), true)
7 | INSECURE=-i
8 | endif
9 |
10 | ifeq ($(NOPUSH), true)
11 | NOPUSH=1
12 | endif
13 |
14 | ifndef FLYTECTL_CONFIG
15 | FLYTECTL_CONFIG=~/.flyte/config.yaml
16 | endif
17 |
18 |
19 | # If the REGISTRY environment variable has been set, that means the image name will not just be tagged as
20 | # flytecookbook: but rather,
21 | # docker.io/lyft/flytecookbook: or whatever your REGISTRY is.
22 | ifneq ($(origin REGISTRY), undefined)
23 | FULL_IMAGE_NAME = ${REGISTRY}/${IMAGE_NAME}
24 | else
25 | FULL_IMAGE_NAME = ${IMAGE_NAME}
26 | endif
27 |
28 | export FLYTE_HOST ?= localhost:30081
29 | export PREFIX ?= weather-forecasting
30 | export FLYTE_CONFIG ?= .flyte/sandbox.config
31 |
32 | # The Flyte project and domain that we want to register under
33 | export PROJECT ?= flytelab
34 | export DOMAIN ?= development
35 | export DESCRIPTION ?= 'ML projects using Flyte'
36 |
37 | # This specifies where fast-registered code is uploaded to during registration.
38 | # If you're not using the standard minio deployment on flyte sandbox: update this path to something that
39 | # - you have write access to
40 | # - flytepropeller can read (depending on the role it uses)
41 | export ADDL_DISTRIBUTION_DIR ?= s3://my-s3-bucket/flyte-fast-distributions
42 |
43 | FLYTE_INTERNAL_IMAGE=${FULL_IMAGE_NAME}:${PREFIX}-${VERSION}
44 | FLYTE_INTERNAL_LATEST=${FULL_IMAGE_NAME}:${PREFIX}-latest
45 |
46 | # targets for local development
47 | venv:
48 | @virtualenv ./.venv/weather-forecasting
49 |
50 | deps:
51 | @pip install -r requirements.txt
52 |
53 | env.txt:
54 | @echo "NOAA_API_KEY=''" > env.txt
55 |
56 | .PHONY: env-export
57 | env-export:
58 | @eval $(sed 's/^/export /g' env.txt)
59 |
60 | # flyte-related targets
61 | .PHONY: create-project
62 | create-project:
63 | flyte-cli register-project -h ${FLYTE_HOST} ${INSECURE} -p ${PROJECT} -n ${PROJECT} -d ${DESCRIPTION}
64 |
65 | .PHONY: _requires-commit
66 | _requires-commit:
67 | @if [ -n "$(shell git status --porcelain)" ]; then \
68 | echo "Please commit git changes before building"; \
69 | exit 1; \
70 | fi;
71 |
72 | .PHONY: docker-build
73 | docker-build: _requires-commit
74 | ifndef NOAA_API_KEY
75 | $(error NOAA_API_KEY must be defined)
76 | endif
77 | @echo "Building: ${FLYTE_INTERNAL_IMAGE}"
78 | docker build . \
79 | --build-arg tag="${FLYTE_INTERNAL_IMAGE}" \
80 | --build-arg config="${FLYTE_CONFIG}" \
81 | --build-arg noaa_api_key="${NOAA_API_KEY}" \
82 | -t "${FLYTE_INTERNAL_IMAGE}" \
83 | -t "${FLYTE_INTERNAL_LATEST}" \
84 | -f ./Dockerfile
85 |
86 | .PHONY: docker-push
87 | docker-push: docker-build
88 | @echo "Pushing: ${FLYTE_INTERNAL_IMAGE}"
89 | docker push "${FLYTE_INTERNAL_IMAGE}"
90 | docker push "${FLYTE_INTERNAL_LATEST}"
91 |
92 | .PHONY: serialize
93 | serialize:
94 | echo ${CURDIR}
95 | pyflyte -c flyte.config --pkgs app package \
96 | --force \
97 | --in-container-source-path /root \
98 | --image ${FULL_IMAGE_NAME}:${PREFIX}-${VERSION}
99 |
100 | .PHONY: register
101 | register: docker-push serialize
102 | flytectl -c ${FLYTECTL_CONFIG} \
103 | register files \
104 | --project flytelab \
105 | --domain development \
106 | --archive flyte-package.tgz \
107 | --force \
108 | --version ${VERSION}
109 |
110 | .PHONY: fast_serialize
111 | fast_serialize:
112 | echo ${CURDIR}
113 | pyflyte -c flyte.config --pkgs app package \
114 | --force \
115 | --in-container-source-path /root \
116 | --fast \
117 | --image ${FLYTE_INTERNAL_LATEST}
118 |
119 | .PHONY: fast_register
120 | fast_register: fast_serialize
121 | flytectl -c ${FLYTECTL_CONFIG}} \
122 | register files \
123 | --project flytelab \
124 | --domain development \
125 | --archive flyte-package.tgz \
126 | --version fast${VERSION}
127 |
--------------------------------------------------------------------------------
/projects/weather_forecasting/README.md:
--------------------------------------------------------------------------------
1 | # Weather Forecasting
2 |
3 | [](https://share.streamlit.io/flyteorg/flytelab/main/projects/weather_forecasting/dashboard/weather_forecasting.py)
4 |
5 | The purpose of this project is to train a model to perform weather forecasting
6 | using [noaa.gov](https://www.ncei.noaa.gov/) data.
7 |
8 | **Note:** _For the best reading experience on github, we recommend installing the_
9 | _[markdown diagrams browser extension](https://github.com/marcozaccari/markdown-diagrams-browser-extension)_
10 | _to render all of the diagrams_ and _[mathjax (chrome)](https://github.com/orsharir/github-mathjax)_
11 | for math rendering.
12 |
13 | ## Prototype
14 |
15 | Since the term "weather forecasting" is quite expansive, we'll scope the protoype
16 | to the problem of next-day mean temperature prediction (MTP) at a set of specified
17 | locations. At a high level, our trained model function should look like:
18 |
19 | ```python
20 | locations = ["Seattle, WA, USA", "Atlanta, GA, USA", "Hyderabad, India"]
21 | predictions: List[float] = predict_mean_temperature(locations)
22 | ```
23 |
24 | Where `locations` might be more precisely defined by country, city, zipcode, etc.
25 |
26 | ### Training Data
27 |
28 | For training data, we'll use the [integrated surface database (ISD)](https://www.ncdc.noaa.gov/isd)
29 | to obtain global hourly weather data.
30 |
31 | - [ISD Home page](https://www.ncdc.noaa.gov/isd)
32 | - [Web Browser GUI](https://www.ncei.noaa.gov/access/search/data-search/global-hourly)
33 | - [Web Services API](https://www.ncdc.noaa.gov/cdo-web/webservices/ncdcwebservices)
34 | - [Full ISD Documentation](https://www.ncei.noaa.gov/data/global-hourly/doc/isd-format-document.pdf)
35 |
36 |
37 | ### Training Pipeline
38 |
39 | There are several options for training the MTP model:
40 |
41 | #### Online Training
42 |
43 | Online model that updates its parameters daily based on fixed set of historical data
44 | temperature data (and potentially other related data).
45 |
46 | The model would estimate the function $x^t = f(x^{t - 1}, ..., x^{t - n}, x^{t - 365 \times 1})$, where $x_t$ is the mean temperature for a particular day $t$.
47 |
48 | ```mermaid
49 | graph TB
50 |
51 | subgraph "look-back"
52 | x1["x (t - 1)"]
53 | x2["x (t - 2)"]
54 | xdot["..."]
55 | x14["x (t - 14)"]
56 | end
57 |
58 | x["x (t)"]
59 | x1 --> x
60 | x2 --> x
61 | xdot --> x
62 | x14 --> x
63 | ```
64 |
65 | #### Offline Training
66 |
67 | Offline model trained once on historical data going back `n` years. The data would be
68 | processed in the same form as the online model, it would just have more training instances.
69 |
70 | #### Hybrid Training
71 |
72 | Combined offline-online model. An offline model can be trained on historical data
73 | and can be updated on a daily basis as new data points are obtained.
74 |
75 | ### Pipeline Tasks
76 |
77 | At a high level, the pipeline should look something like:
78 |
79 | ```python
80 | for today in days:
81 | # update the model
82 | training_instance = get_data_point(today)
83 | current_model = get_current_model()
84 | updated_model = update_model(current_model, training_instance)
85 | write_model(updated_model)
86 |
87 | # get prediction for tomorrow
88 | tomorrow = today + 1
89 | prediction_instance = get_data_point(tomorrow)
90 | mean_temp_tomorrow = updated_model(prediction_instance)
91 | write_predictions(mean_temp_tomorrow, tomorrow)
92 |
93 | # evaluate trailing performance of the model
94 | prediction_today = get_prediction(today) # yesterday's prediction for today
95 | trailing_performance = evaluate_model(prediction_today, training_instance["mean_temp"])
96 | write_performance(trailing_performance, today)
97 | ```
98 |
99 | ### Model Architecture
100 |
101 | For the prototype, we'll start with an SGD regression model using `sklearn`, which is
102 | able to express a confidence interval of its predictions.
103 |
104 | ## Extensions
105 |
106 | After a prototype is up and running, here are some extensions to make a more sophisticated model:
107 |
108 | - perform hourly predictions
109 | - predict other data points such as precipitation
110 | - experiment with other model architectures, like ensembles, to improve performance
111 | - for locations that don't have weather data, interpolate predictions for neighboring areas
112 |
113 | ## Repo Structure
114 |
115 | - `Dockerfile`: dockerfile for building the image associated with the weather forecasting app
116 | - `.flyte`: directory containing flyte .ini config files
117 | - `app`: this directory contains the flyte workflows
118 | - `v1`: unmaintained version of the weather forecasting app
119 | - `v2`: latest veresion of the weather forecasting app
120 | - `dashboard`: source for the streamlit app
121 | - `flytelab`: python package for weather forecasting-specific functions. **NOTE:** Currently the source code
122 | in this package is only used by the unmaintained `v1` weather forecasting app. In the future functionality from
123 | the `v2/workflow.py` script might be refactored into a the `flytelab.weather_forecasting` package.
124 | - `scripts`: contains bash utility scripts for activating/archiving launch plans
125 |
126 | ## Setup
127 |
128 | ```
129 | $ make venv
130 | $ source ./.venv/weather-forecasting/bin/activate
131 | $ make deps
132 | $ make env.txt
133 | ```
134 |
135 | Replace `` with an [official API key](https://www.ncdc.noaa.gov/cdo-web/token).
136 |
137 |
138 | ## Usage
139 |
140 | Export environment variables
141 |
142 | ```
143 | $ eval $(sed 's/^/export /g' env.txt)
144 | ```
145 |
146 | Run the workflow locally
147 | ```
148 | python app/workflow.py
149 | ```
150 |
151 |
152 | ## Deployment
153 |
154 | [DEPLOYMENT.md](DEPLOYMENT.md) contains instructions for how to deploy the weather forecasting workflow
155 | to a local or remote sandbox.
156 |
157 | ## Streamlit App
158 |
159 | To run the app locally, but connecting to https://demo.nuclyde.io/console as the backend:
160 |
161 | ```
162 | pip install streamlit
163 | export FLYTE_CREDENTIALS_CLIENT_SECRET= # replace with client secret
164 | export FLYTE_CREDENTIALS_CLIENT_ID=flytepropeller
165 | export FLYTE_CREDENTIALS_AUTH_MODE=basic
166 | export FLYTE_CREDENTIALS_AUTHORIZATION_METADATA_KEY=flyte-authorization
167 | export FLYTE_CREDENTIALS_OAUTH_SCOPES=all
168 | streamlit run dashboard/weather_forecasting.py
169 | ```
170 |
171 | [Live Demo](https://share.streamlit.io/flyteorg/flytelab/main/projects/weather_forecasting/dashboard/weather_forecasting.py)
172 |
173 | ## Resources
174 |
175 | Here are some additional resources related to this project:
176 |
177 | - [awesome online machine learning](https://github.com/MaxHalford/awesome-online-machine-learning)
178 | - [The correct way to evaluate online machine learning models](https://maxhalford.github.io/blog/online-learning-evaluation/)
179 | - [Time Series Modeling using Scikit, Pandas, and Numpy](https://towardsdatascience.com/time-series-modeling-using-scikit-pandas-and-numpy-682e3b8db8d1)
180 |
--------------------------------------------------------------------------------
/projects/weather_forecasting/app/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/weather_forecasting/app/__init__.py
--------------------------------------------------------------------------------
/projects/weather_forecasting/dashboard/flyte.config:
--------------------------------------------------------------------------------
1 | [platform]
2 | url=demo.nuclyde.io
3 | insecure=False
4 |
5 | [credentials]
6 | client_id=flytepropeller
7 | auth_mode=basic
8 | authorization_metadata-key=flyte-authorization
9 | oauth_scopes=all
10 |
--------------------------------------------------------------------------------
/projects/weather_forecasting/dashboard/requirements.txt:
--------------------------------------------------------------------------------
1 | flytekit==0.30.0
2 | flyteidl
3 | geopy
4 | joblib
5 | pandas
6 | pandera
7 | requests
8 | sklearn
9 | timezonefinder
10 | pygments>=2.7.4 # not directly required, pinned by Snyk to avoid a vulnerability
11 |
--------------------------------------------------------------------------------
/projects/weather_forecasting/dashboard/weather_forecasting.py:
--------------------------------------------------------------------------------
1 | import os
2 | from dataclasses import dataclass
3 | from datetime import datetime
4 | from typing import Optional, List
5 |
6 | import pandas as pd
7 | import streamlit as st
8 | from dataclasses_json import dataclass_json
9 |
10 | from flytekit.remote import FlyteRemote
11 | from flytekit.models import filters
12 | from flytekit.models.admin.common import Sort
13 |
14 |
15 | @dataclass_json
16 | @dataclass
17 | class Scores:
18 | # keep track of mean absolute error
19 | train_exp_mae: float = 0.0
20 | valid_exp_mae: float = 0.0
21 |
22 |
23 | @dataclass_json
24 | @dataclass
25 | class Prediction:
26 | air_temp: Optional[float]
27 | dew_temp: Optional[float]
28 | date: datetime
29 | error: Optional[str] = None
30 | imputed: bool = False
31 |
32 |
33 | @dataclass_json
34 | @dataclass
35 | class Forecast:
36 | created_at: datetime
37 | model_id: str
38 | predictions: List[Prediction]
39 |
40 |
41 | LOGO = "https://docs.flyte.org/en/latest/_static/flyte_circle_gradient_1_4x4.png"
42 |
43 | LAUNCH_PLAN_MAP = {
44 | "seattle": "seattle_weather_forecast_v2",
45 | "atlanta": "atlanta_weather_forecast_v2",
46 | "hyderabad": "hyderabad_weather_forecast_v2",
47 | "mumbai": "mumbai_weather_forecast_v2",
48 | "taipei": "taipei_weather_forecast_v2",
49 | "appleton": "appleton_weather_forecast_v2",
50 | "dharamshala": "dharamshala_weather_forecast_v2",
51 | "fremont": "fremont_weather_forecast_v2",
52 | }
53 |
54 |
55 | CITY_LABEL_MAP = {
56 | "atlanta": "Atlanta, GA USA",
57 | "seattle": "Seattle, WA USA",
58 | "hyderabad": "Hyderabad, Telangana India",
59 | "mumbai": "Mumbai, MH India",
60 | "taipei": "Taipei, Taiwan",
61 | "appleton": "Green Bay, WI USA",
62 | "dharamshala": "Dharamsala, HP India",
63 | "fremont": "Fremont, CA USA",
64 | }
65 |
66 |
67 | remote = FlyteRemote.from_config(
68 | default_project="flytelab",
69 | default_domain="development",
70 | config_file_path=os.path.join(
71 | os.path.abspath(os.path.dirname(__file__)), "flyte.config"
72 | )
73 | )
74 |
75 | st.set_page_config(
76 | page_title="flytelab - weather forecasts",
77 | page_icon=LOGO,
78 | )
79 |
80 | _, _, col, *_ = st.columns(5)
81 | with col:
82 | st.image(LOGO, width=100)
83 | st.title("Flytelab: Weather Forecasts ⛈☀️☔️")
84 |
85 | """
86 | This app displays the weather forecasts produced by a model
87 | that was trained using [flyte](https://flyte.org/). For more information
88 | see the [flytelab weather forecasting project](https://github.com/flyteorg/flytelab/tree/main/projects/weather_forecasting).
89 | """
90 |
91 | selected_city = st.selectbox(
92 | "Select a City",
93 | options=[
94 | "atlanta",
95 | "seattle",
96 | "hyderabad",
97 | "mumbai",
98 | "taipei",
99 | "appleton",
100 | "dharamshala",
101 | "fremont",
102 | ],
103 | format_func=lambda x: CITY_LABEL_MAP[x]
104 | )
105 |
106 | [latest_execution, *_], _ = remote.client.list_executions_paginated(
107 | "flytelab",
108 | "development",
109 | limit=1,
110 | filters=[
111 | filters.Equal("launch_plan.name", LAUNCH_PLAN_MAP[selected_city]),
112 | filters.Equal("phase", "SUCCEEDED"),
113 | ],
114 | sort_by=Sort.from_python_std("desc(execution_created_at)"),
115 | )
116 |
117 | wf_execution = remote.fetch_workflow_execution(name=latest_execution.id.name)
118 | remote.sync(wf_execution, sync_nodes=False)
119 | forecast = Forecast.from_dict(wf_execution.outputs["forecast"])
120 | scores = wf_execution.outputs["scores"]
121 |
122 | with st.expander("Model Metadata"):
123 | st.markdown(f"""
124 | ```
125 | model_id: {forecast.model_id}
126 | created_at: {forecast.created_at}
127 | training exp-weighted-mae: {scores.train_exp_mae}
128 | validation exp-weighted-mae: {scores.valid_exp_mae}
129 | ```
130 | """)
131 |
132 | st.markdown(f"""
133 | ## {CITY_LABEL_MAP[selected_city]}
134 |
135 | Air Temperature and Dew Temperature Forecast (°C)
136 | """)
137 |
138 | air_temp = []
139 | dew_temp = []
140 | datetime_index = []
141 | for p in forecast.predictions:
142 | date = p.date.replace(tzinfo=None)
143 | if date < pd.Timestamp.now().floor("D").to_pydatetime():
144 | continue
145 | air_temp.append(p.air_temp)
146 | dew_temp.append(p.dew_temp)
147 | datetime_index.append(date)
148 |
149 | data = pd.DataFrame(
150 | {"air_temp": air_temp, "dew_temp": dew_temp},
151 | index=datetime_index
152 | )
153 |
154 | st.line_chart(data)
155 |
156 | st.markdown(f"""
157 | Predictions powered by [flyte](https://flyte.org/)
158 | """)
159 |
--------------------------------------------------------------------------------
/projects/weather_forecasting/in_container.mk:
--------------------------------------------------------------------------------
1 | SERIALIZED_PB_OUTPUT_DIR := /tmp/output
2 |
3 | .PHONY: clean
4 | clean:
5 | rm -rf $(SERIALIZED_PB_OUTPUT_DIR)/*
6 |
7 | $(SERIALIZED_PB_OUTPUT_DIR): clean
8 | mkdir -p $(SERIALIZED_PB_OUTPUT_DIR)
9 |
10 | .PHONY: serialize
11 | serialize: $(SERIALIZED_PB_OUTPUT_DIR)
12 | pyflyte --config /root/flyte.config serialize workflows -f $(SERIALIZED_PB_OUTPUT_DIR)
13 |
14 | .PHONY: register
15 | register: serialize
16 | flyte-cli register-files -h ${FLYTE_HOST} ${INSECURE_FLAG} -p ${PROJECT} -d development -v ${VERSION} --kubernetes-service-account ${SERVICE_ACCOUNT} --output-location-prefix ${OUTPUT_DATA_PREFIX} $(SERIALIZED_PB_OUTPUT_DIR)/*
17 |
18 | .PHONY: fast_serialize
19 | fast_serialize: $(SERIALIZED_PB_OUTPUT_DIR)
20 | pyflyte --config /root/flyte.config serialize fast workflows -f $(SERIALIZED_PB_OUTPUT_DIR)
21 |
22 | .PHONY: fast_register
23 | fast_register: fast_serialize
24 | flyte-cli fast-register-files -h ${FLYTE_HOST} ${INSECURE_FLAG} -p ${PROJECT} -d development --kubernetes-service-account ${SERVICE_ACCOUNT} --output-location-prefix ${OUTPUT_DATA_PREFIX} --additional-distribution-dir ${ADDL_DISTRIBUTION_DIR} $(SERIALIZED_PB_OUTPUT_DIR)/*
25 |
--------------------------------------------------------------------------------
/projects/weather_forecasting/requirements.txt:
--------------------------------------------------------------------------------
1 | flytekit==0.30.0
2 | flytekitplugins-pandera==0.30.0
3 | flyteidl
4 | geopy
5 | joblib
6 | pandas
7 | pandera
8 | requests
9 | sklearn
10 | timezonefinder
11 |
--------------------------------------------------------------------------------
/projects/weather_forecasting/scripts/activate-launch-plans.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | if [ -z "$1" ]
3 | then
4 | version=$(git rev-parse HEAD)
5 | else
6 | version=$1
7 | fi
8 |
9 | locations="atlanta seattle hyderabad mumbai taipei appleton dharamshala fremont"
10 |
11 |
12 | activate () {
13 | flytectl -c .flyte/remote-config.yaml \
14 | update launchplan \
15 | -p flytelab \
16 | -d development \
17 | "$1_weather_forecast_v2" \
18 | --version $version \
19 | --activate
20 | }
21 |
22 | for location in $locations
23 | do
24 | echo activating launch plan version $version for $location
25 | activate $location
26 | echo
27 | done
28 |
--------------------------------------------------------------------------------
/projects/weather_forecasting/scripts/archive-launch-plans.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | if [ -z "$1" ]
3 | then
4 | version=$(git rev-parse HEAD)
5 | else
6 | version=$1
7 | fi
8 |
9 | locations="atlanta seattle hyderabad mumbai taipei appleton dharamshala fremont"
10 |
11 |
12 | archive () {
13 | flytectl -c .flyte/remote-config.yaml \
14 | update launchplan \
15 | -p flytelab \
16 | -d development \
17 | "$1_weather_forecast_v2" \
18 | --version $version \
19 | --archive
20 | }
21 |
22 | for location in $locations
23 | do
24 | echo archiving launch plan version $version for $location
25 | archive $location
26 | done
27 |
--------------------------------------------------------------------------------
/projects/weather_forecasting/scripts/launch-plan-status.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | if [ -z "$1" ]
3 | then
4 | version=$(git rev-parse HEAD)
5 | else
6 | version=$1
7 | fi
8 |
9 | locations="atlanta seattle hyderabad mumbai taipei appleton dharamshala fremont"
10 |
11 |
12 | get-status () {
13 | flytectl -c .flyte/remote-config.yaml \
14 | get launchplan \
15 | -p flytelab \
16 | -d development \
17 | -o yaml \
18 | --latest \
19 | "$1_weather_forecast_v2"
20 | }
21 |
22 | for location in $locations
23 | do
24 | echo "launch plan status for $location, version: $version"
25 | get-status $location | grep state
26 | echo
27 | done
28 |
--------------------------------------------------------------------------------
/projects/whats_cooking_good_looking/.flyte/remote-config.yaml:
--------------------------------------------------------------------------------
1 | admin:
2 | # For GRPC endpoints you might want to use dns:///flyte.myexample.com
3 | endpoint: dns:///playground.hosted.unionai.cloud
4 | authType: ClientSecret
5 | # Change insecure flag to ensure that you use the right setting for your environment
6 | storage:
7 | type: stow
8 | stow:
9 | kind: s3
10 | config:
11 | auth_type: iam
12 | region: us-east-2
13 | logger:
14 | # Logger settings to control logger output. Useful to debug logger:
15 | show-source: true
16 | level: 1
17 |
--------------------------------------------------------------------------------
/projects/whats_cooking_good_looking/.flyte/remote.config:
--------------------------------------------------------------------------------
1 | [sdk]
2 | # This option specifies the python packages where-in to search for workflows and tasks workflow packages. These workflows and tasks are then serialized during the `make serialize` commands
3 | workflow_packages=.projects.whats_cooking_good_looking.whats_cooking_good_looking
4 |
5 | [auth]
6 | raw_output_data_prefix=s3://open-compute-playground
7 |
--------------------------------------------------------------------------------
/projects/whats_cooking_good_looking/.flyte/sandbox-config.yaml:
--------------------------------------------------------------------------------
1 | admin:
2 | # For GRPC endpoints you might want to use dns:///flyte.myexample.com
3 | endpoint: dns:///localhost:30081
4 | authType: Pkce
5 | insecure: true
6 | logger:
7 | show-source: true
8 | level: 0
9 | storage:
10 | connection:
11 | access-key: minio
12 | auth-type: accesskey
13 | disable-ssl: true
14 | endpoint: http://localhost:30084
15 | region: us-east-1
16 | secret-key: miniostorage
17 | type: minio
18 | container: "my-s3-bucket"
19 | enable-multicontainer: true
20 |
--------------------------------------------------------------------------------
/projects/whats_cooking_good_looking/.flyte/sandbox.config:
--------------------------------------------------------------------------------
1 | [sdk]
2 | # This option specifies the python packages where-in to search for workflows and tasks workflow packages. These workflows and tasks are then serialized during the `make serialize` commands
3 | workflow_packages=whats_cooking_good_looking
4 |
5 | [auth]
6 | raw_output_data_prefix=s3://my-s3-bucket/flytelab
7 |
--------------------------------------------------------------------------------
/projects/whats_cooking_good_looking/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9-slim-buster
2 |
3 | WORKDIR /root
4 | ENV VENV /opt/venv
5 | ENV LANG C.UTF-8
6 | ENV LC_ALL C.UTF-8
7 | ENV PYTHONPATH /root
8 |
9 | # e.g. flyte.config or sandbox.config
10 | ARG config
11 |
12 |
13 | RUN apt-get update && \
14 | apt-get install -y \
15 | libsm6 \
16 | libxext6 \
17 | libxrender-dev \
18 | ffmpeg \
19 | build-essential
20 |
21 | # Install the AWS cli separately to prevent issues with boto being written over
22 | RUN pip3 install awscli
23 |
24 | ENV VENV /opt/venv
25 |
26 | # Virtual environment
27 | RUN python3 -m venv ${VENV}
28 | ENV PATH="${VENV}/bin:$PATH"
29 |
30 | # Install Python dependencies
31 | COPY requirements.txt /root
32 | RUN pip install -r /root/requirements.txt
33 |
34 | COPY whats_cooking_good_looking /root/whats_cooking_good_looking
35 | COPY $config /root/flyte.config
36 |
37 |
38 | ARG creds
39 | RUN echo $creds | base64 -d > /root/google_creds.json
40 | ENV GOOGLE_APPLICATION_CREDENTIALS "/root/google_creds.json"
41 |
42 | # This image is supplied by the build script and will be used to determine the version
43 | # when registering tasks, workflows, and launch plans
44 | ARG image
45 | ENV FLYTE_INTERNAL_IMAGE $image
46 |
--------------------------------------------------------------------------------
/projects/whats_cooking_good_looking/README.md:
--------------------------------------------------------------------------------
1 | # Whats_cooking_good_looking
2 |
3 | ## Problem statement
4 |
5 | The world of beauty is in constant evolution. New molecules, new brands, new discovered benefits. Innovating is time consuming, especially when you are a large corporation. So a possible strategy is to find "indie brands" that innovate, and that the public likes, and buy them out.
6 | Social networks are a perfect place to talk about beauty, and therefore a great place to discover such brands.
7 | The problem then lies in, how to detect the new brands and trendy products or benefits, to help the merger division finding the right buy?
8 |
9 | ## Target Solution implementation
10 |
11 |
12 |
13 |
14 |
15 | In order to extract brands in a tweet, we implemented a NER (named entity recognition) pipeline that:
16 | 1. Retrieves tweets related to beauty based on keywords
17 | 2. Applies a pretrained NER model to those posts
18 | 3. Sends model results to a labelling interface and waits for a manual annotation to check given results
19 | 4. Computes evaluation metrics (so far only accuracy but it would be interesting to compute precision and recall as well)
20 | 5. 1. If metrics are good enough (defined by a business standard), the pipeline end
21 | 5. 2. If metrics are not good enough, it sends those labeled posts into a training task and goes back into the same piece of pipeline composed of steps 2 - 3 - 4
22 |
23 |
24 | ## Actual Solution implementation
25 |
26 |
27 |
28 |
29 |
30 | The project is cut into 3 steps:
31 |
32 | ### 1. NER application pipeline
33 | 1. Retrieves tweets related to beauty based on keywords
34 | 2. Applies a NER model to those posts
35 | 3. Sends model results in a format that could load into Label Studio in a GCS bucket
36 |
37 | To run this pipeline locally please run
38 | ```python whats_cooking_good_looking/apply_ner_workflow.py```
39 |
40 |
41 |
42 |
43 |
44 | ### 2. Manual labelling part in Label Studio
45 |
46 |
47 |
48 |
49 |
50 |
51 | ### 3. NER training pipeline
52 | 1. Retrieves labelled tasks (Label Studio output)
53 | 2. Computes model accuracy based on those labelled observations
54 | 3. 1. If metrics are good enough, pipeline stops
55 | 3. 2. If metrics are not good enough, labelled tasks are used as input to train a new NER model
56 |
57 | The goal was to create a feedback loop where it was possible to iterate by training NER models based on new manual annotations. We chose to cut into 2 pipelines to get rid of the network constraints that we would have to handle and that were not evaluated in the scope of this hackaton.
58 |
59 | To run this pipeline locally please run
60 | ```python whats_cooking_good_looking/train_ner_workflow.py```
61 |
62 |
63 |
64 |
65 | ## Pipeline deployment
66 |
67 | The Goal of this pipeline is deploy workflows automatically to the playground on pushed commits
68 |
69 | ### Steps and flow
70 | the steps are :
71 | 1. Build the dockerfile of our project
72 | 2. Push the build docker image to the container registery of our GCP project
73 | 3. serialize our workflows using the pyflyte cli
74 | 4. register the packages to our playgroud project
75 |
76 | Step 3 and 4 are using external docker based github actions that we made ourselves : [serialize](https://github.com/louisRDSC/FlyteSerializeAction), [register](https://github.com/louisRDSC/flyteRegisterAction)
77 |
78 | for all branches except main, the workflows are registered in the development environment. For main the workflows are registered in the staging environment. Future iterations could register them in the production evironment when a tag is created.
79 |
80 | In develpment each version of a workflow is named like followed : \-\.
81 | This allows ourself to easily recognize our work while working simultaneously on different branches.
82 |
83 | In staging each version of a workflow is named like followed : \
84 |
85 | ### How to make it work
86 |
87 | 4 secrets are required for the pipeline to work :
88 | - ClIENT_ID : the ID to authenticate with the playground
89 | - ClIENT_SECRET : The secret to authenticate with the playgroud
90 | - RUNNER_KEY : The json key encoded in BASE64 of a GCP service account with read and write rights on the gcp bucket where the data il pulled and pushed from.
91 | - SERVICE_ACCOUNT_KEY : The json key of a service account with write rights on the container registery where we are pushing our images
92 |
--------------------------------------------------------------------------------
/projects/whats_cooking_good_looking/dashboard/app.py:
--------------------------------------------------------------------------------
1 | import os
2 | from argparse import ArgumentParser
3 | from pathlib import Path
4 |
5 | import streamlit as st
6 | from flytekit.models import filters
7 | from flytekit.models.admin.common import Sort
8 | from flytekit.remote import FlyteRemote
9 | from sklearn.datasets import load_digits
10 |
11 | PROJECT_NAME = "flytelab-whats_cooking_good_looking".replace("_", "-")
12 | WORKFLOW_NAME = "whats_cooking_good_looking.workflows.main"
13 |
14 |
15 | parser = ArgumentParser()
16 | parser.add_argument("--remote", action="store_true")
17 | args = parser.parse_args()
18 |
19 | backend = os.getenv("FLYTE_BACKEND", 'remote' if args.remote else 'sandbox')
20 |
21 | # configuration for accessing a Flyte cluster backend
22 | remote = FlyteRemote.from_config(
23 | default_project=PROJECT_NAME,
24 | default_domain="development",
25 | config_file_path=Path(__file__).parent / f"{backend}.config",
26 | )
27 |
28 | # get the latest workflow execution
29 | [latest_execution, *_], _ = remote.client.list_executions_paginated(
30 | PROJECT_NAME,
31 | "development",
32 | limit=1,
33 | filters=[
34 | filters.Equal("launch_plan.name", WORKFLOW_NAME),
35 | filters.Equal("phase", "SUCCEEDED"),
36 | ],
37 | sort_by=Sort.from_python_std("desc(execution_created_at)"),
38 | )
39 |
40 | wf_execution = remote.fetch_workflow_execution(name=latest_execution.id.name)
41 | remote.sync(wf_execution, sync_nodes=False)
42 | model = wf_execution.outputs["o0"]
43 | print(model)
44 |
45 |
46 | ############
47 | # App Code #
48 | ############
49 |
50 | data = load_digits(as_frame=True)
51 |
52 | st.write("# Flytelab: whats_cooking_good_looking")
53 | st.write("### TBD")
54 | st.write(f"Model: `{model}`")
55 |
56 | st.write("Use the slider below to select a sample for prediction")
57 |
58 | sample_index = st.slider(
59 | "Sample Number",
60 | min_value=0,
61 | max_value=data.frame.shape[0] - 1,
62 | value=0,
63 | step=1,
64 | )
65 |
66 | st.image(data.images[sample_index], clamp=True, width=300)
67 | st.write(f"Ground Truth: {data.target[sample_index]}")
68 | st.write(f"Prediction: {model.predict(data.frame[data.feature_names].loc[[sample_index]])[0]}")
69 |
--------------------------------------------------------------------------------
/projects/whats_cooking_good_looking/dashboard/remote.config:
--------------------------------------------------------------------------------
1 | [platform]
2 | url=playground.hosted.unionai.cloud
3 | insecure=False
4 |
5 | [credentials]
6 | client_id=flytepropeller
7 | auth_mode=basic
8 | authorization_metadata-key=flyte-authorization
9 | oauth_scopes=all
10 |
--------------------------------------------------------------------------------
/projects/whats_cooking_good_looking/dashboard/sandbox.config:
--------------------------------------------------------------------------------
1 | [platform]
2 | url=localhost:30081
3 | insecure=True
4 |
5 | [aws]
6 | access_key_id=minio
7 | secret_access_key=miniostorage
8 | endpoint=http://localhost:30084
9 |
--------------------------------------------------------------------------------
/projects/whats_cooking_good_looking/deploy.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | from pathlib import Path
3 |
4 | import docker
5 | import git
6 | import typer
7 |
8 | app = typer.Typer()
9 |
10 | docker_client = docker.from_env()
11 |
12 |
13 | IMAGE_NAME = "flytelab"
14 | REGISTRY = "eu.gcr.io/flyte-sandbox-342013"
15 | PROJECT_NAME = "adorable-unicorns-23"
16 | DESCRIPTION = "TBD"
17 |
18 |
19 | def create_project(remote: bool):
20 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}-config.yaml"
21 | output = subprocess.run(
22 | [
23 | "flytectl",
24 | "get",
25 | "project",
26 | PROJECT_NAME,
27 | "--config", config,
28 | ],
29 | capture_output=True,
30 | )
31 | if output.stdout.decode().strip():
32 | return
33 |
34 | typer.echo(f"Creating project {PROJECT_NAME}")
35 | subprocess.run(
36 | [
37 | "flytectl",
38 | "create",
39 | "project",
40 | "--project", PROJECT_NAME,
41 | "--name", PROJECT_NAME,
42 | "--id", PROJECT_NAME,
43 | "--description", DESCRIPTION,
44 | "--config", config,
45 | ]
46 | )
47 |
48 |
49 | def get_version():
50 | repo = git.Repo(".", search_parent_directories=True)
51 | if repo.is_dirty():
52 | typer.echo("Please commit git changes before building", err=True)
53 | raise typer.Exit(code=1)
54 | commit = repo.rev_parse("HEAD")
55 | return commit.hexsha
56 |
57 |
58 | def get_tag(version):
59 | return f"{REGISTRY}/{IMAGE_NAME}:{PROJECT_NAME}-{version}"
60 |
61 |
62 | def sandbox_docker_build(tag):
63 | typer.echo("Building image in Flyte sandbox")
64 | subprocess.run([
65 | "flytectl",
66 | "sandbox",
67 | "exec",
68 | "--",
69 | "docker",
70 | "build",
71 | ".",
72 | "--tag",
73 | tag,
74 | ])
75 |
76 |
77 | def docker_build(tag: str, remote: bool) -> docker.models.images.Image:
78 | client = docker.from_env()
79 |
80 | # TODO: image build, push, flytectl serialization and registration
81 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}.config"
82 |
83 | typer.echo(f"Building image: {tag}")
84 | image, build_logs = client.images.build(
85 | path=".",
86 | dockerfile="Dockerfile",
87 | tag=tag,
88 | buildargs={
89 | "image": tag,
90 | "config": str(config),
91 | }
92 | )
93 | for line in build_logs:
94 | typer.echo(line)
95 | return image
96 |
97 |
98 | def docker_push(image: docker.models.images.Image):
99 | for line in docker_client.api.push(image.tags[0], stream=True, decode=True):
100 | typer.echo(line)
101 |
102 |
103 | def serialize(tag: str):
104 | typer.echo("Serializing Flyte workflows")
105 | subprocess.run([
106 | "pyflyte",
107 | "-c", ".flyte/remote.config",
108 | #"--pkgs", "whats_cooking_good_looking",
109 | "package",
110 | "--force",
111 | "--in-container-source-path", "/root",
112 | "--image", tag
113 | ])
114 |
115 |
116 | def register(version: str, remote: bool, domain: str):
117 | typer.echo("Registering Flyte workflows")
118 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}-config.yaml"
119 | subprocess.run([
120 | "flytectl",
121 | "-c", config,
122 | "register",
123 | "files",
124 | "--project", "adorable-unicorns-23",
125 | "--domain", domain,
126 | "--archive", "flyte-package.tgz",
127 | "--force",
128 | "--version", version
129 | ])
130 |
131 |
132 | @app.command()
133 | def main(remote: bool = False, domain: str = "development"):
134 | create_project(remote)
135 | version = get_version()
136 | tag = get_tag(version)
137 | if remote:
138 | docker_push(docker_build(tag, remote))
139 | else:
140 | sandbox_docker_build(tag)
141 | serialize(tag)
142 | register(version, remote, domain)
143 |
144 |
145 | if __name__ == "__main__":
146 | app()
147 |
--------------------------------------------------------------------------------
/projects/whats_cooking_good_looking/docs/actual_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/whats_cooking_good_looking/docs/actual_pipeline.png
--------------------------------------------------------------------------------
/projects/whats_cooking_good_looking/docs/apply_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/whats_cooking_good_looking/docs/apply_pipeline.png
--------------------------------------------------------------------------------
/projects/whats_cooking_good_looking/docs/label_studio.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/whats_cooking_good_looking/docs/label_studio.png
--------------------------------------------------------------------------------
/projects/whats_cooking_good_looking/docs/target_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/whats_cooking_good_looking/docs/target_pipeline.png
--------------------------------------------------------------------------------
/projects/whats_cooking_good_looking/docs/train_pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/whats_cooking_good_looking/docs/train_pipeline.png
--------------------------------------------------------------------------------
/projects/whats_cooking_good_looking/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | docker
2 | gitpython
3 | streamlit
4 | typer
5 |
--------------------------------------------------------------------------------
/projects/whats_cooking_good_looking/requirements.txt:
--------------------------------------------------------------------------------
1 | flytekit>=0.30.3
2 | s3fs>=2022.2.0
3 |
4 | snscrape==0.4.3.20220106
5 | spacy==3.2.3
6 | https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl
7 | google-cloud-storage==2.2.1
8 | gcsfs==2022.2.0
9 | click==7.1.2
10 | pandas==1.3.5
11 |
--------------------------------------------------------------------------------
/projects/whats_cooking_good_looking/whats_cooking_good_looking/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/projects/whats_cooking_good_looking/whats_cooking_good_looking/__init__.py
--------------------------------------------------------------------------------
/projects/whats_cooking_good_looking/whats_cooking_good_looking/apply_ner_workflow.py:
--------------------------------------------------------------------------------
1 | import json
2 | from pathlib import Path
3 | from typing import List
4 |
5 | import spacy
6 | from flytekit import Resources, task, workflow
7 | from snscrape.modules.twitter import TwitterSearchScraper
8 |
9 | from whats_cooking_good_looking.utils import (doc_to_spans, download_from_gcs,
10 | load_config, upload_to_gcs)
11 |
12 | SPACY_MODEL = {"en": "en_core_web_sm"}
13 |
14 | CACHE_VERSION = "2.2"
15 | request_resources = Resources(cpu="1", mem="500Mi", storage="500Mi")
16 | limit_resources = Resources(cpu="2", mem="1000Mi", storage="1000Mi")
17 |
18 |
19 | @task
20 | def get_tweets_list(
21 | keyword_list: List[str], lang: str = "en", max_results: int = 1000
22 | ) -> str:
23 | """Collects `max_results` tweets mentioning any of the words in `keywords_list` written in language `lang`.
24 |
25 | Args:
26 | keyword_list (List[str]): List of keywords that tweets must mention at least one of.
27 | lang (str, optional): Language in which tweets must be written(iso-code). Defaults to "en".
28 | max_results (int, optional): Number of maximum tweets to retrieve. Defaults to 1000.
29 |
30 | Returns:
31 | str: json dumped results with following shape
32 | [
33 | {
34 | "date": "2022-03-25 16:23:01+00:00,
35 | "tweet_id": "XXXXXXX",
36 | "text": "some tweet",
37 | "username": "some user"
38 | },
39 | ]
40 | """
41 | keywords_query = " OR ".join(keyword_list)
42 | query = f"({keywords_query}) lang:{lang}"
43 | tweets_list = []
44 | for tweet_idx, tweet_post in enumerate(TwitterSearchScraper(query).get_items()):
45 | if tweet_idx == max_results:
46 | break
47 | tweets_list.append(
48 | {
49 | "date": str(tweet_post.date),
50 | "tweet_id": str(tweet_post.id),
51 | "text": str(tweet_post.content),
52 | "username": str(tweet_post.username),
53 | }
54 | )
55 | return json.dumps(tweets_list)
56 |
57 |
58 | @task
59 | def load_model(
60 | lang: str,
61 | from_gcs: bool,
62 | gcs_bucket: str,
63 | gcs_source_blob_name: str,
64 | ) -> spacy.Language:
65 | """Loads spacy model either from gcs if specified or given the source language.
66 |
67 | Args:
68 | lang (str): Language in which tweets must be written(iso-code).
69 | from_gcs (bool): True if needs to download custom spacy model from gcs.
70 | gcs_bucket (str): bucket name where to retrieve spacy model if from_gcs.
71 | gcs_source_blob_name (str): blob name where to retrieve spacy model if from_gcs.
72 |
73 | Returns:
74 | Language: spacy model.
75 | """
76 | if from_gcs:
77 | Path("tmp").mkdir(parents=True, exist_ok=True)
78 | output_filename = download_from_gcs(
79 | gcs_bucket, gcs_source_blob_name, "tmp", explicit_filepath=True
80 | )[0]
81 | nlp = spacy.load(output_filename)
82 | else:
83 | model_name = SPACY_MODEL[lang]
84 | nlp = spacy.load(model_name)
85 | return nlp
86 |
87 |
88 | @task
89 | def apply_model(
90 | nlp: bytes, tweets_list: str, bucket_name: str, source_blob_name: str
91 | ) -> str:
92 | """Applies spacy model to each tweet to extract entities from and convert them into
93 | Label studio task format.
94 |
95 | Args:
96 | nlp (Language): Spacy model to use for inference.
97 | tweets_list (str): json dumped list of tweets.
98 | bucket_name (str): Name of the GCS bucket to upload to.
99 | source_blob_name (str): File name of GCS uploaded file.
100 |
101 | Returns:
102 | str: json dumped results with following shape
103 | [
104 | {
105 | "date": "2022-03-25 16:23:01+00:00,
106 | "tweet_id": "XXXXXXX",
107 | "text": "some tweet",
108 | "username": "some user"
109 | "entities": [
110 | {
111 | "label": "some label",
112 | "start_char": "index beginning char entity",
113 | "end_char": "index end char entity"
114 | },
115 | ]
116 |
117 | }
118 | ]
119 | """
120 | entities = set()
121 | labelstudio_tasks = []
122 | model_name = SPACY_MODEL["en"]
123 | for tweet in json.loads(tweets_list):
124 | predictions = []
125 | text = tweet["text"]
126 | doc = nlp(text)
127 | spans, ents = doc_to_spans(doc)
128 | entities |= ents
129 | predictions.append({"model_version": model_name, "result": spans})
130 | labelstudio_tasks.append({"data": {"text": text}, "predictions": predictions})
131 | with open("tasks.json", mode="w") as f:
132 | json.dump(labelstudio_tasks, f, indent=2)
133 | json_labelstudio_tasks = json.dumps(labelstudio_tasks)
134 | upload_to_gcs(
135 | bucket_name, source_blob_name, json_labelstudio_tasks, content_type=None
136 | )
137 | return json_labelstudio_tasks
138 |
139 |
140 | @workflow
141 | def main() -> str:
142 | """Main workflow searching for entities in beauty related tweets.
143 |
144 | Returns:
145 | str: json dumped results with following shape
146 | [
147 | {
148 | "date": "2022-03-25 16:23:01+00:00,
149 | "tweet_id": "XXXXXXX",
150 | "text": "some tweet",
151 | "username": "some user"
152 | "entities": [
153 | {
154 | "label": "some label",
155 | "start_char": "index beginning char entity",
156 | "end_char": "index end char entity"
157 | },
158 | ]
159 |
160 | }
161 | ]
162 | """
163 | config = load_config("apply")
164 | tweets_list = get_tweets_list(
165 | keyword_list=config["keyword_list"],
166 | lang=config["lang"],
167 | max_results=config["max_results"],
168 | )
169 | nlp = load_model(
170 | lang=config["lang"],
171 | from_gcs=config["from_gcs"],
172 | gcs_bucket=config["bucket_name"],
173 | gcs_source_blob_name=config["gcs_spacy_model_blob_name"],
174 | )
175 | return apply_model(
176 | nlp=nlp,
177 | tweets_list=tweets_list,
178 | bucket_name=config["bucket_name"],
179 | source_blob_name=config["applied_model_output_blob_name"],
180 | )
181 |
182 |
183 | if __name__ == "__main__":
184 | print(f"Applied model: {main()}")
185 |
--------------------------------------------------------------------------------
/projects/whats_cooking_good_looking/whats_cooking_good_looking/config.json:
--------------------------------------------------------------------------------
1 | {"apply":
2 | {
3 | "lang": "en",
4 | "keyword_list": ["beauty", "skin", "hair"],
5 | "max_results": 1000,
6 | "from_gcs": false,
7 | "bucket_name": "wcgl_data",
8 | "gcs_spacy_model_blob_name": "",
9 | "applied_model_output_blob_name": "label_in/tasks_out.json"
10 | },
11 | "train":
12 | {
13 | "bucket_name": "wcgl_data",
14 | "training_iterations": 10,
15 | "bucket_label_out_name": "wcgl_label_out",
16 | "model_name": "dummy",
17 | "label_studio_output_blob_name": "annotations.json",
18 | "model_output_blob_name": "spacy_model/models/dummy.pkl"
19 | }
20 | }
21 |
--------------------------------------------------------------------------------
/projects/whats_cooking_good_looking/whats_cooking_good_looking/keywords.txt:
--------------------------------------------------------------------------------
1 | açai
2 | acid serum
3 | acne prone
4 | active charcoal
5 | adaptogens
6 | advanced hydrators
7 | african black soap
8 | agentnateur
9 | ahas
10 | algae
11 | almond milk
12 | aloe soothing
13 | aloe vera
14 | alpha hydroxy acids
15 | amino
16 | ampoule
17 | amrezy highlighter
18 | anti pollution skincare
19 | antioxidant lip
20 | aox
21 | ardellbeauty
22 | ascorbic acid
23 | ash blonde
24 | babylights
25 | baking setting
26 | bakuchiol
27 | balayage tutorials
28 | balm
29 | bar soap
30 | beauty blender
31 | beauty oil
32 | beauty sponge
33 | beauty wellness
34 | benzoate
35 | bergamot
36 | beta carotene
37 | beta glucan
38 | biba eyeshadow
39 | big hair
40 | blaze
41 | blonde lightener
42 | blue light
43 | blueberry
44 | blush duo
45 | blush stix
46 | body butter
47 | body shimmer
48 | bottom lashes
49 | bounce
50 | box dye
51 | brassy
52 | bridal hair
53 | bright blonde
54 | brightening essence
55 | brightening serum
56 | brightens
57 | bronde
58 | bronzer blush
59 | bronzer brush
60 | bronzer highlighter
61 | brow blade
62 | brow duo
63 | brow pen
64 | brow routine
65 | brow styler
66 | browbone
67 | brown brow
68 | buckthorn
69 | burdock root
70 | bushy brow
71 | butyrospermum parkii
72 | calendula
73 | calm serum
74 | camu camu
75 | carotene
76 | carrageenan
77 | cbd
78 | cellulose mask
79 | cica
80 | cilantro
81 | cleansing balm
82 | collagen
83 | cucumber water
84 | damp hair
85 | deep hydration
86 | deeper lip
87 | dermaplane
88 | detox
89 | dew wet
90 | dewy glow
91 | dipbrow gel
92 | diy lip scrub
93 | double cleansing
94 | dragon fruit
95 | dream palette
96 | drmigy
97 | dry oil
98 | dryness
99 | elastin
100 | elastin
101 | elemis
102 | encapsulated
103 | endorphins
104 | enlight halo
105 | enlight powder
106 | environmental aggressors
107 | enzyme
108 | enzyme cleansing
109 | eraser
110 | essence oil
111 | essential oils
112 | exfoliator
113 | eye brush
114 | eye creams
115 | eye masks
116 | eyeliner easy
117 | eyeliner tutorial
118 | face massaging
119 | face mist
120 | face roller
121 | face serum
122 | facial hydrating
123 | falsies
124 | faux mink
125 | ferment
126 | fine mist
127 | finish foundation
128 | flaking
129 | flare ups
130 | flash palette
131 | flyliner
132 | fornax
133 | fouurthraybeauty
134 | fragrance free
135 | full on glam
136 | full reveal
137 | full tutorial
138 | fuller
139 | fuse
140 | gel like texture
141 | gel liners
142 | gel lips
143 | gemstone
144 | gentle exfoliation
145 | gentle peel
146 | ginseng
147 | glam eyeshadow
148 | glam palette
149 | glassy
150 | gloss set
151 | glossy lip
152 | glow gloss
153 | gluconolactone
154 | golden blonde
155 | grapeseed
156 | green level
157 | green tea
158 | grey blending
159 | growth factors
160 | gua sha
161 | hada labo
162 | hair oil
163 | halo powder
164 | harmful ingredients
165 | hazel
166 | healthy Glow
167 | heat damage
168 | helichrysum
169 | hemp
170 | hooded eyes
171 | hormonal
172 | hormonal acne
173 | hormones
174 | hyaluronic acid
175 | hydra mist
176 | hydrating foundation
177 | hydrating gel
178 | hydrating serum
179 | hydrocolloid
180 | hydrogel
181 | hydrojelly mask
182 | hydroquinone
183 | injectables
184 | injections
185 | iunik
186 | jeju
187 | jelly cleanser
188 | jojoba
189 | jojoba seed
190 | kakadu plum
191 | klairs
192 | kombucha
193 | konjac sponge
194 | korean skin care
195 | kose
196 | krave
197 | kush brow
198 | kush fiber
199 | kvd
200 | lactobacillus
201 | lanolin
202 | laser treatment
203 | lash glue
204 | lash line
205 | lashes super long
206 | lemon
207 | lifts
208 | light blonde
209 | light hydrating
210 | light makeup
211 | light to dark
212 | lip glosses
213 | lip kit
214 | lip liners
215 | lip lines
216 | lip mask
217 | lip oil
218 | lip topper
219 | lip treatment
220 | lipid
221 | lipoic acid
222 | liquid glass
223 | liquid peel
224 | liquid powder
225 | lob
226 | long lashes
227 | long lasting
228 | long wearing
229 | longwear foundation shade
230 | lotus
231 | low ph
232 | lower lash
233 | lowlights
234 | luminous foundation shade
235 | lunarbeauty
236 | magenta
237 | magnesium
238 | makeup geek
239 | makeup hair
240 | mandelic acid
241 | matcha
242 | matifying
243 | matte eyeshadows
244 | matte metal
245 | matte shadows
246 | matte velvet
247 | mehronmakeup
248 | melasma
249 | melted matte
250 | microdermabrasion
251 | microneedling
252 | milk honey
253 | milky
254 | mineral spf
255 | mini palette
256 | mink lashes
257 | mist
258 | misty mauve
259 | mochi
260 | moringa
261 | mothership palette
262 | mud
263 | mugwort
264 | multi purpose
265 | multi tonal
266 | murad
267 | nabla
268 | nanoparticles
269 | natural finish
270 | natural hyaluronic
271 | natural oils
272 | neck cream
273 | neon pink and neon yellow
274 | neroli
275 | niacinamide booster
276 | night cream
277 | night repair
278 | night routine
279 | nude lip
280 | nude shades
281 | oats
282 | oil Infused
283 | oil soluble
284 | oil vision
285 | olive
286 | omega
287 | overnight mask
288 | packaging free
289 | pad
290 | pale yellow
291 | palette dark
292 | palette fornax
293 | palette shades
294 | palmitate
295 | palmitoyl
296 | papaya
297 | paraben
298 | passionfruit
299 | patch testing
300 | peanut
301 | peel off mask
302 | peptides antioxidants
303 | perma
304 | permagel
305 | peroxide
306 | phloretin
307 | physical exfoliant
308 | pigment palette
309 | pink salt
310 | plouise makeup
311 | pollution protection
312 | porous
313 | powder palette
314 | priming
315 | priming moisturizer
316 | prism
317 | pro pigment
318 | probiotic skincare
319 | propanediol
320 | propolis
321 | protection cream
322 | purifying mask
323 | purito
324 | purple hair
325 | purple raven
326 | purples
327 | rainbow hair
328 | red light therapy
329 | refillable
330 | regrowth
331 | reiki
332 | reishi
333 | rejuvenating
334 | rejuvenation
335 | remover
336 | renewal mask
337 | resurfacing mask
338 | retinoid
339 | retinol
340 | rice toner
341 | root melt
342 | root shadow
343 | rootagé
344 | rose elixir
345 | rose hip oil
346 | rose toner
347 | rose water
348 | rosehip
349 | rosewater facial
350 | routine styling
351 | safari
352 | safflower
353 | sake
354 | sakura
355 | salicylic acid
356 | sativa
357 | seaweed
358 | semi opaque
359 | semi sheer
360 | serum retinol
361 | set powder
362 | shades in matte
363 | shea
364 | sheet masks
365 | shimmer shades
366 | shine spray
367 | silver hair
368 | silver smoke
369 | single layer
370 | sleeping masks
371 | sls
372 | smashbox studio
373 | smoky eye
374 | smoky quartz
375 | smudge brush
376 | smudge proof
377 | snail
378 | snowflake
379 | spoolie
380 | squalene
381 | stearate
382 | stem cell
383 | stick masks
384 | stix
385 | strengthening
386 | sugarpill
387 | sulfur
388 | sunflower
389 | super hydrating
390 | super pigmented
391 | super sensitive
392 | supercharged complex
393 | superfood
394 | sweet almond
395 | tattoo liner
396 | tea seed
397 | tea tree
398 | textured hair
399 | tinted mineral
400 | tony moly
401 | topper
402 | toxins
403 | treatment at home
404 | tube
405 | tuberose
406 | tyrosinase
407 | ultra matte
408 | upf
409 | urea
410 | vegan skincare
411 | velour lashes
412 | velvetine
413 | visible redness
414 | vitamin c serum
415 | volume and length
416 | walnuts
417 | water gel
418 | water loss
419 | waterline
420 | watermelon
421 | watery
422 | wet balayage
423 | wet balm
424 | wet hair
425 | white liquid
426 | white lotus
427 | willow bark
428 | witch hazel
429 | wiz taupe
430 | yellow shampoo
431 | zulu
--------------------------------------------------------------------------------
/projects/whats_cooking_good_looking/whats_cooking_good_looking/train_ner_workflow.py:
--------------------------------------------------------------------------------
1 | import json
2 | import pickle
3 | import random
4 | from collections import defaultdict
5 |
6 | import spacy
7 | from flytekit import Resources, dynamic, task, workflow
8 | from spacy.language import Language
9 | from spacy.training import Example
10 | from spacy.util import compounding, minibatch
11 |
12 | from whats_cooking_good_looking.apply_ner_workflow import load_model
13 | from whats_cooking_good_looking.utils import (download_bytes_from_gcs,
14 | load_config, upload_to_gcs)
15 |
16 | SPACY_MODEL = {"en": "en_core_web_sm"}
17 |
18 | CACHE_VERSION = "2.2"
19 | request_resources = Resources(cpu="1", mem="500Mi", storage="500Mi")
20 | limit_resources = Resources(cpu="2", mem="1000Mi", storage="1000Mi")
21 |
22 | THRESHOLD_ACCURACY = 0.7
23 |
24 |
25 | @task
26 | def evaluate_ner(labelstudio_tasks: bytes) -> dict:
27 | """Computes accuracy, precision and recall of NER model out of label studio output.
28 |
29 | Args:
30 | labelstudio_tasks (list): List of dicts outputs of label studio annotation with following format
31 | [
32 | {
33 | "result": [
34 | {
35 | "value": {"start": 10, "end": 17, "text": "Chennai", "labels": ["LOC"]},
36 | "from_name": "label",
37 | "to_name": "text",
38 | "type": "labels",
39 | "origin": "manual",
40 | }
41 | ],
42 | "predictions": [
43 | {
44 | "result": {"start": 10, "end": 17, "text": "Chennai", "labels": ["LOC"]},
45 | "model_version": "en_core_web_sm",
46 | }
47 | ],
48 | }
49 | ]
50 |
51 | Returns:
52 | dict: mapping {model_name: accuracy}
53 |
54 | """
55 | model_acc = dict()
56 | model_hits = defaultdict(int)
57 | for ls_task in json.loads(labelstudio_tasks):
58 | annotation_result = ls_task["result"][0]["value"]
59 | for key in annotation_result:
60 | annotation_result.pop("id", None)
61 | for prediction in ls_task["predictions"]:
62 | model_version = prediction["model_version"]
63 | model_hits[model_version] += int(prediction["result"] == annotation_result)
64 |
65 | num_task = len(labelstudio_tasks)
66 | for model_name, num_hits in model_hits.items():
67 | acc = num_hits / num_task
68 | model_acc[model_name] = acc
69 | print(f"Accuracy for {model_name}: {acc:.2f}%")
70 | return model_acc
71 |
72 |
73 | @task
74 | def load_tasks(bucket_name: str, source_blob_name: str) -> bytes:
75 | """Loads Label Studio annotations.
76 |
77 | Args:
78 | bucket_name (str): GCS bucket name where tasks are stored.
79 | source_blob_name (str): GCS blob name where tasks are stored.
80 |
81 | Returns:
82 | str: json dumped tasks
83 | """
84 | labelstudio_tasks = download_bytes_from_gcs(
85 | bucket_name=bucket_name, source_blob_name=source_blob_name
86 | )
87 | return labelstudio_tasks
88 |
89 |
90 | @task
91 | def format_tasks_for_train(labelstudio_tasks: bytes) -> str:
92 | """Format Label Studio output to be trained in spacy custom model.
93 |
94 | Args:
95 | labelstudio_tasks (str): json dumped labelstudio_tasks
96 |
97 | Returns:
98 | str: json dumped train data formatted
99 | """
100 | train_data = []
101 | for ls_task in json.loads(labelstudio_tasks):
102 | entities = [
103 | (ent["value"]["start"], ent["value"]["end"], label)
104 | for ent in ls_task["result"]
105 | for label in ent["value"]["labels"]
106 | ]
107 | if entities != []:
108 | train_data.append((ls_task["task"]["data"]["text"], {"entities": entities}))
109 | return json.dumps(train_data)
110 |
111 |
112 | @task
113 | def train_model(
114 | train_data: str,
115 | nlp: Language,
116 | training_iterations: int,
117 | bucket_out: str,
118 | source_blob_name: str,
119 | ) -> Language:
120 | """ Uses new labelled data to improve spacy NER model. Uploads trained model in GCS.
121 |
122 | Args:
123 | train_data_files (List[str]): List of data filepath to train model on. After being loaded, format \
124 | should be the following:
125 | train_data = [
126 | ("Text to detect Entities in.", {"entities": [(15, 23, "PRODUCT")]}),
127 | ("Flyte is another example of organisation.", {"entities": [(0, 6, "ORG")]}),
128 | ]
129 | nlp (Language): Spacy base model to train on.
130 | training_iterations (int): Number of training iterations to make. Defaults to 30.
131 |
132 | Returns:
133 | Language: Trained spacy model
134 | """
135 | train_data = json.loads(train_data)
136 | ner = nlp.get_pipe("ner")
137 | for _, annotations in train_data:
138 | for ent in annotations.get("entities"):
139 | ner.add_label(ent[2])
140 | pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
141 | unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
142 | print("Starting model training")
143 | with nlp.disable_pipes(*unaffected_pipes):
144 | optimizer = spacy.blank("en").initialize()
145 | for iteration in range(training_iterations):
146 | random.shuffle(train_data)
147 | losses = {}
148 | batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
149 | for batch in batches:
150 | for text, annotations in batch:
151 | doc = nlp.make_doc(text)
152 | example = Example.from_dict(doc, annotations)
153 | nlp.update([example], drop=0.35, losses=losses, sgd=optimizer)
154 | print("Iteration n°", iteration)
155 | print("Losses", losses)
156 | print("Model training completed !")
157 | upload_to_gcs(bucket_out, source_blob_name, pickle.dumps(nlp))
158 | print("Model upload on GCS completed !")
159 | return nlp
160 |
161 |
162 | @dynamic(
163 | cache=False,
164 | requests=request_resources,
165 | limits=limit_resources,
166 | )
167 | def train_model_if_necessary(
168 | labelstudio_tasks: bytes,
169 | metrics_dict: dict,
170 | model_name: str,
171 | training_iterations: int,
172 | bucket_out: str,
173 | model_output_blob_name: str,
174 | ):
175 | """Checks for model accuracy. If it's high enough, the pipeline stops, else it trains a new model \
176 | and upload it to GCS.
177 |
178 | Args:
179 | labelstudio_tasks (bytes): Label studio annotations
180 | metrics_dict (dict): mapping between model name and accuracy
181 | model_name (str): model name from which we get accuracy
182 | training_iterations (int): number of training iterations for the spacy NER model
183 | """
184 | if metrics_dict[model_name] >= THRESHOLD_ACCURACY:
185 | print(f"No need to train. Accuracy of {metrics_dict[model_name]} is above threshold {THRESHOLD_ACCURACY}")
186 | else:
187 | train_data = format_tasks_for_train(labelstudio_tasks=labelstudio_tasks)
188 | nlp = load_model(
189 | lang="en",
190 | from_gcs=False,
191 | gcs_bucket=bucket_out,
192 | gcs_source_blob_name=model_output_blob_name,
193 | )
194 | nlp = train_model(
195 | train_data=train_data,
196 | nlp=nlp,
197 | training_iterations=training_iterations,
198 | bucket_out=bucket_out,
199 | source_blob_name=model_output_blob_name,
200 | )
201 |
202 |
203 | @workflow
204 | def main():
205 | """Main training workflow evaluating model based on labelled observations.
206 | * If accuracy is high enough, the pipeline ends
207 | * If accuracy is below threshold, the pipeline trains a new model based on those
208 | observations and dumps it on GCS
209 | """
210 | config = load_config("train")
211 | labelstudio_tasks = load_tasks(
212 | bucket_name=config["bucket_label_out_name"],
213 | source_blob_name=config["label_studio_output_blob_name"],
214 | )
215 | metrics_dict = evaluate_ner(labelstudio_tasks=labelstudio_tasks)
216 | train_model_if_necessary(
217 | labelstudio_tasks=labelstudio_tasks,
218 | metrics_dict=metrics_dict,
219 | training_iterations=config["training_iterations"],
220 | model_name=config["model_name"],
221 | bucket_out=config["bucket_name"],
222 | model_output_blob_name=config["model_output_blob_name"],
223 | )
224 |
225 |
226 | if __name__ == "__main__":
227 | main()
228 |
--------------------------------------------------------------------------------
/projects/whats_cooking_good_looking/whats_cooking_good_looking/utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | from itertools import groupby
4 | from pathlib import Path
5 | from typing import List, Union
6 |
7 | from google.cloud import storage
8 |
9 |
10 | def load_config(train_or_apply: str) -> dict:
11 | """Load config"""
12 | config_file_path = Path(__file__).parent.resolve() / "config.json"
13 | with open(config_file_path, "r") as f:
14 | config = json.load(f)
15 | return config[train_or_apply]
16 |
17 |
18 | def doc_to_spans(doc):
19 | """This function converts spaCy docs to the list of named entity spans in Label Studio compatible JSON format"""
20 | tokens = [(tok.text, tok.idx, tok.ent_type_) for tok in doc]
21 | results = []
22 | entities = set()
23 | for entity, group in groupby(tokens, key=lambda t: t[-1]):
24 | if not entity:
25 | continue
26 | group = list(group)
27 | _, start, _ = group[0]
28 | word, last, _ = group[-1]
29 | text = " ".join(item[0] for item in group)
30 | end = last + len(word)
31 | results.append(
32 | {
33 | "from_name": "label",
34 | "to_name": "text",
35 | "type": "labels",
36 | "value": {"start": start, "end": end, "text": text, "labels": [entity]},
37 | }
38 | )
39 | entities.add(entity)
40 |
41 | return results, entities
42 |
43 |
44 | def load_train_data(train_data_files: str) -> List:
45 | """Load jsonl train data as a list, ready to be ingested by spacy model.
46 |
47 | Args:
48 | train_data_local_path (str): Path of files to load.
49 |
50 | Returns:
51 | List: Tuple of texts and dict of entities to be used for training.
52 | """
53 | train_data = []
54 | for data_file in train_data_files:
55 | with open(data_file, "r") as f:
56 | for json_str in list(f):
57 | train_data_dict = json.loads(json_str)
58 | train_text = train_data_dict["text"]
59 | train_entities = {
60 | "entities": [
61 | tuple(entity_elt) for entity_elt in train_data_dict["entities"]
62 | ]
63 | }
64 | formatted_train_line = (train_text, train_entities)
65 | train_data.append(formatted_train_line)
66 | return train_data
67 |
68 |
69 | def download_from_gcs(
70 | bucket_name: str,
71 | source_blob_name: str,
72 | destination_folder: str,
73 | explicit_filepath: bool = False,
74 | ) -> Union[str, List[str]]:
75 | """Download gcs data locally.
76 |
77 | Args:
78 | bucket_name (str): Name of the GCS bucket.
79 | source_blob_name (str): GCS path to data in the bucket.
80 | destination_folder (str): Folder to download GCS data to.
81 | explicit_filepath (bool, optional): Decides whether to return explicit list of filepath instead \
82 | of destination folder only. Default to False.
83 |
84 | Returns:
85 | str: Local destination folder
86 | """
87 | storage_client = storage.Client()
88 | bucket = storage_client.bucket(bucket_name)
89 | blobs = bucket.list_blobs(prefix=source_blob_name)
90 | filepath_list = []
91 | for blob in blobs:
92 | if not blob.name.endswith("/"):
93 | filename = blob.name.replace("/", "_")
94 | local_path = os.path.join(destination_folder, filename)
95 | blob.download_to_filename(local_path)
96 | filepath_list.append(local_path)
97 | print(f"Downloaded at {destination_folder}")
98 | if explicit_filepath:
99 | return filepath_list
100 | return destination_folder
101 |
102 |
103 | def download_bytes_from_gcs(bucket_name, source_blob_name):
104 | storage_client = storage.Client()
105 | bucket = storage_client.bucket(bucket_name)
106 | blob = bucket.blob(source_blob_name)
107 | return blob.download_as_string()
108 |
109 |
110 | def upload_to_gcs(bucket_name, source_blob_name, data, content_type=None):
111 | storage_client = storage.Client()
112 | bucket = storage_client.bucket(bucket_name)
113 | blob = bucket.blob(source_blob_name)
114 | blob.upload_from_string(data, content_type=content_type)
115 |
--------------------------------------------------------------------------------
/templates/_common/deploy.py:
--------------------------------------------------------------------------------
1 | import os
2 | import subprocess
3 | import uuid
4 | from pathlib import Path
5 |
6 | import docker
7 | import git
8 | import typer
9 |
10 |
11 | app = typer.Typer()
12 |
13 | docker_client = docker.from_env()
14 |
15 |
16 | IMAGE_NAME = "flytelab"
17 | REGISTRY = "ghcr.io/{{cookiecutter.github_username}}".lower()
18 | PROJECT_NAME = "{{cookiecutter.flyte_project}}"
19 | DESCRIPTION = "{{cookiecutter.description}}"
20 |
21 |
22 | def create_project(remote: bool):
23 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}-config.yaml"
24 | output = subprocess.run(
25 | [
26 | "flytectl",
27 | "get",
28 | "project",
29 | PROJECT_NAME,
30 | "--config", config,
31 | ],
32 | capture_output=True,
33 | check=True,
34 | )
35 | if output.stdout.decode().strip():
36 | return
37 |
38 | typer.echo(f"Creating project {PROJECT_NAME}")
39 | subprocess.run(
40 | [
41 | "flytectl",
42 | "create",
43 | "project",
44 | "--project", PROJECT_NAME,
45 | "--name", PROJECT_NAME,
46 | "--id", PROJECT_NAME,
47 | "--description", DESCRIPTION,
48 | "--config", config,
49 | ],
50 | check=True,
51 | )
52 |
53 |
54 | def get_version(fast: bool):
55 | repo = git.Repo(".", search_parent_directories=True)
56 | if not fast and repo.is_dirty():
57 | typer.echo(
58 | "Please commit git changes before building. If you haven't updated any system/python dependencies "
59 | "but want to deploy task/workflow code changes, use the --fast flag to do fast registration.",
60 | err=True
61 | )
62 | raise typer.Exit(code=1)
63 | commit = repo.rev_parse("HEAD")
64 | return commit.hexsha
65 |
66 |
67 | def get_tag(version, registry=None):
68 | return f"{REGISTRY if registry is None else registry}/{IMAGE_NAME}:{PROJECT_NAME}-{version}"
69 |
70 |
71 | def sandbox_docker_build(tag):
72 | typer.echo("Building image in Flyte sandbox")
73 | subprocess.run(
74 | [
75 | "flytectl",
76 | "sandbox",
77 | "exec",
78 | "--",
79 | "docker",
80 | "build",
81 | ".",
82 | "--tag",
83 | tag,
84 | ],
85 | check=True,
86 | )
87 |
88 |
89 | def docker_build(tag: str, remote: bool) -> docker.models.images.Image:
90 | client = docker.from_env()
91 |
92 | # TODO: image build, push, flytectl serialization and registration
93 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}.config"
94 |
95 | typer.echo(f"Building image: {tag}")
96 | image, build_logs = client.images.build(
97 | path=".",
98 | dockerfile="Dockerfile",
99 | tag=tag,
100 | buildargs={
101 | "image": tag,
102 | "config": str(config),
103 | }
104 | )
105 | for line in build_logs:
106 | typer.echo(line)
107 | return image
108 |
109 |
110 | def docker_push(image: docker.models.images.Image):
111 | for line in docker_client.api.push(image.tags[0], stream=True, decode=True):
112 | typer.echo(line)
113 |
114 |
115 | def serialize(tag: str, remote: bool, fast: bool):
116 | typer.echo("Serializing Flyte workflows")
117 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}.config"
118 | package = Path(".") / "flyte-package.tgz"
119 | if package.exists():
120 | os.remove(package)
121 | subprocess.run(
122 | [
123 | "pyflyte",
124 | "-c", str(config),
125 | "--pkgs", "{{cookiecutter.project_name}}",
126 | "package",
127 | "--force",
128 | "--image", tag,
129 | *(
130 | ["--fast"]
131 | if fast
132 | else ["--in-container-source-path", "/root"]
133 | ),
134 | ],
135 | check=True,
136 | # inject the FLYTE_SANDBOX environment variable to the serialization runtime
137 | env={"FLYTE_SANDBOX": "1" if not remote else "0", **os.environ},
138 | )
139 |
140 |
141 | def register(version: str, remote: bool, fast: bool, domain: str):
142 | typer.echo("Registering Flyte workflows")
143 | config = Path(".flyte") / f"{'remote' if remote else 'sandbox'}-config.yaml"
144 | if fast:
145 | version = f"{version}-fast{uuid.uuid4().hex[:7]}"
146 | subprocess.run(
147 | [
148 | "flytectl",
149 | "-c", config,
150 | "register",
151 | "files",
152 | "--project", PROJECT_NAME,
153 | "--domain", domain,
154 | "--archive", "flyte-package.tgz",
155 | "--force",
156 | "--version", version
157 | ],
158 | check=True,
159 | )
160 | typer.echo(f"Successfully registered version {version}")
161 |
162 |
163 | @app.command()
164 | def main(remote: bool = False, fast: bool = False, domain: str = "development", registry: str = None):
165 | if remote and fast:
166 | typer.echo(
167 | "Fast registration is not enabled when deploying to remote. "
168 | "Please deploy your workflows without the --fast flag.",
169 | err=True
170 | )
171 | create_project(remote)
172 | version = get_version(fast)
173 | tag = get_tag(version, registry)
174 | if not fast:
175 | if remote:
176 | docker_push(docker_build(tag, remote))
177 | else:
178 | sandbox_docker_build(tag)
179 | serialize(tag, remote, fast)
180 | register(version, remote, fast, domain)
181 |
182 |
183 | if __name__ == "__main__":
184 | app()
185 |
--------------------------------------------------------------------------------
/templates/basic/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/templates/basic/README.md
--------------------------------------------------------------------------------
/templates/basic/cookiecutter.json:
--------------------------------------------------------------------------------
1 | {
2 | "project_name": null,
3 | "project_author": null,
4 | "github_username": null,
5 | "flyte_project": "{{ cookiecutter.project_author|lower|replace(' ', '-')|replace('_', '-') }}",
6 | "description": "A flytelab project"
7 | }
8 |
--------------------------------------------------------------------------------
/templates/basic/hooks/pre_gen_project.py:
--------------------------------------------------------------------------------
1 | import re
2 | import sys
3 |
4 | PROJECT_NAME_REGEX = r'^[_a-zA-Z][_a-zA-Z0-9]+$'
5 | PROJECT_AUTHOR_REGEX = r'^[-_a-zA-Z0-9 ]+$'
6 | FLYTE_PROJECT_REGEX = r'^[-a-z0-9]+$'
7 |
8 | project_name = '{{ cookiecutter.project_name }}'
9 | project_author = '{{ cookiecutter.project_author }}'
10 | flyte_project = '{{ cookiecutter.flyte_project }}' or '{{ cookiecutter.project_author }}'.lower().replace("_", "-").replace(" ", "-")
11 |
12 | if not re.match(PROJECT_NAME_REGEX, project_name):
13 | print(f"ERROR: project_name '{project_name}' is invalid. Must match the expression {PROJECT_NAME_REGEX}")
14 | sys.exit(1)
15 |
16 | if not re.match(PROJECT_AUTHOR_REGEX, project_author):
17 | print(f"ERROR: project_author '{project_author}' is invalid. Must match the expression {PROJECT_AUTHOR_REGEX}")
18 | sys.exit(1)
19 |
20 | if not re.match(FLYTE_PROJECT_REGEX, flyte_project):
21 | print(f"ERROR: flyte_project '{flyte_project}' is invalid. Must match the expression {FLYTE_PROJECT_REGEX}")
22 | sys.exit(1)
23 |
--------------------------------------------------------------------------------
/templates/basic/{{cookiecutter.project_name}}/.dockerignore:
--------------------------------------------------------------------------------
1 | !.flyte
--------------------------------------------------------------------------------
/templates/basic/{{cookiecutter.project_name}}/.flyte/remote-config.yaml:
--------------------------------------------------------------------------------
1 | admin:
2 | # For GRPC endpoints you might want to use dns:///flyte.myexample.com
3 | endpoint: dns:///playground.hosted.unionai.cloud
4 | authType: Pkce
5 | # Change insecure flag to ensure that you use the right setting for your environment
6 | insecure: false
7 | storage:
8 | type: stow
9 | stow:
10 | kind: s3
11 | config:
12 | auth_type: iam
13 | region: us-east-2
14 | logger:
15 | # Logger settings to control logger output. Useful to debug logger:
16 | show-source: true
17 | level: 1
18 |
--------------------------------------------------------------------------------
/templates/basic/{{cookiecutter.project_name}}/.flyte/remote.config:
--------------------------------------------------------------------------------
1 | [sdk]
2 | # This option specifies the python packages where-in to search for workflows and tasks workflow packages. These workflows and tasks are then serialized during the `make serialize` commands
3 | workflow_packages={{cookiecutter.project_name}}
4 |
5 | [auth]
6 | raw_output_data_prefix=s3://open-compute-playground
7 |
--------------------------------------------------------------------------------
/templates/basic/{{cookiecutter.project_name}}/.flyte/sandbox-config.yaml:
--------------------------------------------------------------------------------
1 | admin:
2 | # For GRPC endpoints you might want to use dns:///flyte.myexample.com
3 | endpoint: dns:///localhost:30081
4 | authType: Pkce
5 | insecure: true
6 | logger:
7 | show-source: true
8 | level: 0
9 | storage:
10 | connection:
11 | access-key: minio
12 | auth-type: accesskey
13 | disable-ssl: true
14 | endpoint: http://localhost:30084
15 | region: us-east-1
16 | secret-key: miniostorage
17 | type: minio
18 | container: "my-s3-bucket"
19 | enable-multicontainer: true
20 |
--------------------------------------------------------------------------------
/templates/basic/{{cookiecutter.project_name}}/.flyte/sandbox.config:
--------------------------------------------------------------------------------
1 | [sdk]
2 | # This option specifies the python packages where-in to search for workflows and tasks workflow packages. These workflows and tasks are then serialized during the `make serialize` commands
3 | workflow_packages={{cookiecutter.project_name}}
4 |
5 | [auth]
6 | raw_output_data_prefix=s3://my-s3-bucket/flytelab
7 |
--------------------------------------------------------------------------------
/templates/basic/{{cookiecutter.project_name}}/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.9-slim-buster
2 |
3 | WORKDIR /root
4 | ENV VENV /opt/venv
5 | ENV LANG C.UTF-8
6 | ENV LC_ALL C.UTF-8
7 | ENV PYTHONPATH /root
8 |
9 | # e.g. flyte.config or sandbox.config
10 | ARG config
11 |
12 | RUN apt-get update && \
13 | apt-get install -y \
14 | libsm6 \
15 | libxext6 \
16 | libxrender-dev \
17 | ffmpeg \
18 | build-essential
19 |
20 | # Install the AWS cli separately to prevent issues with boto being written over
21 | RUN pip3 install awscli
22 |
23 | ENV VENV /opt/venv
24 |
25 | # Virtual environment
26 | RUN python3 -m venv ${VENV}
27 | ENV PATH="${VENV}/bin:$PATH"
28 |
29 | # Install Python dependencies
30 | COPY requirements.txt /root
31 | RUN pip install -r /root/requirements.txt
32 |
33 | COPY {{cookiecutter.project_name}} /root/{{cookiecutter.project_name}}
34 | COPY $config /root/flyte.config
35 |
36 | # This image is supplied by the build script and will be used to determine the version
37 | # when registering tasks, workflows, and launch plans
38 | ARG image
39 | ENV FLYTE_INTERNAL_IMAGE $image
40 |
--------------------------------------------------------------------------------
/templates/basic/{{cookiecutter.project_name}}/README.md:
--------------------------------------------------------------------------------
1 | # {{cookiecutter.project_name}}
2 |
3 | {{cookiecutter.description}}
4 |
--------------------------------------------------------------------------------
/templates/basic/{{cookiecutter.project_name}}/dashboard/app.py:
--------------------------------------------------------------------------------
1 | import os
2 | from argparse import ArgumentParser
3 | from pathlib import Path
4 |
5 | import streamlit as st
6 |
7 | from flytekit.remote import FlyteRemote
8 | from flytekit.models import filters
9 | from flytekit.models.admin.common import Sort
10 |
11 | from sklearn.datasets import load_digits
12 |
13 |
14 | PROJECT_NAME = "{{cookiecutter.flyte_project}}"
15 | WORKFLOW_NAME = "{{cookiecutter.project_name}}.workflows.main"
16 |
17 |
18 | parser = ArgumentParser()
19 | parser.add_argument("--remote", action="store_true")
20 | args = parser.parse_args()
21 |
22 | backend = os.getenv("FLYTE_BACKEND", 'remote' if args.remote else 'sandbox')
23 |
24 | # configuration for accessing a Flyte cluster backend
25 | remote = FlyteRemote.from_config(
26 | default_project=PROJECT_NAME,
27 | default_domain="development",
28 | config_file_path=Path(__file__).parent / f"{backend}.config",
29 | )
30 |
31 | # get the latest workflow execution
32 | [latest_execution, *_], _ = remote.client.list_executions_paginated(
33 | PROJECT_NAME,
34 | "development",
35 | limit=1,
36 | filters=[
37 | filters.Equal("launch_plan.name", WORKFLOW_NAME),
38 | filters.Equal("phase", "SUCCEEDED"),
39 | ],
40 | sort_by=Sort.from_python_std("desc(execution_created_at)"),
41 | )
42 |
43 | wf_execution = remote.fetch_workflow_execution(name=latest_execution.id.name)
44 | remote.sync(wf_execution, sync_nodes=False)
45 | model = wf_execution.outputs["o0"]
46 | print(model)
47 |
48 |
49 | ############
50 | # App Code #
51 | ############
52 |
53 | data = load_digits(as_frame=True)
54 |
55 | st.write("# Flytelab: {{cookiecutter.project_name}}")
56 | st.write("### {{cookiecutter.description}}")
57 | st.write(f"Model: `{model}`")
58 |
59 | st.write("Use the slider below to select a sample for prediction")
60 |
61 | sample_index = st.slider(
62 | "Sample Number",
63 | min_value=0,
64 | max_value=data.frame.shape[0] - 1,
65 | value=0,
66 | step=1,
67 | )
68 |
69 | st.image(data.images[sample_index], clamp=True, width=300)
70 | st.write(f"Ground Truth: {data.target[sample_index]}")
71 | st.write(f"Prediction: {model.predict(data.frame[data.feature_names].loc[[sample_index]])[0]}")
72 |
--------------------------------------------------------------------------------
/templates/basic/{{cookiecutter.project_name}}/dashboard/remote.config:
--------------------------------------------------------------------------------
1 | [platform]
2 | url=playground.hosted.unionai.cloud
3 | insecure=False
4 |
5 | [credentials]
6 | client_id=flytepropeller
7 | auth_mode=basic
8 | authorization_metadata-key=flyte-authorization
9 | oauth_scopes=all
10 |
--------------------------------------------------------------------------------
/templates/basic/{{cookiecutter.project_name}}/dashboard/sandbox.config:
--------------------------------------------------------------------------------
1 | [platform]
2 | url=localhost:30081
3 | insecure=True
4 |
5 | [aws]
6 | access_key_id=minio
7 | secret_access_key=miniostorage
8 | endpoint=http://localhost:30084
9 |
--------------------------------------------------------------------------------
/templates/basic/{{cookiecutter.project_name}}/deploy.py:
--------------------------------------------------------------------------------
1 | ../../_common/deploy.py
--------------------------------------------------------------------------------
/templates/basic/{{cookiecutter.project_name}}/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | docker
2 | gitpython
3 | streamlit
4 | typer
5 |
--------------------------------------------------------------------------------
/templates/basic/{{cookiecutter.project_name}}/requirements.txt:
--------------------------------------------------------------------------------
1 | flytekit>=0.30.3
2 | pandas
3 | s3fs
4 | sklearn
5 |
--------------------------------------------------------------------------------
/templates/basic/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/templates/basic/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/__init__.py
--------------------------------------------------------------------------------
/templates/basic/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/workflows.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from sklearn.datasets import load_digits
3 | from sklearn.linear_model import LogisticRegression
4 |
5 | from flytekit import task, workflow
6 |
7 |
8 | @task
9 | def get_dataset() -> pd.DataFrame:
10 | return load_digits(as_frame=True).frame
11 |
12 |
13 | @task
14 | def train_model(dataset: pd.DataFrame) -> LogisticRegression:
15 | model = LogisticRegression()
16 | features, target = dataset[[x for x in dataset if x != "target"]], dataset["target"]
17 | return model.fit(features, target)
18 |
19 |
20 | @workflow
21 | def main() -> LogisticRegression:
22 | return train_model(dataset=get_dataset())
23 |
24 |
25 | if __name__ == "__main__":
26 | print(f"trained model: {main()}")
27 |
--------------------------------------------------------------------------------
/templates/pytorch-gpu/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/templates/pytorch-gpu/README.md
--------------------------------------------------------------------------------
/templates/pytorch-gpu/cookiecutter.json:
--------------------------------------------------------------------------------
1 | {
2 | "project_name": null,
3 | "project_author": null,
4 | "github_username": null,
5 | "description": "A flytelab project"
6 | }
7 |
--------------------------------------------------------------------------------
/templates/pytorch-gpu/hooks/pre_gen_project.py:
--------------------------------------------------------------------------------
1 | import re
2 | import sys
3 |
4 | PROJECT_NAME_REGEX = r'^[_a-zA-Z][_a-zA-Z0-9]+$'
5 | PROJECT_AUTHOR_REGEX = r'^[-_a-zA-Z0-9 ]+$'
6 | FLYTE_PROJECT_REGEX = r'^[-a-z0-9]+$'
7 |
8 | project_name = '{{ cookiecutter.project_name }}'
9 | project_author = '{{ cookiecutter.project_author }}'
10 | flyte_project = '{{ cookiecutter.flyte_project }}' or '{{ cookiecutter.project_author }}'.lower().replace("_", "-").replace(" ", "-")
11 |
12 | if not re.match(PROJECT_NAME_REGEX, project_name):
13 | print(f"ERROR: project_name '{project_name}' is invalid. Must match the expression {PROJECT_NAME_REGEX}")
14 | sys.exit(1)
15 |
16 | if not re.match(PROJECT_AUTHOR_REGEX, project_author):
17 | print(f"ERROR: project_author '{project_author}' is invalid. Must match the expression {PROJECT_AUTHOR_REGEX}")
18 | sys.exit(1)
19 |
20 | if not re.match(FLYTE_PROJECT_REGEX, flyte_project):
21 | print(f"ERROR: flyte_project '{flyte_project}' is invalid. Must match the expression {FLYTE_PROJECT_REGEX}")
22 | sys.exit(1)
23 |
--------------------------------------------------------------------------------
/templates/pytorch-gpu/{{cookiecutter.project_name}}/.dockerignore:
--------------------------------------------------------------------------------
1 | !.flyte
--------------------------------------------------------------------------------
/templates/pytorch-gpu/{{cookiecutter.project_name}}/.flyte/remote-config.yaml:
--------------------------------------------------------------------------------
1 | admin:
2 | # For GRPC endpoints you might want to use dns:///flyte.myexample.com
3 | endpoint: dns:///playground.hosted.unionai.cloud
4 | authType: Pkce
5 | # Change insecure flag to ensure that you use the right setting for your environment
6 | insecure: false
7 | storage:
8 | type: stow
9 | stow:
10 | kind: s3
11 | config:
12 | auth_type: iam
13 | region: us-east-2
14 | logger:
15 | # Logger settings to control logger output. Useful to debug logger:
16 | show-source: true
17 | level: 1
18 |
--------------------------------------------------------------------------------
/templates/pytorch-gpu/{{cookiecutter.project_name}}/.flyte/remote.config:
--------------------------------------------------------------------------------
1 | [sdk]
2 | # This option specifies the python packages where-in to search for workflows and tasks workflow packages. These workflows and tasks are then serialized during the `make serialize` commands
3 | workflow_packages={{cookiecutter.project_name}}
4 |
5 | [auth]
6 | raw_output_data_prefix=s3://open-compute-playground
7 |
--------------------------------------------------------------------------------
/templates/pytorch-gpu/{{cookiecutter.project_name}}/.flyte/sandbox-config.yaml:
--------------------------------------------------------------------------------
1 | admin:
2 | # For GRPC endpoints you might want to use dns:///flyte.myexample.com
3 | endpoint: dns:///localhost:30081
4 | authType: Pkce
5 | insecure: true
6 | logger:
7 | show-source: true
8 | level: 0
9 | storage:
10 | connection:
11 | access-key: minio
12 | auth-type: accesskey
13 | disable-ssl: true
14 | endpoint: http://localhost:30084
15 | region: us-east-1
16 | secret-key: miniostorage
17 | type: minio
18 | container: "my-s3-bucket"
19 | enable-multicontainer: true
20 |
--------------------------------------------------------------------------------
/templates/pytorch-gpu/{{cookiecutter.project_name}}/.flyte/sandbox.config:
--------------------------------------------------------------------------------
1 | [sdk]
2 | # This option specifies the python packages where-in to search for workflows and tasks workflow packages. These workflows and tasks are then serialized during the `make serialize` commands
3 | workflow_packages={{cookiecutter.project_name}}
4 |
5 | [auth]
6 | raw_output_data_prefix=s3://my-s3-bucket/flytelab
7 |
--------------------------------------------------------------------------------
/templates/pytorch-gpu/{{cookiecutter.project_name}}/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM pytorch/pytorch:1.9.0-cuda10.2-cudnn7-runtime
2 |
3 | WORKDIR /root
4 | ENV VENV /opt/venv
5 | ENV LANG C.UTF-8
6 | ENV LC_ALL C.UTF-8
7 | ENV PYTHONPATH /root
8 |
9 | # e.g. flyte.config or sandbox.config
10 | ARG config
11 |
12 | RUN apt-get update && \
13 | apt-get install -y \
14 | libsm6 \
15 | libxext6 \
16 | libxrender-dev \
17 | ffmpeg \
18 | build-essential
19 |
20 | # Install the AWS cli separately to prevent issues with boto being written over
21 | RUN pip3 install awscli
22 |
23 | ENV VENV /opt/venv
24 |
25 | # Virtual environment
26 | RUN python3 -m venv ${VENV}
27 | ENV PATH="${VENV}/bin:$PATH"
28 |
29 | # Install Python dependencies
30 | COPY requirements.txt /root
31 | RUN pip install -r /root/requirements.txt
32 |
33 | COPY {{cookiecutter.project_name}} /root/{{cookiecutter.project_name}}
34 | COPY $config /root/flyte.config
35 |
36 | # This image is supplied by the build script and will be used to determine the version
37 | # when registering tasks, workflows, and launch plans
38 | ARG image
39 | ENV FLYTE_INTERNAL_IMAGE $image
40 |
--------------------------------------------------------------------------------
/templates/pytorch-gpu/{{cookiecutter.project_name}}/README.md:
--------------------------------------------------------------------------------
1 | # {{cookiecutter.project_name}}
2 |
3 | {{cookiecutter.description}}
4 |
--------------------------------------------------------------------------------
/templates/pytorch-gpu/{{cookiecutter.project_name}}/dashboard/app.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import os
3 | import sys
4 | from argparse import ArgumentParser
5 | from pathlib import Path
6 |
7 | import streamlit as st
8 | import torch
9 |
10 | from flytekit.remote import FlyteRemote
11 | from flytekit.models import filters
12 | from flytekit.models.admin.common import Sort
13 | from sklearn.datasets import load_digits
14 |
15 |
16 | # import flytelab project source
17 | sys.path.append(str(Path(__file__).parent.parent))
18 | importlib.import_module("{{cookiecutter.project_name}}")
19 |
20 |
21 | PROJECT_NAME = "{{cookiecutter.flyte_project}}"
22 | WORKFLOW_NAME = "{{cookiecutter.project_name}}.workflows.main"
23 |
24 |
25 | parser = ArgumentParser()
26 | parser.add_argument("--remote", action="store_true")
27 | args = parser.parse_args()
28 |
29 | backend = os.getenv("FLYTE_BACKEND", 'remote' if args.remote else 'sandbox')
30 |
31 | # configuration for accessing a Flyte cluster backend
32 | remote = FlyteRemote.from_config(
33 | default_project=PROJECT_NAME,
34 | default_domain="development",
35 | config_file_path=Path(__file__).parent / f"{backend}.config",
36 | )
37 |
38 | # get the latest workflow execution
39 | [latest_execution, *_], _ = remote.client.list_executions_paginated(
40 | PROJECT_NAME,
41 | "development",
42 | limit=1,
43 | filters=[
44 | filters.Equal("launch_plan.name", WORKFLOW_NAME),
45 | filters.Equal("phase", "SUCCEEDED"),
46 | ],
47 | sort_by=Sort.from_python_std("desc(execution_created_at)"),
48 | )
49 |
50 | wf_execution = remote.fetch_workflow_execution(name=latest_execution.id.name)
51 | remote.sync(wf_execution, sync_nodes=False)
52 | model = wf_execution.outputs["o0"]
53 | print(model)
54 |
55 |
56 | ############
57 | # App Code #
58 | ############
59 |
60 | data = load_digits(as_frame=True)
61 |
62 | st.write("# Flytelab: {{cookiecutter.project_name}}")
63 | st.write("### {{cookiecutter.description}}")
64 | st.write(f"Model: `{model}`")
65 |
66 | st.write("Use the slider below to select a sample for prediction")
67 |
68 | sample_index = st.slider(
69 | "Sample Number",
70 | min_value=0,
71 | max_value=data.frame.shape[0] - 1,
72 | value=0,
73 | step=1,
74 | )
75 |
76 | st.image(data.images[sample_index], clamp=True, width=300)
77 | st.write(f"Ground Truth: {data.target[sample_index]}")
78 |
79 | data = torch.from_numpy(data.frame[data.feature_names].loc[[sample_index]].values).float()
80 | prediction = model(data).argmax().item()
81 | st.write(f"Prediction: {prediction}")
82 |
--------------------------------------------------------------------------------
/templates/pytorch-gpu/{{cookiecutter.project_name}}/dashboard/remote.config:
--------------------------------------------------------------------------------
1 | [platform]
2 | url=playground.hosted.unionai.cloud
3 | insecure=False
4 |
5 | [credentials]
6 | client_id=flytepropeller
7 | auth_mode=basic
8 | authorization_metadata-key=flyte-authorization
9 | oauth_scopes=all
10 |
--------------------------------------------------------------------------------
/templates/pytorch-gpu/{{cookiecutter.project_name}}/dashboard/sandbox.config:
--------------------------------------------------------------------------------
1 | [platform]
2 | url=localhost:30081
3 | insecure=True
4 |
5 | [aws]
6 | access_key_id=minio
7 | secret_access_key=miniostorage
8 | endpoint=http://localhost:30084
9 |
--------------------------------------------------------------------------------
/templates/pytorch-gpu/{{cookiecutter.project_name}}/deploy.py:
--------------------------------------------------------------------------------
1 | ../../_common/deploy.py
--------------------------------------------------------------------------------
/templates/pytorch-gpu/{{cookiecutter.project_name}}/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | docker
2 | gitpython
3 | streamlit
4 | typer
5 |
--------------------------------------------------------------------------------
/templates/pytorch-gpu/{{cookiecutter.project_name}}/requirements.txt:
--------------------------------------------------------------------------------
1 | flytekit>=0.30.3
2 | pandas
3 | s3fs
4 | sklearn
5 | torch
6 |
--------------------------------------------------------------------------------
/templates/pytorch-gpu/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/flyteorg/flytelab/67ff9a779004209c82f9f5df89f2873cc5acdd52/templates/pytorch-gpu/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/__init__.py
--------------------------------------------------------------------------------
/templates/pytorch-gpu/{{cookiecutter.project_name}}/{{cookiecutter.project_name}}/workflows.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import numpy as np
4 | import pandas as pd
5 | import torch
6 | import torch.nn as nn
7 | import torch.nn.functional as F
8 | import torch.optim as optim
9 | from sklearn.datasets import load_digits
10 |
11 | from flytekit import task, workflow, Resources
12 |
13 |
14 | dataset_resources = Resources(cpu="1", mem="1Gi", storage="1Gi")
15 |
16 | # This conditional is used at deployment time to determine whether the
17 | # task uses CPUs or GPUs. The "FLYTE_SANDBOX" environment variable is
18 | # automatically set by the `deploy.py` script when serializing tasks/workflows
19 | training_resources = (
20 | Resources(cpu="1", mem="1Gi", storage="1Gi")
21 | if int(os.getenv("FLYTE_SANDBOX", "0"))
22 | else Resources(gpu="1", mem="4Gi", storage="4Gi")
23 | )
24 |
25 |
26 | DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
27 |
28 |
29 | class Model(nn.Module):
30 | def __init__(self, input_dim: int, hidden_dim: int, output_dim: int):
31 | super(Model, self).__init__()
32 | self.layer1 = nn.Linear(input_dim, hidden_dim)
33 | self.layer2 = nn.Linear(hidden_dim, hidden_dim)
34 | self.layer3 = nn.Linear(hidden_dim, output_dim)
35 |
36 | def forward(self, input):
37 | x = F.relu(self.layer1(input))
38 | x = F.relu(self.layer2(x))
39 | return F.log_softmax(self.layer3(x))
40 |
41 |
42 | @task(requests=dataset_resources, limits=dataset_resources)
43 | def get_dataset() -> pd.DataFrame:
44 | return load_digits(as_frame=True).frame
45 |
46 |
47 | def dataset_iterator(features, target, n_batches: int):
48 | for X, y in zip(np.array_split(features, n_batches), np.array_split(target, n_batches)):
49 | yield (
50 | torch.from_numpy(X.values).float().to(DEVICE),
51 | torch.from_numpy(y.values).long().to(DEVICE)
52 | )
53 |
54 |
55 | @task(requests=training_resources, limits=training_resources)
56 | def train_model(
57 | dataset: pd.DataFrame,
58 | hidden_dim: int,
59 | n_epochs: int,
60 | batch_size: int,
61 | learning_rate: float,
62 | ) -> Model:
63 | features, target = dataset[[x for x in dataset if x != "target"]], dataset["target"]
64 |
65 | # define the model
66 | n_classes = target.nunique()
67 | model = Model(features.shape[1], hidden_dim, n_classes).to(DEVICE)
68 | opt = optim.SGD(model.parameters(), lr=learning_rate)
69 |
70 | # iterate through n_epochs and n_batches of the training data
71 | n_batches = int(features.shape[0] / batch_size)
72 | for epoch in range(1, n_epochs + 1):
73 | for batch, (X, y) in enumerate(dataset_iterator(features, target, n_batches), 1):
74 |
75 | opt.zero_grad()
76 | y_hat = model(X)
77 | loss = F.nll_loss(y_hat, y)
78 | loss.backward()
79 | opt.step()
80 |
81 | accuracy = (y_hat.argmax(1) == y).float().mean()
82 |
83 | print(
84 | f"epoch={epoch:02d}: "
85 | f"batch {batch:02d}/{n_batches} - "
86 | f"loss={loss.item():0.04f}; "
87 | f"accuracy={accuracy:0.04f}"
88 | )
89 |
90 | return model.to("cpu")
91 |
92 |
93 | @workflow
94 | def main(
95 | hidden_dim: int = 300,
96 | n_epochs: int = 30,
97 | batch_size: int = 64,
98 | learning_rate: float = 0.001,
99 | ) -> Model:
100 | return train_model(
101 | dataset=get_dataset(),
102 | hidden_dim=hidden_dim,
103 | n_epochs=n_epochs,
104 | batch_size=batch_size,
105 | learning_rate=learning_rate,
106 | )
107 |
108 |
109 | if __name__ == "__main__":
110 | print(f"trained model: {main()}")
111 |
--------------------------------------------------------------------------------